diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index c81b1190e9d6..6fe25ecdcd5c 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -733,10 +733,10 @@ public enum CassandraRelevantProperties /** Class used to discover/load the proper SAI index components file for a given sstable. */ SAI_ANN_USE_SYNTHETIC_SCORE("cassandra.sai.ann_use_synthetic_score", "false"), - + /** The current version of the SAI on-disk index format. */ SAI_CURRENT_VERSION("cassandra.sai.latest.version", "ec"), - + SAI_CUSTOM_COMPONENTS_DISCOVERY_CLASS("cassandra.sai.custom_components_discovery_class"), SAI_ENABLE_EDGES_CACHE("cassandra.sai.enable_edges_cache", "false"), SAI_ENABLE_GENERAL_ORDER_BY("cassandra.sai.general_order_by", "true"), @@ -874,8 +874,7 @@ public enum CassandraRelevantProperties // NVQ number of subvectors. This isn't really expected to change much so we're only exposing // it as a global variable in case it's needed. SAI_VECTOR_NVQ_NUM_SUB_VECTORS("cassandra.sai.vector.nvq_num_sub_vectors", "2"), - - // The allowed ratio of extra rows (that map to "holes" in the ordinal space) to total rows indexed in the graph +// The allowed ratio of extra rows (that map to "holes" in the ordinal space) to total rows indexed in the graph // Higher percentages will result in more memory utilized to store the extra postings mappings and larger graph // file sizes to store the empty nodes. SAI_VECTOR_ORDINAL_HOLE_DENSITY_LIMIT("cassandra.sai.vector.ordinal_hole_density_limit", "0.01"), @@ -890,7 +889,6 @@ public enum CassandraRelevantProperties * build a potential result set for search-then-sort query execution. */ SAI_VECTOR_SEARCH_MAX_MATERIALIZE_KEYS("cassandra.sai.vector_search.max_materialized_keys", "16000"), - /** Controls the maximum top-k limit for vector search */ SAI_VECTOR_SEARCH_MAX_TOP_K("cassandra.sai.vector_search.max_top_k", "1000"), SAI_VECTOR_USE_PRUNING_DEFAULT("cassandra.sai.jvector.use_pruning_default", "1000"), @@ -1059,6 +1057,8 @@ public enum CassandraRelevantProperties // i.e. that all replicas except for at most one in the cluster (across all DCs) must accept the write for it to be successful. THREE_MEANS_ALL_BUT_ONE("dse.consistency_level.three_means_all_but_one", "false"), TOLERATE_SSTABLE_SIZE("cassandra.tolerate_sstable_size"), + /** To be used for tests: whether trie cursors should be verified for correctness. */ + TRIE_DEBUG("cassandra.debug_tries"), /** * Allows to set custom current trie index format. This node will produce sstables in this format. */ diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java index 3bfcabe5c1ea..646ecb2b09d1 100644 --- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java @@ -494,7 +494,7 @@ private void updatePerBatchMetrics(Collection mutations) for (PartitionUpdate update : mutation.getPartitionUpdates()) { for (Row row : update.rows()) - nrUpdatedColumns += row.columns().size(); + nrUpdatedColumns += row.columnCount(); } } metrics.update(type, nrUpdatedPartitions, nrUpdatedColumns); diff --git a/src/java/org/apache/cassandra/db/DeletionTime.java b/src/java/org/apache/cassandra/db/DeletionTime.java index 5c8153886604..cdfc0872ba2c 100644 --- a/src/java/org/apache/cassandra/db/DeletionTime.java +++ b/src/java/org/apache/cassandra/db/DeletionTime.java @@ -24,6 +24,7 @@ import org.apache.cassandra.cache.IMeasurableMemory; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.io.ISerializer; import org.apache.cassandra.io.sstable.format.Version; import org.apache.cassandra.io.util.DataInputPlus; @@ -68,9 +69,9 @@ static DeletionTime buildUnsafeWithUnsignedInteger(long markedForDeleteAt, int l : new DeletionTime(markedForDeleteAt, localDeletionTimeUnsignedInteger); } - private DeletionTime(long markedForDeleteAt, long localDeletionTime) + protected DeletionTime(long markedForDeleteAt, long localDeletionTime) { - this(markedForDeleteAt, Cell.deletionTimeLongToUnsignedInteger(localDeletionTime)); + this(markedForDeleteAt, CellData.deletionTimeLongToUnsignedInteger(localDeletionTime)); } private DeletionTime(long markedForDeleteAt, int localDeletionTimeUnsignedInteger) @@ -95,7 +96,7 @@ public long markedForDeleteAt() */ public long localDeletionTime() { - return Cell.deletionTimeUnsignedIntegerToLong(localDeletionTimeUnsignedInteger); + return CellData.deletionTimeUnsignedIntegerToLong(localDeletionTimeUnsignedInteger); } /** @@ -170,7 +171,7 @@ public boolean deletes(LivenessInfo info) return deletes(info.timestamp()); } - public boolean deletes(Cell cell) + public boolean deletes(CellData cell) { return deletes(cell.timestamp()); } diff --git a/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java b/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java index 35c24805bc8b..52dbb4fe63a2 100644 --- a/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java +++ b/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java @@ -28,6 +28,7 @@ import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.rows.BufferCell; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientWarn; @@ -79,7 +80,7 @@ public static void maybeApplyExpirationDateOverflowPolicy(TableMetadata metadata // Check for localExpirationTime overflow (CASSANDRA-14092) to apply a policy if needed long nowInSecs = currentTimeMillis() / 1000; - if (((long) ttl + nowInSecs) > Cell.getVersionedMaxDeletiontionTime()) + if (((long) ttl + nowInSecs) > CellData.getVersionedMaxDeletiontionTime()) { switch (policy) { @@ -121,13 +122,13 @@ public static long computeLocalExpirationTime(long nowInSec, int timeToLive) { long localExpirationTime = (long) (nowInSec + timeToLive); - long cellMaxDeletionTime = Cell.getVersionedMaxDeletiontionTime(); + long cellMaxDeletionTime = CellData.getVersionedMaxDeletiontionTime(); return localExpirationTime <= cellMaxDeletionTime ? localExpirationTime : cellMaxDeletionTime; } private static String getMaxExpirationDateTS() { - return Cell.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP ? "2038-01-19T03:14:06+00:00" + return CellData.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP ? "2038-01-19T03:14:06+00:00" : "2106-02-07T06:28:13+00:00"; } } diff --git a/src/java/org/apache/cassandra/db/IDataSize.java b/src/java/org/apache/cassandra/db/IDataSize.java new file mode 100644 index 000000000000..8611a7b1e3e1 --- /dev/null +++ b/src/java/org/apache/cassandra/db/IDataSize.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db; + +/// Shared interface for providing data size information +public interface IDataSize +{ + int dataSize(); +} diff --git a/src/java/org/apache/cassandra/db/LivenessInfo.java b/src/java/org/apache/cassandra/db/LivenessInfo.java index 9bf480aa87d3..804dbfe0b769 100644 --- a/src/java/org/apache/cassandra/db/LivenessInfo.java +++ b/src/java/org/apache/cassandra/db/LivenessInfo.java @@ -21,6 +21,7 @@ import org.apache.cassandra.cache.IMeasurableMemory; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ObjectSizes; @@ -37,7 +38,7 @@ * unaffected (of course, the rest of said row data might be ttl'ed on its own but this is * separate). */ -public class LivenessInfo implements IMeasurableMemory +public class LivenessInfo implements IMeasurableMemory, IDataSize { public static final long NO_TIMESTAMP = Long.MIN_VALUE; public static final int NO_TTL = Cell.NO_TTL; @@ -329,13 +330,13 @@ public LivenessInfo withUpdatedTimestamp(long newTimestamp) } } - private static class ExpiringLivenessInfo extends LivenessInfo + protected static class ExpiringLivenessInfo extends LivenessInfo { private final int ttl; private final long localExpirationTime; private static final long UNSHARED_HEAP_SIZE = ObjectSizes.measure(new ExpiringLivenessInfo(-1, -1, -1)); - private ExpiringLivenessInfo(long timestamp, int ttl, long localExpirationTime) + protected ExpiringLivenessInfo(long timestamp, int ttl, long localExpirationTime) { super(timestamp); assert ttl != NO_TTL && localExpirationTime != NO_EXPIRATION_TIME; @@ -375,7 +376,7 @@ public void digest(Digest digest) // As of 5.0, local expiration times are encoded as unsigned integers on disk, so we can do the // same thing here to populate the digest. This supports extended TTLs, but also maintains digest // compatibility with previous versions, avoiding false digest mismatches during upgrades. - digest.updateWithInt(Cell.deletionTimeLongToUnsignedInteger(localExpirationTime)); + digest.updateWithInt(CellData.deletionTimeLongToUnsignedInteger(localExpirationTime)); digest.updateWithInt(ttl); } diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index a180bc3c5849..c352b3db30e2 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@ -682,7 +682,7 @@ public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs, ReadExe public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, - Function>> rowTransformer, + Function>> rowTransformer, ReadExecutionController executionController) { assert executionController != null && executionController.validForReadOn(cfs); @@ -700,7 +700,7 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, - Function>> rowTransformer, + Function>> rowTransformer, ReadExecutionController controller, long startTimeNanos) { @@ -985,7 +985,7 @@ private boolean queriesMulticellType() * no collection or counters are included). * This method assumes the filter is a {@code ClusteringIndexNamesFilter}. */ - private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, Function>> rowTransformer, ClusteringIndexNamesFilter filter, ReadExecutionController controller, long startTimeNanos) + private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, Function>> rowTransformer, ClusteringIndexNamesFilter filter, ReadExecutionController controller, long startTimeNanos) { if (Tracing.traceSinglePartitions()) Tracing.trace("Acquiring sstable references"); diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java index a76a760de917..d081eab4eb6b 100644 --- a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java @@ -36,6 +36,7 @@ import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.FBUtilities; @@ -86,7 +87,7 @@ public abstract class AbstractAllocatorMemtable extends AbstractMemtableWithComm * the estimate is updated only whenever the number of operations on the memtable increases significantly from the * last update. This estimate is not very accurate but should be ok for planning or diagnostic purposes. */ - private volatile MemtableAverageRowSize estimatedAverageRowSize; + protected volatile MemtableAverageRowSize estimatedAverageRowSize; @VisibleForTesting static MemtablePool createMemtableAllocatorPool() @@ -128,9 +129,10 @@ public static MemtablePool createMemtableAllocatorPoolInternal(Config.MemtableAl public AbstractAllocatorMemtable(AtomicReference commitLogLowerBound, TableMetadataRef metadataRef, Owner owner) { super(metadataRef, commitLogLowerBound); - this.allocator = MEMORY_POOL.newAllocator(metadataRef.toString()); - this.initialComparator = metadata.get().comparator; - this.initialFactory = metadata().params.memtable.factory(); + TableMetadata tableMetadata = metadataRef.get(); + this.allocator = MEMORY_POOL.newAllocator(tableMetadata.toString()); + this.initialComparator = tableMetadata.comparator; + this.initialFactory = tableMetadata.params.memtable.factory(); this.owner = owner; scheduleFlush(); } @@ -170,8 +172,9 @@ public boolean shouldSwitch(ColumnFamilyStore.FlushReason reason) switch (reason) { case SCHEMA_CHANGE: - return initialComparator != metadata().comparator // If the CF comparator has changed, because our partitions reference the old one - || !initialFactory.equals(metadata().params.memtable.factory()); // If a different type of memtable is requested + TableMetadata tableMetadata = metadata.get(); // do not use metadata() as this may be overridden + return initialComparator != tableMetadata.comparator // If the CF comparator has changed, because our partitions reference the old one + || !initialFactory.equals(tableMetadata.params.memtable.factory()); // If a different type of memtable is requested case OWNED_RANGES_CHANGE: return false; // by default we don't use the local ranges, thus this has no effect default: diff --git a/src/java/org/apache/cassandra/db/memtable/MemtableAverageRowSize.java b/src/java/org/apache/cassandra/db/memtable/MemtableAverageRowSize.java index 24afc3df6b03..7d309c45afd2 100644 --- a/src/java/org/apache/cassandra/db/memtable/MemtableAverageRowSize.java +++ b/src/java/org/apache/cassandra/db/memtable/MemtableAverageRowSize.java @@ -19,10 +19,15 @@ package org.apache.cassandra.db.memtable; import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.IDataSize; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.Trie; import org.apache.cassandra.io.sstable.SSTableReadsListener; class MemtableAverageRowSize @@ -32,6 +37,62 @@ class MemtableAverageRowSize public final long rowSize; public final long operations; + static class SizeCalculator implements DeletionAwareTrie.ValueConsumer + { + long totalSize = 0; + long count = 0; + + @Override + public void content(Object o) + { + if (o instanceof IDataSize) + { + totalSize += ((IDataSize) o).dataSize(); + ++count; + } + } + + @Override + public void deletionMarker(TrieTombstoneMarker marker) + { + // Count one side of the marker + TrieTombstoneMarker.Covering startedDeletion = marker.rightDeletion(); + if (startedDeletion != null) + { + totalSize += startedDeletion.dataSize(); + ++count; + } + } + } + + public MemtableAverageRowSize(Memtable memtable, DeletionAwareTrie trie) + { + // If this is a trie-based memtable, get the row sizes from the trie elements. This achieves two things: + // - makes sure the size used is the size reflected in the memtable's dataSize + // (which e.g. excludes clustering keys) + // - avoids the conversion to Row, which has non-trivial cost + + SizeCalculator sizeCalculator = new SizeCalculator(); + trie.process(Direction.FORWARD, sizeCalculator); + + this.rowSize = sizeCalculator.count > 0 ? sizeCalculator.totalSize / sizeCalculator.count : 0; + this.operations = memtable.operationCount(); + } + + public MemtableAverageRowSize(Memtable memtable, Trie trie) + { + // If this is a trie-based memtable, get the row sizes from the trie elements. This achieves two things: + // - makes sure the size used is the size reflected in the memtable's dataSize + // (which e.g. excludes clustering keys) + // - avoids the conversion to Row, which has non-trivial cost + + + SizeCalculator sizeCalculator = new SizeCalculator(); + trie.process(Direction.FORWARD, sizeCalculator); + + this.rowSize = sizeCalculator.count > 0 ? sizeCalculator.totalSize / sizeCalculator.count : 0; + this.operations = memtable.operationCount(); + } public MemtableAverageRowSize(Memtable memtable) { diff --git a/src/java/org/apache/cassandra/db/memtable/TrieCellData.java b/src/java/org/apache/cassandra/db/memtable/TrieCellData.java new file mode 100644 index 000000000000..cf00596f5cac --- /dev/null +++ b/src/java/org/apache/cassandra/db/memtable/TrieCellData.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.memtable; + +import java.nio.ByteBuffer; + +import org.agrona.concurrent.UnsafeBuffer; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.rows.AbstractBufferCellData; +import org.apache.cassandra.db.rows.CellData; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; + +/// [CellData] objects stored in in-memory tries. +/// Uses one 32-byte cell of an in-memory trie's buffer to store the data of a cell (without path and column id). This +/// includes liveness (timestamp/ttl/local deletion time) and value. If the value is small enough to fit, it is placed +/// directly inside the 32-byte cell; otherwise we use the given saver/loader to map it to a long integer handle and +/// store the handle. +public abstract class TrieCellData extends AbstractBufferCellData +{ + public interface ExternalBufferHandler + { + /// Store the data in the given buffer and return an integer handle for it (e.g. a native address). + long store(ByteBuffer buffer, int length) throws TrieSpaceExhaustedException; + + /// Load the data from the given handle (e.g. a native address) and return it in a buffer. + ByteBuffer load(long handle, int length); + + /// Release a handle which will no longer be used. + void release(long handle, int length); + } + + public static final int OFFSET_TIMESTAMP = 24; + public static final int OFFSET_LOCAL_DELETION_TIME = 20; + public static final int OFFSET_TTL = 16; + /// If the value fits, it is placed starting from this offset in the trie cell. + public static final int OFFSET_DATA = 0; + /// If the value does not fit, these 8 bytes hold its external handle. + public static final int OFFSET_EXTERNAL_HANDLE = 8; + /// If the value does not fit, these 4 bytes hold its length. + public static final int OFFSET_EXTERNAL_LENGTH = 4; + /// Byte storing whether an externally-stored cell is a counter + public static final int OFFSET_EXTERNAL_IS_COUNTER = 0; + + /// Length of an embedded counter + private static final int OFFSET_COUNTER_LENGTH = 15; + + final UnsafeBuffer buffer; + final int inBufferPos; + + /// Store the given cell data in the 32 bytes of `buffer` starting at position `inBufferPos`. If the value cannot fit in + /// this space, use the given external saver to store it, and save the resulting handle and the length of the value. + public static int serialize(CellData cell, + UnsafeBuffer buffer, int inBufferPos, + ExternalBufferHandler externalBufferSaver) + throws TrieSpaceExhaustedException + { + ByteBuffer value = cell.buffer(); + int length = value.remaining(); + buffer.putLongOrdered(inBufferPos + OFFSET_TIMESTAMP, cell.timestamp()); + buffer.putIntOrdered(inBufferPos + OFFSET_LOCAL_DELETION_TIME, CellData.deletionTimeLongToUnsignedInteger(cell.localDeletionTime())); + buffer.putIntOrdered(inBufferPos + OFFSET_TTL, cell.ttl()); + + boolean isCounterCell = cell.isCounterCell(); + if (isCounterCell && length <= OFFSET_COUNTER_LENGTH) + { + assert length <= OFFSET_COUNTER_LENGTH; + buffer.putByte(inBufferPos + OFFSET_COUNTER_LENGTH, (byte) length); + buffer.putBytes(inBufferPos + OFFSET_DATA, value, value.position(), length); + return TrieMemtable.TrieSerializer.TYPE_CELL_COUNTER; + } + + if (!isCounterCell && cellValueCanBeEmbedded(cell, length)) + { + // Storing value embedded in trie cell. This may overwrite the TTL/local deletion, which we won't read if + // the length is above OFFSET_TTL. + + // using the inBufferPos, length version of putBytes to make sure the source buffer's position is not touched + buffer.putBytes(inBufferPos + OFFSET_DATA, value, value.position(), length); + return length; + } + + // stored externally + long handle = externalBufferSaver.store(value, length); + buffer.putLongOrdered(inBufferPos + OFFSET_EXTERNAL_HANDLE, handle); + buffer.putIntOrdered(inBufferPos + OFFSET_EXTERNAL_LENGTH, length); + buffer.putByte(inBufferPos + OFFSET_EXTERNAL_IS_COUNTER, (byte) (isCounterCell ? 1 : 0)); + return TrieMemtable.TrieSerializer.TYPE_CELL_EXTERNAL_VALUE; + } + + private static boolean cellValueCanBeEmbedded(CellData cell, int length) + { + // No expiration time implies no TTL + return length <= OFFSET_TTL || length <= OFFSET_TIMESTAMP && cell.localDeletionTime() == NO_DELETION_TIME; + } + + TrieCellData(UnsafeBuffer buffer, int inBufferPos) + { + this.buffer = buffer; + this.inBufferPos = inBufferPos; + } + + @Override + public boolean isCounterCell() + { + return false; + } + + @Override + public long timestamp() + { + return buffer.getLong(inBufferPos + OFFSET_TIMESTAMP); + } + + @Override + public int ttl() + { + return buffer.getInt(inBufferPos + OFFSET_TTL); + } + + @Override + public int localDeletionTimeAsUnsignedInt() + { + return buffer.getInt(inBufferPos + OFFSET_LOCAL_DELETION_TIME); + } + + @Override + public long unsharedHeapSizeExcludingData() + { + // Managed separately by trie/external handler + return 0; + } + + @Override + public long unsharedHeapSize() + { + // Managed separately by trie/external handler + return 0; + } + + public static long offTrieSize(CellData cell) + { + int sz = cell.valueSize(); + return cellValueCanBeEmbedded(cell, sz) + ? 0 + : sz; + } + + @Override + public int dataSize() + { + return TypeSizes.LONG_SIZE + + TypeSizes.INT_SIZE + + TypeSizes.INT_SIZE + + valueSize(); + } + + public static TrieCellData embedded(UnsafeBuffer buffer, int inBufferPos, int length) + { + return length <= OFFSET_TTL ? new Embedded(buffer, inBufferPos, length) + : new EmbeddedNoTTL(buffer, inBufferPos, length); + } + + public static class Embedded extends TrieCellData + { + final int length; + + public Embedded(UnsafeBuffer buffer, int inBufferPos, int length) + { + super(buffer, inBufferPos); + this.length = length; + } + + + @Override + public int valueSize() + { + return length; + } + + @Override + public ByteBuffer value() + { + ByteBuffer buf = buffer.byteBuffer().duplicate(); + buf.position(inBufferPos + OFFSET_DATA); + buf.limit(inBufferPos + OFFSET_DATA + length); + return buf; // we don't need to slice + } + } + + public static class EmbeddedNoTTL extends Embedded + { + public EmbeddedNoTTL(UnsafeBuffer buffer, int inBufferPos, int length) + { + super(buffer, inBufferPos, length); + } + + @Override + public int ttl() + { + return NO_TTL; + } + + @Override + public int localDeletionTimeAsUnsignedInt() + { + return NO_DELETION_TIME_UNSIGNED_INTEGER; + } + } + + public static class Counter extends Embedded + { + public Counter(UnsafeBuffer buffer, int inBufferPos) + { + super(buffer, inBufferPos, buffer.getByte(inBufferPos + OFFSET_COUNTER_LENGTH)); + } + + @Override + public boolean isCounterCell() + { + return true; + } + } + + public static class External extends TrieCellData + { + final ExternalBufferHandler handler; + final boolean isCounterCell; + + public External(UnsafeBuffer buffer, int inBufferPos, ExternalBufferHandler handler) + { + super(buffer, inBufferPos); + this.handler = handler; + this.isCounterCell = buffer.getByte(inBufferPos + OFFSET_EXTERNAL_IS_COUNTER) != 0; + } + + @Override + public boolean isCounterCell() + { + return isCounterCell; + } + + @Override + public int valueSize() + { + return buffer.getInt(inBufferPos + OFFSET_EXTERNAL_LENGTH); + } + + @Override + public ByteBuffer value() + { + long handle = buffer.getLong(inBufferPos + OFFSET_EXTERNAL_HANDLE); + int length = buffer.getInt(inBufferPos + OFFSET_EXTERNAL_LENGTH); + return handler.load(handle, length); + } + + public static void release(UnsafeBuffer buffer, int inBufferPos, ExternalBufferHandler handler) + { + long handle = buffer.getLong(inBufferPos + OFFSET_EXTERNAL_HANDLE); + int length = buffer.getInt(inBufferPos + OFFSET_EXTERNAL_LENGTH); + handler.release(handle, length); + } + } +} diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java index 7be17982fb00..7b86e921f3b6 100644 --- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java @@ -17,7 +17,9 @@ */ package org.apache.cassandra.db.memtable; +import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -27,34 +29,47 @@ import java.util.function.Predicate; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Predicates; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.agrona.concurrent.UnsafeBuffer; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Columns; import org.apache.cassandra.db.DataRange; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.DeletionInfo; -import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.commitlog.CommitLogPosition; import org.apache.cassandra.db.filter.ClusteringIndexFilter; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator; import org.apache.cassandra.db.partitions.Partition; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.partitions.TrieBackedPartition; import org.apache.cassandra.db.partitions.TriePartitionUpdate; import org.apache.cassandra.db.partitions.TriePartitionUpdater; +import org.apache.cassandra.db.partitions.TriePartitionUpdaterLegacyIndex; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.tries.ContentManagerPojo; +import org.apache.cassandra.db.tries.ContentSerializer; +import org.apache.cassandra.db.tries.DeletionAwareTrie; import org.apache.cassandra.db.tries.Direction; -import org.apache.cassandra.db.tries.InMemoryTrie; -import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.TrieDumperWithPath; import org.apache.cassandra.db.tries.TrieEntriesWalker; import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; import org.apache.cassandra.db.tries.TrieTailsIterator; @@ -68,16 +83,19 @@ import org.apache.cassandra.metrics.TrieMemtableMetricsView; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.Clock; -import org.apache.cassandra.utils.ObjectSizes; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.FastByteOperations; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.EnsureOnHeap; -import org.apache.cassandra.utils.memory.HeapCloner; +import org.apache.cassandra.utils.memory.MemoryUtil; import org.apache.cassandra.utils.memory.MemtableAllocator; +import org.apache.cassandra.utils.memory.MemtableBufferAllocator; +import org.apache.cassandra.utils.memory.NativeAllocator; import org.github.jamm.Unmetered; /** @@ -94,22 +112,19 @@ public class TrieMemtable extends AbstractShardedMemtable { private static final Logger logger = LoggerFactory.getLogger(TrieMemtable.class); - /** Buffer type to use for memtable tries (on- vs off-heap) */ + /// Buffer type to use for memtable tries (on- vs off-heap) public static final BufferType BUFFER_TYPE = DatabaseDescriptor.getMemtableAllocationType().toBufferType(); - /** - * Force copy checker (see InMemoryTrie.ApplyState) ensuring all modifications apply atomically and consistently to - * the whole partition. - */ - public static final Predicate> FORCE_COPY_PARTITION_BOUNDARY = features -> isPartitionBoundary(features.content()); - - public static final Predicate IS_PARTITION_BOUNDARY = TrieMemtable::isPartitionBoundary; + /// Force copy checker (see [InMemoryTrie#apply]) ensuring all modifications apply atomically and consistently to + /// the whole partition. + public static final Predicate> FORCE_COPY_PARTITION_BOUNDARY = + features -> TrieBackedPartition.isPartitionBoundary(features.content()); public static volatile int SHARD_COUNT = CassandraRelevantProperties.TRIE_MEMTABLE_SHARD_COUNT.getInt(autoShardCount()); public static volatile boolean SHARD_LOCK_FAIRNESS = CassandraRelevantProperties.TRIE_MEMTABLE_SHARD_LOCK_FAIRNESS.getBoolean(); public static final String TRIE_MEMTABLE_CONFIG_OBJECT_NAME = "org.apache.cassandra.db:type=TrieMemtableConfig"; - + static { MBeanWrapper.instance.registerMBean(new TrieMemtableConfig(), TRIE_MEMTABLE_CONFIG_OBJECT_NAME, MBeanWrapper.OnException.LOG); @@ -119,35 +134,28 @@ public class TrieMemtable extends AbstractShardedMemtable // thread calls cfs.switchMemtableIfCurrent. private final AtomicBoolean switchRequested = new AtomicBoolean(false); - /** - * Sharded memtable sections. Each is responsible for a contiguous range of the token space (between boundaries[i] - * and boundaries[i+1]) and is written to by one thread at a time, while reads are carried out concurrently - * (including with any write). - */ + /// Sharded memtable sections. Each is responsible for a contiguous range of the token space (between `boundaries[i]` + /// and `boundaries[i+1]`) and is written to by one thread at a time, while reads are carried out concurrently + /// (including with any write). private final MemtableShard[] shards; - /** - * A merged view of the memtable map. Used for partition range queries and flush. - * For efficiency we serve single partition requests off the shard which offers more direct InMemoryTrie methods. - */ - private final Trie mergedTrie; + /// A merged view of the memtable map. Used for partition range queries and flush. + /// For efficiency we serve single partition requests off the shard which offers more direct [InMemoryTrie] methods. + @VisibleForTesting + final DeletionAwareTrie mergedTrie; @Unmetered private final TrieMemtableMetricsView metrics; - /** - * Keeps an estimate of the average row size in this memtable, computed from a small sample of rows. - * Because computing this estimate is potentially costly, as it requires iterating the rows, - * the estimate is updated only whenever the number of operations on the memtable increases significantly from the - * last update. This estimate is not very accurate but should be ok for planning or diagnostic purposes. - */ - private volatile MemtableAverageRowSize estimatedAverageRowSize; + @Unmetered + private final TableMetadata actualMetadata; TrieMemtable(AtomicReference commitLogLowerBound, TableMetadataRef metadataRef, Owner owner, Integer shardCountOption) { super(commitLogLowerBound, metadataRef, owner, shardCountOption); this.metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name); - this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics, owner.readOrdering()); + this.actualMetadata = metadataRef.get(); + this.shards = generatePartitionShards(boundaries.shardCount(), actualMetadata, metrics, owner.readOrdering()); this.mergedTrie = makeMergedTrie(shards); logger.trace("Created memtable with {} shards", this.shards.length); } @@ -158,7 +166,7 @@ private static int autoShardCount() } private static MemtableShard[] generatePartitionShards(int splits, - TableMetadataRef metadata, + TableMetadata metadata, TrieMemtableMetricsView metrics, OpOrder opOrder) { @@ -172,12 +180,29 @@ private static MemtableShard[] generatePartitionShards(int splits, return partitionMapContainer; } - private static Trie makeMergedTrie(MemtableShard[] shards) + private static DeletionAwareTrie makeMergedTrie(MemtableShard[] shards) { - List> tries = new ArrayList<>(shards.length); + List> tries = new ArrayList<>(shards.length); for (MemtableShard shard : shards) tries.add(shard.data); - return Trie.mergeDistinct(tries); + return DeletionAwareTrie.mergeDistinct(tries); + } + + @Override + public TableMetadata metadata() + { + return actualMetadata; + } + + @Override + public boolean shouldSwitch(ColumnFamilyStore.FlushReason reason) + { + if (super.shouldSwitch(reason)) + return true; + if (reason != ColumnFamilyStore.FlushReason.SCHEMA_CHANGE) + return false; + // If the columns definition changes, we need to flush as we would need to remap column indexes. + return !actualMetadata.regularAndStaticColumns().equals(metadata.get().regularAndStaticColumns()); } @Override @@ -217,12 +242,11 @@ public void discard() } } - /** - * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate - * OpOrdering. - * - * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null - */ + /// Should only be called by [ColumnFamilyStore#apply] via `Keyspace#apply`, which supplies the appropriate + /// [OpOrder.Group]. + /// + /// `commitLogSegmentPosition` should only be null if this is a secondary index, in which case it is *expected* to + /// be null. @Override public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) { @@ -291,13 +315,19 @@ public int getShardCount() return shards.length; } - /** - * Returns the minTS if one available, otherwise NO_MIN_TIMESTAMP. - * - * EncodingStats uses a synthetic epoch TS at 2015. We don't want to leak that (CASSANDRA-18118) so we return NO_MIN_TIMESTAMP instead. - * - * @return The minTS or NO_MIN_TIMESTAMP if none available - */ + @Override + public long getEstimatedAverageRowSize() + { + if (estimatedAverageRowSize == null || currentOperations.get() > estimatedAverageRowSize.operations * 1.5) + estimatedAverageRowSize = new MemtableAverageRowSize(this, mergedTrie); + return estimatedAverageRowSize.rowSize; + } + + /// Returns the minimum timestamp if one available, otherwise `NO_MIN_TIMESTAMP`. + /// [EncodingStats] uses a synthetic epoch TS at 2015. We don't want to leak that (CASSANDRA-18118) so we return + /// `NO_MIN_TIMESTAMP` instead. + /// + /// @return The minTS or `NO_MIN_TIMESTAMP` if none available @Override public long getMinTimestamp() { @@ -356,13 +386,6 @@ EncodingStats encodingStats() return statsCollector.get(); } - static boolean isPartitionBoundary(Object content) - { - // In the trie we use PartitionData for the root of a partition, but PartitionUpdates come with DeletionInfo. - // Both are descendants of DeletionInfo. - return content instanceof DeletionInfo; - } - @Override public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter columnFilter, final DataRange dataRange, @@ -374,10 +397,12 @@ public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds; boolean includeStop = isBound || keyRange instanceof Range; - Trie subMap = mergedTrie.subtrie(toComparableBound(keyRange.left, includeStart), - toComparableBound(keyRange.right, !includeStop)); + DeletionAwareTrie subMap = + mergedTrie.subtrie(toComparableBound(keyRange.left, includeStart), + toComparableBound(keyRange.right, !includeStop)); - return new MemtableUnfilteredPartitionIterator(metadata(), + return new MemtableUnfilteredPartitionIterator(actualMetadata, + metadata.get(), allocator.ensureOnHeap(), subMap, columnFilter, @@ -390,17 +415,17 @@ public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter private static ByteComparable toComparableBound(PartitionPosition position, boolean before) { - return position.isMinimum() ? null : position.asComparableBound(before); + return position == null || position.isMinimum() ? null : position.asComparableBound(before); } public Partition getPartition(DecoratedKey key) { int shardIndex = boundaries.getShardForKey(key); - Trie trie = shards[shardIndex].data.tailTrie(key); - return createPartition(metadata(), allocator.ensureOnHeap(), key, trie); + DeletionAwareTrie trie = shards[shardIndex].data.tailTrie(key); + return createPartition(metadata(), metadata.get(), allocator.ensureOnHeap(), key, trie); } - private static TrieBackedPartition createPartition(TableMetadata metadata, EnsureOnHeap ensureOnHeap, DecoratedKey key, Trie trie) + private static TrieBackedPartition createPartition(TableMetadata metadata, TableMetadata droppedColumnsSource, EnsureOnHeap ensureOnHeap, DecoratedKey key, DeletionAwareTrie trie) { if (trie == null) return null; @@ -414,8 +439,10 @@ private static TrieBackedPartition createPartition(TableMetadata metadata, Ensur holder.columns(), holder.stats(), holder.rowCountIncludingStatic(), + holder.tombstoneCount(), trie, metadata, + droppedColumnsSource, ensureOnHeap); } @@ -443,36 +470,38 @@ private static DecoratedKey getPartitionKeyFromPath(TableMetadata metadata, Byte metadata.partitioner); } - /** - * Metadata object signifying the root node of a partition. Holds the deletion information as well as a link - * to the owning subrange, which is used for compiling statistics and column sets. - * - * Descends from MutableDeletionInfo to permit tail tries to be passed directly to TrieBackedPartition. - */ - public static class PartitionData extends MutableDeletionInfo + /// Make a textual representation of the trie, linking content with its type and key. See + /// [TrieMemtable.md](TrieMemtable.md) for an example of the output. + public String dump() { - @Unmetered - public final MemtableShard owner; + return mergedTrie.process(Direction.FORWARD, new Dumper(metadata())); + } - private int rowCountIncludingStatic; + static final int PARTITIONDATA_OFFSET_ROW_COUNT = 0; + static final int PARTITIONDATA_OFFSET_TOMBSTONE_COUNT = 4; + + /// Metadata object signifying the root node of a partition. Stores row and tombstone counts in a trie cell, + /// as well as a link to the owning subrange, which is used for compiling encoding statistics and column sets. + /// + /// Descends from [TrieBackedPartition.PartitionMarker] to permit tail tries to be passed directly to + /// [TrieBackedPartition]. + public static class PartitionData implements TrieBackedPartition.PartitionMarker + { + public final MemtableShard owner; - public static final long HEAP_SIZE = ObjectSizes.measure(new PartitionData(DeletionInfo.LIVE, null)); + UnsafeBuffer buffer; + int inBufferPos; - public PartitionData(DeletionInfo deletion, - MemtableShard owner) + public PartitionData(MemtableShard owner) { - super(deletion.getPartitionDeletion(), deletion.copyRanges(HeapCloner.instance)); - this.owner = owner; - this.rowCountIncludingStatic = 0; + this(owner, null, 0); } - public PartitionData(PartitionData existing, - DeletionInfo update) + public PartitionData(MemtableShard owner, UnsafeBuffer buffer, int inBufferPos) { - // Start with the update content, to properly copy it - this(update, existing.owner); - rowCountIncludingStatic = existing.rowCountIncludingStatic; - add(existing); + this.owner = owner; + this.buffer = buffer; + this.inBufferPos = inBufferPos; } public RegularAndStaticColumns columns() @@ -487,27 +516,41 @@ public EncodingStats stats() public int rowCountIncludingStatic() { - return rowCountIncludingStatic; + return buffer.getInt(inBufferPos + PARTITIONDATA_OFFSET_ROW_COUNT); + } + + public int tombstoneCount() + { + return buffer.getInt(inBufferPos + PARTITIONDATA_OFFSET_TOMBSTONE_COUNT); } public void markInsertedRows(int howMany) { - rowCountIncludingStatic += howMany; + buffer.addIntOrdered(inBufferPos + PARTITIONDATA_OFFSET_ROW_COUNT, howMany); + } + + public void markAddedTombstones(int howMany) + { + buffer.addIntOrdered(inBufferPos + PARTITIONDATA_OFFSET_TOMBSTONE_COUNT, howMany); } @Override public String toString() { - return "partition " + super.toString(); + return String.format("partition with %d rows and %d tombstones", rowCountIncludingStatic(), tombstoneCount()); } - @Override public long unsharedHeapSize() { - return super.unsharedHeapSize() + HEAP_SIZE - MutableDeletionInfo.EMPTY_SIZE; + return 0; } - } + public void clearStats() + { + buffer.putIntOrdered(inBufferPos + PARTITIONDATA_OFFSET_ROW_COUNT, 0); + buffer.putIntOrdered(inBufferPos + PARTITIONDATA_OFFSET_TOMBSTONE_COUNT, 0); + } + } class KeySizeAndCountCollector extends TrieEntriesWalker { @@ -533,16 +576,17 @@ protected void content(Object content, byte[] bytes, int byteLength) } } + @Override public FlushablePartitionSet getFlushSet(PartitionPosition from, PartitionPosition to) { - Trie toFlush = mergedTrie.subtrie(from, true, to, false); + DeletionAwareTrie toFlush = mergedTrie.subtrie(toComparableBound(from, true), toComparableBound(to, true)); var counter = new KeySizeAndCountCollector(); // need to jump over tails keys - toFlush.processSkippingBranches(counter, Direction.FORWARD); + toFlush.processSkippingBranches(Direction.FORWARD, counter); int partitionCount = counter.keyCount; long partitionKeySize = counter.keySize; - return new AbstractFlushablePartitionSet() + return new AbstractFlushablePartitionSet<>() { public Memtable memtable() { @@ -566,7 +610,7 @@ public long partitionCount() public Iterator iterator() { - return new PartitionIterator(toFlush, metadata(), EnsureOnHeap.NOOP); + return new PartitionIterator(toFlush, actualMetadata, metadata.get(), EnsureOnHeap.NOOP); } public long partitionKeysSize() @@ -585,8 +629,6 @@ public static class MemtableShard // The smallest timestamp for all partitions stored in this shard private volatile long minTimestamp = Long.MAX_VALUE; - private volatile long minLocalDeletionTime = Long.MAX_VALUE; - private volatile long liveDataSize = 0; private volatile long currentOperations = 0; @@ -596,20 +638,21 @@ public static class MemtableShard @Unmetered private final ReentrantLock writeLock = new ReentrantLock(SHARD_LOCK_FAIRNESS); - // Content map for the given shard. This is implemented as a memtable trie which uses the prefix-free - // byte-comparable ByteSource representations of the keys to address the partitions. - // - // This map is used in a single-producer, multi-consumer fashion: only one thread will insert items but - // several threads may read from it and iterate over it. Iterators (especially partition range iterators) - // may operate for a long period of time and thus iterators should not throw ConcurrentModificationExceptions - // if the underlying map is modified during iteration, they should provide a weakly consistent view of the map - // instead. - // - // Also, this data is backed by memtable memory, when accessing it callers must specify if it can be accessed - // unsafely, meaning that the memtable will not be discarded as long as the data is used, or whether the data - // should be copied on heap for off-heap allocators. + /// Content map for the given shard. This is implemented as an in-memory trie which uses the prefix-free + /// byte-comparable [ByteSource] representations of keys to address partitions and individual rows within + /// partitions. + /// + /// This map is used in a single-producer, multi-consumer fashion: only one thread will insert items but + /// several threads may read from it and iterate over it. Iterators (especially partition range iterators) + /// may operate for a long period of time and thus iterators should not throw `ConcurrentModificationException`s + /// if the underlying map is modified during iteration, they should provide a weakly consistent view of the map + /// instead. + /// + /// Also, this data is backed by memtable memory, when accessing it callers must specify if it can be accessed + /// unsafely, meaning that the memtable will not be discarded as long as the data is used, or whether the data + /// should be copied on heap for off-heap allocators. @VisibleForTesting - final InMemoryTrie data; + final InMemoryDeletionAwareTrie data; RegularAndStaticColumns columns; @@ -618,30 +661,59 @@ public static class MemtableShard @Unmetered // total pool size should not be included in memtable's deep size private final MemtableAllocator allocator; + private final CellDataBufferManager cellDataBufferManager; + @Unmetered private final TrieMemtableMetricsView metrics; - private final TableMetadataRef metadata; + private final TableMetadata metadata; + + private TriePartitionUpdater noIndexUpdater; + private TriePartitionUpdaterLegacyIndex legacyIndexUpdater; - MemtableShard(TableMetadataRef metadata, TrieMemtableMetricsView metrics, OpOrder opOrder) + MemtableShard(TableMetadata metadata, TrieMemtableMetricsView metrics, OpOrder opOrder) { this(metadata, AbstractAllocatorMemtable.MEMORY_POOL.newAllocator(metadata.toString()), metrics, opOrder); } @VisibleForTesting - MemtableShard(TableMetadataRef metadata, MemtableAllocator allocator, TrieMemtableMetricsView metrics, OpOrder opOrder) + MemtableShard(TableMetadata metadata, MemtableAllocator allocator, TrieMemtableMetricsView metrics, OpOrder opOrder) { this.metadata = metadata; - this.data = InMemoryTrie.longLived(TrieBackedPartition.BYTE_COMPARABLE_VERSION, BUFFER_TYPE, opOrder); + this.allocator = allocator; + if (this.allocator instanceof NativeAllocator) + this.cellDataBufferManager = new NativeBufferManager((NativeAllocator) allocator); + else + this.cellDataBufferManager = new SlabBufferManager((MemtableBufferAllocator) allocator, + opOrder, + BUFFER_TYPE.onHeapSizeWithoutData()); + + this.data = InMemoryDeletionAwareTrie.longLived(TrieBackedPartition.BYTE_COMPARABLE_VERSION, BUFFER_TYPE, opOrder, + new TrieSerializer(cellDataBufferManager, this)); this.columns = RegularAndStaticColumns.NONE; this.stats = EncodingStats.NO_STATS; - this.allocator = allocator; this.metrics = metrics; } + private TriePartitionUpdater getAndConfigureUpdater(PartitionUpdate update, UpdateTransaction indexer) + { + if (indexer == UpdateTransaction.NO_OP) + { + if (noIndexUpdater == null) + noIndexUpdater = new TriePartitionUpdater(this, data); + return noIndexUpdater; + } + else + { + if (legacyIndexUpdater == null) + legacyIndexUpdater = new TriePartitionUpdaterLegacyIndex(this, data, metadata); + legacyIndexUpdater.setIndexContext(indexer); + return legacyIndexUpdater; + } + } + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) { - TriePartitionUpdater updater = new TriePartitionUpdater(allocator.cloner(opGroup), indexer, metadata.get(), this); boolean locked = writeLock.tryLock(); if (locked) { @@ -654,31 +726,20 @@ public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group writeLock.lock(); metrics.contentionTime.addNano(Clock.Global.nanoTime() - lockStartTime); } + + TriePartitionUpdater updater = getAndConfigureUpdater(update, indexer); try { try { - indexer.start(); - // Add the initial trie size on the first operation. This technically isn't correct (other shards - // do take their memory share even if they are empty) but doing it during construction may cause - // the allocator to block while we are trying to flush a memtable and become a deadlock. - long onHeap = data.isEmpty() ? 0 : data.usedSizeOnHeap(); - long offHeap = data.isEmpty() ? 0 : data.usedSizeOffHeap(); - // Use the fast recursive put if we know the key is small enough to not cause a stack overflow. - try - { - data.apply(TriePartitionUpdate.asMergableTrie(update), - updater, - FORCE_COPY_PARTITION_BOUNDARY); - } - catch (TrieSpaceExhaustedException e) - { - // This should never really happen as a flush would be triggered long before this limit is reached. - throw new AssertionError(e); - } - allocator.offHeap().adjust(data.usedSizeOffHeap() - offHeap, opGroup); - allocator.onHeap().adjust((data.usedSizeOnHeap() - onHeap) + updater.heapSize, opGroup); - partitionCount += updater.partitionsAdded; + this.cellDataBufferManager.opOrderGroup = opGroup; + int partitionsAdded = mergeUpdate(data, + allocator, + TriePartitionUpdate.asMergableTrie(update), + indexer, + opGroup, + updater); + partitionCount += partitionsAdded; } finally { @@ -711,12 +772,12 @@ private void updateMinTimestamp(long timestamp) void updateLiveDataSize(long size) { - liveDataSize = liveDataSize + size; + liveDataSize += size; } private void updateCurrentOperations(long op) { - currentOperations = currentOperations + op; + currentOperations += op; } public int partitionCount() @@ -736,12 +797,13 @@ long currentOperations() private DecoratedKey firstPartitionKey(Direction direction) { + // Note: there is no need to skip tails here as this will only be run until we find the first partition. Iterator> iter = data.filteredEntryIterator(direction, PartitionData.class); if (!iter.hasNext()) return null; Map.Entry entry = iter.next(); - return getPartitionKeyFromPath(metadata.get(), entry.getKey()); + return getPartitionKeyFromPath(metadata, entry.getKey()); } public DecoratedKey minPartitionKey() @@ -755,19 +817,52 @@ public DecoratedKey maxPartitionKey() } } - static class PartitionIterator extends TrieTailsIterator + /// Merge an update into the given data trie using the given helpers. Extracted to separate method for testing. + @VisibleForTesting + public static int mergeUpdate(InMemoryDeletionAwareTrie dataTrie, + MemtableAllocator allocator, + DeletionAwareTrie updateTrie, + UpdateTransaction indexer, + OpOrder.Group opGroup, + TriePartitionUpdater updater) + { + indexer.start(); + // Add the initial trie size on the first operation. This technically isn't correct (other shards + // do take their memory share even if they are empty) but doing it during construction may cause + // the allocator to block while we are trying to flush a memtable and become a deadlock. + long onHeap = dataTrie.isEmpty() ? 0 : dataTrie.usedSizeOnHeap(); + long offHeap = dataTrie.isEmpty() ? 0 : dataTrie.usedSizeOffHeap(); + try + { + updater.mergeUpdate(updateTrie); + } + catch (TrieSpaceExhaustedException e) + { + // This should never really happen as a flush would be triggered long before this limit is reached. + throw new AssertionError(e); + } + allocator.offHeap().adjust(dataTrie.usedSizeOffHeap() - offHeap, opGroup); + allocator.onHeap().adjust((dataTrie.usedSizeOnHeap() - onHeap), opGroup); + return updater.partitionsAdded; + } + + /// Iterator over partitions of the given trie. Looks for partition markers and presents the branch of each + /// partition marker as a [TrieBackedPartition]. + static class PartitionIterator extends TrieTailsIterator.DeletionAwareWithoutCoveringDeletions { final TableMetadata metadata; + final TableMetadata droppedColumnsSource; final EnsureOnHeap ensureOnHeap; - PartitionIterator(Trie source, TableMetadata metadata, EnsureOnHeap ensureOnHeap) + PartitionIterator(DeletionAwareTrie source, TableMetadata metadata, TableMetadata droppedColumnsSource, EnsureOnHeap ensureOnHeap) { - super(source, Direction.FORWARD, PartitionData.class::isInstance); + super(source, Direction.FORWARD, TrieBackedPartition.IS_PARTITION_BOUNDARY); this.metadata = metadata; + this.droppedColumnsSource = droppedColumnsSource; this.ensureOnHeap = ensureOnHeap; } @Override - protected TrieBackedPartition mapContent(Object content, Trie tailTrie, byte[] bytes, int byteLength) + protected TrieBackedPartition mapContent(Object content, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength) { PartitionData pd = (PartitionData) content; DecoratedKey key = getPartitionKeyFromPath(metadata, @@ -777,13 +872,18 @@ protected TrieBackedPartition mapContent(Object content, Trie tailTrie, pd.columns(), pd.stats(), pd.rowCountIncludingStatic(), + pd.tombstoneCount(), tailTrie, metadata, + droppedColumnsSource, ensureOnHeap); } } - static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator + /// The implementation of [UnfilteredPartitionIterator] used to walk partition ranges. + static class MemtableUnfilteredPartitionIterator + extends AbstractUnfilteredPartitionIterator + implements Memtable.MemtableUnfilteredPartitionIterator { private final TableMetadata metadata; private final Iterator iter; @@ -792,13 +892,14 @@ static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredParti private final long minLocalDeletionTime; public MemtableUnfilteredPartitionIterator(TableMetadata metadata, + TableMetadata droppedColumnsSource, EnsureOnHeap ensureOnHeap, - Trie source, + DeletionAwareTrie source, ColumnFilter columnFilter, DataRange dataRange, long minLocalDeletionTime) { - this.iter = new PartitionIterator(source, metadata, ensureOnHeap); + this.iter = new PartitionIterator(source, metadata, droppedColumnsSource, ensureOnHeap); this.metadata = metadata; this.columnFilter = columnFilter; this.dataRange = dataRange; @@ -850,10 +951,8 @@ public long unusedReservedOnHeapMemory() return size; } - /** - * Release all recycled content references, including the ones waiting in still incomplete recycling lists. - * This is a test method and can cause null pointer exceptions if used on a live trie. - */ + /// Release all recycled content references, including the ones waiting in still incomplete recycling lists. + /// This is a test method and can cause null pointer exceptions if used on a live trie. @VisibleForTesting void releaseReferencesUnsafe() { @@ -875,7 +974,7 @@ public void setShardCount(String shardCount) { try { - SHARD_COUNT = Integer.valueOf(shardCount); + SHARD_COUNT = Integer.parseInt(shardCount); CassandraRelevantProperties.TRIE_MEMTABLE_SHARD_COUNT.setInt(SHARD_COUNT); } catch (NumberFormatException ex) @@ -908,4 +1007,579 @@ public String getLockFairness() return "" + SHARD_LOCK_FAIRNESS; } } + + /// Trie serializer, used for mapping trie data cells to and from the various database objects. + @VisibleForTesting + public static class TrieSerializer implements ContentSerializer + { + final CellDataBufferManager manager; + final MemtableShard owner; + + // Singletons mapped to special trie values (i.e. negative trie pointers) + + /// [TrieBackedRow#COMPLEX_COLUMN_MARKER] + static final int COMPLEX_COLUMN_ID = 0; + /// [LivenessInfo#EMPTY] + static final int EMPTY_LIVENESS_ID = 1; + /// [TrieTombstoneMarker.LevelMarker#ROW] on the left. + static final int TOMBSTONE_ROW_MARKER_BEFORE_BRANCH = 2; + /// [TrieTombstoneMarker.LevelMarker#ROW] on the right. + static final int TOMBSTONE_ROW_MARKER_AFTER_BRANCH = 3; + + // Offset values + + // Offsets 0 to 24 are cells where the length is given in the offset. Lengths above 16 mean cell has no TTL. + + /// Counter cell whose length is stored in byte 15. + static final byte TYPE_CELL_COUNTER = 0x19; + + /// Cell whose value does not fit and is stored externally. + static final byte TYPE_CELL_EXTERNAL_VALUE = 0x1A; + + /// Content is [LivenessInfo] + static final byte TYPE_LIVENESS_INFO = 0x1B; + /// Content is [TrieTombstoneMarker], to be presented _before_ branch + static final byte TYPE_TOMBSTONE_MARKER_BEFORE_BRANCH = 0x1C; + /// Content is [TrieTombstoneMarker], to be presented _after_ branch + static final byte TYPE_TOMBSTONE_MARKER_AFTER_BRANCH = 0x1D; + /// Content is [PartitionData] + static final byte TYPE_PARTITION_DATA = 0x1E; + // 0x1F for OFFSET_SPECIAL + + // Tombstone flags + + // The three values below match the ones in [TrieCellData], but they don't necessarily have to + // (the equivalence is only used in [#dumpContent]). + static final int OFFSET_TIMESTAMP = 0x18; + static final int OFFSET_LOCAL_DELETION_TIME = 0x14; + static final int OFFSET_TTL = 0x10; + static final int OFFSET_TOMBSTONE_KIND = 0x13; + + /// Offset to add to the above for the left side of a tombstone marker + static final int OFFSET_TOMBSTONE_LEFT = -0x00; + /// Offset to add to the above for the right side of a tombstone marker + static final int OFFSET_TOMBSTONE_RIGHT = -0x10; + + /// Byte that stores whether the tombstone marker is also a row marker + static final int OFFSET_TOMBSTONE_IS_ROW_MARKER = 0x00; + + @VisibleForTesting + public TrieSerializer(CellDataBufferManager manager, MemtableShard owner) + { + this.manager = manager; + this.owner = owner; + } + + @Override + public int idIfSpecial(Object content, boolean shouldPresentAfterBranch) + { + if (content == TrieBackedRow.COMPLEX_COLUMN_MARKER) + { + assert !shouldPresentAfterBranch; + return COMPLEX_COLUMN_ID; + } + if (content == LivenessInfo.EMPTY || content instanceof LivenessInfo && LivenessInfo.EMPTY.equals(content)) + { + assert !shouldPresentAfterBranch; + return EMPTY_LIVENESS_ID; + } + if (content == TrieTombstoneMarker.LevelMarker.ROW) + return shouldPresentAfterBranch ? TOMBSTONE_ROW_MARKER_AFTER_BRANCH : TOMBSTONE_ROW_MARKER_BEFORE_BRANCH; + + // Everything else takes a trie cell. + return -1; + } + + @Override + public Object special(int id) + { + switch (id) + { + case COMPLEX_COLUMN_ID: + return TrieBackedRow.COMPLEX_COLUMN_MARKER; + case EMPTY_LIVENESS_ID: + return LivenessInfo.EMPTY; + case TOMBSTONE_ROW_MARKER_BEFORE_BRANCH: + case TOMBSTONE_ROW_MARKER_AFTER_BRANCH: + return TrieTombstoneMarker.LevelMarker.ROW; + default: + throw new AssertionError("Unknown special ID " + id); + } + } + + @Override + public boolean shouldPresentSpecialAfterBranch(int id) + { + return id == TOMBSTONE_ROW_MARKER_AFTER_BRANCH; + } + + @Override + public boolean shouldPreserveSpecialWithoutChildren(int id) + { + // All our specials are level markers that should not survive if the branch becomes empty. + return false; + } + + @Override + public boolean shouldPreserveWithoutChildren(int offset) + { + // Row markers that fall under a deletion turn into x->x markers with level id. If there is no + // substructure (e.g. a complex column deletion), these should disappear. + return offset != TYPE_TOMBSTONE_MARKER_BEFORE_BRANCH && offset != TYPE_TOMBSTONE_MARKER_AFTER_BRANCH; + } + + @Override + public boolean shouldPreserveWithoutChildren(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + if (buffer.getByte(inBufferPos + OFFSET_TOMBSTONE_IS_ROW_MARKER) == 0) + return true; + if (buffer.getLong(inBufferPos + OFFSET_TOMBSTONE_LEFT + OFFSET_TIMESTAMP) != + buffer.getLong(inBufferPos + OFFSET_TOMBSTONE_RIGHT + OFFSET_TIMESTAMP)) + return true; + if (buffer.getInt(inBufferPos + OFFSET_TOMBSTONE_LEFT + OFFSET_LOCAL_DELETION_TIME) != + buffer.getInt(inBufferPos + OFFSET_TOMBSTONE_RIGHT + OFFSET_LOCAL_DELETION_TIME)) + return true; + if (buffer.getByte(inBufferPos + OFFSET_TOMBSTONE_LEFT + OFFSET_TOMBSTONE_KIND) != + buffer.getByte(inBufferPos + OFFSET_TOMBSTONE_RIGHT + OFFSET_TOMBSTONE_KIND)) + return true; + return false; + } + + @Override + public int serialize(Object content, boolean shouldPresentAfterBranch, UnsafeBuffer buffer, int inBufferPos) + throws TrieSpaceExhaustedException + { + assert !shouldPresentAfterBranch || content instanceof TrieTombstoneMarker; + // most common first + if (content instanceof CellData) + return TrieCellData.serialize((CellData) content, buffer, inBufferPos, manager); + else if (content instanceof LivenessInfo) + return serializeLivenessInfo((LivenessInfo) content, buffer, inBufferPos); + else if (content instanceof TrieTombstoneMarker) + return serializeTombstoneMarker((TrieTombstoneMarker) content, shouldPresentAfterBranch, buffer, inBufferPos); + else if (content instanceof PartitionData) + return serializePartitionData((PartitionData) content, buffer, inBufferPos); + else + throw new AssertionError("Unknown trie content type: " + content); + } + + private int serializeLivenessInfo(LivenessInfo livenessInfo, UnsafeBuffer buffer, int inBufferPos) + { + buffer.putLongOrdered(inBufferPos + OFFSET_TIMESTAMP, livenessInfo.timestamp()); + buffer.putIntOrdered(inBufferPos + OFFSET_LOCAL_DELETION_TIME, CellData.deletionTimeLongToUnsignedInteger(livenessInfo.localExpirationTime())); + buffer.putIntOrdered(inBufferPos + OFFSET_TTL, livenessInfo.ttl()); + return TYPE_LIVENESS_INFO; + } + + private int serializeTombstoneMarker(TrieTombstoneMarker marker, boolean shouldPresentAfterBranch, UnsafeBuffer buffer, int inBufferPos) + { + assert marker.isBoundary(); + TrieTombstoneMarker.Covering left = marker.leftDeletion(); + TrieTombstoneMarker.Covering right = marker.rightDeletion(); + serializeTombstoneSide(buffer, inBufferPos + OFFSET_TOMBSTONE_LEFT, left); + serializeTombstoneSide(buffer, inBufferPos + OFFSET_TOMBSTONE_RIGHT, right); + buffer.putByte(inBufferPos + OFFSET_TOMBSTONE_IS_ROW_MARKER, (byte) (marker.hasLevelMarker(TrieTombstoneMarker.LevelMarker.ROW) ? 1 : 0)); + return shouldPresentAfterBranch ? TYPE_TOMBSTONE_MARKER_AFTER_BRANCH : TYPE_TOMBSTONE_MARKER_BEFORE_BRANCH; + } + + private void serializeTombstoneSide(UnsafeBuffer buffer, int inBufferPos, TrieTombstoneMarker.Covering markerSide) + { + if (markerSide != null) + { + buffer.putLongOrdered(inBufferPos + OFFSET_TIMESTAMP, markerSide.markedForDeleteAt()); + buffer.putIntOrdered(inBufferPos + OFFSET_LOCAL_DELETION_TIME, CellData.deletionTimeLongToUnsignedInteger(markerSide.localDeletionTime())); + buffer.putByte(inBufferPos + OFFSET_TOMBSTONE_KIND, (byte) markerSide.deletionKind().ordinal()); + } + else + { + buffer.putByte(inBufferPos + OFFSET_TOMBSTONE_KIND, (byte) -1); + } + } + + private int serializePartitionData(PartitionData partitionData, UnsafeBuffer buffer, int inBufferPos) + { + if (partitionData.buffer == null) + { + // We are creating a new partition. Link this buffer/inBufferPos with the argument, so that we can add + // statistics as we descend into the partition. + partitionData.buffer = buffer; + partitionData.inBufferPos = inBufferPos; + // we don't need to set anything else as the buffer is filled with 0s when allocated + } + else + { + // We are making a copy of another PartitionData object. + buffer.putLongOrdered(inBufferPos + PARTITIONDATA_OFFSET_ROW_COUNT, partitionData.rowCountIncludingStatic()); + buffer.putIntOrdered(inBufferPos + PARTITIONDATA_OFFSET_TOMBSTONE_COUNT, partitionData.tombstoneCount()); + } + return TYPE_PARTITION_DATA; + } + + @Override + public int updateInPlace(UnsafeBuffer buffer, int inBufferPos, int offsetBits, Object newContent) throws TrieSpaceExhaustedException + { + // We can always set in place, but we may need to release previously held buffer. + if (releaseNeeded(offsetBits)) + release(buffer, inBufferPos, offsetBits); + + return serialize(newContent, shouldPresentAfterBranch(offsetBits), buffer, inBufferPos); + } + + @Override + public void releaseSpecial(int id) + { + // nothing to do, our specials are fixed + } + + @Override + public Object deserialize(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + switch (offsetBits) + { + case TYPE_CELL_COUNTER: + return new TrieCellData.Counter(buffer, inBufferPos); + case TYPE_CELL_EXTERNAL_VALUE: + return new TrieCellData.External(buffer, inBufferPos, manager); + case TYPE_LIVENESS_INFO: + return deserializeLivenessInfo(buffer, inBufferPos); + case TYPE_TOMBSTONE_MARKER_BEFORE_BRANCH: + case TYPE_TOMBSTONE_MARKER_AFTER_BRANCH: + return deserializeTombstoneMarker(buffer, inBufferPos); + case TYPE_PARTITION_DATA: + return new PartitionData(owner, buffer, inBufferPos); + default: + return TrieCellData.embedded(buffer, inBufferPos, offsetBits); + } + } + + private LivenessInfo deserializeLivenessInfo(UnsafeBuffer buffer, int inBufferPos) + { + long timestamp = buffer.getLong(inBufferPos + OFFSET_TIMESTAMP); + long localExpirationTime = CellData.deletionTimeUnsignedIntegerToLong(buffer.getInt(inBufferPos + OFFSET_LOCAL_DELETION_TIME)); + int ttl = buffer.getInt(inBufferPos + OFFSET_TTL); + return LivenessInfo.withExpirationTime(timestamp, ttl, localExpirationTime); + } + + private TrieTombstoneMarker deserializeTombstoneMarker(UnsafeBuffer buffer, int inBufferPos) + { + TrieTombstoneMarker.Covering left = deserializeTombstoneSide(buffer, inBufferPos + OFFSET_TOMBSTONE_LEFT); + TrieTombstoneMarker.Covering right = deserializeTombstoneSide (buffer, inBufferPos + OFFSET_TOMBSTONE_RIGHT); + TrieTombstoneMarker.LevelMarker levelMarker = buffer.getByte(inBufferPos + OFFSET_TOMBSTONE_IS_ROW_MARKER) != 0 + ? TrieTombstoneMarker.LevelMarker.ROW + : null; + return TrieTombstoneMarker.make(left, right, levelMarker); + } + + private TrieTombstoneMarker.Covering deserializeTombstoneSide(UnsafeBuffer buffer, int inBufferPos) + { + byte kind = buffer.getByte(inBufferPos + OFFSET_TOMBSTONE_KIND); + if (kind < 0) + return null; + return TrieTombstoneMarker.covering(buffer.getLong(inBufferPos + OFFSET_TIMESTAMP), + CellData.deletionTimeUnsignedIntegerToLong(buffer.getInt(inBufferPos + OFFSET_LOCAL_DELETION_TIME)), + TrieTombstoneMarker.Kind.values()[kind]); + } + + @Override + public boolean shouldPresentAfterBranch(int offsetBits) + { + // only markers can be after branch + return offsetBits == TYPE_TOMBSTONE_MARKER_AFTER_BRANCH; + } + + @Override + public boolean releaseNeeded(int offsetBits) + { + return manager.releaseNeeded() && offsetBits == TYPE_CELL_EXTERNAL_VALUE; + } + + @Override + public void release(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + TrieCellData.External.release(buffer, inBufferPos, manager); + } + + @Override + public void completeMutation() + { + manager.completeMutation(); + } + + @Override + public void abortMutation() + { + manager.abortMutation(); + } + + @Override + public long usedSizeOnHeap() + { + return manager.onHeapSize(); + } + + @Override + public long usedSizeOffHeap() + { + // managed separately in allocator + return 0; + } + + @Override + public long unusedReservedOnHeapMemory() + { + return manager.unusedReservedOnHeapMemory(); + } + + @Override + public void releaseReferencesUnsafe() + { + manager.releaseReferencesUnsafe(); + } + + @Override + public String dumpSpecial(int id) + { + return "Payload: " + special(id).toString(); + } + + @Override + public String dumpContent(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + return String.format("Payload: length/type %02x data %s ttl %08x ldt %08x timestamp %016x", + offsetBits, + ByteBufferUtil.bytesToHex(buffer.byteBuffer() + .duplicate() + .position(inBufferPos + 0) + .limit(inBufferPos + 16)), + buffer.getInt(inBufferPos + OFFSET_TTL), + buffer.getInt(inBufferPos + OFFSET_LOCAL_DELETION_TIME), + buffer.getLong(inBufferPos + OFFSET_TIMESTAMP) + ); + } + } + + /// Buffer manager for cell data, used to store data that does not fit the 15 bytes for value in the trie block. + @VisibleForTesting + public static abstract class CellDataBufferManager implements TrieCellData.ExternalBufferHandler + { + OpOrder.Group opOrderGroup; + + /// On-heap size of any additional structures used to store the references to data + abstract long onHeapSize(); + + /// If true, the release method will be called when a value is no longer in use + abstract boolean releaseNeeded(); + + /// See [MemoryManager#completeMutation] + abstract void completeMutation(); + /// See [MemoryManager#abortMutation] + abstract void abortMutation(); + + /// See [MemoryManager#unusedReservedOnHeapMemory] + abstract long unusedReservedOnHeapMemory(); + + /// See [ContentManager#releaseReferencesUnsafe] + abstract void releaseReferencesUnsafe(); + } + + /// Buffer manager for cell data, used to store data that does not fit the 15 bytes for value in the trie block. + /// + /// This option stores data in ByteBuffers allocated by the given [MemtableBufferAllocator] and keeps a list of the + /// ByteBuffers it returned in a long-lived [ContentManagerPojo]. + /// It has on-heap presence that is proportional to the number of large data values. + @VisibleForTesting + public static class SlabBufferManager extends CellDataBufferManager + { + final MemtableBufferAllocator allocator; + final long bufferSizeOnHeap; + final ContentManagerPojo buffers; + + @VisibleForTesting + public SlabBufferManager(MemtableBufferAllocator allocator, OpOrder opOrder, long bufferSizeOnHeap) + { + this.allocator = allocator; + this.bufferSizeOnHeap = bufferSizeOnHeap; + this.buffers = new ContentManagerPojo<>(Predicates.alwaysTrue(), InMemoryBaseTrie.ExpectedLifetime.LONG, + opOrder); + } + + @Override + public long store(ByteBuffer buffer, int length) throws TrieSpaceExhaustedException + { + ByteBuffer cloned = allocator.allocate(length, opOrderGroup); + FastByteOperations.copy(buffer, buffer.position(), cloned, cloned.position(), length); + return buffers.addContent(cloned, false); + } + + @Override + public ByteBuffer load(long handle, int length) + { + return buffers.getContent((int) handle); + } + + @Override + long onHeapSize() + { + return buffers.usedSizeOnHeap() + buffers.valuesCount() * bufferSizeOnHeap; + } + + @Override + public boolean releaseNeeded() + { + return true; + } + + @Override + public void release(long handle, int length) + { + buffers.releaseContent((int) handle); + } + + @Override + public void completeMutation() + { + buffers.completeMutation(); + } + + @Override + public void abortMutation() + { + buffers.abortMutation(); + } + + @Override + long unusedReservedOnHeapMemory() + { + return buffers.unusedReservedOnHeapMemory(); + } + + @Override + void releaseReferencesUnsafe() + { + buffers.releaseReferencesUnsafe(); + } + } + + /// Buffer manager for cell data, used to store data that does not fit the 15 bytes for value in the trie block. + /// + /// This option stores data in native memory using [NativeAllocator] and returns the memory address as handle. + /// This storage method has no on-heap presence and is as efficient as it gets. Used when the memtable allocation + /// type is `offheap_objects`. + @VisibleForTesting + public static class NativeBufferManager extends CellDataBufferManager + { + final NativeAllocator allocator; + + @VisibleForTesting + public NativeBufferManager(NativeAllocator allocator) + { + this.allocator = allocator; + } + + @Override + public long store(ByteBuffer buffer, int length) + { + long address = allocator.allocate(length, opOrderGroup); + MemoryUtil.setBytes(address, buffer); + return address; + } + + @Override + public ByteBuffer load(long address, int length) + { + return MemoryUtil.getByteBuffer(address, length); + } + + @Override + long onHeapSize() + { + return 0; + } + + @Override + public boolean releaseNeeded() + { + return false; + } + + @Override + public void release(long handle, int length) + { + // Nothing to do as we can't release data in the allocator. Trie will remove its cells as needed. + } + + @Override + public void completeMutation() + { + // Nothing needed as we can't recycle allocator memory + } + + @Override + public void abortMutation() + { + // Nothing needed as we can't recycle allocator memory + } + + @Override + long unusedReservedOnHeapMemory() + { + return 0; + } + + @Override + void releaseReferencesUnsafe() + { + // no references held + } + } + + /// Trie dumper attaching paths and types to cells and a translation of the key for rows and partitions. + static class Dumper extends TrieDumperWithPath.DeletionAware + { + final TableMetadata metadata; + Columns columns; + int rowKeyLength = 0; + int partitionKeyLength = 0; + + Dumper(TableMetadata metadata) + { + this.metadata = metadata; + } + + @Override + public String contentToString(Object content) + { + if (content instanceof TrieCellData) + { + byte[] cellPath = Arrays.copyOfRange(keyBytes, rowKeyLength, keyPos); + Cell asCell = TrieBackedRow.cellFromCellData((TrieCellData) content, cellPath, cellPath.length, columns); + return asCell.toString(); + } + else if (content instanceof LivenessInfo) + { + rowKeyLength = keyPos; + Clustering clustering = metadata.comparator.clusteringFromByteComparable(ByteBufferAccessor.instance, + ByteComparable.preencoded(TrieBackedPartition.BYTE_COMPARABLE_VERSION, keyBytes, partitionKeyLength, keyPos - partitionKeyLength), + TrieBackedPartition.BYTE_COMPARABLE_VERSION); + columns = metadata.regularAndStaticColumns().columns(clustering == Clustering.STATIC_CLUSTERING); + return content.toString() + " at " + clustering.toString(metadata); + } + else if (content instanceof PartitionData) + { + partitionKeyLength = keyPos; + BufferDecoratedKey key = BufferDecoratedKey.fromByteComparable(ByteComparable.preencoded(TrieBackedPartition.BYTE_COMPARABLE_VERSION, keyBytes, 0, keyPos), + TrieBackedPartition.BYTE_COMPARABLE_VERSION, + metadata.partitioner); + return content.toString() + " at " + metadata.partitionKeyType.getString(key.getKey()); + } + + return content.toString(); + } + + @Override + public String deletionToString(TrieTombstoneMarker deletionMarker) + { + return deletionMarker.toString(); + } + } } diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.md b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.md new file mode 100644 index 000000000000..65b9808b1c65 --- /dev/null +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.md @@ -0,0 +1,775 @@ +# TrieMemtable + +This file describes the implementation of `TrieMemtable` and the various trie-backed containers that we use +to convert the trie representation to legacy structures. + +Trie memtables store the structure of the memtable in one single trie. If viewed as a simple map, this trie +maps a cell key to its data, where the cell key is composed by concatenating the byte-comparable +representations of all key components: +- token +- partition key +- clustering key +- column id +- cell path + +To maintain the correct order, we use the +[byte-comparable representation](../../utils/bytecomparable/ByteComparable.md) +of the keys everywhere in tries +and thus will omit "byte-comparable representation" in the text below (in other words, when we say e.g. +"indexed by the cell path" below we mean "indexed by the byte comparable representation of the cell path"). + +Because tries naturally perform prefix compression, the leading components of these keys are not repeated +and the storage and processing is at least as efficient as having a hierarchy of containers, but crucially +the trie machinery that operates on these does not need to understand the different types of keys or use +separate containers. Additionally, because of this prefix compressed structure we can easily find the points +of origin of various levels of the hierarchy, and can thus view their branches (i.e. everything that has the +same prefix, e.g. same decorated partition key, with that prefix removed; we call this a "tail trie" for that +prefix) as an implementation of the legacy container they map to. + +The trie memtable maintains separate deletion paths which originate at the partition level and contain the +hierarchy of deletions and the specific deletion time applicable to any point in the trie. See +[the deletion-aware tries section in Trie.md](../tries/Trie.md#deletion-aware-tries) for information about +how deletion branches work. + +The details of these mappings will be given below. + +## Structure + +### Cell + +The cell is the lowest level of the data hierarchy (stored at the leaves of the trie) and contains: +- value +- timestamp +- ttl / local deletion time + +If a cell is part of a complex column, it also needs the cell path by which it is reached inside the +complex column (e.g. the map key). When a cell is deleted (which can happen both because it was explicitly +deleted or because its ttl expired), its value is removed and the cell becomes a tombstone. + +`TrieMemtable` uses trie memory to store cell data, which is represented by a `TrieCellData` object (the actual +byte content of which will be described in [the next section](#data-storage)); to be turned into a `Cell`, `CellData` needs +to be combined with its column definition and, if the column is complex, a cell path. Both of these can +be obtained from last part of the path used to reach the cell's position in the memtable trie (see below). + +If a cell is deleted (which is a rare occurence for cells in a memtable), we still store it as a +`TrieCellData` (rather than a tombstone) on the live part of the trie[^1]. + +[^1]: The main reason for this is the fact that cells can become deleted without any change other than time +elapsed, and we will always have the possibility of deleted cells being present in the live branches. +Because expiration should be very rare for data in memtables, we don't expect the accumulation of this +kind of tombstones to become a problem. This point is to be revisited when we implement on-disk tries and +compaction. + +### Complex column + +Complex columns are collections of cells with cell paths. We map this to tries where the `TrieCellData` +objects are stored in a trie map with the cell paths as keys. When we need to return a cell +from these containers to a legacy consumer, we combine the `Cell` object with the path used to reach +it. To make it easier to list the columns contained in a row, we mark the root of a complex column with +a special `COMPLEX_COLUMN_MARKER`, a singleton object that contains no data. + +Complex columns can be deleted as a whole (i.e. have the so-called "complex deletion"). We store complex +deletions as a deleted branch at the root of the complex column. Note that a complex column can be +in the live path, in both, but also in the deletion path only, if it has been deleted and no newer value +has been added. + +The class `TrieBackedComplexColumn` implements the mapping between a trie branch and the legacy complex +column concept. These complex columns cannot be constructed on their own and are always taken from larger +trie objects (e.g. a row or a memtable) to represent the column when a legacy consumer needs this form. + +#### Example + +The example below shows the trie that describes a complex column of type `map`, which is +created with the following insert statement: +``` +INSERT INTO %s (..., purchases) values (..., {"d79012af-8b34-4fb4-9799-6c0d29ca4e2f" : 88.67, + "830b82ce-a7f2-4939-9ea1-46b9d3714848" : 168.01}) +``` +``` +*** Start deletion branch + -> LIVE -> deletedAt=345[COLUMN] +↑ -> deletedAt=345[COLUMN] -> LIVE +*** End deletion branch + -> COMPLEX_COLUMN_MARKER +404830b82cea7f29399ea146b9d371484838 -> [?=40650051eb851eb8 ts=346] + 4d79012af8b34fb497996c0d29ca4e2f38 -> [?=40562ae147ae147b ts=346] +``` +The above is what we store in the trie. For clarity, we can also dump the trie in a way which combines +the `TrieCellData` with its path and column definitions to be able to see the column names, map keys and +interpreted values: +``` +*** Start deletion branch + -> LIVE -> deletedAt=345[COLUMN] +↑ -> deletedAt=345[COLUMN] -> LIVE +*** End deletion branch + -> COMPLEX_COLUMN_MARKER +404830b82cea7f29399ea146b9d371484838 -> [purchases[830b82ce-a7f2-4939-9ea1-46b9d3714848]=168.01 ts=346] + 4d79012af8b34fb497996c0d29ca4e2f38 -> [purchases[d79012af-8b34-4fb4-9799-6c0d29ca4e2f]=88.67 ts=346] +``` +When we insert a value (rather than update) in a complex column, Cassandra always creates a tombstone with +a smaller timestamp. Here we see this as a deletion branch which starts a deletion before the root of the +trie and ends it after the root, covering any data that may have previously existed for the complex column. + +On the live side of the trie, we have a `COMPLEX_COLUMN_MARKER` at the root and two trie branches for the +two entries in the map. The UUID keys are converted to byte-ordered by moving the UUID type digit first +and are only present in the path in the trie. When we convert these to cells this path would be converted +back to a UUID. + +### Row + +The row is the central concept in Cassandra's CQL data model. A row is a collection of typed columns. The +type and order of columns is predefined in the table's metadata, and thus each column can be identified by +a simple integer id[^2], which we represent as a variable length unsigned integer, usually taking just one +byte. Some of the columns are simple, mapping to a single cell, and some may be complex, +where the id maps to the complex column marker, and the individual cells can be reached by following the +cell path from the position of the marker. A row may also contain "liveness info", which tells Cassandra +if the row should be listed as live even if all cells that it contains have been deleted. We store this +liveness info as a `LivenessInfo` object at the root of a row and use it as a marker to list rows within +a partition. + +[^2]: Provided that the metadata does not change, which we can be guaranteed for the lifetime of a memtable. + +Rows can have a row deletion, which is represented as a branch deletion over the root of the row in the +deletion branch of the trie. Like complex columns, it is also possible for rows to exist in the deletion +branch alone, and to be able to recognize such rows as rows we mark the root of a row in a deletion branch +with a row level marker, which is a special `TrieTombstoneMarker` that has no effect other than as metadata +to mark a level. + +There are two ways that the content of a row can be listed: +- by cell, in which case we present all the cells/leaves of the trie, combining complex column cells with + their cell path. +- by column, in which case we present simple cells directly, and also look for `COMPLEX_COLUMN_MARKER` or + a deletion marker below the root of the row. If one of these is found, we take the tail trie (which + includes both the live and deleted part) and use it to form a `TrieBackedComplexColumn`. + +Rows are implemented by the class `TrieBackedRow`. They can be taken from a bigger structure or constructed +by inserting cells into a standalone row in a short-lived in-memory trie using a row builder. We use these +standalone tries as the building blocks to make the partition update objects that we insert into a memtable. + +#### Example + +The full row for the example above, where the insert statement also sets the `total` column: +``` +INSERT INTO %s (..., total, purchases) values (..., 256.68, {"d79012af-8b34-4fb4-9799-6c0d29ca4e2f" : 88.67, + "830b82ce-a7f2-4939-9ea1-46b9d3714848" : 168.01}) +``` +is the following: +``` + -> [ts=346] +*** Start deletion branch + -> Level ROW +01 -> LIVE -> deletedAt=345[COLUMN] +01↑ -> deletedAt=345[COLUMN] -> LIVE +↑ -> Level ROW +*** End deletion branch +00 -> [?=40700ae147ae147b ts=346] +01 -> COMPLEX_COLUMN_MARKER + 404830b82cea7f29399ea146b9d371484838 -> [?=40650051eb851eb8 ts=346] + 4d79012af8b34fb497996c0d29ca4e2f38 -> [?=40562ae147ae147b ts=346] +``` + +This trie contains row level markers in both the live (liveness info `[ts=346]`) and deletion +(`Level ROW`)[^3] branches; a complex column deletion for the column `purchases` with index `01`; +a cell for the timestamp and value of the simple column `total` with index `00`, and two cells with path +for the entries of the `purchases` complex column. + +[^3]: Note that the deletion marker must be presented both before and after the branch because +it needs to be returned before the content of the branch in both forward and reverse direction. + +When this trie is presented as an iterator of `ColumnData`, `TrieBackedRow` uses `tailTries` to stop on +cells, deletions and complex column markers and view the trie above as: +``` +00 -> [total=256.68 ts=346] +01 -> [complex column] +``` +where the stop at `01` is given the complex column trie as shown in the example in the previous section as +the tail trie. + +### Partition + +A partition is an ordered collection of rows, where each row is indexed by a "clustering key", formed of +one or more columns of a pre-specified type. We represent these as tries which can be seen as: +- a collection of cells, indexed by the clustering key, column id and optionally a cell path, with some + metadata added at the trie nodes that start each row and complex column; or +- a collection of row tries, indexed by the clustering key, stored together in a single trie object. + +As before, the root of a partition is marked by an instance of the `PartitionMarker` interface. This marker +interface has no methods; in standalone partitions we use the singleton `PARTITION_MARKER`, but in partitions +that are part of a memtable we use objects that are also used to collect statistics. + +If the partition has a static row, we do not treat it differently, i.e. we use the `STATIC_CLUSTERING` it +reports as the key for the static row subtrie inside the partition. + +Partitions can be deleted as a whole, which we do by creating a branch deletion for the partition root. +Importantly, ranges of rows inside a partition can also be deleted using a range tombstone. We implement +the latter as range deletions in the deletion branch of the partition; the byte-comparable mapping of +range tombstone bounds and boundaries is chosen in a way that makes sure such deletion ranges cover the +trie sections containing any of the deleted rows. + +The deletion information for a partition is stored separately in the trie in a "deletion branch", which +is presented at the root of the partition. This separation is needed for several reasons: +- To be able to find the applicable deletion (the most recent of the applicable partition, range, row or + complex column tombstone) for any point in the trie by finding the closest deletion in the deletion + branch. This closest deletion may be millions on live entries away, and if the two were stored together + we would have to walk over all these live entries to find it. +- To be able to find the closest live entry to a given position easily, when there may be millions of + tombstones between that position and the live entry. By separating the deletion branch we can simply + advance in the live part of the trie. + +Note that when we take a branch of the trie representing a smaller container, e.g. a row, we follow both +the live and deletion branch and present any data we have together in the tail trie. + +Listing the content of a partition is usually accomplished by taking a so-called "unfiltered" row iterator +between a set if pairs of clustring bounds, containing all the rows and deletions applicable to that set. +We perform this by taking the intersection of the partition trie with the set between the bounds[^4], +and then walking the combination of the live and deletion trie to find: +- row markers (i.e. `LivenessInfo` objects or deletion boundaries with a row marker); when we find one we + take the tail trie and wrap it into a `TrieBackedRow`, or +- deletion boundaries, which we map into the corresponding `RangeTombstoneBound(ary)`. + +[^4]: Inclusivity does not matter for this because clustering bounds do not match row clustering keys; they +include a component that adjusts them to be just before or just after a row key. + +Note that if a deletion range applies over a wider range than the query, the result of this intersection +needs to restrict the deletion to the queried range. The trie code ensures that this is done. + +The partition representation is implemented in `TrieBackedPartition`. It can be created over the tail trie in +a memtable, or standalone in a short-lived in-memory trie. The most common usage of standalone partitions is +`TriePartitionUpdate`, which is built by adding rows into an initially empty trie. When Cassandra executes a +write request, it first turns it into one or more `TriePartitionUpdate` objects, and then merges these into +the current memtable. + +#### Example + +The example below is constructed by the following statements: +``` +INSERT INTO %s (..., date, total, purchases) + VALUES (..., '2026-02-12', 324.83, {"82b4ce57-d6a0-4470-8747-1c2aa4fc5961": 324.83}) +DELETE FROM %s WHERE ... AND date = '2026-02-09' +DELETE FROM %s WHERE ... AND date <= '2026-01-31' AND date >= '2026-01-01' +``` + +``` + -> partition with 1 rows and 8 tombstones +*** Start deletion branch +4080004fe620 -> LIVE -> deletedAt=412[RANGE] + 500460 -> deletedAt=412[RANGE] -> LIVE + 0d38 -> Level ROW + LIVE -> deletedAt=329[ROW] + 38↑ -> Level ROW + deletedAt=329[ROW] -> LIVE + 1038 -> Level ROW + 01 -> LIVE -> deletedAt=366[COLUMN] + 01↑ -> deletedAt=366[COLUMN] -> LIVE + 38↑ -> Level ROW +*** End deletion branch +408000501038 -> [ts=367] + 00 -> [?=40744d47ae147ae1 ts=367] + 01 -> COMPLEX_COLUMN_MARKER + 40482b4ce57d6a047087471c2aa4fc596138 -> [?=40744d47ae147ae1 ts=367] +``` + +The trie here has a partition marker with some collected statistics ("partition with ...") +and is first indexed by the row clustering key (the `date` column), which is encoded as an integer +and wrapped in a clustering sequence container (seen as the `40` leading byte and `38`/`20`/`60` terminators). + +Here we have one live row at `408000501038`, including a deletion for its complex column, and two types of +row deletions: a deleted row at `408000500d38` and a range tombstone between `4080004fe620` and `408000500460`. +Notice how as we move to the bigger partition container the deletion branches are moved to be split at the +higher point, repeating the clustering key path — in the memtable trie all deletion branches split at +the partition level to be efficiently processed (see +[section in Trie.md](../tries/Trie.md#why-predetermined-deletion-levels-deletionsatfixedpoints-are-important) +for the reasons for this choice). + +Live rows are recognized by their `LivenessInfo` marker `[ts=367]`. Deleted rows (partial or not) have a level +marker shown above as `Level ROW`. Full row deletions are applied as a branch deletion covering the row +(boundary `LIVE -> deleted` at `408000500d38` and `deleted -> LIVE` at `408000500d38↑` (i.e. on the return path, +after the children of that point); these boundaries combine with the level marker and are reported together. +Range tombstones use the clustering bound terminators `20` (before row) and `60` (after row) and are +expressed as range boundaries to span the trie sections between the two ends[^5]. + +[^5]: It is possible to express row deletions using these clustering bound terminators as well, having +the effect of converting the deletion from a `key = X` restriction to `key >= X AND key <= X`. Cassandra +could just as well work with the latter only; we tried this approach for a while and gave it up for two +reasons: the existing test suite needs to make a distinction between the two types of deletion; and having +the start and end at the same point in the trie presents some optimization opportunities. + +A `TrieBackedPartition` is usually consumed by converting it to an `UnfilteredRowIterator`. This is +achieved by calling `tailTries` and recognizing the row and range tombstone markers as described in the +previous paragraph, viewing the partition as: +``` +4080004fe620 -> LIVE -> deletedAt=412 (range tombstone start) + 500460 -> deletedAt=412 -> LIVE (range tombstone end) + 0d38 -> [row recognized by the marker "Level ROW" with tail] + 1038 -> [row recognized by the marker "ts=367" with tail] +``` +where each row is turned into a `TrieBackedRow` by passing in the tail trie originating at that point, and the +path that leads to the point to be converted to a clustering key object. + +For example, the tail given for the `408000501038` row is +``` +-> [ts=367] +*** Start deletion branch +-> Level ROW +01 -> LIVE -> deletedAt=366[COLUMN] +01↑ -> deletedAt=366[COLUMN] -> LIVE +↑ -> Level ROW +*** End deletion branch +00 -> [?=40744d47ae147ae1 ts=367] +01 -> COMPLEX_COLUMN_MARKER + 40482b4ce57d6a047087471c2aa4fc596138 -> [?=40744d47ae147ae1 ts=367] +``` + +### Memtable + +A memtable is a giant trie containing a map of all the cells indexed by the concatenation of all cell key +components. In other words, it is a map of partition tries, indexed by the partition's decorated partition +key (i.e. token + serialized partition key), stored together in one giant structure. + +This partition map does not have any deletion component, because in Cassandra we cannot have deletions that +go above the level of individual partitions. Deletion branches are always rooted at the partition level. + +The memtable is split into memtable shards, i.e. separate tries that split the served token range in equal +ranges, each holding a long-lived in-memory trie. This is done to improve parallelism, because individual +in-memory tries cannot be modified by multiple threads concurrently. By splitting the space into shards we +allow per-shard parallelism, which typically improves the parallelism of the whole structure by the number +of shards because the randomized nature of the tokens usually results in well-distributed accesses to the +individual shards. + +While writes are handled by the individual shards, the memtable also contains a merged view of the shards +which is used to serve reads — given a key or bounds to query the merge can automatically handle the +question of finding the relevant shard that contains the data. Since tries can be read concurrently by +multiple threads including while they are being modified, we do not need to lock the shard to perform a read. + +#### Example +The trie below +``` +40a9e72bd32b9f1ba24041434d450038 -> partition with 2 rows and 4 tombstones + *** Start deletion branch + 408000500e38 -> Level ROW + 01 -> LIVE -> deletedAt=345[COLUMN] + 01↑ -> deletedAt=345[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000500e38 -> [ts=346] + 00 -> [?=40700ae147ae147b ts=346] + 01 -> COMPLEX_COLUMN_MARKER + 404830b82cea7f29399ea146b9d371484838 -> [?=40650051eb851eb8 ts=346] + 4d79012af8b34fb497996c0d29ca4e2f38 -> [?=40562ae147ae147b ts=346] + 1138 -> [ts=385] + 00 -> [?=4058ceb851eb851f ts=385] + 01 -> COMPLEX_COLUMN_MARKER + 404dab4819dc6f5c05b5754a057d78c99a38 -> [?=4058ceb851eb851f ts=385] + ca8e7ee71a25ce664049424d0038 -> partition with 1 rows and 4 tombstones + *** Start deletion branch + 408000500f38 -> Level ROW + 01 -> LIVE -> deletedAt=351[COLUMN] + 01↑ -> deletedAt=351[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000500f38 -> [ts=352] + 00 -> [?=4080f651eb851eb8 ts=352] + 01 -> COMPLEX_COLUMN_MARKER + 40435441ee93ac90d098e89c75b414addb38 -> [?=407a4ab851eb851e ts=352] + edf143aa8d178f86b4d83a72a7517038 -> [?=405e87ae147ae148 ts=352] + cd0a37fd8f053c6c404170706c650038 -> partition with 1 rows and 8 tombstones + *** Start deletion branch + 4080004fe620 -> LIVE -> deletedAt=412[RANGE] + 500460 -> deletedAt=412[RANGE] -> LIVE + 0d38 -> Level ROW + LIVE -> deletedAt=329[ROW] + 38↑ -> Level ROW + deletedAt=329[ROW] -> LIVE + 1038 -> Level ROW + 01 -> LIVE -> deletedAt=366[COLUMN] + 01↑ -> deletedAt=366[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000501038 -> [ts=367] + 00 -> [?=40744d47ae147ae1 ts=367] + 01 -> COMPLEX_COLUMN_MARKER + 40482b4ce57d6a047087471c2aa4fc596138 -> [?=40744d47ae147ae1 ts=367] +``` +is constructed by running the code in `TrieMemtableDocTrieMakerTest.java` and represents the full trie for a small +table with three partitions. The leading part of this trie is the decorated partition key, composed of a token +and serialization of the partition key (the `company` column, containing a string), wrapped in a sequence encoding +(see [description in ByteComparable.md](../../utils/bytecomparable/ByteComparable.md#multi-component-sequences-partition-or-clustering-keys-tuples-bounds-and-nulls)). +The first byte `40` starts the sequence encoding, followed by 8 bytes of Murmur3-generated token, another `40` byte +to start the next value in the sequence, a `00`-terminated string for the company name, and a `38` sequence +terminator. + +At each partition key we have a partition marker that also collects statistics about the partition, and each +partition is as described in the previous section, with deletion branch splitting at the partition root. + +Then the memtable is consumed (e.g. on flush or to list a range of partitions), we once again use `tailTries` +recognizing these partition markers to view it as: +``` +40a9e72bd32b9f1ba24041434d450038 -> [partition with tail] + ca8e7ee71a25ce664049424d0038 -> [partition with tail] + cd0a37fd8f053c6c404170706c650038 -> [partition with tail] +``` +and we use the tail trie and the path used to reach the point to form a `TrieBackedPartition`. E.g. the example +trie in the previous section is given as the tail for the key `40cd0a37fd8f053c6c404170706c650038`, which is +translated to the partition key "Apple". + +Here is another representation of the trie above, where we have combined the cells, rows and partitions with their +type definitions and keys to make it easier to see what the content is describing: +``` +40a9e72bd32b9f1ba24041434d450038 -> partition with 2 rows and 4 tombstones at ACME + *** Start deletion branch + 408000500e38 -> Level ROW + 01 -> LIVE -> deletedAt=345[COLUMN] + 01↑ -> deletedAt=345[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000500e38 -> [ts=346] at date=2026-02-10 + 00 -> [total=256.68 ts=346] + 01 -> COMPLEX_COLUMN_MARKER + 404830b82cea7f29399ea146b9d371484838 -> [purchases[830b82ce-a7f2-4939-9ea1-46b9d3714848]=168.01 ts=346] + 4d79012af8b34fb497996c0d29ca4e2f38 -> [purchases[d79012af-8b34-4fb4-9799-6c0d29ca4e2f]=88.67 ts=346] + 1138 -> [ts=385] at date=2026-02-13 + 00 -> [total=99.23 ts=385] + 01 -> COMPLEX_COLUMN_MARKER + 404dab4819dc6f5c05b5754a057d78c99a38 -> [purchases[dab4819d-c6f5-4c05-b575-4a057d78c99a]=99.23 ts=385] + ca8e7ee71a25ce664049424d0038 -> partition with 1 rows and 4 tombstones at IBM + *** Start deletion branch + 408000500f38 -> Level ROW + 01 -> LIVE -> deletedAt=351[COLUMN] + 01↑ -> deletedAt=351[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000500f38 -> [ts=352] at date=2026-02-11 + 00 -> [total=542.79 ts=352] + 01 -> COMPLEX_COLUMN_MARKER + 40435441ee93ac90d098e89c75b414addb38 -> [purchases[35441ee9-3ac9-40d0-98e8-9c75b414addb]=420.66999999999996 ts=352] + edf143aa8d178f86b4d83a72a7517038 -> [purchases[3edf143a-a8d1-478f-86b4-d83a72a75170]=122.12 ts=352] + cd0a37fd8f053c6c404170706c650038 -> partition with 1 rows and 8 tombstones at Apple + *** Start deletion branch + 4080004fe620 -> LIVE -> deletedAt=412[RANGE] + 500460 -> deletedAt=412[RANGE] -> LIVE + 0d38 -> Level ROW + LIVE -> deletedAt=329[ROW] + 38↑ -> Level ROW + deletedAt=329[ROW] -> LIVE + 1038 -> Level ROW + 01 -> LIVE -> deletedAt=366[COLUMN] + 01↑ -> deletedAt=366[COLUMN] -> LIVE + 38↑ -> Level ROW + *** End deletion branch + 408000501038 -> [ts=367] at date=2026-02-12 + 00 -> [total=324.83 ts=367] + 01 -> COMPLEX_COLUMN_MARKER + 40482b4ce57d6a047087471c2aa4fc596138 -> [purchases[82b4ce57-d6a0-4470-8747-1c2aa4fc5961]=324.83 ts=367] +``` + +## Data storage + +The content of the tries in the current version of the trie memtables is stored as bytes in trie cells and +converted to the various content objects when it is requested by a consumer. To accompish this, `TrieMemtable` +constructs its in-memory trie with a `ContentSerializer` that can write and read content from 32-byte trie +cells. + +We also have three types of markers that may appear multiple times and carry no additional information. For +these (`LivenessInfo.EMPTY`, `TrieTombstoneMarker.Level.ROW` and `COMPLEX_COLUMN_MARKER`) we use special +content ids that use no trie cells. + +The tables below describe how the data is stored with an example for each. + +### TrieCellData.Embedded for cells with values up to 16 bytes in length + +Cell data that has values up to 16 bytes in length is stored by setting `offsetBits` to the length of the +value and then filling the cell with: + +| bytes | content | example | example decoding | +|-------|------------------------------|-------------------|--------------------| +| 0-15 | value | 40744d47 ae147ae1 | 324.83 | +| 16-19 | ttl | 00000000 | NO_TTL | +| 20-23 | unsigned local deletion time | FFFFFFFF | NO_EXPIRATION_TIME | +| 24-31 | timestamp | 00000000 0000016F | 367 | + +The example, with `offsetBits == 8`, encodes the cell data `[?=40744d47ae147ae1 ts=367]` for the cell +`[total=324.83 ts=367]` from above. + +### TrieCellData.EmbeddedNoTTL for cells with values between 17 and 24 bytes in length with no TTL + +If a cell is not expiring or expired/deleted, it has empty TTL and local deletion/expiration time. +In this case we can use 8 extra bytes for value: + +| bytes | content | example | example decoding | +|-------|-----------------------------|---------------------------------------------------------------------------------|-------------------------| +| 0-23 | value | 53 61 6D 70 6C 65 20 74
65 78 74 20 6F 66 20 32
35 20 62 79 74 65 73 00 | Sample text of 23 bytes | +| 24-31 | timestamp | 00000000 00000160 | 352 | + +The example, with `offsetBits == 23`, encodes cell data `[?=53616D706C652074657874206F66203233206279746573 ts=352]` +containing an ASCII string. + +### TrieCellData.Counter for counter cells up to 15 bytes in length + +For counters that can be embedded (usually with empty value), we use `offsetBits == 0x19` and store the value +length in the cell. + +| bytes | content | example | example decoding | +|-------|------------------------------|-------------------|--------------------| +| 0-14 | value | | | +| 15 | value length | 00 | empty value | +| 16-19 | ttl | 00000000 | NO_TTL | +| 20-23 | unsigned local deletion time | FFFFFFFF | NO_EXPIRATION_TIME | +| 24-31 | timestamp | 00000000 0000016D | 365 | + + +### TrieCellData.External for cells with values that can't be fitted in the available trie bytes + +Externally-stored value of cells use `offsetBits == 0x1A` and the following content: + +| bytes | content | example | example decoding | +|-------|------------------------------|-------------------|--------------------------| +| 0 | is counter | 00 | non-counter | +| 1-3 | _unused_ | | | +| 4-7 | value length | 20 | 32 bytes | +| 8-15 | external value handle | 12345678 90ABCDEF | address in direct memory | +| 16-19 | ttl | 00000000 | NO_TTL | +| 20-23 | unsigned local deletion time | FFFFFFFF | NO_EXPIRATION_TIME | +| 24-31 | timestamp | 00000000 0000016C | 364 | + +The example encodes a cell with timestamp 364, no expiration, and a 32-byte value stored in direct memory. + +### LivenessInfo + +We use `offsetBits == 0x1B` for liveness info. + +| bytes | content | example | example decoding | +|-------|------------------------------|-------------------|--------------------| +| 0-15 | _unused_ | | | +| 16-19 | ttl | 00000000 | NO_TTL | +| 20-23 | unsigned local deletion time | FFFFFFFF | NO_EXPIRATION_TIME | +| 24-31 | timestamp | 00000000 0000016D | 365 | + +The example encodes the `[ts=367]` liveness info object from above. + +### TrieTombstoneMarker + +Tombstone markers use two offset bit values, depending on whether they are to be presented before (`0x1C`) +or after (`0x1D`) the child branch. + +| bytes | content | example | example decoding | +|-------|---------------------------|-------------------|------------------------------------| +| 0 | has row level marker | 00 | no row level marker | +| 13-15 | _unused_ | | | +| 3 | right tombstone kind | 01 | COLUMN (kind ordinal = 1) | +| 4-7 | right local deletion time | 69CFB5AF | 1775220143 | +| 8-15 | right timestamp | 00000000 00000159 | 345 (right deletion timestamp) | +| 16-18 | _unused_ | | | +| 19 | left tombstone kind | FF | not present | +| 20-23 | left local deletion time | | | +| 24-31 | left timestamp | | | + +The example, with `offsetBits == 0x1C`, encodes `LIVE -> deletedAt=345, localDeletion=1775220143[COLUMN]`, a +tombstone boundary that starts a column deletion with timestamp 345. + +### PartitionData + +For partition data we use `offsetBits = 0x1E`. + +| bytes | content | example | example decoding | +|-------|---------------------|----------|-------------------------------------| +| 0-4 | row count | 00000001 | 1 row (including static) | +| 4-7 | tombstone count | 00000008 | 8 tombstones | +| 8-31 | _unused_ | | | + +The example encodes partition metadata for a partition with 1 row and 8 tombstones, as seen in the +"Apple" partition example above. + +The `PartitionData` object encapsulates this and stores the buffer and pointer. The mutation code uses this +to modify the values in the buffer directly when a row/tombstone is added to the partition or removed. + +### External buffer storage for large values + +If a cell value can fit within the 15 bytes in the `TrieCellData` block, it is directly stored there. + +Values that don't fit are stored according to the memtable allocation mode. When the allocation mode is +`offheap_objects`, the allocator hands out memory addresses from a direct memory slab. The memtable +will in this case directly store the memory address in the `TrieCellData` serialization, and wrap it +in a direct buffer when requested. This is the most efficient mode of operation of the memtable where +all content and metadata is stored in off-heap memory; the on-heap presence of a memtable is constant +and somewhere on the order of 100 KiB. + +In the other allocation modes, the allocator returns byte buffers which we must store as Java objects. +To do this, the memtable defers to the in-memory tries' `ContentManagerPojo` that maintains lists of java +objects and maps them into integer handles. To store, we ask the allocator for a buffer, fill it in, give +it to the content manager and save the id it returns in the handle field. To read, we get the id from +the handle field and ask the manager for the buffer. In this mode of operation every large cell has +additional on-heap presence in the form of one byte buffer and a list entry for it. + +## Other key points + +### Write atomicity and monotonicity + +Cassandra provides some consistency guarantees for writes to the same partition on the same node, namely that +such writes are atomic (i.e. a reader cannot see part of an update and miss something else that was modified +with the same update) and, if requests are made to the same node, that writes to the same partition are +monotonic (i.e. if one process issues two writes one after the other, no reader can see the result of the +latter write without the result of the former). + +To ensure these two properties, the memtable performs trie mutations with a force-copy predicate that +recognizes partition boundaries. The effect of this predicate is that whenever a mutation reaches the partition +level, every change to a node at that level or below it in the trie is performed by making a copy of the +trie cells rather than modifying them in place. + +This has the effect of practically maintaining a snapshot of the partition's state before the mutation and +letting any concurrent reader that has reached a node inside that snapshot observe it unchanged. Eventually +the mutation will reach a point above the partition level and modify a pointer to the new version of the +partition, which will swap in the new version of the whole partition for later readers to see. From this point +on the old snapshot can no longer be found, and eventually all readers that could be inside it will finish +their work and the old snapshot can be thrown away. + +See also [the Atomicity and Consistency sections in InMemoryTrie.md](../tries/InMemoryTrie.md#atomicity). + +### Preventing corruption from reuse + +As the memtable trie is long-lived, it should be able to identify cells of the trie that have been replaced +with newer versions and reuse them. For this to work, it needs to be able to tell if all operations started +before a given point in time have been completed. If e.g. it issues such a "read op barrier" after a write +wires in a new version of a partition, and that barrier "expires" (i.e. guarantees that all operations +started before the barrier's issue), then it can be certain that none of the currently active or future +operations on the memtable can see the previous snapshot of that partition and all its nodes can thus be +recycled and reused. + +This doesn't need to be granular to a partition or individual cells; we can form batches of cells to reuse +and issue a single barrier for the whole group. To do this, we use the `readOrdering` `OpOrder` that +`ColumnFamilyStore` already provides. Every read on a table marks itself in this op order and closes it +when it completes, and it gives us the required signal for reusing space in the memtable tries. + +There is an additional situation where memory containing memtable data could be reused if we are not careful; +this is a feature shared by all memtables implementations: if data is stored off-heap, it can happen +that some results of a read reference this data long after the whole memtable is deleted and has released +its memory. This can occur, for example, if this node is the coordinator for a request with multiple +replicas and it has to wait for responses from other nodes. As it is not a good idea to block reuse for the +long periods that such responses can require, we need a different way to ensure that such data is alive. + +The latter problem is solved by copying data to on-heap objects for temporary storage before it is given +to the coordination layer to keep. This has a pretty high overhead and will likely be replaced with trie +serialization in later stages of the work on trie-based interfaces. + +### Unfiltered and filtered row iterators + +When serving a query, unfiltered row iterators from multiple sources are merged to form a single stream and +then "filtered", i.e. processed to remove all deletions and report the final result. To aid the filtering +process, a trie-backed partition's iterator can be asked to `stopIssuingTombstones`, which tells it to stop +looking at the deletion branch and makes it possible to quickly return live data without having to skip over +tombstones that may be between it and the current position of the iterator. + +This currently only works for partitions that are present only in a memtable, because the deletions need +to be applied to data from other sources. This should be improved with later work on on-disk tries and query +interfaces. + +### Applying deletions and handling dangling markers + +When the memtable receives a mutation that contains deletions, the trie code applies these deletions to all +content that they apply to / cover. This means that anything with a lower timestamp than the deletion time +is removed from the memtable, including cells, liveness info or earlier tombstones. When such a deletion is +applied, the substructure that leads to the deleted cells is also removed, which also means that bigger +branches like rows can become empty and should be removed as well. + +There is a little complication here caused by level markers. If we don't do anything special about them, a +deletion, for example, that deletes a partition would remove all cells, the rows' liveness info, all range +or row tombstones, but would keep an empty liveness info object as a marker for the root of each row that +existed before. This marker would have no substructure and represent an empty row, but unfortunately this +marker would also force the path leading to it to be retained, inflating the size of the memtable and +complicating later walks that have to pass over it. + +To avoid this problem and make sure that we delete markers that no longer serve any purpose, the in-memory +trie uses a `shouldPreserveWithoutChildren` predicate that is used to check if a trie content/metadata +entry makes sense when it has no substructure. This checker is called every time the mutation code notices +that it is building a leaf node, and drops the content (and with it the whole leaf and path leading to it) +if the predicate returns `false`. By using a suitable predicate, recognizing `LivenessInfo.EMPTY`, +`COMPLEX_COLUMN_MARKER` and `Level ROW`, the trie memtable makes sure that unproductive markers are dropped +without affecting things like cells or non-empty liveness info that are meaningful without substructure. + +Note that when data in the trie and paths are deleted, the trie will drop and reuse the trie cells that stored +them, but cannot do anything about non-trie memory. This means, for example, that, while we can fully release +deletion markers, liveness info and cells with values of 15 bytes or fewer, large cell values stored in +on- or off-heap slab buffers cannot be released. This fact is a feature of Cassandra memtables that is not +fully resolved by the current trie memtable implementation. + +#### Example + +Suppose we issue a partition deletion for the 'Apple' parition using +``` +DELETE FROM ... USING TIMESTAMP 513 WHERE company = 'Apple'; +``` + +This deletion is represented by the partition update +``` +-> PARTITION_MARKER +*** Start deletion branch +-> LIVE -> deletedAt=513[PARTITION] +↑ -> deletedAt=513[PARTITION] -> LIVE +*** End deletion branch +``` + +To merge it into the trie, we attach it at the path corresponding to its decorated partition key: +``` +40cd0a37fd8f053c6c404170706c650038 -> PARTITION_MARKER + *** Start deletion branch + -> LIVE -> deletedAt=513[PARTITION] + ↑ -> deletedAt=513[PARTITION] -> LIVE + *** End deletion branch +``` +and then we call the trie code to merge it in. + +The trie code walk this trie in parallel with the in-memory memtable trie. When it reaches the "Apple" +partition it will see something similar to: +``` +40cd0a37fd8f053c6c404170706c650038 -> partition with 1 rows and 8 tombstones + *** Start deletion branch + -> TO APPLY: LIVE -> deletedAt=513[PARTITION] + 4080004fe620 -> LIVE -> deletedAt=412[RANGE] + 500460 -> deletedAt=412[RANGE] -> LIVE + 0d38 -> Level ROW + LIVE -> deletedAt=329[ROW] + 38↑ -> Level ROW + deletedAt=329[ROW] -> LIVE + 1038 -> Level ROW + 01 -> LIVE -> deletedAt=366[COLUMN] + 01↑ -> deletedAt=366[COLUMN] -> LIVE + 38↑ -> Level ROW + ↑ -> TO APPLY: deletedAt=513[PARTITION] -> LIVE + *** End deletion branch + -> TO APPLY: LIVE -> deletedAt=513[PARTITION] + 408000501038 -> [ts=367] + 00 -> [?=40744d47ae147ae1 ts=367] + 01 -> COMPLEX_COLUMN_MARKER + 40482b4ce57d6a047087471c2aa4fc596138 -> [?=40744d47ae147ae1 ts=367] + ↑ -> TO APPLY: deletedAt=513[PARTITION] -> LIVE +``` + +Everything between the "TO APPLY" bounds that has a timestamp smaller than 513 is removed from the trie, +resulting in: +``` +40cd0a37fd8f053c6c404170706c650038 -> partition with 1 rows and 2 tombstones + *** Start deletion branch + -> LIVE -> deletedAt=513[PARTITION] + 408000500d38 -> Level ROW + 38↑ -> Level ROW + 1038 -> Level ROW + 38↑ -> Level ROW + ↑ -> deletedAt=513[PARTITION] -> LIVE + *** End deletion branch + 408000501038 -> [ts=EMPTY] + 01 -> COMPLEX_COLUMN_MARKER +``` +As part of the process of modifying the in-memory trie, the mutation code recognizes that the `COMPLEX_COLUMN_MARKER` +and the `Level ROW` markers have no children and call the `shouldPreserveWithoutChildren` predicate. As that returns +false, the marker and the path leading to them is removed. +``` +40cd0a37fd8f053c6c404170706c650038 -> partition with 1 rows and 2 tombstones + *** Start deletion branch + -> LIVE -> deletedAt=513[PARTITION] + ↑ -> deletedAt=513[PARTITION] -> LIVE + *** End deletion branch + 408000501038 -> [ts=EMPTY] +``` +Now `[ts=EMPTY]` (i.e. `LivenessInfo.EMPTY`) has no children and has the `shouldPreserveWithoutChildren` predicate +called, which tells the trie code to remove it and the path leading to it, resulting in the final +``` +40cd0a37fd8f053c6c404170706c650038 -> partition with 0 rows and 2 tombstones + *** Start deletion branch + -> LIVE -> deletedAt=513[PARTITION] + ↑ -> deletedAt=513[PARTITION] -> LIVE + *** End deletion branch +``` +The intermediate stages shown above are not actually materialized and just given for clarification: the real +process yields the final state directly by walking the trie in parallel with the deletions and recursively +setting pointers to null as they are deleted or become empty. \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage1.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage1.java index ffc3535c655f..f9fd2c105be4 100644 --- a/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage1.java +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage1.java @@ -79,26 +79,23 @@ import static org.apache.cassandra.io.sstable.SSTableReadsListener.NOOP_LISTENER; -/** - * Previous TrieMemtable implementation, provided for two reasons: - *
    - *
  • to easily compare current and earlier implementations of the trie memtable - *
  • to have an option to change a database back to the older implementation if we find a bug or a performance problem - * with the new code. - *
- *

- * To switch a table to this version, use - *

- *   ALTER TABLE ... WITH memtable = {'class': 'TrieMemtableStage1'}
- * 
- * or add - *
- *   memtable:
- *     class: TrieMemtableStage1
- * 
- * in cassandra.yaml to switch a node to it as default. - * - */ +/// Previous TrieMemtable implementation, provided for two reasons: +/// +/// - to easily compare current and earlier implementations of the trie memtable +/// - to have an option to change a database back to the older implementation if we find a bug or a performance +/// problem with the new code. +/// +/// +/// To switch a table to this version, use +/// ``` +/// ALTER TABLE ... WITH memtable = {'class': 'TrieMemtableStage1'} +/// ``` +/// or add +/// ``` +/// memtable: +/// class: TrieMemtableStage1 +/// ``` +/// in `cassandra.yaml` to switch a node to it as default. public class TrieMemtableStage1 extends AbstractAllocatorMemtable { private static final Logger logger = LoggerFactory.getLogger(TrieMemtableStage1.class); @@ -138,14 +135,6 @@ public class TrieMemtableStage1 extends AbstractAllocatorMemtable @Unmetered private final TrieMemtableMetricsView metrics; - /** - * Keeps an estimate of the average row size in this memtable, computed from a small sample of rows. - * Because computing this estimate is potentially costly, as it requires iterating the rows, - * the estimate is updated only whenever the number of operations on the memtable increases significantly from the - * last update. This estimate is not very accurate but should be ok for planning or diagnostic purposes. - */ - private volatile MemtableAverageRowSize estimatedAverageRowSize; - // only to be used by init(), to setup the very first memtable for the cfs TrieMemtableStage1(AtomicReference commitLogLowerBound, TableMetadataRef metadataRef, Owner owner) { @@ -307,14 +296,6 @@ public long rowCount(final ColumnFilter columnFilter, final DataRange dataRange) return total; } - @Override - public long getEstimatedAverageRowSize() - { - if (estimatedAverageRowSize == null || currentOperations.get() > estimatedAverageRowSize.operations * 1.5) - estimatedAverageRowSize = new MemtableAverageRowSize(this); - return estimatedAverageRowSize.rowSize; - } - @Override public UnfilteredRowIterator rowIterator(DecoratedKey key, Slices slices, ColumnFilter columnFilter, boolean reversed, SSTableReadsListener listener) { @@ -406,7 +387,8 @@ public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds; boolean includeStop = isBound || keyRange instanceof Range; - Trie subMap = mergedTrie.subtrie(left, includeStart, right, includeStop); + Trie subMap = mergedTrie.subtrie(toComparableBound(left, includeStart), + toComparableBound(right, !includeStop)); return new MemtableUnfilteredPartitionIterator(metadata(), allocator.ensureOnHeap(), @@ -415,6 +397,11 @@ public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter dataRange); } + private static ByteComparable toComparableBound(PartitionPosition position, boolean before) + { + return position == null || position.isMinimum() ? null : position.asComparableBound(before); + } + public Partition getPartition(DecoratedKey key) { int shardIndex = boundaries.getShardForKey(key); @@ -445,7 +432,7 @@ private static DecoratedKey getPartitionKeyFromPath(TableMetadata metadata, Byte public FlushablePartitionSet getFlushSet(PartitionPosition from, PartitionPosition to) { - Trie toFlush = mergedTrie.subtrie(from, true, to, false); + Trie toFlush = mergedTrie.subtrie(toComparableBound(from, true), toComparableBound(to, true)); long keySize = 0; int keyCount = 0; diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage2.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage2.java new file mode 100644 index 000000000000..0ac9f6059a5b --- /dev/null +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage2.java @@ -0,0 +1,881 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.memtable; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Predicate; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.commitlog.CommitLogPosition; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.TrieBackedPartitionStage2; +import org.apache.cassandra.db.partitions.TriePartitionUpdateStage2; +import org.apache.cassandra.db.partitions.TriePartitionUpdaterStage2; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryTrie; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieEntriesWalker; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.db.tries.TrieTailsIterator; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.IncludingExcludingBounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.io.sstable.SSTableReadsListener; +import org.apache.cassandra.metrics.TableMetrics; +import org.apache.cassandra.metrics.TrieMemtableMetricsView; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableMetadataRef; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.memory.EnsureOnHeap; +import org.apache.cassandra.utils.memory.HeapCloner; +import org.apache.cassandra.utils.memory.MemtableAllocator; +import org.github.jamm.Unmetered; + +/// Previous TrieMemtable implementation, provided for two reasons: +/// +/// - to easily compare current and earlier implementations of the trie memtable +/// - to have an option to change a database back to the older implementation if we find a bug or a performance +/// problem with the new code. +/// +/// +/// To switch a table to this version, use +/// ``` +/// ALTER TABLE ... WITH memtable = {'class': 'TrieMemtableStage2'} +/// ``` +/// or add +/// ``` +/// memtable: +/// class: TrieMemtableStage2 +/// ``` +/// in `cassandra.yaml` to switch a node to it as default. +public class TrieMemtableStage2 extends AbstractShardedMemtable +{ + private static final Logger logger = LoggerFactory.getLogger(TrieMemtableStage2.class); + + /** Buffer type to use for memtable tries (on- vs off-heap) */ + public static final BufferType BUFFER_TYPE = DatabaseDescriptor.getMemtableAllocationType().toBufferType(); + + /** + * Force copy checker (see InMemoryTrie.ApplyState) ensuring all modifications apply atomically and consistently to + * the whole partition. + */ + public static final Predicate> FORCE_COPY_PARTITION_BOUNDARY = features -> isPartitionBoundary(features.content()); + + public static final Predicate IS_PARTITION_BOUNDARY = TrieMemtableStage2::isPartitionBoundary; + + // Set to true when the memtable requests a switch (e.g. for trie size limit being reached) to ensure only one + // thread calls cfs.switchMemtableIfCurrent. + private final AtomicBoolean switchRequested = new AtomicBoolean(false); + + /** + * Sharded memtable sections. Each is responsible for a contiguous range of the token space (between boundaries[i] + * and boundaries[i+1]) and is written to by one thread at a time, while reads are carried out concurrently + * (including with any write). + */ + private final MemtableShard[] shards; + + /** + * A merged view of the memtable map. Used for partition range queries and flush. + * For efficiency we serve single partition requests off the shard which offers more direct InMemoryTrie methods. + */ + private final Trie mergedTrie; + + @Unmetered + private final TrieMemtableMetricsView metrics; + + TrieMemtableStage2(AtomicReference commitLogLowerBound, TableMetadataRef metadataRef, Owner owner, Integer shardCountOption) + { + super(commitLogLowerBound, metadataRef, owner, shardCountOption); + this.metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name); + this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics, owner.readOrdering()); + this.mergedTrie = makeMergedTrie(shards); + logger.trace("Created memtable with {} shards", this.shards.length); + } + + private static MemtableShard[] generatePartitionShards(int splits, + TableMetadataRef metadata, + TrieMemtableMetricsView metrics, + OpOrder opOrder) + { + if (splits == 1) + return new MemtableShard[] { new MemtableShard(metadata, metrics, opOrder) }; + + MemtableShard[] partitionMapContainer = new MemtableShard[splits]; + for (int i = 0; i < splits; i++) + partitionMapContainer[i] = new MemtableShard(metadata, metrics, opOrder); + + return partitionMapContainer; + } + + private static Trie makeMergedTrie(MemtableShard[] shards) + { + List> tries = new ArrayList<>(shards.length); + for (MemtableShard shard : shards) + tries.add(shard.data); + return Trie.mergeDistinct(tries); + } + + @Override + public boolean isClean() + { + for (MemtableShard shard : shards) + if (!shard.isClean()) + return false; + return true; + } + + @VisibleForTesting + @Override + public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference commitLogUpperBound) + { + super.switchOut(writeBarrier, commitLogUpperBound); + + for (MemtableShard shard : shards) + shard.allocator.setDiscarding(); + } + + @Override + public void discard() + { + super.discard(); + // metrics here are not thread safe, but I think we can live with that + metrics.lastFlushShardDataSizes.reset(); + for (MemtableShard shard : shards) + { + metrics.lastFlushShardDataSizes.update(shard.liveDataSize()); + } + // the buffer release is a longer-running process, do it in a separate loop to not make the metrics update wait + for (MemtableShard shard : shards) + { + shard.allocator.setDiscarded(); + shard.data.discardBuffers(); + } + } + + /** + * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate + * OpOrdering. + * + * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null + */ + @Override + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + { + DecoratedKey key = update.partitionKey(); + MemtableShard shard = shards[boundaries.getShardForKey(key)]; + long colUpdateTimeDelta = shard.put(update, indexer, opGroup); + + if (shard.data.reachedAllocatedSizeThreshold()) + signalFlushRequired(ColumnFamilyStore.FlushReason.TRIE_LIMIT, true); + + return colUpdateTimeDelta; + } + + @Override + public void signalFlushRequired(ColumnFamilyStore.FlushReason flushReason, boolean skipIfSignaled) + { + if (!switchRequested.getAndSet(true) || !skipIfSignaled) + { + logger.info("Scheduling flush for table {} due to {}", this.metadata.get(), flushReason); + owner.signalFlushRequired(this, flushReason); + } + } + + @Override + public void addMemoryUsageTo(MemoryUsage stats) + { + super.addMemoryUsageTo(stats); + for (MemtableShard shard : shards) + { + stats.ownsOnHeap += shard.allocator.onHeap().owns(); + stats.ownsOffHeap += shard.allocator.offHeap().owns(); + stats.ownershipRatioOnHeap += shard.allocator.onHeap().ownershipRatio(); + stats.ownershipRatioOffHeap += shard.allocator.offHeap().ownershipRatio(); + } + } + + @Override + public long getLiveDataSize() + { + long total = 0L; + for (MemtableShard shard : shards) + total += shard.liveDataSize(); + return total; + } + + @Override + public long operationCount() + { + long total = 0L; + for (MemtableShard shard : shards) + total += shard.currentOperations(); + return total; + } + + @Override + public long partitionCount() + { + int total = 0; + for (MemtableShard shard : shards) + total += shard.partitionCount(); + return total; + } + + public int getShardCount() + { + return shards.length; + } + + @Override + public long getEstimatedAverageRowSize() + { + if (estimatedAverageRowSize == null || currentOperations.get() > estimatedAverageRowSize.operations * 1.5) + estimatedAverageRowSize = new MemtableAverageRowSize(this, mergedTrie); + return estimatedAverageRowSize.rowSize; + } + + /** + * Returns the minTS if one available, otherwise NO_MIN_TIMESTAMP. + * + * EncodingStats uses a synthetic epoch TS at 2015. We don't want to leak that (CASSANDRA-18118) so we return NO_MIN_TIMESTAMP instead. + * + * @return The minTS or NO_MIN_TIMESTAMP if none available + */ + @Override + public long getMinTimestamp() + { + long min = Long.MAX_VALUE; + for (MemtableShard shard : shards) + min = EncodingStats.mergeMinTimestamp(min, shard.stats); + return min != EncodingStats.NO_STATS.minTimestamp ? min : NO_MIN_TIMESTAMP; + } + + @Override + public long getMinLocalDeletionTime() + { + long min = Long.MAX_VALUE; + for (MemtableShard shard : shards) + min = EncodingStats.mergeMinLocalDeletionTime(min, shard.stats); + return min; + } + + @Override + public DecoratedKey minPartitionKey() + { + for (int i = 0; i < shards.length; i++) + { + MemtableShard shard = shards[i]; + if (!shard.isClean()) + return shard.minPartitionKey(); + } + return null; + } + + @Override + public DecoratedKey maxPartitionKey() + { + for (int i = shards.length - 1; i >= 0; i--) + { + MemtableShard shard = shards[i]; + if (!shard.isClean()) + return shard.maxPartitionKey(); + } + return null; + } + + @Override + RegularAndStaticColumns columns() + { + for (MemtableShard shard : shards) + columnsCollector.update(shard.columns); + return columnsCollector.get(); + } + + @Override + EncodingStats encodingStats() + { + for (MemtableShard shard : shards) + statsCollector.update(shard.stats); + return statsCollector.get(); + } + + static boolean isPartitionBoundary(Object content) + { + // In the trie we use PartitionData for the root of a partition, but PartitionUpdates come with DeletionInfo. + // Both are descendants of DeletionInfo. + return content instanceof DeletionInfo; + } + + @Override + public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter columnFilter, + final DataRange dataRange, + SSTableReadsListener readsListener) + { + AbstractBounds keyRange = dataRange.keyRange(); + + boolean isBound = keyRange instanceof Bounds; + boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds; + boolean includeStop = isBound || keyRange instanceof Range; + + Trie subMap = mergedTrie.subtrie(toComparableBound(keyRange.left, includeStart), + toComparableBound(keyRange.right, !includeStop)); + + return new MemtableUnfilteredPartitionIterator(metadata(), + allocator.ensureOnHeap(), + subMap, + columnFilter, + dataRange, + getMinLocalDeletionTime()); + // Note: the minLocalDeletionTime reported by the iterator is the memtable's minLocalDeletionTime. This is okay + // because we only need to report a lower bound that will eventually advance, and calculating a more precise + // bound would be an unnecessary expense. + } + + private static ByteComparable toComparableBound(PartitionPosition position, boolean before) + { + return position == null || position.isMinimum() ? null : position.asComparableBound(before); + } + + public Partition getPartition(DecoratedKey key) + { + int shardIndex = boundaries.getShardForKey(key); + Trie trie = shards[shardIndex].data.tailTrie(key); + return createPartition(metadata(), allocator.ensureOnHeap(), key, trie); + } + + private static TrieBackedPartitionStage2 createPartition(TableMetadata metadata, EnsureOnHeap ensureOnHeap, DecoratedKey key, Trie trie) + { + if (trie == null) + return null; + PartitionData holder = (PartitionData) trie.get(ByteComparable.EMPTY); + // If we found a matching path in the trie, it must be the root of this partition (because partition keys are + // prefix-free, it can't be a prefix for a different path, or have another partition key as prefix) and contain + // PartitionData (because the attachment of a new or modified partition to the trie is atomic). + assert holder != null : "Entry for " + key + " without associated PartitionData"; + + return TrieBackedPartitionStage2.create(key, + holder.columns(), + holder.stats(), + holder.rowCountIncludingStatic(), + trie, + metadata, + ensureOnHeap); + } + + @Override + public UnfilteredRowIterator rowIterator(DecoratedKey key, Slices slices, ColumnFilter selectedColumns, boolean reversed, SSTableReadsListener listener) + { + Partition p = getPartition(key); + if (p == null) + return null; + else + return p.unfilteredIterator(selectedColumns, slices, reversed); + } + + @Override + public UnfilteredRowIterator rowIterator(DecoratedKey key) + { + Partition p = getPartition(key); + return p != null ? p.unfilteredIterator() : null; + } + + private static DecoratedKey getPartitionKeyFromPath(TableMetadata metadata, ByteComparable path) + { + return BufferDecoratedKey.fromByteComparable(path, + TrieBackedPartitionStage2.BYTE_COMPARABLE_VERSION, + metadata.partitioner); + } + + /** + * Metadata object signifying the root node of a partition. Holds the deletion information as well as a link + * to the owning subrange, which is used for compiling statistics and column sets. + * + * Descends from MutableDeletionInfo to permit tail tries to be passed directly to TrieBackedPartitionStage2. + */ + public static class PartitionData extends MutableDeletionInfo + { + @Unmetered + public final MemtableShard owner; + + private int rowCountIncludingStatic; + + public static final long HEAP_SIZE = ObjectSizes.measure(new PartitionData(DeletionInfo.LIVE, null)); + + public PartitionData(DeletionInfo deletion, + MemtableShard owner) + { + super(deletion.getPartitionDeletion(), deletion.copyRanges(HeapCloner.instance)); + this.owner = owner; + this.rowCountIncludingStatic = 0; + } + + public PartitionData(PartitionData existing, + DeletionInfo update) + { + // Start with the update content, to properly copy it + this(update, existing.owner); + rowCountIncludingStatic = existing.rowCountIncludingStatic; + add(existing); + } + + public RegularAndStaticColumns columns() + { + return owner.columns; + } + + public EncodingStats stats() + { + return owner.stats; + } + + public int rowCountIncludingStatic() + { + return rowCountIncludingStatic; + } + + public void markInsertedRows(int howMany) + { + rowCountIncludingStatic += howMany; + } + + @Override + public String toString() + { + return "partition " + super.toString(); + } + + @Override + public long unsharedHeapSize() + { + return super.unsharedHeapSize() + HEAP_SIZE - MutableDeletionInfo.EMPTY_SIZE; + } + } + + + class KeySizeAndCountCollector extends TrieEntriesWalker + { + long keySize = 0; + int keyCount = 0; + + @Override + public Void complete() + { + return null; + } + + @Override + protected void content(Object content, byte[] bytes, int byteLength) + { + // This is used with processSkippingBranches which should ensure that we only see the partition roots. + assert content instanceof PartitionData; + ++keyCount; + byte[] keyBytes = DecoratedKey.keyFromByteSource(ByteSource.preencoded(bytes, 0, byteLength), + TrieBackedPartitionStage2.BYTE_COMPARABLE_VERSION, + metadata().partitioner); + keySize += keyBytes.length; + } + } + + public FlushablePartitionSet getFlushSet(PartitionPosition from, PartitionPosition to) + { + Trie toFlush = mergedTrie.subtrie(toComparableBound(from, true), toComparableBound(to, true)); + + var counter = new KeySizeAndCountCollector(); // need to jump over tails keys + toFlush.processSkippingBranches(Direction.FORWARD, counter); + int partitionCount = counter.keyCount; + long partitionKeySize = counter.keySize; + + return new AbstractFlushablePartitionSet() + { + public Memtable memtable() + { + return TrieMemtableStage2.this; + } + + public PartitionPosition from() + { + return from; + } + + public PartitionPosition to() + { + return to; + } + + public long partitionCount() + { + return partitionCount; + } + + public Iterator iterator() + { + return new PartitionIterator(toFlush, metadata(), EnsureOnHeap.NOOP); + } + + public long partitionKeysSize() + { + return partitionKeySize; + } + }; + } + + public static class MemtableShard + { + // The following fields are volatile as we have to make sure that when we + // collect results from all sub-ranges, the thread accessing the value + // is guaranteed to see the changes to the values. + + // The smallest timestamp for all partitions stored in this shard + private volatile long minTimestamp = Long.MAX_VALUE; + + private volatile long liveDataSize = 0; + + private volatile long currentOperations = 0; + + private volatile int partitionCount = 0; + + @Unmetered + private final ReentrantLock writeLock = new ReentrantLock(TrieMemtable.SHARD_LOCK_FAIRNESS); + + // Content map for the given shard. This is implemented as a memtable trie which uses the prefix-free + // byte-comparable ByteSource representations of the keys to address the partitions. + // + // This map is used in a single-producer, multi-consumer fashion: only one thread will insert items but + // several threads may read from it and iterate over it. Iterators (especially partition range iterators) + // may operate for a long period of time and thus iterators should not throw ConcurrentModificationExceptions + // if the underlying map is modified during iteration, they should provide a weakly consistent view of the map + // instead. + // + // Also, this data is backed by memtable memory, when accessing it callers must specify if it can be accessed + // unsafely, meaning that the memtable will not be discarded as long as the data is used, or whether the data + // should be copied on heap for off-heap allocators. + @VisibleForTesting + final InMemoryTrie data; + + RegularAndStaticColumns columns; + + EncodingStats stats; + + @Unmetered // total pool size should not be included in memtable's deep size + private final MemtableAllocator allocator; + + @Unmetered + private final TrieMemtableMetricsView metrics; + + private final TableMetadataRef metadata; + + MemtableShard(TableMetadataRef metadata, TrieMemtableMetricsView metrics, OpOrder opOrder) + { + this(metadata, AbstractAllocatorMemtable.MEMORY_POOL.newAllocator(metadata.toString()), metrics, opOrder); + } + + @VisibleForTesting + MemtableShard(TableMetadataRef metadata, MemtableAllocator allocator, TrieMemtableMetricsView metrics, OpOrder opOrder) + { + this.metadata = metadata; + this.data = InMemoryTrie.longLived(TrieBackedPartitionStage2.BYTE_COMPARABLE_VERSION, BUFFER_TYPE, opOrder); + this.columns = RegularAndStaticColumns.NONE; + this.stats = EncodingStats.NO_STATS; + this.allocator = allocator; + this.metrics = metrics; + } + + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + { + TriePartitionUpdaterStage2 updater = new TriePartitionUpdaterStage2(allocator.cloner(opGroup), indexer, metadata.get(), this); + boolean locked = writeLock.tryLock(); + if (locked) + { + metrics.uncontendedPuts.inc(); + } + else + { + metrics.contendedPuts.inc(); + long lockStartTime = Clock.Global.nanoTime(); + writeLock.lock(); + metrics.contentionTime.addNano(Clock.Global.nanoTime() - lockStartTime); + } + try + { + try + { + indexer.start(); + // Add the initial trie size on the first operation. This technically isn't correct (other shards + // do take their memory share even if they are empty) but doing it during construction may cause + // the allocator to block while we are trying to flush a memtable and become a deadlock. + long onHeap = data.isEmpty() ? 0 : data.usedSizeOnHeap(); + long offHeap = data.isEmpty() ? 0 : data.usedSizeOffHeap(); + + try + { + updater.apply(data, TriePartitionUpdateStage2.asMergableTrie(update)); + } + catch (TrieSpaceExhaustedException e) + { + // This should never really happen as a flush would be triggered long before this limit is reached. + throw new AssertionError(e); + } + allocator.offHeap().adjust(data.usedSizeOffHeap() - offHeap, opGroup); + allocator.onHeap().adjust((data.usedSizeOnHeap() - onHeap) + updater.heapSize, opGroup); + partitionCount += updater.partitionsAdded; + } + finally + { + indexer.commit(); + updateMinTimestamp(update.stats().minTimestamp); + updateLiveDataSize(updater.dataSize); + updateCurrentOperations(update.operationCount()); + + columns = columns.mergeTo(update.columns()); + stats = stats.mergeWith(update.stats()); + } + } + finally + { + writeLock.unlock(); + } + return updater.colUpdateTimeDelta; + } + + public boolean isClean() + { + return data.isEmpty(); + } + + private void updateMinTimestamp(long timestamp) + { + if (timestamp < minTimestamp) + minTimestamp = timestamp; + } + + void updateLiveDataSize(long size) + { + liveDataSize = liveDataSize + size; + } + + private void updateCurrentOperations(long op) + { + currentOperations = currentOperations + op; + } + + public int partitionCount() + { + return partitionCount; + } + + long liveDataSize() + { + return liveDataSize; + } + + long currentOperations() + { + return currentOperations; + } + + private DecoratedKey firstPartitionKey(Direction direction) + { + Iterator> iter = data.filteredEntryIterator(direction, PartitionData.class); + if (!iter.hasNext()) + return null; + + Map.Entry entry = iter.next(); + return getPartitionKeyFromPath(metadata.get(), entry.getKey()); + } + + public DecoratedKey minPartitionKey() + { + return firstPartitionKey(Direction.FORWARD); + } + + public DecoratedKey maxPartitionKey() + { + return firstPartitionKey(Direction.REVERSE); + } + } + + static class PartitionIterator extends TrieTailsIterator.Plain + { + final TableMetadata metadata; + final EnsureOnHeap ensureOnHeap; + PartitionIterator(Trie source, TableMetadata metadata, EnsureOnHeap ensureOnHeap) + { + super(source, Direction.FORWARD, PartitionData.class::isInstance); + this.metadata = metadata; + this.ensureOnHeap = ensureOnHeap; + } + + @Override + protected TrieBackedPartitionStage2 mapContent(Object content, Trie tailTrie, byte[] bytes, int byteLength) + { + PartitionData pd = (PartitionData) content; + DecoratedKey key = getPartitionKeyFromPath(metadata, + ByteComparable.preencoded(TrieBackedPartitionStage2.BYTE_COMPARABLE_VERSION, + bytes, 0, byteLength)); + return TrieBackedPartitionStage2.create(key, + pd.columns(), + pd.stats(), + pd.rowCountIncludingStatic(), + tailTrie, + metadata, + ensureOnHeap); + } + } + + static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator + { + private final TableMetadata metadata; + private final Iterator iter; + private final ColumnFilter columnFilter; + private final DataRange dataRange; + private final long minLocalDeletionTime; + + public MemtableUnfilteredPartitionIterator(TableMetadata metadata, + EnsureOnHeap ensureOnHeap, + Trie source, + ColumnFilter columnFilter, + DataRange dataRange, + long minLocalDeletionTime) + { + this.iter = new PartitionIterator(source, metadata, ensureOnHeap); + this.metadata = metadata; + this.columnFilter = columnFilter; + this.dataRange = dataRange; + this.minLocalDeletionTime = minLocalDeletionTime; + } + + public long getMinLocalDeletionTime() + { + return minLocalDeletionTime; + } + + public TableMetadata metadata() + { + return metadata; + } + + public boolean hasNext() + { + return iter.hasNext(); + } + + public UnfilteredRowIterator next() + { + Partition partition = iter.next(); + DecoratedKey key = partition.partitionKey(); + ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key); + + return filter.getUnfilteredRowIterator(columnFilter, partition); + } + } + + public static Memtable.Factory factory(Map optionsCopy) + { + String shardsString = optionsCopy.remove(SHARDS_OPTION); + Integer shardCount = shardsString != null ? Integer.parseInt(shardsString) : null; + return new Factory(shardCount); + } + + + static class Factory implements Memtable.Factory + { + final Integer shardCount; + + Factory(Integer shardCount) + { + this.shardCount = shardCount; + } + + public Memtable create(AtomicReference commitLogLowerBound, + TableMetadataRef metadaRef, + Owner owner) + { + return new TrieMemtableStage2(commitLogLowerBound, metadaRef, owner, shardCount); + } + + @Override + public PartitionUpdate.Factory partitionUpdateFactory() + { + return TriePartitionUpdateStage2.FACTORY; + } + + @Override + public TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef) + { + TrieMemtableMetricsView metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name); + return metrics::release; + } + } + + @Override + public long unusedReservedOnHeapMemory() + { + long size = 0; + for (MemtableShard shard : shards) + { + size += shard.data.unusedReservedOnHeapMemory(); + size += shard.allocator.unusedReservedOnHeapMemory(); + } + size += this.allocator.unusedReservedOnHeapMemory(); + return size; + } + + /** + * Release all recycled content references, including the ones waiting in still incomplete recycling lists. + * This is a test method and can cause null pointer exceptions if used on a live trie. + */ + @VisibleForTesting + void releaseReferencesUnsafe() + { + for (MemtableShard shard : shards) + shard.data.releaseReferencesUnsafe(); + } +} diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage3.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage3.java new file mode 100644 index 000000000000..7fb0d013aad6 --- /dev/null +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtableStage3.java @@ -0,0 +1,862 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.memtable; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Predicate; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.commitlog.CommitLogPosition; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.TrieBackedPartitionStage3; +import org.apache.cassandra.db.partitions.TriePartitionUpdateStage3; +import org.apache.cassandra.db.partitions.TriePartitionUpdaterStage3; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.TrieEntriesWalker; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.db.tries.TrieTailsIterator; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.IncludingExcludingBounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.io.sstable.SSTableReadsListener; +import org.apache.cassandra.metrics.TableMetrics; +import org.apache.cassandra.metrics.TrieMemtableMetricsView; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableMetadataRef; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.memory.EnsureOnHeap; +import org.apache.cassandra.utils.memory.MemtableAllocator; +import org.github.jamm.Unmetered; + +/** + * Trie memtable implementation. Improves memory usage, garbage collection efficiency and lookup performance. + * The implementation is described in detail in the paper: + * https://www.vldb.org/pvldb/vol15/p3359-lambov.pdf + * + * The configuration takes a single parameter: + * - shards: the number of shards to split into, defaulting to the number of CPU cores. + * + * Also see Memtable_API.md. + */ +public class TrieMemtableStage3 extends AbstractShardedMemtable +{ + private static final Logger logger = LoggerFactory.getLogger(TrieMemtableStage3.class); + + /// Buffer type to use for memtable tries (on- vs off-heap) + public static final BufferType BUFFER_TYPE = DatabaseDescriptor.getMemtableAllocationType().toBufferType(); + + /// Force copy checker (see [InMemoryTrie#apply]) ensuring all modifications apply atomically and consistently to + /// the whole partition. + public static final Predicate> FORCE_COPY_PARTITION_BOUNDARY = + features -> TrieBackedPartitionStage3.isPartitionBoundary(features.content()); + + /// Set to true when the memtable requests a switch (e.g. for trie size limit being reached) to ensure only one + /// thread calls cfs.switchMemtableIfCurrent. + private final AtomicBoolean switchRequested = new AtomicBoolean(false); + + /// Sharded memtable sections. Each is responsible for a contiguous range of the token space (between `boundaries[i]` + /// and `boundaries[i+1]`) and is written to by one thread at a time, while reads are carried out concurrently + /// (including with any write). + private final MemtableShard[] shards; + + /// A merged view of the memtable map. Used for partition range queries and flush. + /// For efficiency we serve single partition requests off the shard which offers more direct [InMemoryTrie] methods. + private final DeletionAwareTrie mergedTrie; + + @Unmetered + private final TrieMemtableMetricsView metrics; + + TrieMemtableStage3(AtomicReference commitLogLowerBound, TableMetadataRef metadataRef, Owner owner, Integer shardCountOption) + { + super(commitLogLowerBound, metadataRef, owner, shardCountOption); + this.metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name); + this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics, owner.readOrdering()); + this.mergedTrie = makeMergedTrie(shards); + logger.trace("Created memtable with {} shards", this.shards.length); + } + + private static MemtableShard[] generatePartitionShards(int splits, + TableMetadataRef metadata, + TrieMemtableMetricsView metrics, + OpOrder opOrder) + { + if (splits == 1) + return new MemtableShard[] { new MemtableShard(metadata, metrics, opOrder) }; + + MemtableShard[] partitionMapContainer = new MemtableShard[splits]; + for (int i = 0; i < splits; i++) + partitionMapContainer[i] = new MemtableShard(metadata, metrics, opOrder); + + return partitionMapContainer; + } + + private static DeletionAwareTrie makeMergedTrie(MemtableShard[] shards) + { + List> tries = new ArrayList<>(shards.length); + for (MemtableShard shard : shards) + tries.add(shard.data); + return DeletionAwareTrie.mergeDistinct(tries); + } + + @Override + public boolean isClean() + { + for (MemtableShard shard : shards) + if (!shard.isClean()) + return false; + return true; + } + + @VisibleForTesting + @Override + public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference commitLogUpperBound) + { + super.switchOut(writeBarrier, commitLogUpperBound); + + for (MemtableShard shard : shards) + shard.allocator.setDiscarding(); + } + + @Override + public void discard() + { + super.discard(); + // metrics here are not thread safe, but I think we can live with that + metrics.lastFlushShardDataSizes.reset(); + for (MemtableShard shard : shards) + { + metrics.lastFlushShardDataSizes.update(shard.liveDataSize()); + } + // the buffer release is a longer-running process, do it in a separate loop to not make the metrics update wait + for (MemtableShard shard : shards) + { + shard.allocator.setDiscarded(); + shard.data.discardBuffers(); + } + } + + /// Should only be called by [ColumnFamilyStore#apply] via `Keyspace#apply`, which supplies the appropriate + /// [OpOrder.Group]. + /// + /// `commitLogSegmentPosition` should only be null if this is a secondary index, in which case it is *expected* to + /// be null. + @Override + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + { + DecoratedKey key = update.partitionKey(); + MemtableShard shard = shards[boundaries.getShardForKey(key)]; + long colUpdateTimeDelta = shard.put(update, indexer, opGroup); + + if (shard.data.reachedAllocatedSizeThreshold()) + signalFlushRequired(ColumnFamilyStore.FlushReason.TRIE_LIMIT, true); + + return colUpdateTimeDelta; + } + + @Override + public void signalFlushRequired(ColumnFamilyStore.FlushReason flushReason, boolean skipIfSignaled) + { + if (!switchRequested.getAndSet(true) || !skipIfSignaled) + { + logger.info("Scheduling flush for table {} due to {}", this.metadata.get(), flushReason); + owner.signalFlushRequired(this, flushReason); + } + } + + @Override + public void addMemoryUsageTo(MemoryUsage stats) + { + super.addMemoryUsageTo(stats); + for (MemtableShard shard : shards) + { + stats.ownsOnHeap += shard.allocator.onHeap().owns(); + stats.ownsOffHeap += shard.allocator.offHeap().owns(); + stats.ownershipRatioOnHeap += shard.allocator.onHeap().ownershipRatio(); + stats.ownershipRatioOffHeap += shard.allocator.offHeap().ownershipRatio(); + } + } + + @Override + public long getLiveDataSize() + { + long total = 0L; + for (MemtableShard shard : shards) + total += shard.liveDataSize(); + return total; + } + + @Override + public long operationCount() + { + long total = 0L; + for (MemtableShard shard : shards) + total += shard.currentOperations(); + return total; + } + + @Override + public long partitionCount() + { + int total = 0; + for (MemtableShard shard : shards) + total += shard.partitionCount(); + return total; + } + + public int getShardCount() + { + return shards.length; + } + + @Override + public long getEstimatedAverageRowSize() + { + if (estimatedAverageRowSize == null || currentOperations.get() > estimatedAverageRowSize.operations * 1.5) + estimatedAverageRowSize = new MemtableAverageRowSize(this, mergedTrie.contentOnlyTrie()); + return estimatedAverageRowSize.rowSize; + } + + /// Returns the minimum timestamp if one available, otherwise `NO_MIN_TIMESTAMP`. + /// [EncodingStats] uses a synthetic epoch TS at 2015. We don't want to leak that (CASSANDRA-18118) so we return + /// `NO_MIN_TIMESTAMP` instead. + /// + /// @return The minTS or `NO_MIN_TIMESTAMP` if none available + @Override + public long getMinTimestamp() + { + long min = Long.MAX_VALUE; + for (MemtableShard shard : shards) + min = EncodingStats.mergeMinTimestamp(min, shard.stats); + return min != EncodingStats.NO_STATS.minTimestamp ? min : NO_MIN_TIMESTAMP; + } + + @Override + public long getMinLocalDeletionTime() + { + long min = Long.MAX_VALUE; + for (MemtableShard shard : shards) + min = EncodingStats.mergeMinLocalDeletionTime(min, shard.stats); + return min; + } + + @Override + public DecoratedKey minPartitionKey() + { + for (int i = 0; i < shards.length; i++) + { + MemtableShard shard = shards[i]; + if (!shard.isClean()) + return shard.minPartitionKey(); + } + return null; + } + + @Override + public DecoratedKey maxPartitionKey() + { + for (int i = shards.length - 1; i >= 0; i--) + { + MemtableShard shard = shards[i]; + if (!shard.isClean()) + return shard.maxPartitionKey(); + } + return null; + } + + @Override + RegularAndStaticColumns columns() + { + for (MemtableShard shard : shards) + columnsCollector.update(shard.columns); + return columnsCollector.get(); + } + + @Override + EncodingStats encodingStats() + { + for (MemtableShard shard : shards) + statsCollector.update(shard.stats); + return statsCollector.get(); + } + + @Override + public MemtableUnfilteredPartitionIterator partitionIterator(final ColumnFilter columnFilter, + final DataRange dataRange, + SSTableReadsListener readsListener) + { + AbstractBounds keyRange = dataRange.keyRange(); + + boolean isBound = keyRange instanceof Bounds; + boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds; + boolean includeStop = isBound || keyRange instanceof Range; + + DeletionAwareTrie subMap = + mergedTrie.subtrie(toComparableBound(keyRange.left, includeStart), + toComparableBound(keyRange.right, !includeStop)); + + return new MemtableUnfilteredPartitionIterator(metadata(), + allocator.ensureOnHeap(), + subMap, + columnFilter, + dataRange, + getMinLocalDeletionTime()); + // Note: the minLocalDeletionTime reported by the iterator is the memtable's minLocalDeletionTime. This is okay + // because we only need to report a lower bound that will eventually advance, and calculating a more precise + // bound would be an unnecessary expense. + } + + private static ByteComparable toComparableBound(PartitionPosition position, boolean before) + { + return position == null || position.isMinimum() ? null : position.asComparableBound(before); + } + + public Partition getPartition(DecoratedKey key) + { + int shardIndex = boundaries.getShardForKey(key); + DeletionAwareTrie trie = shards[shardIndex].data.tailTrie(key); + return createPartition(metadata(), allocator.ensureOnHeap(), key, trie); + } + + private static TrieBackedPartitionStage3 createPartition(TableMetadata metadata, EnsureOnHeap ensureOnHeap, DecoratedKey key, DeletionAwareTrie trie) + { + if (trie == null) + return null; + PartitionData holder = (PartitionData) trie.get(ByteComparable.EMPTY); + // If we found a matching path in the trie, it must be the root of this partition (because partition keys are + // prefix-free, it can't be a prefix for a different path, or have another partition key as prefix) and contain + // PartitionData (because the attachment of a new or modified partition to the trie is atomic). + assert holder != null : "Entry for " + key + " without associated PartitionData"; + + return TrieBackedPartitionStage3.create(key, + holder.columns(), + holder.stats(), + holder.rowCountIncludingStatic(), + holder.tombstoneCount(), + trie, + metadata, + ensureOnHeap); + } + + @Override + public UnfilteredRowIterator rowIterator(DecoratedKey key, Slices slices, ColumnFilter selectedColumns, boolean reversed, SSTableReadsListener listener) + { + Partition p = getPartition(key); + if (p == null) + return null; + else + return p.unfilteredIterator(selectedColumns, slices, reversed); + } + + @Override + public UnfilteredRowIterator rowIterator(DecoratedKey key) + { + Partition p = getPartition(key); + return p != null ? p.unfilteredIterator() : null; + } + + private static DecoratedKey getPartitionKeyFromPath(TableMetadata metadata, ByteComparable path) + { + return BufferDecoratedKey.fromByteComparable(path, + TrieBackedPartitionStage3.BYTE_COMPARABLE_VERSION, + metadata.partitioner); + } + + /// Metadata object signifying the root node of a partition. Holds row and tombstone counts as well as a link + /// to the owning subrange, which is used for compiling encoding statistics and column sets. + /// + /// Descends from [TrieBackedPartitionStage3.PartitionMarker] to permit tail tries to be passed directly to + /// [TrieBackedPartitionStage3]. + public static class PartitionData implements TrieBackedPartitionStage3.PartitionMarker + { + @Unmetered + public final MemtableShard owner; + + private int rowCountIncludingStatic; + private int tombstoneCount; + + public static final long HEAP_SIZE = ObjectSizes.measure(new PartitionData(null)); + + public PartitionData(MemtableShard owner) + { + this.owner = owner; + this.rowCountIncludingStatic = 0; + this.tombstoneCount = 0; + } + + public RegularAndStaticColumns columns() + { + return owner.columns; + } + + public EncodingStats stats() + { + return owner.stats; + } + + public int rowCountIncludingStatic() + { + return rowCountIncludingStatic; + } + + public int tombstoneCount() + { + return tombstoneCount; + } + + public void markInsertedRows(int howMany) + { + rowCountIncludingStatic += howMany; + } + + public void markAddedTombstones(int howMany) + { + tombstoneCount += howMany; + } + + @Override + public String toString() + { + return String.format("partition with %d rows and %d tombstones", rowCountIncludingStatic, tombstoneCount); + } + + public long unsharedHeapSize() + { + return HEAP_SIZE; + } + + public void clearStats() + { + rowCountIncludingStatic = 0; + tombstoneCount = 0; + } + } + + class KeySizeAndCountCollector extends TrieEntriesWalker + { + long keySize = 0; + int keyCount = 0; + + @Override + public Void complete() + { + return null; + } + + @Override + protected void content(Object content, byte[] bytes, int byteLength) + { + // This is used with processSkippingBranches which should ensure that we only see the partition roots. + assert content instanceof PartitionData; + ++keyCount; + byte[] keyBytes = DecoratedKey.keyFromByteSource(ByteSource.preencoded(bytes, 0, byteLength), + TrieBackedPartitionStage3.BYTE_COMPARABLE_VERSION, + metadata().partitioner); + keySize += keyBytes.length; + } + } + + @Override + public FlushablePartitionSet getFlushSet(PartitionPosition from, PartitionPosition to) + { + DeletionAwareTrie toFlush = mergedTrie.subtrie(toComparableBound(from, true), toComparableBound(to, true)); + + var counter = new KeySizeAndCountCollector(); // need to jump over tails keys + toFlush.processSkippingBranches(Direction.FORWARD, counter); + int partitionCount = counter.keyCount; + long partitionKeySize = counter.keySize; + + return new AbstractFlushablePartitionSet<>() + { + public Memtable memtable() + { + return TrieMemtableStage3.this; + } + + public PartitionPosition from() + { + return from; + } + + public PartitionPosition to() + { + return to; + } + + public long partitionCount() + { + return partitionCount; + } + + public Iterator iterator() + { + return new PartitionIterator(toFlush, metadata(), EnsureOnHeap.NOOP); + } + + public long partitionKeysSize() + { + return partitionKeySize; + } + }; + } + + public static class MemtableShard + { + // The following fields are volatile as we have to make sure that when we + // collect results from all sub-ranges, the thread accessing the value + // is guaranteed to see the changes to the values. + + // The smallest timestamp for all partitions stored in this shard + private volatile long minTimestamp = Long.MAX_VALUE; + + private volatile long liveDataSize = 0; + + private volatile long currentOperations = 0; + + private volatile int partitionCount = 0; + + @Unmetered + private final ReentrantLock writeLock = new ReentrantLock(TrieMemtable.SHARD_LOCK_FAIRNESS); + + /// Content map for the given shard. This is implemented as an in-memory trie which uses the prefix-free + /// byte-comparable [ByteSource] representations of keys to address partitions and individual rows within + /// partitions. + /// + /// This map is used in a single-producer, multi-consumer fashion: only one thread will insert items but + /// several threads may read from it and iterate over it. Iterators (especially partition range iterators) + /// may operate for a long period of time and thus iterators should not throw `ConcurrentModificationException`s + /// if the underlying map is modified during iteration, they should provide a weakly consistent view of the map + /// instead. + /// + /// Also, this data is backed by memtable memory, when accessing it callers must specify if it can be accessed + /// unsafely, meaning that the memtable will not be discarded as long as the data is used, or whether the data + /// should be copied on heap for off-heap allocators. + @VisibleForTesting + final InMemoryDeletionAwareTrie data; + + RegularAndStaticColumns columns; + + EncodingStats stats; + + @Unmetered // total pool size should not be included in memtable's deep size + private final MemtableAllocator allocator; + + @Unmetered + private final TrieMemtableMetricsView metrics; + + private final TableMetadataRef metadata; + + MemtableShard(TableMetadataRef metadata, TrieMemtableMetricsView metrics, OpOrder opOrder) + { + this(metadata, AbstractAllocatorMemtable.MEMORY_POOL.newAllocator(metadata.toString()), metrics, opOrder); + } + + @VisibleForTesting + MemtableShard(TableMetadataRef metadata, MemtableAllocator allocator, TrieMemtableMetricsView metrics, OpOrder opOrder) + { + this.metadata = metadata; + this.data = InMemoryDeletionAwareTrie.longLived(TrieBackedPartitionStage3.BYTE_COMPARABLE_VERSION, BUFFER_TYPE, opOrder); + this.columns = RegularAndStaticColumns.NONE; + this.stats = EncodingStats.NO_STATS; + this.allocator = allocator; + this.metrics = metrics; + } + + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + { + TriePartitionUpdaterStage3 updater = new TriePartitionUpdaterStage3(data, allocator.cloner(opGroup), indexer, metadata.get(), this); + boolean locked = writeLock.tryLock(); + if (locked) + { + metrics.uncontendedPuts.inc(); + } + else + { + metrics.contendedPuts.inc(); + long lockStartTime = Clock.Global.nanoTime(); + writeLock.lock(); + metrics.contentionTime.addNano(Clock.Global.nanoTime() - lockStartTime); + } + try + { + try + { + indexer.start(); + // Add the initial trie size on the first operation. This technically isn't correct (other shards + // do take their memory share even if they are empty) but doing it during construction may cause + // the allocator to block while we are trying to flush a memtable and become a deadlock. + long onHeap = data.isEmpty() ? 0 : data.usedSizeOnHeap(); + long offHeap = data.isEmpty() ? 0 : data.usedSizeOffHeap(); + try + { + updater.apply(TriePartitionUpdateStage3.asMergableTrie(update)); + } + catch (TrieSpaceExhaustedException e) + { + // This should never really happen as a flush would be triggered long before this limit is reached. + throw new AssertionError(e); + } + allocator.offHeap().adjust(data.usedSizeOffHeap() - offHeap, opGroup); + allocator.onHeap().adjust((data.usedSizeOnHeap() - onHeap) + updater.heapSize, opGroup); + partitionCount += updater.partitionsAdded; + } + finally + { + indexer.commit(); + updateMinTimestamp(update.stats().minTimestamp); + updateLiveDataSize(updater.dataSize); + updateCurrentOperations(update.operationCount()); + + columns = columns.mergeTo(update.columns()); + stats = stats.mergeWith(update.stats()); + } + } + finally + { + writeLock.unlock(); + } + return updater.colUpdateTimeDelta; + } + + public boolean isClean() + { + return data.isEmpty(); + } + + private void updateMinTimestamp(long timestamp) + { + if (timestamp < minTimestamp) + minTimestamp = timestamp; + } + + void updateLiveDataSize(long size) + { + liveDataSize += size; + } + + private void updateCurrentOperations(long op) + { + currentOperations += op; + } + + public int partitionCount() + { + return partitionCount; + } + + long liveDataSize() + { + return liveDataSize; + } + + long currentOperations() + { + return currentOperations; + } + + private DecoratedKey firstPartitionKey(Direction direction) + { + // Note: there is no need to skip tails here as this will only be run until we find the first partition. + Iterator> iter = data.filteredEntryIterator(direction, PartitionData.class); + if (!iter.hasNext()) + return null; + + Map.Entry entry = iter.next(); + return getPartitionKeyFromPath(metadata.get(), entry.getKey()); + } + + public DecoratedKey minPartitionKey() + { + return firstPartitionKey(Direction.FORWARD); + } + + public DecoratedKey maxPartitionKey() + { + return firstPartitionKey(Direction.REVERSE); + } + } + + static class PartitionIterator extends TrieTailsIterator.DeletionAwareWithoutCoveringDeletions + { + final TableMetadata metadata; + final EnsureOnHeap ensureOnHeap; + PartitionIterator(DeletionAwareTrie source, TableMetadata metadata, EnsureOnHeap ensureOnHeap) + { + super(source, Direction.FORWARD, PartitionData.class::isInstance); + this.metadata = metadata; + this.ensureOnHeap = ensureOnHeap; + } + + @Override + protected TrieBackedPartitionStage3 mapContent(Object content, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength) + { + PartitionData pd = (PartitionData) content; + DecoratedKey key = getPartitionKeyFromPath(metadata, + ByteComparable.preencoded(TrieBackedPartitionStage3.BYTE_COMPARABLE_VERSION, + bytes, 0, byteLength)); + return TrieBackedPartitionStage3.create(key, + pd.columns(), + pd.stats(), + pd.rowCountIncludingStatic(), + pd.tombstoneCount(), + tailTrie, + metadata, + ensureOnHeap); + } + } + + static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator + { + private final TableMetadata metadata; + private final Iterator iter; + private final ColumnFilter columnFilter; + private final DataRange dataRange; + private final long minLocalDeletionTime; + + public MemtableUnfilteredPartitionIterator(TableMetadata metadata, + EnsureOnHeap ensureOnHeap, + DeletionAwareTrie source, + ColumnFilter columnFilter, + DataRange dataRange, + long minLocalDeletionTime) + { + this.iter = new PartitionIterator(source, metadata, ensureOnHeap); + this.metadata = metadata; + this.columnFilter = columnFilter; + this.dataRange = dataRange; + this.minLocalDeletionTime = minLocalDeletionTime; + } + + public long getMinLocalDeletionTime() + { + return minLocalDeletionTime; + } + + public TableMetadata metadata() + { + return metadata; + } + + public boolean hasNext() + { + return iter.hasNext(); + } + + public UnfilteredRowIterator next() + { + Partition partition = iter.next(); + DecoratedKey key = partition.partitionKey(); + ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key); + + return filter.getUnfilteredRowIterator(columnFilter, partition); + } + } + + public static Memtable.Factory factory(Map optionsCopy) + { + String shardsString = optionsCopy.remove(SHARDS_OPTION); + Integer shardCount = shardsString != null ? Integer.parseInt(shardsString) : null; + return new Factory(shardCount); + } + + + static class Factory implements Memtable.Factory + { + final Integer shardCount; + + Factory(Integer shardCount) + { + this.shardCount = shardCount; + } + + public Memtable create(AtomicReference commitLogLowerBound, + TableMetadataRef metadaRef, + Owner owner) + { + return new TrieMemtableStage3(commitLogLowerBound, metadaRef, owner, shardCount); + } + + @Override + public PartitionUpdate.Factory partitionUpdateFactory() + { + return TriePartitionUpdateStage3.FACTORY; + } + + @Override + public TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef) + { + TrieMemtableMetricsView metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name); + return metrics::release; + } + } + + @Override + public long unusedReservedOnHeapMemory() + { + long size = 0; + for (MemtableShard shard : shards) + { + size += shard.data.unusedReservedOnHeapMemory(); + size += shard.allocator.unusedReservedOnHeapMemory(); + } + size += this.allocator.unusedReservedOnHeapMemory(); + return size; + } + + /// Release all recycled content references, including the ones waiting in still incomplete recycling lists. + /// This is a test method and can cause null pointer exceptions if used on a live trie. + @VisibleForTesting + void releaseReferencesUnsafe() + { + for (MemtableShard shard : shards) + shard.data.releaseReferencesUnsafe(); + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdate.java index 7dedd2d66544..df1ec175e9ea 100644 --- a/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdate.java +++ b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdate.java @@ -36,6 +36,7 @@ import org.apache.cassandra.db.RangeTombstone; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.EncodingStats; @@ -354,15 +355,23 @@ public List collectCounterMarks() return marks; } - private static void addMarksForRow(Row row, List marks) + private void addMarksForRow(Row row, List marks) { for (Cell cell : row.cells()) { if (cell.isCounterCell()) - marks.add(new CounterMark(row, cell.column(), cell.path())); + marks.add(new CounterMark(this, row, cell.column(), cell.path())); } } + @Override + public void setCounterMarkValue(CounterMark mark, ByteBuffer value) + { + // Please read the warning in BTreeRow.setValue before using this method. + BTreeRow row = (BTreeRow) mark.row(); + row.setValue(mark.column(), mark.path(), value); + } + @Override public void validateIndexedColumns(ClientState state) { diff --git a/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java index aa708ab23819..da73bc34fb7d 100644 --- a/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java +++ b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java @@ -20,9 +20,9 @@ import org.apache.cassandra.db.DeletionInfo; import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.Row; -import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.index.transactions.UpdateTransaction; import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.btree.UpdateFunction; @@ -63,7 +63,7 @@ public Row insert(Row insert) @Override public Row merge(Row existing, Row update) { - Row reconciled = Rows.merge(existing, update, this); + Row reconciled = ((BTreeRow) existing).mergeWith((BTreeRow) update, this); indexer.onUpdated(existing, reconciled); return reconciled; diff --git a/src/java/org/apache/cassandra/db/partitions/Partition.java b/src/java/org/apache/cassandra/db/partitions/Partition.java index 9b6dace1d00e..270d883621b1 100644 --- a/src/java/org/apache/cassandra/db/partitions/Partition.java +++ b/src/java/org/apache/cassandra/db/partitions/Partition.java @@ -82,10 +82,12 @@ default Iterable rows() Row lastRow(); /** - * Returns the row corresponding to the provided clustering, or null if there is not such row. + * Returns the row corresponding to the provided clustering, or null if there is no such row. * * @param clustering clustering key to search - * @return row corresponding to the clustering, it's either null or non-empty row. + * @return Row corresponding to the clustering, it's either null or non-empty row. Note that the returned row can + * be fully deleted (i.e. contain only a row deletion timestamp). The method will return a deleted row also in + * the case where no row exists for the given clustering, but it is covered under a range deletion. */ public @Nullable Row getRow(Clustering clustering); diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java index 2b663077de7e..4dbcfb62b640 100644 --- a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java +++ b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java @@ -38,7 +38,6 @@ import org.apache.cassandra.db.SimpleBuilders; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.DeserializationHelper; @@ -82,6 +81,7 @@ public interface PartitionUpdate extends Partition @SuppressWarnings("Convert2MethodRef") public static final PartitionUpdateSerializer serializer = new PartitionUpdateSerializer(tableId -> Schema.instance.getExistingTableMetadata(tableId)); + // FIXME This must be removed DeletionInfo deletionInfo(); /** @@ -101,6 +101,11 @@ public interface PartitionUpdate extends Partition */ int dataSize(); + /** + * The size of the data contained in this update. + * + * @return the size of the data contained in this update. + */ long unsharedHeapSize(); @Override @@ -148,6 +153,20 @@ default void validateIndexedColumns(ClientState state) IndexRegistry.obtain(metadata()).validate(this, state); } + /** + * Modify this update to set every timestamp for live data to {@code newTimestamp} and + * every deletion timestamp to {@code newTimestamp - 1}. + * + * There is no reason to use that except on the Paxos code path, where we need to ensure that + * anything inserted uses the ballot timestamp (to respect the order of updates decided by + * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones + * always win on timestamp equality and we don't want to delete our own insertions + * (typically, when we overwrite a collection, we first set a complex deletion to delete the + * previous collection before adding new elements. If we were to set that complex deletion + * to the same timestamp that the new elements, it would delete those elements). And since + * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still + * delete anything from a previous update. + */ PartitionUpdate withUpdatedTimestamps(long timestamp); static Builder builder(TableMetadata metadata, DecoratedKey partitionKey, RegularAndStaticColumns columns, int initialRowCapacity) @@ -533,12 +552,14 @@ public long serializedSize(PartitionUpdate update, int version) */ class CounterMark { + private final PartitionUpdate update; private final Row row; private final ColumnMetadata column; private final CellPath path; - protected CounterMark(Row row, ColumnMetadata column, CellPath path) + protected CounterMark(PartitionUpdate update, Row row, ColumnMetadata column, CellPath path) { + this.update = update; this.row = row; this.column = column; this.path = path; @@ -549,6 +570,11 @@ public Clustering clustering() return row.clustering(); } + Row row() + { + return row; + } + public ColumnMetadata column() { return column; @@ -570,11 +596,17 @@ public void setValue(ByteBuffer value) { // This is a bit of a giant hack as this is the only place where we mutate a Row object. This makes it more efficient // for counters however and this won't be needed post-#6506 so that's probably fine. - assert row instanceof BTreeRow; - ((BTreeRow)row).setValue(column, path, value); + update.setCounterMarkValue(this, value); } } + /** + * This method should be used only by CounterMark to efficiently update counter values. + * + * This method violates the immutability expectations of PartitionUpdate. Use with extreme care. + */ + void setCounterMarkValue(CounterMark mark, ByteBuffer value); + /** * Builder for PartitionUpdates * @@ -613,10 +645,59 @@ interface Builder interface Factory { Builder builder(TableMetadata metadata, DecoratedKey partitionKey, RegularAndStaticColumns columns, int initialRowCapacity); + + /** + * Creates a empty immutable partition update. + * + * @param metadata the metadata for the created update. + * @param partitionKey the partition key for the created update. + * + * @return the newly created empty (and immutable) update. + */ PartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey partitionKey); - PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey valueKey, Row row); + + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param row the row for the update, may be a regular or static row and cannot be null. + * + * @return the newly created partition update containing only {@code row}. + */ + PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey key, Row row); + + /** + * Creates an immutable partition update that entirely deletes a given partition. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition that the created update should delete. + * @param timestamp the timestamp for the deletion. + * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion. + * + * @return the newly created partition deletion update. + */ PartitionUpdate fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec); + + /** + * Turns the given iterator into an update. + * + * @param iterator the iterator to turn into updates. + * + * Warning: this method does not close the provided iterator, it is up to + * the caller to close it. + */ PartitionUpdate fromIterator(UnfilteredRowIterator iterator); + + /** + * Turns the given iterator into an update, filtering data through the given column filter. + * + * @param iterator the iterator to turn into updates. + * @param filter the column filter to apply (e.g. queried columns). + * + * Warning: this method does not close the provided iterator, it is up to + * the caller to close it. + */ PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter); /** diff --git a/src/java/org/apache/cassandra/db/partitions/TrieBackedPartition.java b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartition.java index ad0c75b8662d..9492f6b5ee26 100644 --- a/src/java/org/apache/cassandra/db/partitions/TrieBackedPartition.java +++ b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartition.java @@ -20,72 +20,57 @@ import java.util.Iterator; import java.util.NavigableSet; +import java.util.function.BiFunction; +import java.util.function.Predicate; -import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Predicates; import com.google.common.primitives.Ints; import org.apache.cassandra.db.Clustering; -import org.apache.cassandra.db.ClusteringBound; import org.apache.cassandra.db.ClusteringComparator; import org.apache.cassandra.db.ClusteringPrefix; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.DeletionInfo; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.LivenessInfo; -import org.apache.cassandra.db.MutableDeletionInfo; -import org.apache.cassandra.db.RangeTombstone; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.Slice; import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; -import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; -import org.apache.cassandra.db.rows.BTreeRow; -import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.db.rows.Row; -import org.apache.cassandra.db.rows.RowAndDeletionMergeIterator; import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.db.tries.DeletionAwareTrie; import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; import org.apache.cassandra.db.tries.InMemoryTrie; -import org.apache.cassandra.db.tries.Trie; -import org.apache.cassandra.db.tries.TrieEntriesIterator; +import org.apache.cassandra.db.tries.RangeTrie; +import org.apache.cassandra.db.tries.TrieSet; import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.db.tries.TrieTailsIterator; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.utils.AbstractIterator; -import org.apache.cassandra.utils.ObjectSizes; -import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; -import org.apache.cassandra.utils.memory.Cloner; import org.apache.cassandra.utils.memory.EnsureOnHeap; -/** - * In-memory partition backed by a trie. The rows of the partition are values in the leaves of the trie, where the key - * to the row is only stored as the path to reach that leaf; static rows are also treated as a row with STATIC_CLUSTERING - * path; the deletion information is placed as a metadata object at the root of the trie -- this matches how Memtable - * stores partitions within the larger map, so that TrieBackedPartition objects can be created directly from Memtable - * tail tries. - * - * This object also holds the partition key, as well as some metadata (columns and statistics). - * - * Currently all descendants and instances of this class are immutable (even tail tries from mutable memtables are - * guaranteed to not change as we use forced copying below the partition level), though this may change in the future. - */ +/// In-memory partition backed by a deletion-aware trie. The rows of the partition are values in the leaves of the trie, +/// where the key to the row is only stored as the path to reach that leaf; static rows are also treated as a row with +/// `STATIC_CLUSTERING` path; the deletion information is placed in a deletion branch of the trie which starts at the +/// root of the partition. This matches how `TrieMemtable` stores partitions within the larger map, so that +/// `TrieBackedPartition` objects can be created directly from `TrieMemtable` tail tries. +/// +/// This object also holds the partition key, as well as some metadata (columns and statistics). +/// Currently, all descendants and instances of this class are immutable (even tail tries from mutable memtables are +/// guaranteed to not change as we use forced copying below the partition level), though this may change in the future. public class TrieBackedPartition implements Partition { - /** - * If keys are below this length, we will use a recursive procedure for inserting data when building the backing - * trie. - */ - @VisibleForTesting - public static final int MAX_RECURSIVE_KEY_LENGTH = 128; - public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS50; /** Pre-made path for STATIC_CLUSTERING, to avoid creating path object when querying static path. */ @@ -93,86 +78,46 @@ public class TrieBackedPartition implements Partition /** Pre-made path for BOTTOM, to avoid creating path object when iterating rows. */ public static final ByteComparable BOTTOM_PATH = v -> ByteSource.oneByte(ClusteringPrefix.Kind.INCL_START_BOUND.asByteComparableValue(v)); - /** - * The representation of a row stored at the leaf of a trie. Does not contain the row key. - * - * The methods toRow and copyToOnHeapRow combine this with a clustering for the represented Row. - */ - public static class RowData - { - final Object[] columnsBTree; - final LivenessInfo livenessInfo; - final DeletionTime deletion; - final long minLocalDeletionTime; - - RowData(Object[] columnsBTree, LivenessInfo livenessInfo, DeletionTime deletion) - { - this(columnsBTree, livenessInfo, deletion, BTreeRow.minDeletionTime(columnsBTree, livenessInfo, deletion)); - } - - RowData(Object[] columnsBTree, LivenessInfo livenessInfo, DeletionTime deletion, long minLocalDeletionTime) - { - this.columnsBTree = columnsBTree; - this.livenessInfo = livenessInfo; - this.deletion = deletion; - this.minLocalDeletionTime = minLocalDeletionTime; - } - - Row toRow(Clustering clustering) - { - return BTreeRow.create(clustering, - livenessInfo, - Row.Deletion.regular(deletion), - columnsBTree, - minLocalDeletionTime); - } - - public int dataSize() - { - int dataSize = livenessInfo.dataSize() + deletion.dataSize(); - - return Ints.checkedCast(BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.dataSize(), dataSize)); - } - - public long unsharedHeapSizeExcludingData() - { - long heapSize = EMPTY_ROWDATA_SIZE - + BTree.sizeOfStructureOnHeap(columnsBTree) - + livenessInfo.unsharedHeapSize() - + deletion.unsharedHeapSize(); - - return BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.unsharedHeapSizeExcludingData(), heapSize); - } + /// Interface implemented by partition markers, both the singleton below used for standalone [TrieBackedPartition], + /// and the marker used in tail tries in `TrieMemtable`s. + public interface PartitionMarker {} + /// Singleton partition marker used for standalone [TrieBackedPartition] and [TriePartitionUpdate] objects. + public static final PartitionMarker PARTITION_MARKER = new PartitionMarker() + { public String toString() { - return "row " + livenessInfo + " size " + dataSize(); + return "PARTITION_MARKER"; } + }; - public RowData clone(Cloner cloner) - { - Object[] tree = BTree.transform(columnsBTree, c -> c.clone(cloner)); - return new RowData(tree, livenessInfo, deletion, minLocalDeletionTime); - } - } + /// Predicate to identify partition boundaries in tries. This accepts any [PartitionMarker], not just the + /// [#PARTITION_MARKER] used for standalone trie-backed partitions. + public static final Predicate IS_PARTITION_BOUNDARY = TrieBackedPartition::isPartitionBoundary; - private static final long EMPTY_ROWDATA_SIZE = ObjectSizes.measure(new RowData(null, null, null, 0)); + /// Returns true if the given content is a partition marker. + public static boolean isPartitionBoundary(Object content) + { + return content instanceof TrieBackedPartition.PartitionMarker; + } - protected final Trie trie; + protected final DeletionAwareTrie trie; protected final DecoratedKey partitionKey; protected final TableMetadata metadata; protected final RegularAndStaticColumns columns; protected final EncodingStats stats; + /// Number of rows in the partition. This will count only the rows that have live data. protected final int rowCountIncludingStatic; - protected final boolean canHaveShadowedData; + /// Number of tombstone boundary pairs on the row-level or above: partition, range and row tombstones. + protected final int tombstoneCount; public TrieBackedPartition(DecoratedKey partitionKey, RegularAndStaticColumns columns, EncodingStats stats, int rowCountIncludingStatic, - Trie trie, - TableMetadata metadata, - boolean canHaveShadowedData) + int tombstoneCount, + DeletionAwareTrie trie, + TableMetadata metadata) { this.partitionKey = partitionKey; this.trie = trie; @@ -180,9 +125,8 @@ public TrieBackedPartition(DecoratedKey partitionKey, this.columns = columns; this.stats = stats; this.rowCountIncludingStatic = rowCountIncludingStatic; - this.canHaveShadowedData = canHaveShadowedData; - // There must always be deletion info metadata. - // Note: we can't use deletionInfo() because WithEnsureOnHeap's override is not yet set up. + this.tombstoneCount = tombstoneCount; + // There must always be a partition marker. assert trie.get(ByteComparable.EMPTY) != null; assert stats != null; } @@ -194,9 +138,9 @@ public static TrieBackedPartition fromIterator(UnfilteredRowIterator iterator) iterator.columns(), iterator.stats(), builder.rowCountIncludingStatic(), + builder.tombstoneCount(), builder.trie(), - iterator.metadata(), - false); + iterator.metadata()); } protected static ContentBuilder build(UnfilteredRowIterator iterator, boolean collectDataSize) @@ -218,142 +162,190 @@ protected static ContentBuilder build(UnfilteredRowIterator iterator, boolean co } } - /** - * Create a row with the given properties and content, making sure to copy all off-heap data to keep it alive when - * the given access mode requires it. - */ + /// Create a row with the given properties and content, making sure to copy all off-heap data to keep it alive when + /// the given access mode requires it. public static TrieBackedPartition create(DecoratedKey partitionKey, RegularAndStaticColumns columnMetadata, EncodingStats encodingStats, int rowCountIncludingStatic, - Trie trie, + int tombstoneCount, + DeletionAwareTrie trie, TableMetadata metadata, + TableMetadata droppedColumnsSource, EnsureOnHeap ensureOnHeap) { return ensureOnHeap == EnsureOnHeap.NOOP - ? new TrieBackedPartition(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, trie, metadata, true) - : new WithEnsureOnHeap(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, trie, metadata, true, ensureOnHeap); + ? new WithDroppedColumnsSource(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, tombstoneCount, trie, metadata, droppedColumnsSource) + : new WithEnsureOnHeap(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, tombstoneCount, trie, metadata, droppedColumnsSource, ensureOnHeap); } - class RowIterator extends TrieEntriesIterator + /// Implementation of an iterator over rows. Note that because the legacy containers store deleted rows as [Row]s + /// with a row deletion time and empty content and present them here, we must also present fully deleted rows + /// (but not range deletions). + class RowIterator extends TrieTailsIterator.DeletionAware { - public RowIterator(Trie trie, Direction direction) + public RowIterator(DeletionAwareTrie trie, Direction direction) { - super(trie, direction, RowData.class::isInstance); + super(trie, + direction, + (live, marker) -> + live instanceof LivenessInfo ? live + : marker != null && marker.hasLevelMarker(TrieTombstoneMarker.LevelMarker.ROW) ? marker + : null, + false); } @Override - protected Row mapContent(Object content, byte[] bytes, int byteLength) + protected Row mapContent(Object content, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength) { - var rd = (RowData) content; - return toRow(rd, - metadata.comparator.clusteringFromByteComparable( - ByteBufferAccessor.instance, - ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, bytes, 0, byteLength))); + return toRow(tailTrie, getClustering(bytes, byteLength)); } } - private Iterator rowIterator(Trie trie, Direction direction) + private Iterator rowIterator(DeletionAwareTrie trie, Direction direction) { return new RowIterator(trie, direction); } - static RowData rowToData(Row row) + /// Conversion from row branch to [Row]. [WithEnsureOnHeap] overrides this to do the necessary copying + /// (hence the non-static method). + Row toRow(DeletionAwareTrie rowContent, Clustering clustering) { - BTreeRow brow = (BTreeRow) row; - return new RowData(brow.getBTree(), row.primaryKeyLivenessInfo(), row.deletion().time(), brow.getMinLocalDeletionTime()); + return rowContent != null ? TrieBackedRow.create(metadata, clustering, rowContent) : null; } - /** - * Conversion from RowData to Row. TrieBackedPartitionOnHeap overrides this to do the necessary copying - * (hence the non-static method). - */ - Row toRow(RowData data, Clustering clustering) + /// Put the given row in the trie, used by methods to build stand-alone partitions. + /// + /// @param comparator for converting key to byte-comparable + /// @param trie destination + /// @param untypedRow content to put + protected static void putInTrie(TableMetadata metadata, + ClusteringComparator comparator, + InMemoryDeletionAwareTrie trie, + Row untypedRow) + throws TrieSpaceExhaustedException { - return data.toRow(clustering); + TrieBackedRow row; + if (untypedRow instanceof TrieBackedRow) + row = (TrieBackedRow) untypedRow; + else + row = TrieBackedRow.from(metadata, untypedRow); + + ByteComparable comparableClustering = comparator.asByteComparable(row.clustering()); + makeMutator(trie).apply(row.trie().prefixedBySeparately(comparableClustering, true)); } - /** - * Put the given unfiltered in the trie. - * @param comparator for converting key to byte-comparable - * @param useRecursive whether the key length is guaranteed short and recursive put can be used - * @param trie destination - * @param row content to put - */ - protected static void putInTrie(ClusteringComparator comparator, boolean useRecursive, InMemoryTrie trie, Row row) throws TrieSpaceExhaustedException + private static + InMemoryDeletionAwareTrie.Mutator + makeMutator(InMemoryDeletionAwareTrie trie) { - trie.putSingleton(comparator.asByteComparable(row.clustering()), rowToData(row), NO_CONFLICT_RESOLVER, useRecursive); + return trie.mutator(noConflictInData(), + mergeTombstoneRanges(), + noIncomingSelfDeletion(), + noExistingSelfDeletion(), + true, + Predicates.alwaysFalse()); } - /** - * Check if we can use recursive operations when putting a value in tries. - * True if all types in the clustering keys are fixed length, and total size is small enough. - */ - protected static boolean useRecursive(ClusteringComparator comparator) + protected static void putRangeDeletionInTrie(ClusteringComparator comparator, + InMemoryDeletionAwareTrie trie, + RangeTombstoneMarker openMarker, + RangeTombstoneMarker closeMarker) { - int length = 1; // terminator - for (AbstractType type : comparator.subtypes()) - if (!type.isValueLengthFixed()) - return false; - else - length += 1 + type.valueLengthIfFixed(); // separator + value + DeletionTime deletionTime = openMarker.openDeletionTime(false); + assert deletionTime.equals(closeMarker.closeDeletionTime(false)); + ByteComparable start = comparator.asByteComparable(openMarker.clustering()); + ByteComparable end = comparator.asByteComparable(closeMarker.clustering()); + assert !deletionTime.isLive(); + try + { + makeMutator(trie).delete(RangeTrie.range(start, true, + end, false, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.RANGE))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } - return length <= MAX_RECURSIVE_KEY_LENGTH; + protected static void putPartitionDeletionInTrie(InMemoryDeletionAwareTrie trie, + DeletionTime deletionTime) + { + if (deletionTime.isLive()) + return; + try + { + makeMutator(trie).delete(RangeTrie.branch(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.PARTITION))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } } + + @Override public TableMetadata metadata() { return metadata; } + @Override public DecoratedKey partitionKey() { return partitionKey; } + @Override public DeletionTime partitionLevelDeletion() { - return deletionInfo().getPartitionDeletion(); + return TrieTombstoneMarker.applicableDeletionOrLive(trie, ByteComparable.EMPTY); } + @Override public RegularAndStaticColumns columns() { return columns; } + @Override public EncodingStats stats() { return stats; } + /// @inheritDoc + /// Trie-backed partitions only count rows with live data. If a row only contains a row deletion, or a column-level + /// deletion with no overriding live cells, it will not be counted in this number. Though this is somewhat wrong, + /// the uses of the row count do not require the number to be precise, and getting the right count can be expensive. + @Override public int rowCount() { return rowCountIncludingStatic - (hasStaticRow() ? 1 : 0); } - public DeletionInfo deletionInfo() - { - return (DeletionInfo) trie.get(ByteComparable.EMPTY); - } - - public ByteComparable path(ClusteringPrefix clustering) + public ByteComparable path(ClusteringPrefix clustering) { return metadata.comparator.asByteComparable(clustering); } + @Override public Row staticRow() { - RowData staticRow = (RowData) trie.get(STATIC_CLUSTERING_PATH); - - if (staticRow != null) - return toRow(staticRow, Clustering.STATIC_CLUSTERING); - else - return Rows.EMPTY_STATIC_ROW; + // Unlike getRow, this method does not apply any covering deletion to the returned row. + DeletionAwareTrie staticRow = trie.tailTrie(STATIC_CLUSTERING_PATH, false); + Row row = toRow(staticRow, Clustering.STATIC_CLUSTERING); + return row != null ? row : Rows.EMPTY_STATIC_ROW; } + @Override public boolean isEmpty() { - return rowCountIncludingStatic == 0 && deletionInfo().isLive(); + return rowCountIncludingStatic + tombstoneCount == 0; } private boolean hasStaticRow() @@ -361,20 +353,19 @@ private boolean hasStaticRow() return trie.get(STATIC_CLUSTERING_PATH) != null; } + @Override public boolean hasRows() { return rowCountIncludingStatic > 1 || rowCountIncludingStatic > 0 && !hasStaticRow(); } - /** - * Provides read access to the trie for users that can take advantage of it directly (e.g. Memtable). - */ - public Trie trie() + /// Provides read access to the trie for users that can take advantage of it directly (e.g. `TrieMemtable`). + public DeletionAwareTrie trie() { return trie; } - private Trie nonStaticSubtrie() + private DeletionAwareTrie nonStaticSubtrie() { // skip static row if present - the static clustering sorts before BOTTOM so that it's never included in // any slices (we achieve this by using the byte ByteSource.EXCLUDED for its representation, which is lower @@ -382,11 +373,13 @@ private Trie nonStaticSubtrie() return trie.subtrie(BOTTOM_PATH, null); } + @Override public Iterator rowIterator() { return rowIterator(nonStaticSubtrie(), Direction.FORWARD); } + /// Iterator over all rows of the partition including the static one. public Iterator rowsIncludingStatic() { return rowIterator(trie, Direction.FORWARD); @@ -399,126 +392,201 @@ public Row lastRow() return reverseIterator.hasNext() ? reverseIterator.next() : null; } - public Row getRow(Clustering clustering) + @Override + public Row getRow(Clustering clustering) + { + // getRow must return range and partition deletion applicable to the row + return toRow(trie.tailTrie(path(clustering), true), clustering); + } + + @Override + public UnfilteredRowIterator unfilteredIterator() { - RowData data = (RowData) trie.get(path(clustering)); + return unfilteredIterator(ColumnFilter.selection(columns()), Slices.ALL, false); + } - DeletionInfo deletionInfo = deletionInfo(); - RangeTombstone rt = deletionInfo.rangeCovering(clustering); + private Clustering getClustering(byte[] bytes, int byteLength) + { + return metadata.comparator.clusteringFromByteComparable(ByteBufferAccessor.instance, + ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, + bytes, 0, byteLength), + BYTE_COMPARABLE_VERSION); + } - // The trie only contains rows, so it doesn't allow to directly account for deletion that should apply to row - // (the partition deletion or the deletion of a range tombstone that covers it). So if needs be, reuse the row - // deletion to carry the proper deletion on the row. - DeletionTime partitionDeletion = deletionInfo.getPartitionDeletion(); - DeletionTime activeDeletion = partitionDeletion; - if (rt != null && rt.deletionTime().supersedes(activeDeletion)) - activeDeletion = rt.deletionTime(); + /// Combine live and deletion branch markers to identify row roots in the trie. Must return non-null when a row + /// is identified; the actual value is passed on to [UnfilteredIterator#mapContent] below. + private static Object combineDataAndDeletionForUnfilteredIterator(Object data, TrieTombstoneMarker deletion) + { + if (data instanceof LivenessInfo) + return data; // We don't need to return the deletion marker as it will be included in the tail trie. - if (data == null) + if (deletion != null) { - // this means our partition level deletion supersedes all other deletions and we don't have to keep the row deletions - if (activeDeletion == partitionDeletion) - return null; - // no need to check activeDeletion.isLive here - if anything superseedes the partitionDeletion - // it must be non-live - return BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(activeDeletion)); + // There are several ways we can end up here: + // - The partition deletion starts or ends. This is treated like a range deletion here but thrown away in + // mapContent. + // - A range deletion starts or ends. In this case we don't care about the tail trie as it only contains + // the marker we return. + // - We have a row level marker in the deletion path for a row that has no live data but row, column or cell + // deletion. We will return this branch as a deleted row; since we skip the branch, we will also skip the + // return path marker. + + if (deletion.hasLevelMarker(TrieTombstoneMarker.LevelMarker.ROW)) + return LivenessInfo.EMPTY; // Treat this branch as a row. + else + return deletion; // Range or partition deletion marker. } - Row row = toRow(data, clustering); - if (!activeDeletion.isLive()) - row = row.filter(ColumnFilter.selection(columns()), activeDeletion, true, metadata()); - return row; + return null; } - public UnfilteredRowIterator unfilteredIterator() + /// Source of dropped columns, if they could be different from the metadata in use. + /// To be overridden by memtable partitions. + protected TableMetadata droppedColumnsSource() { - return unfilteredIterator(ColumnFilter.selection(columns()), Slices.ALL, false); + return metadata; } - public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed) + /// Implementation of [UnfilteredRowIterator] for this partition. + /// + /// Looks for row and tombstone markers in the trie and presents each branch as a [TrieBackedRow] or + /// [RangeTombstoneMarker]. Because the legacy convention is that the static row and partition-level deletion are + /// given separately by methods of the iterator, they are filtered out from the returned content -- the former by + /// excluding the static key from the translation of [Slice#ALL], and the latter by excluding covering deletions + /// in the returned trie and filtering out the non-range sides of all tombstone boundaries. + class UnfilteredIterator + extends TrieTailsIterator.DeletionAware + implements UnfilteredRowIterator { - Row staticRow = staticRow(selection, false); - if (slices.size() == 0) + final boolean reversed; + final ColumnFilter selection; + final Row staticRow; + + protected UnfilteredIterator(ColumnFilter selection, DeletionAwareTrie trie, boolean reversed) { - DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); - return UnfilteredRowIterators.noRowsIterator(metadata(), partitionKey(), staticRow, partitionDeletion, reversed); + super(trie, Direction.fromBoolean(reversed), TrieBackedPartition::combineDataAndDeletionForUnfilteredIterator, false); + this.selection = selection; + this.reversed = reversed; + Row staticRow = TrieBackedPartition.this.staticRow().filter(selection, droppedColumnsSource()); + this.staticRow = staticRow != null ? staticRow : Rows.EMPTY_STATIC_ROW; } - return slices.size() == 1 - ? sliceIterator(selection, slices.get(0), reversed, staticRow) - : new SlicesIterator(selection, slices, reversed, staticRow); - } + @Override + protected Unfiltered mapContent(Object content, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength) + { + // content is as prepared by combineDataAndDeletionForUnfilteredIterator above. + if (content instanceof LivenessInfo) + { + // Row. + Row row = toRow(tailTrie, getClustering(bytes, byteLength)); + return row != null ? row.filter(selection, droppedColumnsSource()) : null; + } + else + { + // Range or partition deletion marker. + // The method below will present only range tombstones, identified by their marker kind. + return ((TrieTombstoneMarker) content).toRangeTombstoneMarker( + ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, bytes, 0, byteLength), + BYTE_COMPARABLE_VERSION, + metadata.comparator); + } + } - public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, NavigableSet> clusteringsInQueryOrder, boolean reversed) - { - Row staticRow = staticRow(selection, false); - if (clusteringsInQueryOrder.isEmpty()) + @Override + public DeletionTime partitionLevelDeletion() { - DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); - return UnfilteredRowIterators.noRowsIterator(metadata(), partitionKey(), staticRow, partitionDeletion, reversed); + return TrieTombstoneMarker.applicableDeletionOrLive(trie, ByteComparable.EMPTY); } - Iterator rowIter = new AbstractIterator() { + @Override + public EncodingStats stats() + { + return stats; + } - Iterator> clusterings = clusteringsInQueryOrder.iterator(); + @Override + public TableMetadata metadata() + { + return metadata; + } - @Override - protected Row computeNext() - { - while (clusterings.hasNext()) - { - Clustering clustering = clusterings.next(); - Object rowData = trie.get(path(clustering)); - if (rowData instanceof RowData) - return toRow((RowData) rowData, clustering); - } - return endOfData(); - } - }; + @Override + public boolean isReverseOrder() + { + return reversed; + } - // not using DeletionInfo.rangeCovering(Clustering), because it returns the original range tombstone, - // but we need DeletionInfo.rangeIterator(Set) that generates tombstones based on given clustering bound. - Iterator deleteIter = deletionInfo().rangeIterator(clusteringsInQueryOrder, reversed); + @Override + public RegularAndStaticColumns columns() + { + return selection.fetchedColumns(); + } - return merge(rowIter, deleteIter, selection, reversed, staticRow); - } + @Override + public DecoratedKey partitionKey() + { + return partitionKey; + } - private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice, boolean reversed, Row staticRow) - { - ClusteringBound start = slice.start(); - ClusteringBound end = slice.end() == ClusteringBound.TOP ? null : slice.end(); - Iterator rowIter = slice(start, end, reversed); - Iterator deleteIter = deletionInfo().rangeIterator(slice, reversed); - return merge(rowIter, deleteIter, selection, reversed, staticRow); - } + @Override + public Row staticRow() + { + return staticRow; + } - private Iterator slice(ClusteringBound start, ClusteringBound end, boolean reversed) - { - ByteComparable endPath = end != null ? path(end) : null; - // use BOTTOM as bound to skip over static rows - ByteComparable startPath = start != null ? path(start) : BOTTOM_PATH; - return rowIterator(trie.subtrie(startPath, endPath), Direction.fromBoolean(reversed)); + @Override + public void close() + { + // nothing to close + } + + @Override + public boolean stopIssuingTombstones() + { + stopIssuingDeletions(current -> !current.isRow() || ((Row) current).isEmptyAfterDeletion()); + return true; + } } - private Row staticRow(ColumnFilter columns, boolean setActiveDeletionToRow) + private UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, ByteComparable[] bounds, boolean reversed) { - DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); - Row staticRow = staticRow(); - if (columns.fetchedColumns().statics.isEmpty() || (staticRow.isEmpty() && partitionDeletion.isLive())) - return Rows.EMPTY_STATIC_ROW; + if (bounds.length == 0) + return UnfilteredRowIterators.noRowsIterator(metadata, partitionKey, staticRow(), partitionLevelDeletion(), reversed); - Row row = staticRow.filter(columns, partitionDeletion, setActiveDeletionToRow, metadata()); - return row == null ? Rows.EMPTY_STATIC_ROW : row; + DeletionAwareTrie slicedTrie = + trie.intersect(TrieSet.ranges(BYTE_COMPARABLE_VERSION, bounds)); + return new UnfilteredIterator(selection, slicedTrie, reversed); } - private RowAndDeletionMergeIterator merge(Iterator rowIter, Iterator deleteIter, - ColumnFilter selection, boolean reversed, Row staticRow) + @Override + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed) { - return new RowAndDeletionMergeIterator(metadata(), partitionKey(), deletionInfo().getPartitionDeletion(), - selection, staticRow, reversed, stats(), - rowIter, deleteIter, canHaveShadowedData); + ByteComparable[] bounds = new ByteComparable[slices.size() * 2]; + int index = 0; + for (Slice slice : slices) + { + bounds[index++] = metadata.comparator.asByteComparable(slice.start()); + bounds[index++] = metadata.comparator.asByteComparable(slice.end()); + } + return unfilteredIterator(selection, bounds, reversed); } + @Override + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, NavigableSet> clusteringsInQueryOrder, boolean reversed) + { + ByteComparable[] bounds = new ByteComparable[clusteringsInQueryOrder.size() * 2]; + // Trie intersection requires the boundaries to be given in forward order. Our clusterings are given in query + // order, which is why we have to reverse them if we are making a reversed iterator. + int index = reversed ? (clusteringsInQueryOrder.size() - 1) * 2 : 0; + int indexInc = reversed ? -2 : +2; + for (Clustering clustering : clusteringsInQueryOrder) + { + bounds[index + 0] = metadata.comparator.asByteComparable(clustering.asStartBound()); + bounds[index + 1] = metadata.comparator.asByteComparable(clustering.asEndBound()); + index += indexInc; + } + return unfilteredIterator(selection, bounds, reversed); + } @Override public String toString() @@ -526,88 +594,55 @@ public String toString() return Partition.toString(this); } - class SlicesIterator extends AbstractUnfilteredRowIterator + private static class WithDroppedColumnsSource extends TrieBackedPartition { - private final Slices slices; + final TableMetadata droppedColumnsSource; - private int idx; - private Iterator currentSlice; - private final ColumnFilter selection; - - private SlicesIterator(ColumnFilter selection, - Slices slices, - boolean isReversed, - Row staticRow) + public WithDroppedColumnsSource(DecoratedKey partitionKey, RegularAndStaticColumns columns, EncodingStats stats, int rowCountIncludingStatic, int tombstoneCount, DeletionAwareTrie trie, TableMetadata metadata, TableMetadata droppedColumnsSource) { - super(TrieBackedPartition.this.metadata(), TrieBackedPartition.this.partitionKey(), - TrieBackedPartition.this.partitionLevelDeletion(), - selection.fetchedColumns(), staticRow, isReversed, TrieBackedPartition.this.stats()); - this.selection = selection; - this.slices = slices; + super(partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, trie, metadata); + this.droppedColumnsSource = droppedColumnsSource; } - protected Unfiltered computeNext() + @Override + protected TableMetadata droppedColumnsSource() { - while (true) - { - if (currentSlice == null) - { - if (idx >= slices.size()) - return endOfData(); - - int sliceIdx = isReverseOrder ? slices.size() - idx - 1 : idx; - currentSlice = sliceIterator(selection, slices.get(sliceIdx), isReverseOrder, Rows.EMPTY_STATIC_ROW); - idx++; - } - - if (currentSlice.hasNext()) - return currentSlice.next(); - - currentSlice = null; - } + return droppedColumnsSource; } } - - /** - * An snapshot of the current TrieBackedPartition data, copied on heap when retrieved. - */ - private static final class WithEnsureOnHeap extends TrieBackedPartition + /// A snapshot of the current [TrieBackedPartition] data, copied on heap when retrieved. + private static final class WithEnsureOnHeap extends WithDroppedColumnsSource { - final DeletionInfo onHeapDeletion; EnsureOnHeap ensureOnHeap; public WithEnsureOnHeap(DecoratedKey partitionKey, RegularAndStaticColumns columns, EncodingStats stats, int rowCountIncludingStatic, - Trie trie, + int tombstoneCount, + DeletionAwareTrie trie, TableMetadata metadata, - boolean canHaveShadowedData, + TableMetadata droppedColumnsSource, EnsureOnHeap ensureOnHeap) { - super(partitionKey, columns, stats, rowCountIncludingStatic, trie, metadata, canHaveShadowedData); + super(partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, trie, metadata, droppedColumnsSource); this.ensureOnHeap = ensureOnHeap; - this.onHeapDeletion = ensureOnHeap.applyToDeletionInfo(super.deletionInfo()); - } - - @Override - public Row toRow(RowData data, Clustering clustering) - { - return ensureOnHeap.applyToRow(super.toRow(data, clustering)); } @Override - public DeletionInfo deletionInfo() + public Row toRow(DeletionAwareTrie data, Clustering clustering) { - return onHeapDeletion; + Row row = super.toRow(data, clustering); + if (row == null) + return null; + return ensureOnHeap.applyToRow(row); } } - /** - * Resolver for operations with trie-backed partitions. We don't permit any overwrites/merges. - */ - public static final InMemoryTrie.UpsertTransformer NO_CONFLICT_RESOLVER = + /// Resolver for operations with trie-backed partitions. We don't permit any overwrites/merges. + @SuppressWarnings("rawtypes") + private static final InMemoryTrie.UpsertTransformer NO_CONFLICT_RESOLVER = (existing, update) -> { if (existing != null) @@ -615,40 +650,79 @@ public DeletionInfo deletionInfo() return update; }; - /** - * Helper class for constructing tries and deletion info from an iterator or flowable partition. - * - * Note: This is not collecting any stats or columns! - */ + /// Resolver for data in trie-backed partitions. We don't permit any overwrites/merges. + @SuppressWarnings("unchecked") + public static InMemoryTrie.UpsertTransformer noConflictInData() + { + return NO_CONFLICT_RESOLVER; + } + + /// Tombstone merging resolver. Even though we don't support overwrites, we get requests to add the two sides + /// of a boundary separately and must join them. + private static final InMemoryTrie.UpsertTransformer MERGE_TOMBSTONE_RANGES = + (existing, update) -> existing != null ? existing.mergeWith(update) : update; + + /// Tombstone merging resolver. Even though we don't support overwrites, we get requests to add the two sides + /// of a boundary separately and must join them. + public static InMemoryTrie.UpsertTransformer mergeTombstoneRanges() + { + return MERGE_TOMBSTONE_RANGES; + } + + private static final InMemoryBaseTrie.UpsertTransformer IGNORE_UPDATE = (left, right) -> left; + + /// Resolver for applying incoming deletions to existing data in trie-backed partitions. We assume that the data is + /// not affected by the deletion. + public static InMemoryTrie.UpsertTransformer noIncomingSelfDeletion() + { + return IGNORE_UPDATE; + } + + private static final BiFunction IGNORE_EXISTING = (left, right) -> right; + + /// Resolver for applying existing deletions to incoming data in trie-backed partitions. We assume that the data is + /// not affected by the deletion. + public static BiFunction noExistingSelfDeletion() + { + return IGNORE_EXISTING; + } + + /// Helper class for constructing tries and deletion info from an iterator. public static class ContentBuilder { final TableMetadata metadata; final ClusteringComparator comparator; - private final MutableDeletionInfo.Builder deletionBuilder; - private final InMemoryTrie trie; + private final InMemoryDeletionAwareTrie trie; - private final boolean useRecursive; private final boolean collectDataSize; private int rowCountIncludingStatic; + private int tombstoneCount; private long dataSize; + private RangeTombstoneMarker openMarker = null; + private final boolean isReverseOrder; + public ContentBuilder(TableMetadata metadata, DeletionTime partitionLevelDeletion, boolean isReverseOrder, boolean collectDataSize) { this.metadata = metadata; this.comparator = metadata.comparator; - this.deletionBuilder = MutableDeletionInfo.builder(partitionLevelDeletion, - comparator, - isReverseOrder); - this.trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + this.trie = InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION); - this.useRecursive = useRecursive(comparator); this.collectDataSize = collectDataSize; rowCountIncludingStatic = 0; + tombstoneCount = 0; dataSize = 0; + this.isReverseOrder = isReverseOrder; + + if (!partitionLevelDeletion.isLive()) + { + putPartitionDeletionInTrie(trie, partitionLevelDeletion); + ++tombstoneCount; + } } public ContentBuilder addStatic(Row staticRow) throws TrieSpaceExhaustedException @@ -661,16 +735,34 @@ public ContentBuilder addStatic(Row staticRow) throws TrieSpaceExhaustedExceptio public ContentBuilder addRow(Row row) throws TrieSpaceExhaustedException { - putInTrie(comparator, useRecursive, trie, row); + putInTrie(metadata, comparator, trie, row); ++rowCountIncludingStatic; if (collectDataSize) dataSize += row.dataSize(); + if (!row.deletion().isLive()) + ++tombstoneCount; return this; } public ContentBuilder addRangeTombstoneMarker(RangeTombstoneMarker unfiltered) { - deletionBuilder.add(unfiltered); + if (openMarker != null) + { + // This will check that unfiltered closes openMarker + putRangeDeletionInTrie(comparator, trie, + isReverseOrder ? unfiltered : openMarker, + isReverseOrder ? openMarker : unfiltered); + ++tombstoneCount; // we only count one side of a range, to match DeletionInfo.rangeCount + if (unfiltered.isOpen(isReverseOrder)) + openMarker = unfiltered; + else + openMarker = null; + } + else + { + assert unfiltered.isOpen(isReverseOrder); + openMarker = unfiltered; + } return this; } @@ -684,13 +776,12 @@ public ContentBuilder addUnfiltered(Unfiltered unfiltered) throws TrieSpaceExhau public ContentBuilder complete() throws TrieSpaceExhaustedException { - MutableDeletionInfo deletionInfo = deletionBuilder.build(); - trie.putRecursive(ByteComparable.EMPTY, deletionInfo, NO_CONFLICT_RESOLVER); // will throw if called more than once - // dataSize does not include the deletion info bytes + assert openMarker == null; + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); // will throw if called more than once return this; } - public Trie trie() + public InMemoryDeletionAwareTrie trie() { return trie; } @@ -700,6 +791,11 @@ public int rowCountIncludingStatic() return rowCountIncludingStatic; } + public int tombstoneCount() + { + return tombstoneCount; + } + public int dataSize() { assert collectDataSize; diff --git a/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage2.java b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage2.java new file mode 100644 index 000000000000..da8f2478734f --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage2.java @@ -0,0 +1,710 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import java.util.Iterator; +import java.util.NavigableSet; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Ints; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.IDataSize; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowAndDeletionMergeIterator; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryTrie; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieEntriesIterator; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.AbstractIterator; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.memory.Cloner; +import org.apache.cassandra.utils.memory.EnsureOnHeap; + +/** + * In-memory partition backed by a trie. The rows of the partition are values in the leaves of the trie, where the key + * to the row is only stored as the path to reach that leaf; static rows are also treated as a row with STATIC_CLUSTERING + * path; the deletion information is placed as a metadata object at the root of the trie -- this matches how Memtable + * stores partitions within the larger map, so that TrieBackedPartition objects can be created directly from Memtable + * tail tries. + * + * This object also holds the partition key, as well as some metadata (columns and statistics). + * + * Currently all descendants and instances of this class are immutable (even tail tries from mutable memtables are + * guaranteed to not change as we use forced copying below the partition level), though this may change in the future. + */ +public class TrieBackedPartitionStage2 implements Partition +{ + /** + * If keys are below this length, we will use a recursive procedure for inserting data when building the backing + * trie. + */ + @VisibleForTesting + public static final int MAX_RECURSIVE_KEY_LENGTH = 128; + + public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS50; + + /** Pre-made path for STATIC_CLUSTERING, to avoid creating path object when querying static path. */ + public static final ByteComparable STATIC_CLUSTERING_PATH = v -> ByteSource.oneByte(ClusteringPrefix.Kind.STATIC_CLUSTERING.asByteComparableValue(v)); + /** Pre-made path for BOTTOM, to avoid creating path object when iterating rows. */ + public static final ByteComparable BOTTOM_PATH = v -> ByteSource.oneByte(ClusteringPrefix.Kind.INCL_START_BOUND.asByteComparableValue(v)); + + /** + * The representation of a row stored at the leaf of a trie. Does not contain the row key. + * + * The methods toRow and copyToOnHeapRow combine this with a clustering for the represented Row. + */ + public static class RowData implements IDataSize + { + final Object[] columnsBTree; + final LivenessInfo livenessInfo; + final DeletionTime deletion; + final long minLocalDeletionTime; + + RowData(Object[] columnsBTree, LivenessInfo livenessInfo, DeletionTime deletion) + { + this(columnsBTree, livenessInfo, deletion, BTreeRow.minDeletionTime(columnsBTree, livenessInfo, deletion)); + } + + RowData(Object[] columnsBTree, LivenessInfo livenessInfo, DeletionTime deletion, long minLocalDeletionTime) + { + this.columnsBTree = columnsBTree; + this.livenessInfo = livenessInfo; + this.deletion = deletion; + this.minLocalDeletionTime = minLocalDeletionTime; + } + + Row toRow(Clustering clustering) + { + return BTreeRow.create(clustering, + livenessInfo, + Row.Deletion.regular(deletion), + columnsBTree, + minLocalDeletionTime); + } + + public int dataSize() + { + int dataSize = livenessInfo.dataSize() + deletion.dataSize(); + + return Ints.checkedCast(BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.dataSize(), dataSize)); + } + + public long unsharedHeapSizeExcludingData() + { + long heapSize = EMPTY_ROWDATA_SIZE + + BTree.sizeOfStructureOnHeap(columnsBTree) + + livenessInfo.unsharedHeapSize() + + deletion.unsharedHeapSize(); + + return BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.unsharedHeapSizeExcludingData(), heapSize); + } + + public String toString() + { + return "row " + livenessInfo + " size " + dataSize(); + } + + public RowData clone(Cloner cloner) + { + Object[] tree = BTree.transform(columnsBTree, c -> c.clone(cloner)); + return new RowData(tree, livenessInfo, deletion, minLocalDeletionTime); + } + } + + private static final long EMPTY_ROWDATA_SIZE = ObjectSizes.measure(new RowData(null, null, null, 0)); + + protected final Trie trie; + protected final DecoratedKey partitionKey; + protected final TableMetadata metadata; + protected final RegularAndStaticColumns columns; + protected final EncodingStats stats; + protected final int rowCountIncludingStatic; + protected final boolean canHaveShadowedData; + + public TrieBackedPartitionStage2(DecoratedKey partitionKey, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + Trie trie, + TableMetadata metadata, + boolean canHaveShadowedData) + { + this.partitionKey = partitionKey; + this.trie = trie; + this.metadata = metadata; + this.columns = columns; + this.stats = stats; + this.rowCountIncludingStatic = rowCountIncludingStatic; + this.canHaveShadowedData = canHaveShadowedData; + // There must always be deletion info metadata. + // Note: we can't use deletionInfo() because WithEnsureOnHeap's override is not yet set up. + assert trie.get(ByteComparable.EMPTY) != null; + assert stats != null; + } + + public static TrieBackedPartitionStage2 fromIterator(UnfilteredRowIterator iterator) + { + ContentBuilder builder = build(iterator, false); + return new TrieBackedPartitionStage2(iterator.partitionKey(), + iterator.columns(), + iterator.stats(), + builder.rowCountIncludingStatic(), + builder.trie(), + iterator.metadata(), + false); + } + + protected static ContentBuilder build(UnfilteredRowIterator iterator, boolean collectDataSize) + { + try + { + ContentBuilder builder = new ContentBuilder(iterator.metadata(), iterator.partitionLevelDeletion(), iterator.isReverseOrder(), collectDataSize); + + builder.addStatic(iterator.staticRow()); + + while (iterator.hasNext()) + builder.addUnfiltered(iterator.next()); + + return builder.complete(); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + /** + * Create a row with the given properties and content, making sure to copy all off-heap data to keep it alive when + * the given access mode requires it. + */ + public static TrieBackedPartitionStage2 create(DecoratedKey partitionKey, + RegularAndStaticColumns columnMetadata, + EncodingStats encodingStats, + int rowCountIncludingStatic, + Trie trie, + TableMetadata metadata, + EnsureOnHeap ensureOnHeap) + { + return ensureOnHeap == EnsureOnHeap.NOOP + ? new TrieBackedPartitionStage2(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, trie, metadata, true) + : new WithEnsureOnHeap(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, trie, metadata, true, ensureOnHeap); + } + + class RowIterator extends TrieEntriesIterator + { + public RowIterator(Trie trie, Direction direction) + { + super(trie, direction, RowData.class::isInstance); + } + + @Override + protected Row mapContent(Object content, byte[] bytes, int byteLength) + { + var rd = (RowData) content; + return toRow(rd, + metadata.comparator.clusteringFromByteComparable( + ByteBufferAccessor.instance, + ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, bytes, 0, byteLength))); + } + } + + private Iterator rowIterator(Trie trie, Direction direction) + { + return new RowIterator(trie, direction); + } + + static RowData rowToData(Row row) + { + BTreeRow brow = (BTreeRow) row; + return new RowData(brow.getBTree(), row.primaryKeyLivenessInfo(), row.deletion().time(), brow.getMinLocalDeletionTime()); + } + + /** + * Conversion from RowData to Row. TrieBackedPartitionOnHeap overrides this to do the necessary copying + * (hence the non-static method). + */ + Row toRow(RowData data, Clustering clustering) + { + return data.toRow(clustering); + } + + /** + * Put the given unfiltered in the trie. + * @param comparator for converting key to byte-comparable + * @param useRecursive whether the key length is guaranteed short and recursive put can be used + * @param trie destination + * @param row content to put + */ + protected static void putInTrie(ClusteringComparator comparator, boolean useRecursive, InMemoryTrie trie, Row row) throws TrieSpaceExhaustedException + { + trie.putSingleton(comparator.asByteComparable(row.clustering()), rowToData(row), NO_CONFLICT_RESOLVER, useRecursive); + } + + /** + * Check if we can use recursive operations when putting a value in tries. + * True if all types in the clustering keys are fixed length, and total size is small enough. + */ + protected static boolean useRecursive(ClusteringComparator comparator) + { + int length = 1; // terminator + for (AbstractType type : comparator.subtypes()) + if (!type.isValueLengthFixed()) + return false; + else + length += 1 + type.valueLengthIfFixed(); // separator + value + + return length <= MAX_RECURSIVE_KEY_LENGTH; + } + + public TableMetadata metadata() + { + return metadata; + } + + public DecoratedKey partitionKey() + { + return partitionKey; + } + + public DeletionTime partitionLevelDeletion() + { + return deletionInfo().getPartitionDeletion(); + } + + public RegularAndStaticColumns columns() + { + return columns; + } + + public EncodingStats stats() + { + return stats; + } + + public int rowCount() + { + return rowCountIncludingStatic - (hasStaticRow() ? 1 : 0); + } + + public DeletionInfo deletionInfo() + { + return (DeletionInfo) trie.get(ByteComparable.EMPTY); + } + + public ByteComparable path(ClusteringPrefix clustering) + { + return metadata.comparator.asByteComparable(clustering); + } + + public Row staticRow() + { + RowData staticRow = (RowData) trie.get(STATIC_CLUSTERING_PATH); + + if (staticRow != null) + return toRow(staticRow, Clustering.STATIC_CLUSTERING); + else + return Rows.EMPTY_STATIC_ROW; + } + + public boolean isEmpty() + { + return rowCountIncludingStatic == 0 && deletionInfo().isLive(); + } + + private boolean hasStaticRow() + { + return trie.get(STATIC_CLUSTERING_PATH) != null; + } + + public boolean hasRows() + { + return rowCountIncludingStatic > 1 || rowCountIncludingStatic > 0 && !hasStaticRow(); + } + + /** + * Provides read access to the trie for users that can take advantage of it directly (e.g. Memtable). + */ + public Trie trie() + { + return trie; + } + + private Trie nonStaticSubtrie() + { + // skip static row if present - the static clustering sorts before BOTTOM so that it's never included in + // any slices (we achieve this by using the byte ByteSource.EXCLUDED for its representation, which is lower + // than BOTTOM's ByteSource.LT_NEXT_COMPONENT). + return trie.subtrie(BOTTOM_PATH, null); + } + + public Iterator rowIterator() + { + return rowIterator(nonStaticSubtrie(), Direction.FORWARD); + } + + public Iterator rowsIncludingStatic() + { + return rowIterator(trie, Direction.FORWARD); + } + + @Override + public Row lastRow() + { + Iterator reverseIterator = rowIterator(nonStaticSubtrie(), Direction.REVERSE); + return reverseIterator.hasNext() ? reverseIterator.next() : null; + } + + public Row getRow(Clustering clustering) + { + RowData data = (RowData) trie.get(path(clustering)); + + DeletionInfo deletionInfo = deletionInfo(); + RangeTombstone rt = deletionInfo.rangeCovering(clustering); + + // The trie only contains rows, so it doesn't allow to directly account for deletion that should apply to row + // (the partition deletion or the deletion of a range tombstone that covers it). So if needs be, reuse the row + // deletion to carry the proper deletion on the row. + DeletionTime partitionDeletion = deletionInfo.getPartitionDeletion(); + DeletionTime activeDeletion = partitionDeletion; + if (rt != null && rt.deletionTime().supersedes(activeDeletion)) + activeDeletion = rt.deletionTime(); + + if (data == null) + { + // this means our partition level deletion supersedes all other deletions and we don't have to keep the row deletions + if (activeDeletion == partitionDeletion) + return null; + // no need to check activeDeletion.isLive here - if anything superseedes the partitionDeletion + // it must be non-live + return BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(activeDeletion)); + } + + Row row = toRow(data, clustering); + if (!activeDeletion.isLive()) + row = row.filter(ColumnFilter.selection(columns()), activeDeletion, true, metadata()); + return row; + } + + public UnfilteredRowIterator unfilteredIterator() + { + return unfilteredIterator(ColumnFilter.selection(columns()), Slices.ALL, false); + } + + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed) + { + Row staticRow = staticRow(selection, false); + if (slices.size() == 0) + { + DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); + return UnfilteredRowIterators.noRowsIterator(metadata(), partitionKey(), staticRow, partitionDeletion, reversed); + } + + return slices.size() == 1 + ? sliceIterator(selection, slices.get(0), reversed, staticRow) + : new SlicesIterator(selection, slices, reversed, staticRow); + } + + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, NavigableSet> clusteringsInQueryOrder, boolean reversed) + { + Row staticRow = staticRow(selection, false); + if (clusteringsInQueryOrder.isEmpty()) + { + DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); + return UnfilteredRowIterators.noRowsIterator(metadata(), partitionKey(), staticRow, partitionDeletion, reversed); + } + + Iterator rowIter = new AbstractIterator() { + + Iterator> clusterings = clusteringsInQueryOrder.iterator(); + + @Override + protected Row computeNext() + { + while (clusterings.hasNext()) + { + Clustering clustering = clusterings.next(); + Object rowData = trie.get(path(clustering)); + if (rowData instanceof RowData) + return toRow((RowData) rowData, clustering); + } + return endOfData(); + } + }; + + // not using DeletionInfo.rangeCovering(Clustering), because it returns the original range tombstone, + // but we need DeletionInfo.rangeIterator(Set) that generates tombstones based on given clustering bound. + Iterator deleteIter = deletionInfo().rangeIterator(clusteringsInQueryOrder, reversed); + + return merge(rowIter, deleteIter, selection, reversed, staticRow); + } + + private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice, boolean reversed, Row staticRow) + { + ClusteringBound start = slice.start(); + ClusteringBound end = slice.end() == ClusteringBound.TOP ? null : slice.end(); + Iterator rowIter = slice(start, end, reversed); + Iterator deleteIter = deletionInfo().rangeIterator(slice, reversed); + return merge(rowIter, deleteIter, selection, reversed, staticRow); + } + + private Iterator slice(ClusteringBound start, ClusteringBound end, boolean reversed) + { + ByteComparable endPath = end != null ? path(end) : null; + // use BOTTOM as bound to skip over static rows + ByteComparable startPath = start != null ? path(start) : BOTTOM_PATH; + return rowIterator(trie.subtrie(startPath, endPath), Direction.fromBoolean(reversed)); + } + + private Row staticRow(ColumnFilter columns, boolean setActiveDeletionToRow) + { + DeletionTime partitionDeletion = deletionInfo().getPartitionDeletion(); + Row staticRow = staticRow(); + if (columns.fetchedColumns().statics.isEmpty() || (staticRow.isEmpty() && partitionDeletion.isLive())) + return Rows.EMPTY_STATIC_ROW; + + Row row = staticRow.filter(columns, partitionDeletion, setActiveDeletionToRow, metadata()); + return row == null ? Rows.EMPTY_STATIC_ROW : row; + } + + private RowAndDeletionMergeIterator merge(Iterator rowIter, Iterator deleteIter, + ColumnFilter selection, boolean reversed, Row staticRow) + { + return new RowAndDeletionMergeIterator(metadata(), partitionKey(), deletionInfo().getPartitionDeletion(), + selection, staticRow, reversed, stats(), + rowIter, deleteIter, canHaveShadowedData); + } + + + @Override + public String toString() + { + return Partition.toString(this); + } + + class SlicesIterator extends AbstractUnfilteredRowIterator + { + private final Slices slices; + + private int idx; + private Iterator currentSlice; + private final ColumnFilter selection; + + private SlicesIterator(ColumnFilter selection, + Slices slices, + boolean isReversed, + Row staticRow) + { + super(TrieBackedPartitionStage2.this.metadata(), TrieBackedPartitionStage2.this.partitionKey(), + TrieBackedPartitionStage2.this.partitionLevelDeletion(), + selection.fetchedColumns(), staticRow, isReversed, TrieBackedPartitionStage2.this.stats()); + this.selection = selection; + this.slices = slices; + } + + protected Unfiltered computeNext() + { + while (true) + { + if (currentSlice == null) + { + if (idx >= slices.size()) + return endOfData(); + + int sliceIdx = isReverseOrder ? slices.size() - idx - 1 : idx; + currentSlice = sliceIterator(selection, slices.get(sliceIdx), isReverseOrder, Rows.EMPTY_STATIC_ROW); + idx++; + } + + if (currentSlice.hasNext()) + return currentSlice.next(); + + currentSlice = null; + } + } + } + + + /** + * An snapshot of the current TrieBackedPartition data, copied on heap when retrieved. + */ + private static final class WithEnsureOnHeap extends TrieBackedPartitionStage2 + { + final DeletionInfo onHeapDeletion; + EnsureOnHeap ensureOnHeap; + + public WithEnsureOnHeap(DecoratedKey partitionKey, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + Trie trie, + TableMetadata metadata, + boolean canHaveShadowedData, + EnsureOnHeap ensureOnHeap) + { + super(partitionKey, columns, stats, rowCountIncludingStatic, trie, metadata, canHaveShadowedData); + this.ensureOnHeap = ensureOnHeap; + this.onHeapDeletion = ensureOnHeap.applyToDeletionInfo(super.deletionInfo()); + } + + @Override + public Row toRow(RowData data, Clustering clustering) + { + return ensureOnHeap.applyToRow(super.toRow(data, clustering)); + } + + @Override + public DeletionInfo deletionInfo() + { + return onHeapDeletion; + } + } + + /** + * Resolver for operations with trie-backed partitions. We don't permit any overwrites/merges. + */ + public static final InMemoryTrie.UpsertTransformer NO_CONFLICT_RESOLVER = + (existing, update) -> + { + if (existing != null) + throw new AssertionError("Unique rows expected."); + return update; + }; + + /** + * Helper class for constructing tries and deletion info from an iterator or flowable partition. + * + * Note: This is not collecting any stats or columns! + */ + public static class ContentBuilder + { + final TableMetadata metadata; + final ClusteringComparator comparator; + + private final MutableDeletionInfo.Builder deletionBuilder; + private final InMemoryTrie trie; + + private final boolean useRecursive; + private final boolean collectDataSize; + + private int rowCountIncludingStatic; + private long dataSize; + + public ContentBuilder(TableMetadata metadata, DeletionTime partitionLevelDeletion, boolean isReverseOrder, boolean collectDataSize) + { + this.metadata = metadata; + this.comparator = metadata.comparator; + + this.deletionBuilder = MutableDeletionInfo.builder(partitionLevelDeletion, + comparator, + isReverseOrder); + this.trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + + this.useRecursive = useRecursive(comparator); + this.collectDataSize = collectDataSize; + + rowCountIncludingStatic = 0; + dataSize = 0; + } + + public ContentBuilder addStatic(Row staticRow) throws TrieSpaceExhaustedException + { + if (!staticRow.isEmpty()) + return addRow(staticRow); + else + return this; + } + + public ContentBuilder addRow(Row row) throws TrieSpaceExhaustedException + { + putInTrie(comparator, useRecursive, trie, row); + ++rowCountIncludingStatic; + if (collectDataSize) + dataSize += row.dataSize(); + return this; + } + + public ContentBuilder addRangeTombstoneMarker(RangeTombstoneMarker unfiltered) + { + deletionBuilder.add(unfiltered); + return this; + } + + public ContentBuilder addUnfiltered(Unfiltered unfiltered) throws TrieSpaceExhaustedException + { + if (unfiltered.kind() == Unfiltered.Kind.ROW) + return addRow((Row) unfiltered); + else + return addRangeTombstoneMarker((RangeTombstoneMarker) unfiltered); + } + + public ContentBuilder complete() throws TrieSpaceExhaustedException + { + MutableDeletionInfo deletionInfo = deletionBuilder.build(); + trie.putRecursive(ByteComparable.EMPTY, deletionInfo, NO_CONFLICT_RESOLVER); // will throw if called more than once + // dataSize does not include the deletion info bytes + return this; + } + + public Trie trie() + { + return trie; + } + + public int rowCountIncludingStatic() + { + return rowCountIncludingStatic; + } + + public int dataSize() + { + assert collectDataSize; + return Ints.saturatedCast(dataSize); + } + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage3.java b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage3.java new file mode 100644 index 000000000000..48dbaa7129dc --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TrieBackedPartitionStage3.java @@ -0,0 +1,902 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import java.util.Iterator; +import java.util.NavigableSet; +import java.util.function.BiFunction; +import java.util.function.Predicate; + +import com.google.common.base.Predicates; +import com.google.common.primitives.Ints; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.IDataSize; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.rows.BTreeComplexColumn; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.InMemoryTrie; +import org.apache.cassandra.db.tries.RangeTrie; +import org.apache.cassandra.db.tries.TrieEntriesIterator; +import org.apache.cassandra.db.tries.TrieSet; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.memory.Cloner; +import org.apache.cassandra.utils.memory.EnsureOnHeap; + +/// In-memory partition backed by a deletion-aware trie. The rows of the partition are values in the leaves of the trie, +/// where the key to the row is only stored as the path to reach that leaf; static rows are also treated as a row with +/// `STATIC_CLUSTERING` path; the deletion information is placed in a deletion branch of the trie which starts at the +/// root of the partition. This matches how `TrieMemtable` stores partitions within the larger map, so that +/// `TrieBackedPartition` objects can be created directly from `TrieMemtable` tail tries. +/// +/// This object also holds the partition key, as well as some metadata (columns and statistics). +/// Currently, all descendants and instances of this class are immutable (even tail tries from mutable memtables are +/// guaranteed to not change as we use forced copying below the partition level), though this may change in the future. +public class TrieBackedPartitionStage3 implements Partition +{ + public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS50; + + /** Pre-made path for STATIC_CLUSTERING, to avoid creating path object when querying static path. */ + public static final ByteComparable STATIC_CLUSTERING_PATH = v -> ByteSource.oneByte(ClusteringPrefix.Kind.STATIC_CLUSTERING.asByteComparableValue(v)); + /** Pre-made path for BOTTOM, to avoid creating path object when iterating rows. */ + public static final ByteComparable BOTTOM_PATH = v -> ByteSource.oneByte(ClusteringPrefix.Kind.INCL_START_BOUND.asByteComparableValue(v)); + + /// Interface implemented by partition markers, both the singleton below used for standalone [TrieBackedPartitionStage3], + /// and the marker used in tail tries in `TrieMemtable`s. + public interface PartitionMarker {} + + /// Singleton partition marker used for standalone [TrieBackedPartitionStage3] and [TriePartitionUpdateStage3] objects. + public static final PartitionMarker PARTITION_MARKER = new PartitionMarker() + { + public String toString() + { + return "PARTITION_MARKER"; + } + }; + + /// Predicate to identify partition boundaries in tries. This accepts any [PartitionMarker], not just the + /// [#PARTITION_MARKER] used for standalone trie-backed partitions. + public static final Predicate IS_PARTITION_BOUNDARY = TrieBackedPartitionStage3::isPartitionBoundary; + + /// Returns true if the given content is a partition marker. + public static boolean isPartitionBoundary(Object content) + { + return content instanceof TrieBackedPartitionStage3.PartitionMarker; + } + + /// The representation of a row stored at the leaf of a trie. Does not contain the row key. + /// + /// The method [#toRow] combines this with a clustering for the represented [Row]. + public static class RowData implements IDataSize + { + final Object[] columnsBTree; + final LivenessInfo livenessInfo; + final long minLocalDeletionTime; + + RowData(Object[] columnsBTree, LivenessInfo livenessInfo) + { + this(columnsBTree, livenessInfo, BTreeRow.minDeletionTime(columnsBTree, livenessInfo, DeletionTime.LIVE)); + } + + RowData(Object[] columnsBTree, LivenessInfo livenessInfo, long minLocalDeletionTime) + { + this.columnsBTree = columnsBTree; + this.livenessInfo = livenessInfo; + this.minLocalDeletionTime = minLocalDeletionTime; + } + + Row toRow(Clustering clustering, DeletionTime deletion) + { + return BTreeRow.create(clustering, + livenessInfo, + Row.Deletion.regular(deletion), + columnsBTree, + minLocalDeletionTime); + } + + public int dataSize() + { + int dataSize = livenessInfo.dataSize(); + + return Ints.checkedCast(BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.dataSize(), dataSize)); + } + + public long unsharedHeapSizeExcludingData() + { + long heapSize = EMPTY_ROWDATA_SIZE + + BTree.sizeOfStructureOnHeap(columnsBTree) + + livenessInfo.unsharedHeapSize(); + + return BTree.accumulate(columnsBTree, (ColumnData cd, long v) -> v + cd.unsharedHeapSizeExcludingData(), heapSize); + } + + public String toString() + { + return "row " + livenessInfo + " size " + dataSize() + ": " + BTree.toString(columnsBTree); + } + + public RowData clone(Cloner cloner) + { + Object[] tree = BTree.transform(columnsBTree, c -> c.clone(cloner)); + return new RowData(tree, livenessInfo, minLocalDeletionTime); + } + + public RowData delete(DeletionTime activeDeletion) + { + LivenessInfo newLiveness = livenessInfo; + if (activeDeletion.deletes(livenessInfo.timestamp())) + newLiveness = LivenessInfo.EMPTY; + + Object[] newBTree = BTree.transformAndFilter(columnsBTree, cd -> + { + ColumnMetadata column = cd.column(); + if (column.isComplex()) + return ((BTreeComplexColumn) cd).delete(activeDeletion); + + Cell cell = (Cell) cd; + return activeDeletion.deletes(cell) ? null : cell; + }); + + if (newLiveness == livenessInfo && newBTree == columnsBTree) + return this; + if (newLiveness.isEmpty() && newBTree == BTree.empty()) + return null; + return new RowData(newBTree, newLiveness); + } + } + + private static final long EMPTY_ROWDATA_SIZE = ObjectSizes.measure(new RowData(null, null, 0)); + + protected final DeletionAwareTrie trie; + protected final DecoratedKey partitionKey; + protected final TableMetadata metadata; + protected final RegularAndStaticColumns columns; + protected final EncodingStats stats; + protected final int rowCountIncludingStatic; + protected final int tombstoneCount; + + public TrieBackedPartitionStage3(DecoratedKey partitionKey, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + int tombstoneCount, + DeletionAwareTrie trie, + TableMetadata metadata) + { + this.partitionKey = partitionKey; + this.trie = trie; + this.metadata = metadata; + this.columns = columns; + this.stats = stats; + this.rowCountIncludingStatic = rowCountIncludingStatic; + this.tombstoneCount = tombstoneCount; + // There must always be a partition marker. + assert trie.get(ByteComparable.EMPTY) != null; + assert stats != null; + } + + public static TrieBackedPartitionStage3 fromIterator(UnfilteredRowIterator iterator) + { + ContentBuilder builder = build(iterator, false); + return new TrieBackedPartitionStage3(iterator.partitionKey(), + iterator.columns(), + iterator.stats(), + builder.rowCountIncludingStatic(), + builder.tombstoneCount(), + builder.trie(), + iterator.metadata()); + } + + protected static ContentBuilder build(UnfilteredRowIterator iterator, boolean collectDataSize) + { + try + { + ContentBuilder builder = new ContentBuilder(iterator.metadata(), iterator.partitionLevelDeletion(), iterator.isReverseOrder(), collectDataSize); + + builder.addStatic(iterator.staticRow()); + + while (iterator.hasNext()) + builder.addUnfiltered(iterator.next()); + + return builder.complete(); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + /// Create a row with the given properties and content, making sure to copy all off-heap data to keep it alive when + /// the given access mode requires it. + public static TrieBackedPartitionStage3 create(DecoratedKey partitionKey, + RegularAndStaticColumns columnMetadata, + EncodingStats encodingStats, + int rowCountIncludingStatic, + int tombstoneCount, + DeletionAwareTrie trie, + TableMetadata metadata, + EnsureOnHeap ensureOnHeap) + { + return ensureOnHeap == EnsureOnHeap.NOOP + ? new TrieBackedPartitionStage3(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, tombstoneCount, trie, metadata) + : new WithEnsureOnHeap(partitionKey, columnMetadata, encodingStats, rowCountIncludingStatic, tombstoneCount, trie, metadata, ensureOnHeap); + } + + class RowIterator extends TrieEntriesIterator.WithNullFiltering + { + public RowIterator(DeletionAwareTrie trie, Direction direction) + { + // Even though this is a row iterator, it must list deleted rows. + super(trie.mergedTrie(TrieBackedPartitionStage3::combineDataAndDeletion), direction); + } + + @Override + protected Row mapContent(Object content, byte[] bytes, int byteLength) + { + if (content instanceof RowData) + return toRow((RowData) content, + getClustering(bytes, byteLength)); + if (content instanceof Row) + { + BTreeRow row = (BTreeRow) content; + return BTreeRow.create(getClustering(bytes, byteLength), + row.primaryKeyLivenessInfo(), + row.deletion(), + row.getBTree(), + row.getMinLocalDeletionTime()); + } + + TrieTombstoneMarker marker = (TrieTombstoneMarker) content; + DeletionTime pointDeletion = marker.pointDeletion(); + if (pointDeletion != null) + return BTreeRow.emptyDeletedRow(getClustering(bytes, byteLength), + Row.Deletion.regular(marker.pointDeletion())); + else + return null; + } + } + + private Iterator rowIterator(DeletionAwareTrie trie, Direction direction) + { + return new RowIterator(trie, direction); + } + + static RowData rowToData(Row row) + { + BTreeRow brow = (BTreeRow) row; + return new RowData(brow.getBTree(), row.primaryKeyLivenessInfo(), brow.getMinLocalDeletionTime()); + } + + /// Conversion from [RowData] to [Row]. [WithEnsureOnHeap] overrides this to do the necessary copying + /// (hence the non-static method). + Row toRow(RowData data, Clustering clustering) + { + return data.toRow(clustering, DeletionTime.LIVE); + } + + /// Put the given unfiltered in the trie, used by methods to build stand-alone partitions. + /// + /// @param comparator for converting key to byte-comparable + /// @param trie destination + /// @param row content to put + protected static void putInTrie(ClusteringComparator comparator, InMemoryDeletionAwareTrie trie, Row row) + throws TrieSpaceExhaustedException + { + // We do not look for atomicity here, so can do the two steps separately. + Clustering clustering = row.clustering(); + DeletionTime deletionTime = row.deletion().time(); + InMemoryDeletionAwareTrie.Mutator m = makeMutator(trie); + + ByteComparable comparableClustering = comparator.asByteComparable(clustering); + if (!deletionTime.isLive()) + { + m.delete(RangeTrie.point(comparableClustering, + BYTE_COMPARABLE_VERSION, + true, + TrieTombstoneMarker.point(deletionTime, TrieTombstoneMarker.Kind.ROW))); + } + if (!row.isEmptyAfterDeletion()) + m.apply(DeletionAwareTrie.singleton(comparableClustering, BYTE_COMPARABLE_VERSION, rowToData(row))); + } + + private static InMemoryDeletionAwareTrie.Mutator makeMutator(InMemoryDeletionAwareTrie trie) throws TrieSpaceExhaustedException + { + return trie.mutator(noConflictInData(), + mergeTombstoneRanges(), + noIncomingSelfDeletion(), + noExistingSelfDeletion(), + true, + Predicates.alwaysFalse()); + } + + protected static void putMarkerInTrie(ClusteringComparator comparator, + InMemoryDeletionAwareTrie trie, + RangeTombstoneMarker openMarker, + RangeTombstoneMarker closeMarker) + { + DeletionTime deletionTime = openMarker.openDeletionTime(false); + assert deletionTime.equals(closeMarker.closeDeletionTime(false)); + putDeletionInTrie(trie, + comparator.asByteComparable(openMarker.clustering()), + comparator.asByteComparable(closeMarker.clustering()), + deletionTime); + } + + protected static void putPartitionDeletionInTrie(InMemoryDeletionAwareTrie trie, + DeletionTime deletionTime) + { + try + { + makeMutator(trie).delete(RangeTrie.branch(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.PARTITION))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + static void putDeletionInTrie(InMemoryDeletionAwareTrie trie, + ByteComparable start, + ByteComparable end, + DeletionTime deletionTime) + { + try + { + makeMutator(trie).delete(RangeTrie.range(start, true, + end, false, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.RANGE))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + + public TableMetadata metadata() + { + return metadata; + } + + public DecoratedKey partitionKey() + { + return partitionKey; + } + + public DeletionTime partitionLevelDeletion() + { + return TrieTombstoneMarker.applicableDeletionOrLive(trie, ByteComparable.EMPTY); + } + + public RegularAndStaticColumns columns() + { + return columns; + } + + public EncodingStats stats() + { + return stats; + } + + public int rowCount() + { + return rowCountIncludingStatic - (hasStaticRow() ? 1 : 0); + } + + public ByteComparable path(ClusteringPrefix clustering) + { + return metadata.comparator.asByteComparable(clustering); + } + + public Row staticRow() + { + // Static rows can only be deleted via the partition deletion. There is no need to check and apply that here. + RowData staticRow = (RowData) trie.get(STATIC_CLUSTERING_PATH); + return staticRow != null ? staticRow.toRow(Clustering.STATIC_CLUSTERING, DeletionTime.LIVE) : Rows.EMPTY_STATIC_ROW; + } + + public boolean isEmpty() + { + return rowCountIncludingStatic + tombstoneCount == 0; + } + + private boolean hasStaticRow() + { + return trie.get(STATIC_CLUSTERING_PATH) != null; + } + + public boolean hasRows() + { + return rowCountIncludingStatic > 1 || rowCountIncludingStatic > 0 && !hasStaticRow(); + } + + /// Provides read access to the trie for users that can take advantage of it directly (e.g. `TrieMemtable`). + public DeletionAwareTrie trie() + { + return trie; + } + + private DeletionAwareTrie nonStaticSubtrie() + { + // skip static row if present - the static clustering sorts before BOTTOM so that it's never included in + // any slices (we achieve this by using the byte ByteSource.EXCLUDED for its representation, which is lower + // than BOTTOM's ByteSource.LT_NEXT_COMPONENT). + return trie.subtrie(BOTTOM_PATH, null); + } + + public Iterator rowIterator() + { + return rowIterator(nonStaticSubtrie(), Direction.FORWARD); + } + + public Iterator rowsIncludingStatic() + { + return rowIterator(trie, Direction.FORWARD); + } + + @Override + public Row lastRow() + { + Iterator reverseIterator = rowIterator(nonStaticSubtrie(), Direction.REVERSE); + return reverseIterator.hasNext() ? reverseIterator.next() : null; + } + + public Row getRow(Clustering clustering) + { + return getRow(clustering, path(clustering)); + } + + public Row getRow(Clustering clustering, ByteComparable path) + { + RowData data = (RowData) trie.get(path); + DeletionTime deletion = TrieTombstoneMarker.applicableDeletionOrLive(trie, path); + if (data != null) + return data.toRow(clustering, deletion); + else if (!deletion.isLive()) + return BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(deletion)); + else + return null; + } + + public UnfilteredRowIterator unfilteredIterator() + { + return unfilteredIterator(ColumnFilter.selection(columns()), Slices.ALL, false); + } + + public static Object combineDataAndDeletion(Object data, TrieTombstoneMarker deletion) + { + if (data == null) + return deletion; // Range or partitions tombstones will follow this path. + // drop the PartitionMarker + if (data instanceof PartitionMarker) + return deletion; + + if (deletion == null) + return data; + // mergedTrie will give the covering deletion for any row it reports (i.e. active range or partition deletion); + // ignore it as we don't want to change rows' deletion time to apply it. + if (!deletion.isBoundary()) + return data; + // Tombstone boundaries have different clustering positions than rows; the only boundary that can match the + // position of a row is a point deletion. + DeletionTime delTime = deletion.pointDeletion(); + assert delTime != null + : "Deletion tombstone boundary " + deletion + " clashes with row " + data; + + // This is a row combined with a point deletion. + RowData rowData = (RowData) data; + return rowData.toRow(Clustering.EMPTY, delTime); + } + + private Clustering getClustering(byte[] bytes, int byteLength) + { + return metadata.comparator.clusteringFromByteComparable(ByteBufferAccessor.instance, + ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, + bytes, 0, byteLength), + BYTE_COMPARABLE_VERSION); + } + + /// Implementation of [UnfilteredRowIterator] for this partition. + /// + /// Currently, this implementation has to revert the transformation done to partition-level deletions. To do that, + /// we extract the partition-level deletion from its coverage of the static row and filter out tombstone ranges that + /// switch to it. + class UnfilteredIterator + extends TrieEntriesIterator.WithNullFiltering + implements UnfilteredRowIterator + { + final boolean reversed; + final ColumnFilter selection; + final DeletionTime partitionLevelDeletion; + final DeletionAwareTrie trie; + final Row staticRow; + + protected UnfilteredIterator(ColumnFilter selection, DeletionAwareTrie trie, boolean reversed) + { + this(selection, trie, reversed, TrieBackedPartitionStage3.this.partitionLevelDeletion()); + } + + private UnfilteredIterator(ColumnFilter selection, DeletionAwareTrie trie, boolean reversed, DeletionTime partitionLevelDeletion) + { + super(trie.mergedTrieSwitchable(TrieBackedPartitionStage3::combineDataAndDeletion), + Direction.fromBoolean(reversed)); + this.trie = trie; + this.selection = selection; + this.reversed = reversed; + this.partitionLevelDeletion = partitionLevelDeletion; + Row staticRow = TrieBackedPartitionStage3.this.staticRow().filter(selection, metadata()); + this.staticRow = staticRow != null ? staticRow : Rows.EMPTY_STATIC_ROW; + } + + @Override + protected Unfiltered mapContent(Object content, byte[] bytes, int byteLength) + { + if (content instanceof RowData) + return toRow((RowData) content, + getClustering(bytes, byteLength)) // deletion is given as range tombstone + .filter(selection, metadata()); + if (content instanceof Row) + { + BTreeRow row = (BTreeRow) content; + return BTreeRow.create(getClustering(bytes, byteLength), + row.primaryKeyLivenessInfo(), + row.deletion(), + row.getBTree(), + row.getMinLocalDeletionTime()) + .filter(selection, metadata()); + } + + TrieTombstoneMarker marker = (TrieTombstoneMarker) content; + DeletionTime pointDeletion = marker.pointDeletion(); + if (pointDeletion != null) + return BTreeRow.emptyDeletedRow(getClustering(bytes, byteLength), + Row.Deletion.regular(pointDeletion)); + else if (byteLength > 0) + return ((TrieTombstoneMarker) content).toRangeTombstoneMarker( + ByteComparable.preencoded(BYTE_COMPARABLE_VERSION, bytes, 0, byteLength), + BYTE_COMPARABLE_VERSION, + metadata.comparator); + else // partition deletion markers do not need to be presented + return null; + } + + @Override + public DeletionTime partitionLevelDeletion() + { + return partitionLevelDeletion; + } + + @Override + public EncodingStats stats() + { + return stats; + } + + @Override + public TableMetadata metadata() + { + return metadata; + } + + @Override + public boolean isReverseOrder() + { + return reversed; + } + + @Override + public RegularAndStaticColumns columns() + { + return selection.fetchedColumns(); + } + + @Override + public DecoratedKey partitionKey() + { + return partitionKey; + } + + @Override + public Row staticRow() + { + return staticRow; + } + + @Override + public void close() + { + // nothing to close + } + + @Override + public boolean stopIssuingTombstones() + { + ((DeletionAwareTrie.DeletionsStopControl) cursor).stopIssuingDeletions(this); + + Unfiltered next = peekNextIfAvailable(); + if (next != null && next.isRangeTombstoneMarker()) + consumeNext(); + return true; + } + } + + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, ByteComparable[] bounds, boolean reversed) + { + if (bounds.length == 0) + return UnfilteredRowIterators.noRowsIterator(metadata, partitionKey, staticRow(), partitionLevelDeletion(), reversed); + + DeletionAwareTrie slicedTrie = + trie.intersect(TrieSet.ranges(BYTE_COMPARABLE_VERSION, bounds)); + return new UnfilteredIterator(selection, slicedTrie, reversed); + } + + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed) + { + ByteComparable[] bounds = new ByteComparable[slices.size() * 2]; + int index = 0; + for (Slice slice : slices) + { + bounds[index++] = metadata.comparator.asByteComparable(slice.start()); + bounds[index++] = metadata.comparator.asByteComparable(slice.end()); + } + return unfilteredIterator(selection, bounds, reversed); + } + + public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, NavigableSet> clusteringsInQueryOrder, boolean reversed) + { + ByteComparable[] bounds = new ByteComparable[clusteringsInQueryOrder.size() * 2]; + // Trie intersection requires the boundaries to be given in forward order. Our clusterings are given in query + // order, which is why we have to reverse them if we are making a reversed iterator. + int index = reversed ? (clusteringsInQueryOrder.size() - 1) * 2 : 0; + int indexInc = reversed ? -2 : +2; + for (Clustering clustering : clusteringsInQueryOrder) + { + bounds[index + 0] = metadata.comparator.asByteComparable(clustering.asStartBound()); + bounds[index + 1] = metadata.comparator.asByteComparable(clustering.asEndBound()); + index += indexInc; + } + return unfilteredIterator(selection, bounds, reversed); + } + + @Override + public String toString() + { + return Partition.toString(this); + } + + /// A snapshot of the current [TrieBackedPartitionStage3] data, copied on heap when retrieved. + private static final class WithEnsureOnHeap extends TrieBackedPartitionStage3 + { + EnsureOnHeap ensureOnHeap; + + public WithEnsureOnHeap(DecoratedKey partitionKey, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + int tombstoneCount, + DeletionAwareTrie trie, + TableMetadata metadata, + EnsureOnHeap ensureOnHeap) + { + super(partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, trie, metadata); + this.ensureOnHeap = ensureOnHeap; + } + + @Override + public Row toRow(RowData data, Clustering clustering) + { + return ensureOnHeap.applyToRow(super.toRow(data, clustering)); + } + } + + /// Resolver for operations with trie-backed partitions. We don't permit any overwrites/merges. + @SuppressWarnings("rawtypes") + private static final InMemoryTrie.UpsertTransformer NO_CONFLICT_RESOLVER = + (existing, update) -> + { + if (existing != null) + throw new AssertionError("Unique rows expected."); + return update; + }; + + /// Resolver for data in trie-backed partitions. We don't permit any overwrites/merges. + @SuppressWarnings("unchecked") + public static InMemoryTrie.UpsertTransformer noConflictInData() + { + return NO_CONFLICT_RESOLVER; + } + + /// Tombstone merging resolver. Even though we don't support overwrites, we get requests to add the two sides + /// of a boundary separately and must join them. + private static final InMemoryTrie.UpsertTransformer MERGE_TOMBSTONE_RANGES = + (existing, update) -> existing != null ? existing.mergeWith(update) : update; + + /// Tombstone merging resolver. Even though we don't support overwrites, we get requests to add the two sides + /// of a boundary separately and must join them. + public static InMemoryTrie.UpsertTransformer mergeTombstoneRanges() + { + return MERGE_TOMBSTONE_RANGES; + } + + private static final InMemoryBaseTrie.UpsertTransformer IGNORE_UPDATE = (left, right) -> left; + + /// Resolver for applying incoming deletions to existing data in trie-backed partitions. We assume that the data is + /// not affected by the deletion. + public static InMemoryTrie.UpsertTransformer noIncomingSelfDeletion() + { + return IGNORE_UPDATE; + } + + private static final BiFunction IGNORE_EXISTING = (left, right) -> right; + + /// Resolver for applying existing deletions to incoming data in trie-backed partitions. We assume that the data is + /// not affected by the deletion. + public static BiFunction noExistingSelfDeletion() + { + return IGNORE_EXISTING; + } + + /// Helper class for constructing tries and deletion info from an iterator. + public static class ContentBuilder + { + final TableMetadata metadata; + final ClusteringComparator comparator; + + private final InMemoryDeletionAwareTrie trie; + + private final boolean collectDataSize; + + private int rowCountIncludingStatic; + private int tombstoneCount; + private long dataSize; + + private RangeTombstoneMarker openMarker = null; + private final boolean isReverseOrder; + + public ContentBuilder(TableMetadata metadata, DeletionTime partitionLevelDeletion, boolean isReverseOrder, boolean collectDataSize) + { + this.metadata = metadata; + this.comparator = metadata.comparator; + + this.trie = InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION); + + this.collectDataSize = collectDataSize; + + rowCountIncludingStatic = 0; + tombstoneCount = 0; + dataSize = 0; + this.isReverseOrder = isReverseOrder; + + if (!partitionLevelDeletion.isLive()) + { + putPartitionDeletionInTrie(trie, partitionLevelDeletion); + ++tombstoneCount; + } + } + + public ContentBuilder addStatic(Row staticRow) throws TrieSpaceExhaustedException + { + if (!staticRow.isEmpty()) + return addRow(staticRow); + else + return this; + } + + public ContentBuilder addRow(Row row) throws TrieSpaceExhaustedException + { + putInTrie(comparator, trie, row); + ++rowCountIncludingStatic; + if (collectDataSize) + dataSize += row.dataSize(); + if (!row.deletion().isLive()) + ++tombstoneCount; + return this; + } + + public ContentBuilder addRangeTombstoneMarker(RangeTombstoneMarker unfiltered) + { + if (openMarker != null) + { + // This will check that unfiltered closes openMarker + putMarkerInTrie(comparator, trie, + isReverseOrder ? unfiltered : openMarker, + isReverseOrder ? openMarker : unfiltered); + ++tombstoneCount; + if (unfiltered.isOpen(isReverseOrder)) + openMarker = unfiltered; + else + openMarker = null; + } + else + { + assert unfiltered.isOpen(isReverseOrder); + openMarker = unfiltered; + } + return this; + } + + public ContentBuilder addUnfiltered(Unfiltered unfiltered) throws TrieSpaceExhaustedException + { + if (unfiltered.kind() == Unfiltered.Kind.ROW) + return addRow((Row) unfiltered); + else + return addRangeTombstoneMarker((RangeTombstoneMarker) unfiltered); + } + + public ContentBuilder complete() throws TrieSpaceExhaustedException + { + assert openMarker == null; + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); // will throw if called more than once + return this; + } + + public DeletionAwareTrie trie() + { + return trie; + } + + public int rowCountIncludingStatic() + { + return rowCountIncludingStatic; + } + + public int tombstoneCount() + { + return tombstoneCount; + } + + public int dataSize() + { + assert collectDataSize; + return Ints.saturatedCast(dataSize); + } + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdate.java index bd5e120b9dd0..8c477532a360 100644 --- a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdate.java +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdate.java @@ -19,17 +19,21 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.EnumSet; import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Set; -import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Predicates; import com.google.common.collect.Iterators; import com.google.common.primitives.Ints; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import net.openhft.chronicle.values.NotNull; +import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.Columns; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.DeletionInfo; @@ -40,19 +44,26 @@ import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.Cells; import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.rows.UnfilteredRowIterators; -import org.apache.cassandra.db.tries.InMemoryTrie; -import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.RangeTrie; import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; /** * A trie-backed PartitionUpdate. Immutable. @@ -67,6 +78,9 @@ public class TriePartitionUpdate extends TrieBackedPartition implements Partitio public static final Factory FACTORY = new TrieFactory(); + private static EnumSet ROW_DELETION_KINDS = + EnumSet.of(TrieTombstoneMarker.Kind.ROW, TrieTombstoneMarker.Kind.RANGE, TrieTombstoneMarker.Kind.PARTITION); + final int dataSize; private TriePartitionUpdate(TableMetadata metadata, @@ -74,11 +88,11 @@ private TriePartitionUpdate(TableMetadata metadata, RegularAndStaticColumns columns, EncodingStats stats, int rowCountIncludingStatic, + int tombstoneCount, int dataSize, - Trie trie, - boolean canHaveShadowedData) + InMemoryDeletionAwareTrie trie) { - super(key, columns, stats, rowCountIncludingStatic, trie, metadata, canHaveShadowedData); + super(key, columns, stats, rowCountIncludingStatic, tombstoneCount, trie, metadata); this.dataSize = dataSize; } @@ -88,6 +102,7 @@ public boolean equals(Object obj) if (!(obj instanceof TriePartitionUpdate)) return false; + // FIXME TriePartitionUpdate that = (TriePartitionUpdate) obj; return partitionKey.equals(that.partitionKey) && metadata().id.equals(that.metadata().id) @@ -97,12 +112,12 @@ && staticRow().equals(that.staticRow()) } - private static InMemoryTrie newTrie(DeletionInfo deletion) + private static InMemoryDeletionAwareTrie newTrie() { - InMemoryTrie trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + InMemoryDeletionAwareTrie trie = TrieBackedRow.newTrie(); try { - trie.putRecursive(ByteComparable.EMPTY, deletion, NO_CONFLICT_RESOLVER); + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); } catch (TrieSpaceExhaustedException e) { @@ -111,14 +126,7 @@ private static InMemoryTrie newTrie(DeletionInfo deletion) return trie; } - /** - * Creates a empty immutable partition update. - * - * @param metadata the metadata for the created update. - * @param key the partition key for the created update. - * - * @return the newly created empty (and immutable) update. - */ + /** @see PartitionUpdate.Factory#emptyUpdate */ public static TriePartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey key) { return new TriePartitionUpdate(metadata, @@ -127,46 +135,30 @@ public static TriePartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedK EncodingStats.NO_STATS, 0, 0, - newTrie(MutableDeletionInfo.live()), - false); + 0, + newTrie()); } - /** - * Creates an immutable partition update that entirely deletes a given partition. - * - * @param metadata the metadata for the created update. - * @param key the partition key for the partition that the created update should delete. - * @param timestamp the timestamp for the deletion. - * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion. - * - * @return the newly created partition deletion update. - */ + /** @see PartitionUpdate.Factory#fullPartitionDelete */ public static TriePartitionUpdate fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec) { - MutableDeletionInfo deletion = new MutableDeletionInfo(timestamp, nowInSec); + InMemoryDeletionAwareTrie trie = newTrie(); + putPartitionDeletionInTrie(trie, DeletionTime.build(timestamp, nowInSec)); return new TriePartitionUpdate(metadata, key, RegularAndStaticColumns.NONE, new EncodingStats(timestamp, nowInSec, LivenessInfo.NO_TTL), 0, + 1, 0, - newTrie(deletion), - false); + trie); } - /** - * Creates an immutable partition update that contains a single row update. - * - * @param metadata the metadata for the created update. - * @param key the partition key for the partition to update. - * @param row the row for the update, may be a regular or static row and cannot be null. - * - * @return the newly created partition update containing only {@code row}. - */ + /** @see PartitionUpdate.Factory#singleRowUpdate */ public static TriePartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey key, Row row) { - EncodingStats stats = EncodingStats.Collector.forRow(row); - InMemoryTrie trie = newTrie(DeletionInfo.LIVE); + EncodingStats stats = row.isEmpty() ? EncodingStats.NO_STATS : EncodingStats.Collector.forRow(row); + InMemoryDeletionAwareTrie trie = newTrie(); RegularAndStaticColumns columns; if (row.isStatic()) @@ -176,38 +168,17 @@ public static TriePartitionUpdate singleRowUpdate(TableMetadata metadata, Decora try { - putInTrie(metadata.comparator, useRecursive(metadata.comparator), trie, row); + putInTrie(metadata, metadata.comparator, trie, row); } catch (TrieSpaceExhaustedException e) { throw new AssertionError(e); } - return new TriePartitionUpdate(metadata, key, columns, stats, 1, row.dataSize(), trie, false); + return new TriePartitionUpdate(metadata, key, columns, stats, 1, row.deletion().isLive() ? 0 : 1, row.dataSize(), trie); } - /** - * Creates an immutable partition update that contains a single row update. - * - * @param metadata the metadata for the created update. - * @param key the partition key for the partition to update. - * @param row the row for the update. - * - * @return the newly created partition update containing only {@code row}. - */ - public static TriePartitionUpdate singleRowUpdate(TableMetadata metadata, ByteBuffer key, Row row) - { - return singleRowUpdate(metadata, metadata.partitioner.decorateKey(key), row); - } - - /** - * Turns the given iterator into an update. - * - * @param iterator the iterator to turn into updates. - * - * Warning: this method does not close the provided iterator, it is up to - * the caller to close it. - */ + /** @see PartitionUpdate.Factory#fromIterator(UnfilteredRowIterator) */ @SuppressWarnings("resource") public static TriePartitionUpdate fromIterator(UnfilteredRowIterator iterator) { @@ -218,11 +189,14 @@ public static TriePartitionUpdate fromIterator(UnfilteredRowIterator iterator) iterator.columns(), iterator.stats(), builder.rowCountIncludingStatic(), + builder.tombstoneCount(), builder.dataSize(), - builder.trie(), - false); + builder.trie()); } + /** + * Convert the given update (with unknown implementation type) to a TriePartitionUpdate for insertion in a trie. + */ public static TriePartitionUpdate asTrieUpdate(PartitionUpdate update) { if (update instanceof TriePartitionUpdate) @@ -234,128 +208,147 @@ public static TriePartitionUpdate asTrieUpdate(PartitionUpdate update) } } - public static Trie asMergableTrie(PartitionUpdate update) + /** + * Convert the given update (with unknown implementation type) to its trie representation, including the partition + * key prefix. + */ + public static DeletionAwareTrie asMergableTrie(PartitionUpdate update) { return asTrieUpdate(update).trie.prefixedBy(update.partitionKey()); } - /** - * Modify this update to set every timestamp for live data to {@code newTimestamp} and - * every deletion timestamp to {@code newTimestamp - 1}. - * - * There is no reason to use that except on the Paxos code path, where we need to ensure that - * anything inserted uses the ballot timestamp (to respect the order of updates decided by - * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones - * always win on timestamp equality and we don't want to delete our own insertions - * (typically, when we overwrite a collection, we first set a complex deletion to delete the - * previous collection before adding new elements. If we were to set that complex deletion - * to the same timestamp that the new elements, it would delete those elements). And since - * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still - * delete anything from a previous update. - */ @Override public TriePartitionUpdate withUpdatedTimestamps(long newTimestamp) { - InMemoryTrie t = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + InMemoryDeletionAwareTrie t = TrieBackedRow.newTrie(); try { - t.apply(trie, new InMemoryTrie.UpsertTransformer() - { - public Object apply(Object shouldBeNull, Object o) - { - assert shouldBeNull == null; - if (o instanceof RowData) - return applyRowData((RowData) o); - else - return applyDeletion((DeletionInfo) o); - } - - public RowData applyRowData(RowData update) - { - LivenessInfo newInfo = update.livenessInfo.isEmpty() - ? update.livenessInfo - : update.livenessInfo.withUpdatedTimestamp(newTimestamp); - DeletionTime newDeletion = update.deletion.isLive() - ? DeletionTime.LIVE - : DeletionTime.build(newTimestamp - 1, update.deletion.localDeletionTime()); - - return new RowData(BTree.transformAndFilter(update.columnsBTree, - (ColumnData cd) -> cd.updateAllTimestamp(newTimestamp)), - newInfo, newDeletion); - } - - public DeletionInfo applyDeletion(DeletionInfo update) - { - if (update.isLive()) - return update; - - MutableDeletionInfo mdi = update.mutableCopy(); - mdi.updateAllTimestamp(newTimestamp - 1); - return mdi; - } - }, x -> false); + t.apply(trie, + (shouldBeNull, o) -> + { + assert shouldBeNull == null; + if (o instanceof Cell) + return ((Cell) o).updateAllTimestamp(newTimestamp); + + if (o == TrieBackedRow.COMPLEX_COLUMN_MARKER) + return o; + + if (o == LivenessInfo.EMPTY) + return o; // Empty liveness should remain unchanged. + + if (o instanceof LivenessInfo) + return ((LivenessInfo) o).withUpdatedTimestamp(newTimestamp); + + if (o instanceof PartitionMarker) + return o; + + throw new AssertionError("Unexpected data in trie: " + o); + }, + (shouldBeNull, o) -> + { + assert shouldBeNull == null; + return o.withUpdatedTimestamp(newTimestamp - 1); + }, + noIncomingSelfDeletion(), + noExistingSelfDeletion(), + true, + x -> false); } catch (TrieSpaceExhaustedException e) { throw new AssertionError(e); } - return new TriePartitionUpdate(metadata, partitionKey, columns, stats, rowCountIncludingStatic, dataSize, t, canHaveShadowedData); + return new TriePartitionUpdate(metadata, partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, dataSize, t); } - /** - * The number of "operations" contained in the update. - *

- * This is used by {@code Memtable} to approximate how much work this update does. In practice, this - * count how many rows are updated and how many ranges are deleted by the partition update. - * - * @return the number of "operations" performed by the update. - */ + @Override + public DeletionInfo deletionInfo() + { + // Collect deletion info from the trie. + DeletionTime partitionLevelDeletion = partitionLevelDeletion(); + MutableDeletionInfo.Builder builder = MutableDeletionInfo.builder(partitionLevelDeletion, metadata.comparator, false); + for (Map.Entry entry : trie.deletionBranchAtRoot().entrySet()) + { + RangeTombstoneMarker marker = entry.getValue().toRangeTombstoneMarker(entry.getKey(), BYTE_COMPARABLE_VERSION, metadata.comparator); + if (marker != null) + builder.add(marker); + } + return builder.build(); + } + + /// @inheritDoc + /// Note: This will not count rows that contain only column-level deletions, as these are not represented in either + /// the live row count or the tombstone count. As this method is meant to get an approximation, we would rather + /// spare the cost of correcting this. @Override public int operationCount() { - return rowCountIncludingStatic - + deletionInfo().rangeCount() - + (deletionInfo().getPartitionDeletion().isLive() ? 0 : 1); + return rowCountIncludingStatic + tombstoneCount; + } + + + /// @inheritDoc + /// Note: This will not count rows that contain only column-level deletions, as these are not represented in either + /// the live row count or the tombstone count. As this method is meant to get an approximation, we would rather + /// spare the cost of correcting this. + @Override + public int affectedRowCount() + { + // If there is a partition-level deletion, we intend to delete at least the columns of one row. + if (!partitionLevelDeletion().isLive()) + return 1; + + return rowCountIncludingStatic + tombstoneCount; + } + + @Override + public int affectedColumnCount() + { + // If there is a partition-level deletion, we intend to delete at least the columns of one row. + if (!partitionLevelDeletion().isLive()) + return metadata().regularAndStaticColumns().size(); + + return TrieBackedRow.countColumns(trie) + + // Each range delete should correspond to at least one intended row deletion, and with it, its regular columns. + tombstoneCount * metadata().regularColumns().size(); } - /** - * The size of the data contained in this update. - * - * @return the size of the data contained in this update. - */ @Override public int dataSize() { return dataSize; } - /** - * The size of the data contained in this update. - * - * @return the size of the data contained in this update. - */ @Override public long unsharedHeapSize() { - assert trie instanceof InMemoryTrie; - InMemoryTrie inMemoryTrie = (InMemoryTrie) trie; - long heapSize = inMemoryTrie.usedSizeOnHeap(); - for (Object o : inMemoryTrie.values()) + assert trie instanceof InMemoryDeletionAwareTrie; + InMemoryDeletionAwareTrie inMemoryTrie = (InMemoryDeletionAwareTrie) trie; + class Collector implements DeletionAwareTrie.ValueConsumer { - if (o instanceof RowData) - heapSize += ((RowData) o).unsharedHeapSizeExcludingData(); - else - heapSize += ((DeletionInfo) o).unsharedHeapSize(); + long heapSize = inMemoryTrie.usedSizeOnHeap(); + + @Override + public void deletionMarker(TrieTombstoneMarker marker) + { + heapSize += marker.unsharedHeapSize(); + } + + @Override + public void content(Object o) + { + if (o instanceof Cell) + heapSize += ((Cell) o).unsharedHeapSize(); + else if (o instanceof LivenessInfo) + heapSize += ((LivenessInfo) o).unsharedHeapSize(); + } } - return heapSize; + Collector collector = new Collector(); + inMemoryTrie.process(Direction.FORWARD, collector); + return collector.heapSize; } - /** - * Validates the data contained in this update. - * - * @throws org.apache.cassandra.serializers.MarshalException if some of the data contained in this update is corrupted. - */ @Override public void validate() { @@ -363,32 +356,32 @@ public void validate() { Row row = it.next(); metadata().comparator.validate(row.clustering()); - for (ColumnData cd : row) - cd.validate(); + + for (Cell cell : row.cells()) + cell.validate(); } } - /** - * The maximum timestamp used in this update. - * - * @return the maximum timestamp used in this update. - */ @Override public long maxTimestamp() { - long maxTimestamp = deletionInfo().maxTimestamp(); + long maxTimestamp = LivenessInfo.NO_TIMESTAMP; + for (Iterator it = trie.deletionBranchAtRoot().valueIterator(); it.hasNext();) + { + TrieTombstoneMarker next = it.next(); + DeletionTime pointDeletion = next.pointDeletion(); + if (pointDeletion != null) + maxTimestamp = Math.max(maxTimestamp, pointDeletion.markedForDeleteAt()); + DeletionTime rightDeletion = next.rightDeletion(); // we can ignore left side as it has appeared on the right first + if (rightDeletion != null) + maxTimestamp = Math.max(maxTimestamp, rightDeletion.markedForDeleteAt()); + } for (Iterator it = rowsIncludingStatic(); it.hasNext();) maxTimestamp = Math.max(maxTimestamp, Rows.collectMaxTimestamp(it.next())); return maxTimestamp; } - /** - * For an update on a counter table, returns a list containing a {@code CounterMark} for - * every counter contained in the update. - * - * @return a list with counter marks for every counter in this update. - */ @Override public List collectCounterMarks() { @@ -404,12 +397,40 @@ public List collectCounterMarks() return marks; } - private static void addMarksForRow(Row row, List marks) + private void addMarksForRow(Row row, List marks) { for (Cell cell : row.cells()) { if (cell.isCounterCell()) - marks.add(new CounterMark(row, cell.column(), cell.path())); + marks.add(new CounterMark(this, row, cell.column(), cell.path())); + } + } + + @Override + public void setCounterMarkValue(CounterMark mark, ByteBuffer value) + { + Row row = mark.row(); + ColumnMetadata column = mark.column(); + CellPath path = mark.path(); + ByteComparable key = v -> + ByteSource.concat(metadata.comparator.asByteComparable(row.clustering()).asComparableBytes(v), + TrieBackedRow.columnKey(columns.columns(row.isStatic()), column), + path != null ? TrieBackedRow.cellPathKey(column, path, v) : ByteSource.EMPTY); + try + { + ((InMemoryDeletionAwareTrie) trie).apply( + DeletionAwareTrie.singleton(key, BYTE_COMPARABLE_VERSION, value), + (c, v) -> ((Cell) c).withUpdatedValue(v), + (x, y) -> x, + (x, y) -> x, + (x, y) -> y, + true, + Predicates.alwaysFalse() + ); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); } } @@ -423,7 +444,14 @@ public PartitionUpdate withOnlyPresentColumns() columnSet.add(column.column()); RegularAndStaticColumns columns = RegularAndStaticColumns.builder().addAll(columnSet).build(); - return new TriePartitionUpdate(metadata, partitionKey, columns, stats, rowCountIncludingStatic, dataSize, trie, false); + return new TriePartitionUpdate(metadata, + partitionKey, + columns, + stats, + rowCountIncludingStatic, + tombstoneCount, + dataSize, + (InMemoryDeletionAwareTrie) trie); } /** @@ -434,154 +462,296 @@ public PartitionUpdate withOnlyPresentColumns() public static class Builder implements PartitionUpdate.Builder { private final TableMetadata metadata; + private final ColumnFilter cf; private final DecoratedKey key; - private final MutableDeletionInfo deletionInfo; - private final boolean canHaveShadowedData; private final RegularAndStaticColumns columns; - private final InMemoryTrie trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + private final InMemoryDeletionAwareTrie trie = TrieBackedRow.newTrie(); + private final InMemoryDeletionAwareTrie.Mutator mutator; private final EncodingStats.Collector statsCollector = new EncodingStats.Collector(); - private final boolean useRecursive; private int rowCountIncludingStatic; + private int tombstoneCount; private long dataSize; public Builder(TableMetadata metadata, DecoratedKey key, RegularAndStaticColumns columns) - { - this(metadata, key, columns, true, Rows.EMPTY_STATIC_ROW, DeletionInfo.LIVE); - } - - private Builder(TableMetadata metadata, - DecoratedKey key, - RegularAndStaticColumns columns, - boolean canHaveShadowedData, - Row staticRow, - DeletionInfo deletionInfo) { this.metadata = metadata; this.key = key; this.columns = columns; - this.canHaveShadowedData = canHaveShadowedData; - this.deletionInfo = deletionInfo.mutableCopy(); - useRecursive = useRecursive(metadata.comparator); rowCountIncludingStatic = 0; + tombstoneCount = 0; dataSize = 0; - add(staticRow); + cf = ColumnFilter.all(metadata); + mutator = trie.mutator(this::mergeIncomingData, + this::mergeTombstones, + this::applyIncomingTombstone, + this::applyExistingTombstoneToIncomingRow, + true, + x -> false); } - // This is wasteful, only to be used for testing. - @VisibleForTesting - public Builder(TriePartitionUpdate base) + void putInTrie(Row untypedRow) + throws TrieSpaceExhaustedException { - this(base.metadata, base.partitionKey, base.columns(), base.canHaveShadowedData, Rows.EMPTY_STATIC_ROW, base.deletionInfo()); - for (Iterator it = base.rowsIncludingStatic(); it.hasNext();) - add(it.next()); + TrieBackedRow row; + if (untypedRow instanceof TrieBackedRow) + row = (TrieBackedRow) untypedRow; + else + row = TrieBackedRow.from(metadata, untypedRow); + + Clustering clustering = row.clustering(); + ByteComparable comparableClustering = metadata.comparator.asByteComparable(clustering); + + mutator.apply(row.trie().prefixedBySeparately(comparableClustering, true)); } - /** - * Adds a row to this update. - *

- * There is no particular assumption made on the order of row added to a partition update. It is further - * allowed to add the same row (more precisely, multiple row objects for the same clustering). - *

- * Note however that the columns contained in the added row must be a subset of the columns used when - * creating this update. - * - * @param row the row to add. - */ + @Override public void add(Row row) { if (row.isEmpty()) return; - // this assert is expensive, and possibly of limited value; we should consider removing it - // or introducing a new class of assertions for test purposes - assert (row.isStatic() ? columns().statics : columns().regulars).containsAll(row.columns()) - : (row.isStatic() ? columns().statics : columns().regulars) + " is not superset of " + row.columns(); + try + { + putInTrie(row); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + private void putPartitionDeletionInTrie(DeletionTime deletionTime) + { + try + { + mutator.delete(RangeTrie.branch(ByteComparable.EMPTY, BYTE_COMPARABLE_VERSION, TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.PARTITION))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + private void putDeletionInTrie(ByteComparable start, ByteComparable end, DeletionTime deletionTime) + { try { - trie.putSingleton(metadata.comparator.asByteComparable(row.clustering()), - row, - this::merge, - useRecursive); + mutator.delete(RangeTrie.range(start, true, + end, false, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.RANGE))); + statsCollector.update(deletionTime); } catch (TrieSpaceExhaustedException e) { throw new AssertionError(e); } - Rows.collectStats(row, statsCollector); } + @Override public void addPartitionDeletion(DeletionTime deletionTime) { - deletionInfo.add(deletionTime); + if (!deletionTime.isLive()) + putPartitionDeletionInTrie(deletionTime); } + @Override public void add(RangeTombstone range) { - deletionInfo.add(range, metadata.comparator); + putDeletionInTrie(metadata.comparator.asByteComparable(range.deletedSlice().start()), + metadata.comparator.asByteComparable(range.deletedSlice().end()), + range.deletionTime()); } + @Override public DecoratedKey partitionKey() { return key; } + @Override public TableMetadata metadata() { return metadata; } + @Override public TriePartitionUpdate build() { try { - trie.putRecursive(ByteComparable.EMPTY, deletionInfo, NO_CONFLICT_RESOLVER); + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); } catch (TrieSpaceExhaustedException e) { throw new AssertionError(e); } - deletionInfo.collectStats(statsCollector); - TriePartitionUpdate pu = new TriePartitionUpdate(metadata, - partitionKey(), - columns, - statsCollector.get(), - rowCountIncludingStatic, - Ints.saturatedCast(dataSize), - trie, - canHaveShadowedData); - return pu; + return new TriePartitionUpdate(metadata, + partitionKey(), + columns, + statsCollector.get(), + rowCountIncludingStatic, + tombstoneCount, + Ints.saturatedCast(dataSize), + trie); } - RowData merge(Object existing, Row update) + /** Merge in live data from the update trie. This can be various markers, liveness info or cells. */ + private Object mergeIncomingData(Object existing, Object update) { - if (existing != null) + if (update instanceof Cell) { - // this is not expected to happen much, so going through toRow and the existing size is okay - RowData rowData = (RowData) existing; - update = Rows.merge(rowData.toRow(update.clustering()), update); - dataSize += update.dataSize() - rowData.dataSize(); + assert existing == null || existing instanceof Cell; + Cell updateCell = (Cell) update; + Cell existingCell = (Cell) existing; + Cells.collectStats(updateCell, statsCollector); + Cell reconciled; + if (existingCell == null) + { + reconciled = updateCell; + dataSize += reconciled.dataSize(); + } + else + { + reconciled = Cells.reconcile(existingCell, updateCell); + if (reconciled != existingCell) + dataSize += reconciled.dataSize() - existingCell.dataSize(); + } + return reconciled; + } + else if (update == TrieBackedRow.COMPLEX_COLUMN_MARKER) + { + assert existing == null || existing == TrieBackedRow.COMPLEX_COLUMN_MARKER; + return update; + } + else if (update instanceof LivenessInfo) + { + assert existing == null || existing instanceof LivenessInfo; + LivenessInfo rowUpdate = (LivenessInfo) update; + LivenessInfo existingRow = (LivenessInfo) existing; + statsCollector.update(rowUpdate); + // Note: even though we use LivenessInfo.merge, it returns one of its arguments which is RowData + LivenessInfo reconciled; + + if (existingRow == null) + { + ++rowCountIncludingStatic; + dataSize += rowUpdate.dataSize(); + reconciled = rowUpdate; + } + else + { + reconciled = LivenessInfo.merge(existingRow, rowUpdate); + dataSize = reconciled.dataSize() - existingRow.dataSize(); + } + return reconciled; + } + else if (update instanceof PartitionMarker) + { + assert update == PARTITION_MARKER; + assert existing == null || existing == PARTITION_MARKER; + return PARTITION_MARKER; + } + + throw new AssertionError("Unknown data in trie: " + update); + } + + /** Apply an existing tombstone to incoming data before merging that data in the trie. */ + private Object applyExistingTombstoneToIncomingRow(TrieTombstoneMarker marker, Object o) + { + // This is done before merging the data; we will reflect size changes when the data is merged if it survives. + return applyTombstone(marker, o, false); + } + + /** Apply an incoming tombstone to existing data, possibly removing it from the trie. */ + private Object applyIncomingTombstone(Object o, TrieTombstoneMarker marker) + { + return applyTombstone(marker, o, true); + } + + private Object applyTombstone(TrieTombstoneMarker marker, Object o, boolean updateDataSize) + { + DeletionTime deletion = marker.applicableToPointForward(); + if (deletion == null) + return o; + + if (o instanceof Cell) + { + Cell cell = (Cell) o; + if (!deletion.deletes(cell)) + return o; + if (updateDataSize) + dataSize -= cell.dataSize(); + return null; + } + else if (o == TrieBackedRow.COMPLEX_COLUMN_MARKER) + { + return o; + } + else if (o instanceof LivenessInfo) + { + LivenessInfo info = (LivenessInfo) o; + if (!deletion.deletes(info)) + return o; + + if (updateDataSize) + dataSize -= info.dataSize(); + return LivenessInfo.EMPTY; + } + else if (o instanceof PartitionMarker) + { + return o; + } + throw new AssertionError("Unknown data in trie: " + o); + } + + /** + * Merge an incoming tombstone with existing deletions. + * This will be called for all boundary tombstones in the update, but also for all existing boundaries that are + * covered by an incoming range. + */ + private TrieTombstoneMarker mergeTombstones(TrieTombstoneMarker existing, @NotNull TrieTombstoneMarker update) + { + if (existing == null) + { + // We are adding a new tombstone. We are counting tombstones on the row level, so ones that introduce + // or close column deletions should not count. + // We will only count one of the sides as we want to increase the count by one for each pair. + if (hasKind(update.rightDeletion(), ROW_DELETION_KINDS)) + ++tombstoneCount; + return update; } else { - ++rowCountIncludingStatic; - dataSize += update.dataSize(); + TrieTombstoneMarker merged = update.mergeWith(existing); + int hadTombstone = existing.isBoundary() && hasKind(existing.rightDeletion(), ROW_DELETION_KINDS) ? 1 : 0; + int hasTombstone = merged != null && merged.isBoundary() && hasKind(merged.rightDeletion(), ROW_DELETION_KINDS) ? 1 : 0; + tombstoneCount += hasTombstone - hadTombstone; + return merged; } + } + + private static boolean hasKind(TrieTombstoneMarker.Covering side, EnumSet kinds) + { + if (side == null) + return false; - return rowToData(update); + return kinds.contains(side.deletionKind()); } + @Override public RegularAndStaticColumns columns() { return columns; } + @Override public DeletionTime partitionLevelDeletion() { - return deletionInfo.getPartitionDeletion(); + return TrieTombstoneMarker.applicableDeletionOrLive(trie, ByteComparable.EMPTY); } @Override @@ -590,8 +760,6 @@ public String toString() return "Builder{" + "metadata=" + metadata + ", key=" + key + - ", deletionInfo=" + deletionInfo + - ", canHaveShadowedData=" + canHaveShadowedData + ", columns=" + columns + '}'; } diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage2.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage2.java new file mode 100644 index 000000000000..c1b0feb2b68b --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage2.java @@ -0,0 +1,649 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.partitions; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; +import com.google.common.primitives.Ints; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.db.tries.InMemoryTrie; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/** + * A trie-backed PartitionUpdate. Immutable. + *

+ * Provides factories for simple variations (e.g. singleRowUpdate) and a mutable builder for constructing one. + * The builder holds a mutable trie to which content may be added in any order, also taking care of + * merging any duplicate rows, and keeping track of statistics and column coverage. + */ +public class TriePartitionUpdateStage2 extends TrieBackedPartitionStage2 implements PartitionUpdate +{ + protected static final Logger logger = LoggerFactory.getLogger(TriePartitionUpdateStage2.class); + + public static final Factory FACTORY = new TrieFactory(); + + final int dataSize; + + private TriePartitionUpdateStage2(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + int dataSize, + Trie trie, + boolean canHaveShadowedData) + { + super(key, columns, stats, rowCountIncludingStatic, trie, metadata, canHaveShadowedData); + this.dataSize = dataSize; + } + + @Override + public boolean equals(Object obj) + { + if (!(obj instanceof TriePartitionUpdateStage2)) + return false; + + TriePartitionUpdateStage2 that = (TriePartitionUpdateStage2) obj; + return partitionKey.equals(that.partitionKey) + && metadata().id.equals(that.metadata().id) + && deletionInfo().equals(that.deletionInfo()) + && staticRow().equals(that.staticRow()) + && Iterators.elementsEqual(rowIterator(), that.rowIterator()); + } + + + private static InMemoryTrie newTrie(DeletionInfo deletion) + { + InMemoryTrie trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + try + { + trie.putRecursive(ByteComparable.EMPTY, deletion, NO_CONFLICT_RESOLVER); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return trie; + } + + /** + * Creates a empty immutable partition update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the created update. + * + * @return the newly created empty (and immutable) update. + */ + public static TriePartitionUpdateStage2 emptyUpdate(TableMetadata metadata, DecoratedKey key) + { + return new TriePartitionUpdateStage2(metadata, + key, + RegularAndStaticColumns.NONE, + EncodingStats.NO_STATS, + 0, + 0, + newTrie(MutableDeletionInfo.live()), + false); + } + + /** + * Creates an immutable partition update that entirely deletes a given partition. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition that the created update should delete. + * @param timestamp the timestamp for the deletion. + * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion. + * + * @return the newly created partition deletion update. + */ + public static TriePartitionUpdateStage2 fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec) + { + MutableDeletionInfo deletion = new MutableDeletionInfo(timestamp, nowInSec); + return new TriePartitionUpdateStage2(metadata, + key, + RegularAndStaticColumns.NONE, + new EncodingStats(timestamp, nowInSec, LivenessInfo.NO_TTL), + 0, + 0, + newTrie(deletion), + false); + } + + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param row the row for the update, may be a regular or static row and cannot be null. + * + * @return the newly created partition update containing only {@code row}. + */ + public static TriePartitionUpdateStage2 singleRowUpdate(TableMetadata metadata, DecoratedKey key, Row row) + { + EncodingStats stats = row.isEmpty() ? EncodingStats.NO_STATS : EncodingStats.Collector.forRow(row); + InMemoryTrie trie = newTrie(DeletionInfo.LIVE); + + RegularAndStaticColumns columns; + if (row.isStatic()) + columns = new RegularAndStaticColumns(Columns.from(row.columns()), Columns.NONE); + else + columns = new RegularAndStaticColumns(Columns.NONE, Columns.from(row.columns())); + + try + { + putInTrie(metadata.comparator, useRecursive(metadata.comparator), trie, row); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + + return new TriePartitionUpdateStage2(metadata, key, columns, stats, 1, row.dataSize(), trie, false); + } + + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param row the row for the update. + * + * @return the newly created partition update containing only {@code row}. + */ + public static TriePartitionUpdateStage2 singleRowUpdate(TableMetadata metadata, ByteBuffer key, Row row) + { + return singleRowUpdate(metadata, metadata.partitioner.decorateKey(key), row); + } + + /** + * Turns the given iterator into an update. + * + * @param iterator the iterator to turn into updates. + * + * Warning: this method does not close the provided iterator, it is up to + * the caller to close it. + */ + @SuppressWarnings("resource") + public static TriePartitionUpdateStage2 fromIterator(UnfilteredRowIterator iterator) + { + ContentBuilder builder = build(iterator, true); + + return new TriePartitionUpdateStage2(iterator.metadata(), + iterator.partitionKey(), + iterator.columns(), + iterator.stats(), + builder.rowCountIncludingStatic(), + builder.dataSize(), + builder.trie(), + false); + } + + public static TriePartitionUpdateStage2 asTrieUpdate(PartitionUpdate update) + { + if (update instanceof TriePartitionUpdateStage2) + return (TriePartitionUpdateStage2) update; + + try (UnfilteredRowIterator iterator = update.unfilteredIterator()) + { + return fromIterator(iterator); + } + } + + public static Trie asMergableTrie(PartitionUpdate update) + { + return asTrieUpdate(update).trie.prefixedBy(update.partitionKey()); + } + + /** + * Modify this update to set every timestamp for live data to {@code newTimestamp} and + * every deletion timestamp to {@code newTimestamp - 1}. + * + * There is no reason to use that except on the Paxos code path, where we need to ensure that + * anything inserted uses the ballot timestamp (to respect the order of updates decided by + * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones + * always win on timestamp equality and we don't want to delete our own insertions + * (typically, when we overwrite a collection, we first set a complex deletion to delete the + * previous collection before adding new elements. If we were to set that complex deletion + * to the same timestamp that the new elements, it would delete those elements). And since + * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still + * delete anything from a previous update. + */ + @Override + public TriePartitionUpdateStage2 withUpdatedTimestamps(long newTimestamp) + { + + InMemoryTrie t = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + try + { + t.apply(trie, new InMemoryTrie.UpsertTransformer() + { + public Object apply(Object shouldBeNull, Object o) + { + assert shouldBeNull == null; + if (o instanceof RowData) + return applyRowData((RowData) o); + else + return applyDeletion((DeletionInfo) o); + } + + public RowData applyRowData(RowData update) + { + LivenessInfo newInfo = update.livenessInfo.isEmpty() + ? update.livenessInfo + : update.livenessInfo.withUpdatedTimestamp(newTimestamp); + DeletionTime newDeletion = update.deletion.isLive() + ? DeletionTime.LIVE + : DeletionTime.build(newTimestamp - 1, update.deletion.localDeletionTime()); + + return new RowData(BTree.transformAndFilter(update.columnsBTree, + (ColumnData cd) -> cd.updateAllTimestamp(newTimestamp)), + newInfo, newDeletion); + } + + public DeletionInfo applyDeletion(DeletionInfo update) + { + if (update.isLive()) + return update; + + MutableDeletionInfo mdi = update.mutableCopy(); + mdi.updateAllTimestamp(newTimestamp - 1); + return mdi; + } + }, Predicates.alwaysFalse()); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return new TriePartitionUpdateStage2(metadata, partitionKey, columns, stats, rowCountIncludingStatic, dataSize, t, canHaveShadowedData); + } + + /** + * The number of "operations" contained in the update. + *

+ * This is used by {@code Memtable} to approximate how much work this update does. In practice, this + * count how many rows are updated and how many ranges are deleted by the partition update. + * + * @return the number of "operations" performed by the update. + */ + @Override + public int operationCount() + { + return rowCountIncludingStatic + + deletionInfo().rangeCount() + + (deletionInfo().getPartitionDeletion().isLive() ? 0 : 1); + } + + /** + * The size of the data contained in this update. + * + * @return the size of the data contained in this update. + */ + @Override + public int dataSize() + { + return dataSize; + } + + /** + * The size of the data contained in this update. + * + * @return the size of the data contained in this update. + */ + @Override + public long unsharedHeapSize() + { + assert trie instanceof InMemoryTrie; + InMemoryTrie inMemoryTrie = (InMemoryTrie) trie; + long heapSize = inMemoryTrie.usedSizeOnHeap(); + for (Object o : inMemoryTrie.values()) + { + if (o instanceof RowData) + heapSize += ((RowData) o).unsharedHeapSizeExcludingData(); + else + heapSize += ((DeletionInfo) o).unsharedHeapSize(); + } + return heapSize; + } + + /** + * Validates the data contained in this update. + * + * @throws org.apache.cassandra.serializers.MarshalException if some of the data contained in this update is corrupted. + */ + @Override + public void validate() + { + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + { + Row row = it.next(); + metadata().comparator.validate(row.clustering()); + for (ColumnData cd : row) + cd.validate(); + } + } + + /** + * The maximum timestamp used in this update. + * + * @return the maximum timestamp used in this update. + */ + @Override + public long maxTimestamp() + { + long maxTimestamp = deletionInfo().maxTimestamp(); + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + maxTimestamp = Math.max(maxTimestamp, Rows.collectMaxTimestamp(it.next())); + + return maxTimestamp; + } + + /** + * For an update on a counter table, returns a list containing a {@code CounterMark} for + * every counter contained in the update. + * + * @return a list with counter marks for every counter in this update. + */ + @Override + public List collectCounterMarks() + { + assert metadata().isCounter(); + // We will take aliases on the rows of this update, and update them in-place. So we should be sure the + // update is now immutable for all intent and purposes. + List marks = new ArrayList<>(); + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + { + Row row = it.next(); + addMarksForRow(row, marks); + } + return marks; + } + + private void addMarksForRow(Row row, List marks) + { + for (Cell cell : row.cells()) + { + if (cell.isCounterCell()) + marks.add(new CounterMark(this, row, cell.column(), cell.path())); + } + } + + @Override + public void setCounterMarkValue(CounterMark mark, ByteBuffer value) + { + // Please read the warning in BTreeRow.setValue before using this method. + BTreeRow row = (BTreeRow) mark.row(); + row.setValue(mark.column(), mark.path(), value); + } + + @Override + public PartitionUpdate withOnlyPresentColumns() + { + Set columnSet = new HashSet<>(); + + for (Row row : rows()) + for (ColumnData column : row) + columnSet.add(column.column()); + + RegularAndStaticColumns columns = RegularAndStaticColumns.builder().addAll(columnSet).build(); + return new TriePartitionUpdateStage2(metadata, partitionKey, columns, stats, rowCountIncludingStatic, dataSize, trie, false); + } + + /** + * Builder for PartitionUpdates + * + * This class is not thread safe, but the PartitionUpdate it produces is (since it is immutable). + */ + public static class Builder implements PartitionUpdate.Builder + { + private final TableMetadata metadata; + private final DecoratedKey key; + private final MutableDeletionInfo deletionInfo; + private final boolean canHaveShadowedData; + private final RegularAndStaticColumns columns; + private final InMemoryTrie trie = InMemoryTrie.shortLived(BYTE_COMPARABLE_VERSION); + private final EncodingStats.Collector statsCollector = new EncodingStats.Collector(); + private final boolean useRecursive; + private int rowCountIncludingStatic; + private long dataSize; + + public Builder(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns) + { + this(metadata, key, columns, true, Rows.EMPTY_STATIC_ROW, DeletionInfo.LIVE); + } + + private Builder(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns, + boolean canHaveShadowedData, + Row staticRow, + DeletionInfo deletionInfo) + { + this.metadata = metadata; + this.key = key; + this.columns = columns; + this.canHaveShadowedData = canHaveShadowedData; + this.deletionInfo = deletionInfo.mutableCopy(); + useRecursive = useRecursive(metadata.comparator); + rowCountIncludingStatic = 0; + dataSize = 0; + add(staticRow); + } + + // This is wasteful, only to be used for testing. + @VisibleForTesting + public Builder(TriePartitionUpdateStage2 base) + { + this(base.metadata, base.partitionKey, base.columns(), base.canHaveShadowedData, Rows.EMPTY_STATIC_ROW, base.deletionInfo()); + for (Iterator it = base.rowsIncludingStatic(); it.hasNext();) + add(it.next()); + } + + /** + * Adds a row to this update. + *

+ * There is no particular assumption made on the order of row added to a partition update. It is further + * allowed to add the same row (more precisely, multiple row objects for the same clustering). + *

+ * Note however that the columns contained in the added row must be a subset of the columns used when + * creating this update. + * + * @param row the row to add. + */ + public void add(Row row) + { + if (row.isEmpty()) + return; + + // this assert is expensive, and possibly of limited value; we should consider removing it + // or introducing a new class of assertions for test purposes + assert (row.isStatic() ? columns().statics : columns().regulars).containsAll(row.columns()) + : (row.isStatic() ? columns().statics : columns().regulars) + " is not superset of " + row.columns(); + + try + { + trie.putSingleton(metadata.comparator.asByteComparable(row.clustering()), + row, + this::merge, + useRecursive); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + Rows.collectStats(row, statsCollector); + } + + public void addPartitionDeletion(DeletionTime deletionTime) + { + deletionInfo.add(deletionTime); + } + + public void add(RangeTombstone range) + { + deletionInfo.add(range, metadata.comparator); + } + + public DecoratedKey partitionKey() + { + return key; + } + + public TableMetadata metadata() + { + return metadata; + } + + public TriePartitionUpdateStage2 build() + { + try + { + trie.putRecursive(ByteComparable.EMPTY, deletionInfo, NO_CONFLICT_RESOLVER); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + deletionInfo.collectStats(statsCollector); + TriePartitionUpdateStage2 pu = new TriePartitionUpdateStage2(metadata, + partitionKey(), + columns, + statsCollector.get(), + rowCountIncludingStatic, + Ints.saturatedCast(dataSize), + trie, + canHaveShadowedData); + + return pu; + } + + RowData merge(Object existing, Row update) + { + if (existing != null) + { + // this is not expected to happen much, so going through toRow and the existing size is okay + RowData rowData = (RowData) existing; + update = Rows.merge(rowData.toRow(update.clustering()), update); + dataSize += update.dataSize() - rowData.dataSize(); + } + else + { + ++rowCountIncludingStatic; + dataSize += update.dataSize(); + } + + return rowToData(update); + } + + public RegularAndStaticColumns columns() + { + return columns; + } + + public DeletionTime partitionLevelDeletion() + { + return deletionInfo.getPartitionDeletion(); + } + + @Override + public String toString() + { + return "Builder{" + + "metadata=" + metadata + + ", key=" + key + + ", deletionInfo=" + deletionInfo + + ", canHaveShadowedData=" + canHaveShadowedData + + ", columns=" + columns + + '}'; + } + } + + public static class TrieFactory implements PartitionUpdate.Factory + { + + @Override + public PartitionUpdate.Builder builder(TableMetadata metadata, DecoratedKey partitionKey, RegularAndStaticColumns columns, int initialRowCapacity) + { + return new TriePartitionUpdateStage2.Builder(metadata, partitionKey, columns); + } + + @Override + public PartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey partitionKey) + { + return TriePartitionUpdateStage2.emptyUpdate(metadata, partitionKey); + } + + @Override + public PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey valueKey, Row row) + { + return TriePartitionUpdateStage2.singleRowUpdate(metadata, valueKey, row); + } + + @Override + public PartitionUpdate fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec) + { + return TriePartitionUpdateStage2.fullPartitionDelete(metadata, key, timestamp, nowInSec); + } + + @Override + public PartitionUpdate fromIterator(UnfilteredRowIterator iterator) + { + return TriePartitionUpdateStage2.fromIterator(iterator); + } + + @Override + public PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter) + { + return TriePartitionUpdateStage2.fromIterator(UnfilteredRowIterators.withOnlyQueriedData(iterator, filter)); + } + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage3.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage3.java new file mode 100644 index 000000000000..3fe576123d2c --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdateStage3.java @@ -0,0 +1,745 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.partitions; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; +import com.google.common.primitives.Ints; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.RangeTrie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/** + * A trie-backed PartitionUpdate. Immutable. + *

+ * Provides factories for simple variations (e.g. singleRowUpdate) and a mutable builder for constructing one. + * The builder holds a mutable trie to which content may be added in any order, also taking care of + * merging any duplicate rows, and keeping track of statistics and column coverage. + */ +public class TriePartitionUpdateStage3 extends TrieBackedPartitionStage3 implements PartitionUpdate +{ + protected static final Logger logger = LoggerFactory.getLogger(TriePartitionUpdateStage3.class); + + public static final Factory FACTORY = new TrieFactory(); + + final int dataSize; + + private TriePartitionUpdateStage3(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns, + EncodingStats stats, + int rowCountIncludingStatic, + int tombstoneCount, + int dataSize, + DeletionAwareTrie trie) + { + super(key, columns, stats, rowCountIncludingStatic, tombstoneCount, trie, metadata); + this.dataSize = dataSize; + } + + @Override + public boolean equals(Object obj) + { + if (!(obj instanceof TriePartitionUpdate)) + return false; + + TriePartitionUpdate that = (TriePartitionUpdate) obj; + return partitionKey.equals(that.partitionKey) + && metadata().id.equals(that.metadata().id) + && deletionInfo().equals(that.deletionInfo()) + && staticRow().equals(that.staticRow()) + && Iterators.elementsEqual(rowIterator(), that.rowIterator()); + } + + + private static InMemoryDeletionAwareTrie newTrie() + { + InMemoryDeletionAwareTrie trie = InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION); + try + { + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return trie; + } + + /** + * Creates a empty immutable partition update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the created update. + * + * @return the newly created empty (and immutable) update. + */ + public static TriePartitionUpdateStage3 emptyUpdate(TableMetadata metadata, DecoratedKey key) + { + return new TriePartitionUpdateStage3(metadata, + key, + RegularAndStaticColumns.NONE, + EncodingStats.NO_STATS, + 0, + 0, + 0, + newTrie()); + } + + /** + * Creates an immutable partition update that entirely deletes a given partition. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition that the created update should delete. + * @param timestamp the timestamp for the deletion. + * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion. + * + * @return the newly created partition deletion update. + */ + public static TriePartitionUpdateStage3 fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec) + { + InMemoryDeletionAwareTrie trie = newTrie(); + putPartitionDeletionInTrie(trie, DeletionTime.build(timestamp, nowInSec)); + return new TriePartitionUpdateStage3(metadata, + key, + RegularAndStaticColumns.NONE, + new EncodingStats(timestamp, nowInSec, LivenessInfo.NO_TTL), + 0, + 1, + 0, + trie); + } + + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param row the row for the update, may be a regular or static row and cannot be null. + * + * @return the newly created partition update containing only {@code row}. + */ + public static TriePartitionUpdateStage3 singleRowUpdate(TableMetadata metadata, DecoratedKey key, Row row) + { + EncodingStats stats = row.isEmpty() ? EncodingStats.NO_STATS : EncodingStats.Collector.forRow(row); + InMemoryDeletionAwareTrie trie = newTrie(); + + RegularAndStaticColumns columns; + if (row.isStatic()) + columns = new RegularAndStaticColumns(Columns.from(row.columns()), Columns.NONE); + else + columns = new RegularAndStaticColumns(Columns.NONE, Columns.from(row.columns())); + + try + { + putInTrie(metadata.comparator, trie, row); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + + return new TriePartitionUpdateStage3(metadata, key, columns, stats, 1, row.deletion().isLive() ? 0 : 1, row.dataSize(), trie); + } + + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param row the row for the update. + * + * @return the newly created partition update containing only {@code row}. + */ + public static TriePartitionUpdateStage3 singleRowUpdate(TableMetadata metadata, ByteBuffer key, Row row) + { + return singleRowUpdate(metadata, metadata.partitioner.decorateKey(key), row); + } + + /** + * Turns the given iterator into an update. + * + * @param iterator the iterator to turn into updates. + * + * Warning: this method does not close the provided iterator, it is up to + * the caller to close it. + */ + @SuppressWarnings("resource") + public static TriePartitionUpdateStage3 fromIterator(UnfilteredRowIterator iterator) + { + ContentBuilder builder = build(iterator, true); + + return new TriePartitionUpdateStage3(iterator.metadata(), + iterator.partitionKey(), + iterator.columns(), + iterator.stats(), + builder.rowCountIncludingStatic(), + builder.tombstoneCount(), + builder.dataSize(), + builder.trie()); + } + + public static TriePartitionUpdateStage3 asTrieUpdate(PartitionUpdate update) + { + if (update instanceof TriePartitionUpdateStage3) + return (TriePartitionUpdateStage3) update; + + try (UnfilteredRowIterator iterator = update.unfilteredIterator()) + { + return fromIterator(iterator); + } + } + + public static DeletionAwareTrie asMergableTrie(PartitionUpdate update) + { + return asTrieUpdate(update).trie.prefixedBy(update.partitionKey()); + } + + /** + * Modify this update to set every timestamp for live data to {@code newTimestamp} and + * every deletion timestamp to {@code newTimestamp - 1}. + * + * There is no reason to use that except on the Paxos code path, where we need to ensure that + * anything inserted uses the ballot timestamp (to respect the order of updates decided by + * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones + * always win on timestamp equality and we don't want to delete our own insertions + * (typically, when we overwrite a collection, we first set a complex deletion to delete the + * previous collection before adding new elements. If we were to set that complex deletion + * to the same timestamp that the new elements, it would delete those elements). And since + * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still + * delete anything from a previous update. + */ + @Override + public TriePartitionUpdateStage3 withUpdatedTimestamps(long newTimestamp) + { + + InMemoryDeletionAwareTrie t = InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION); + try + { + t.apply(trie, + (shouldBeNull, o) -> + { + assert shouldBeNull == null; + if (!(o instanceof RowData)) + return o; + RowData update = (RowData) o; + + LivenessInfo newInfo = update.livenessInfo.isEmpty() + ? update.livenessInfo + : update.livenessInfo.withUpdatedTimestamp(newTimestamp); + + return new RowData(BTree.transformAndFilter(update.columnsBTree, + (ColumnData cd) -> cd.updateAllTimestamp(newTimestamp)), + newInfo); + }, + (shouldBeNull, o) -> + { + assert shouldBeNull == null; + return o.withUpdatedTimestamp(newTimestamp - 1); + }, + noIncomingSelfDeletion(), + noExistingSelfDeletion(), + true, + Predicates.alwaysFalse()); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return new TriePartitionUpdateStage3(metadata, partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, dataSize, t); + } + + @Override + public DeletionInfo deletionInfo() + { + // Collect deletion info from the trie. + DeletionTime partitionLevelDeletion = partitionLevelDeletion(); + MutableDeletionInfo.Builder builder = MutableDeletionInfo.builder(partitionLevelDeletion, metadata.comparator, false); + for (Map.Entry entry : trie.deletionBranchAtRoot().entrySet()) + { + RangeTombstoneMarker marker = entry.getValue().toRangeTombstoneMarker(entry.getKey(), BYTE_COMPARABLE_VERSION, metadata.comparator); + if (marker != null) + builder.add(marker); + } + return builder.build(); + } + + /** + * The number of "operations" contained in the update. + *

+ * This is used by {@code Memtable} to approximate how much work this update does. In practice, this + * count how many rows are updated and how many ranges are deleted by the partition update. + * + * @return the number of "operations" performed by the update. + */ + @Override + public int operationCount() + { + return rowCountIncludingStatic + tombstoneCount; + } + + /** + * The size of the data contained in this update. + * + * @return the size of the data contained in this update. + */ + @Override + public int dataSize() + { + return dataSize; + } + + /** + * The size of the data contained in this update. + * + * @return the size of the data contained in this update. + */ + @Override + public long unsharedHeapSize() + { + assert trie instanceof InMemoryDeletionAwareTrie; + InMemoryDeletionAwareTrie inMemoryTrie = (InMemoryDeletionAwareTrie) trie; + long heapSize = inMemoryTrie.usedSizeOnHeap(); + for (Object o : inMemoryTrie.values()) + { + if (o instanceof RowData) + heapSize += ((RowData) o).unsharedHeapSizeExcludingData(); + else + heapSize += ((DeletionInfo) o).unsharedHeapSize(); + } + return heapSize; + } + + /** + * Validates the data contained in this update. + * + * @throws org.apache.cassandra.serializers.MarshalException if some of the data contained in this update is corrupted. + */ + @Override + public void validate() + { + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + { + Row row = it.next(); + metadata().comparator.validate(row.clustering()); + for (ColumnData cd : row) + cd.validate(); + } + } + + /** + * The maximum timestamp used in this update. + * + * @return the maximum timestamp used in this update. + */ + @Override + public long maxTimestamp() + { + long maxTimestamp = LivenessInfo.NO_TIMESTAMP; + for (Iterator it = trie.deletionBranchAtRoot().valueIterator(); it.hasNext();) + { + TrieTombstoneMarker next = it.next(); + DeletionTime pointDeletion = next.pointDeletion(); + if (pointDeletion != null) + maxTimestamp = Math.max(maxTimestamp, pointDeletion.markedForDeleteAt()); + DeletionTime rightDeletion = next.rightDeletion(); // we can ignore left side as it has appeared on the right first + if (rightDeletion != null) + maxTimestamp = Math.max(maxTimestamp, rightDeletion.markedForDeleteAt()); + } + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + maxTimestamp = Math.max(maxTimestamp, Rows.collectMaxTimestamp(it.next())); + + return maxTimestamp; + } + + /** + * For an update on a counter table, returns a list containing a {@code CounterMark} for + * every counter contained in the update. + * + * @return a list with counter marks for every counter in this update. + */ + @Override + public List collectCounterMarks() + { + assert metadata().isCounter(); + // We will take aliases on the rows of this update, and update them in-place. So we should be sure the + // update is now immutable for all intent and purposes. + List marks = new ArrayList<>(); + for (Iterator it = rowsIncludingStatic(); it.hasNext();) + { + Row row = it.next(); + addMarksForRow(row, marks); + } + return marks; + } + + private void addMarksForRow(Row row, List marks) + { + for (Cell cell : row.cells()) + { + if (cell.isCounterCell()) + marks.add(new CounterMark(this, row, cell.column(), cell.path())); + } + } + + @Override + public void setCounterMarkValue(CounterMark mark, ByteBuffer value) + { + // Please read the warning in BTreeRow.setValue before using this method. + BTreeRow row = (BTreeRow) mark.row(); + row.setValue(mark.column(), mark.path(), value); + } + + @Override + public PartitionUpdate withOnlyPresentColumns() + { + Set columnSet = new HashSet<>(); + + for (Row row : rows()) + for (ColumnData column : row) + columnSet.add(column.column()); + + RegularAndStaticColumns columns = RegularAndStaticColumns.builder().addAll(columnSet).build(); + return new TriePartitionUpdateStage3(metadata, partitionKey, columns, stats, rowCountIncludingStatic, tombstoneCount, dataSize, trie); + } + + /** + * Builder for PartitionUpdates + * + * This class is not thread safe, but the PartitionUpdate it produces is (since it is immutable). + */ + public static class Builder implements PartitionUpdate.Builder + { + private final TableMetadata metadata; + private final ColumnFilter cf; + private final DecoratedKey key; + private final RegularAndStaticColumns columns; + private final InMemoryDeletionAwareTrie trie = InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION); + private final InMemoryDeletionAwareTrie.Mutator mutator; + private final EncodingStats.Collector statsCollector = new EncodingStats.Collector(); + private int rowCountIncludingStatic; + private int tombstoneCount; + private long dataSize; + + public Builder(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns) + { + this.metadata = metadata; + this.key = key; + this.columns = columns; + rowCountIncludingStatic = 0; + tombstoneCount = 0; + dataSize = 0; + cf = ColumnFilter.all(metadata); + mutator = trie.mutator(this::mergeIncomingRow, + this::mergeTombstones, + this::applyIncomingTombstone, + this::applyExistingTombstoneToIncomingRow, + true, + Predicates.alwaysFalse(), + Predicates.alwaysFalse()); + } + + /** + * Adds a row to this update. + *

+ * There is no particular assumption made on the order of row added to a partition update. It is further + * allowed to add the same row (more precisely, multiple row objects for the same clustering). + *

+ * Note however that the columns contained in the added row must be a subset of the columns used when + * creating this update. + * + * @param row the row to add. + */ + public void add(Row row) + { + if (row.isEmpty()) + return; + + // this assert is expensive, and possibly of limited value; we should consider removing it + // or introducing a new class of assertions for test purposes + assert (row.isStatic() ? columns().statics : columns().regulars).containsAll(row.columns()) + : (row.isStatic() ? columns().statics : columns().regulars) + " is not superset of " + row.columns(); + + try + { + // We do not look for atomicity here, so can do the two steps separately. + // TODO: Direct insertion methods (singleton known to not be deleted, deletion known to not delete anything) + Clustering clustering = row.clustering(); + DeletionTime deletionTime = row.deletion().time(); + + ByteComparable comparableClustering = metadata.comparator.asByteComparable(clustering); + if (!deletionTime.isLive()) + { + putRowDeletionInTrie(comparableClustering, + deletionTime); + } + if (!row.isEmptyAfterDeletion()) + { + mutator.apply(DeletionAwareTrie.singleton(comparableClustering, BYTE_COMPARABLE_VERSION, row)); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + Rows.collectStats(row, statsCollector); + } + + private void putRowDeletionInTrie(ByteComparable key, + DeletionTime deletionTime) + { + try + { + mutator.delete(RangeTrie.point(key, + BYTE_COMPARABLE_VERSION, + true, + TrieTombstoneMarker.point(deletionTime, TrieTombstoneMarker.Kind.ROW))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + private void putPartitionDeletionInTrie(DeletionTime deletionTime) + { + try + { + mutator.delete(RangeTrie.branch(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.PARTITION))); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + private void putDeletionInTrie(ByteComparable start, ByteComparable end, DeletionTime deletionTime) + { + try + { + mutator.delete(RangeTrie.range(start, true, + end, false, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletionTime, TrieTombstoneMarker.Kind.RANGE))); + statsCollector.update(deletionTime); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + public void addPartitionDeletion(DeletionTime deletionTime) + { + if (!deletionTime.isLive()) + putPartitionDeletionInTrie(deletionTime); + } + + public void add(RangeTombstone range) + { + putDeletionInTrie(metadata.comparator.asByteComparable(range.deletedSlice().start()), + metadata.comparator.asByteComparable(range.deletedSlice().end()), + range.deletionTime()); + } + + public DecoratedKey partitionKey() + { + return key; + } + + public TableMetadata metadata() + { + return metadata; + } + + public TriePartitionUpdateStage3 build() + { + try + { + trie.putRecursive(ByteComparable.EMPTY, PARTITION_MARKER, noConflictInData()); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + TriePartitionUpdateStage3 pu = new TriePartitionUpdateStage3(metadata, + partitionKey(), + columns, + statsCollector.get(), + rowCountIncludingStatic, + tombstoneCount, + Ints.saturatedCast(dataSize), + trie); + + return pu; + } + + RowData mergeIncomingRow(Object existing, Row update) + { + if (existing != null) + { + // this is not expected to happen much, so going through toRow and the existing size is okay + RowData rowData = (RowData) existing; + update = Rows.merge(rowData.toRow(update.clustering(), DeletionTime.LIVE), update); + dataSize += update.dataSize() - rowData.dataSize(); + } + else + { + ++rowCountIncludingStatic; + dataSize += update.dataSize(); + } + + return rowToData(update); + } + + private Row applyExistingTombstoneToIncomingRow(TrieTombstoneMarker trieTombstoneMarker, Row o) + { + DeletionTime deletion = trieTombstoneMarker.applicableToPointForward(); + if (deletion == null) + return o; + return o.filter(cf, deletion, false, metadata); + } + + private Object applyIncomingTombstone(Object o, TrieTombstoneMarker trieTombstoneMarker) + { + DeletionTime deletion = trieTombstoneMarker.applicableToPointForward(); + if (deletion == null) + return o; + RowData row = (RowData) o; + return row.delete(deletion); + } + + private TrieTombstoneMarker mergeTombstones(TrieTombstoneMarker existing, TrieTombstoneMarker update) + { + if (existing == null) + { + // We are adding a new tombstone. + ++tombstoneCount; + return update; + } + else + { + TrieTombstoneMarker merged = update.mergeWith(existing); + if (merged == null || !merged.isBoundary()) + --tombstoneCount; // dropped the existing tombstone (covered by a newer one) + return merged; + } + } + + public RegularAndStaticColumns columns() + { + return columns; + } + + @Override + public DeletionTime partitionLevelDeletion() + { + return TrieTombstoneMarker.applicableDeletionOrLive(trie, ByteComparable.EMPTY); + } + + @Override + public String toString() + { + return "Builder{" + + "metadata=" + metadata + + ", key=" + key + + ", columns=" + columns + + '}'; + } + } + + public static class TrieFactory implements Factory + { + + @Override + public PartitionUpdate.Builder builder(TableMetadata metadata, DecoratedKey partitionKey, RegularAndStaticColumns columns, int initialRowCapacity) + { + return new TriePartitionUpdateStage3.Builder(metadata, partitionKey, columns); + } + + @Override + public PartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey partitionKey) + { + return TriePartitionUpdateStage3.emptyUpdate(metadata, partitionKey); + } + + @Override + public PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey valueKey, Row row) + { + return TriePartitionUpdateStage3.singleRowUpdate(metadata, valueKey, row); + } + + @Override + public PartitionUpdate fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, long nowInSec) + { + return TriePartitionUpdateStage3.fullPartitionDelete(metadata, key, timestamp, nowInSec); + } + + @Override + public PartitionUpdate fromIterator(UnfilteredRowIterator iterator) + { + return TriePartitionUpdateStage3.fromIterator(iterator); + } + + @Override + public PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter) + { + return TriePartitionUpdateStage3.fromIterator(UnfilteredRowIterators.withOnlyQueriedData(iterator, filter)); + } + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdater.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdater.java index 832c38dbf3de..110140f691b3 100644 --- a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdater.java +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdater.java @@ -18,152 +18,219 @@ package org.apache.cassandra.db.partitions; -import org.apache.cassandra.db.Clustering; -import org.apache.cassandra.db.DeletionInfo; +import javax.annotation.Nullable; + import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.LivenessInfo; -import org.apache.cassandra.db.marshal.ByteArrayAccessor; import org.apache.cassandra.db.memtable.TrieMemtable; -import org.apache.cassandra.db.rows.BTreeRow; -import org.apache.cassandra.db.tries.InMemoryTrie; -import org.apache.cassandra.index.transactions.UpdateTransaction; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.memory.Cloner; - -import static org.apache.cassandra.db.partitions.TrieBackedPartition.RowData; - -/** - * The function we provide to the trie utilities to perform any partition and row inserts and updates - */ -public final class TriePartitionUpdater -extends BasePartitionUpdater -implements InMemoryTrie.UpsertTransformerWithKeyProducer +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; +import org.apache.cassandra.db.rows.Cells; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.memtable.TrieCellData; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; + +import static org.apache.cassandra.db.memtable.TrieMemtable.PartitionData; + +/// The function we provide to the trie utilities to perform any partition and row inserts and updates. +/// This version is used when no secondary index is applied, which makes the process quite a bit simpler. +public class TriePartitionUpdater { - private final UpdateTransaction indexer; - private final TableMetadata metadata; - private TrieMemtable.PartitionData currentPartition; private final TrieMemtable.MemtableShard owner; - public int partitionsAdded = 0; + protected final InMemoryDeletionAwareTrie.Mutator mutator; - public TriePartitionUpdater(Cloner cloner, - UpdateTransaction indexer, - TableMetadata metadata, - TrieMemtable.MemtableShard owner) + public long dataSize; + public long colUpdateTimeDelta; + public int partitionsAdded; + + /// Holds a reference to the current partition's statistics, used to update them when merging data. + protected PartitionData currentPartition; + + public TriePartitionUpdater(TrieMemtable.MemtableShard owner, + InMemoryDeletionAwareTrie data) { - super(cloner); - this.indexer = indexer; - this.metadata = metadata; this.owner = owner; + this.mutator = data.mutator(this::mergeData, + this::mergeMarkers, + this::applyIncomingMarker, + this::applyExistingMarkerToIncomingRow, + true, + TrieMemtable.FORCE_COPY_PARTITION_BOUNDARY, + x -> { throw new AssertionError("Force copy should already be in effect for all range tries"); }); } - @Override - public Object apply(Object existing, Object update, InMemoryTrie.KeyProducer keyState) + /// Merge the given update into the data trie. + public void mergeUpdate(DeletionAwareTrie update) throws TrieSpaceExhaustedException { - if (update instanceof RowData) - return applyRow((RowData) existing, (RowData) update, keyState); - else if (update instanceof DeletionInfo) - return applyDeletion((TrieMemtable.PartitionData) existing, (DeletionInfo) update); + this.currentPartition = null; + this.partitionsAdded = 0; + this.dataSize = 0; + this.colUpdateTimeDelta = Long.MAX_VALUE; + + mutator.apply(update); + } + + /// Merge incoming live data (cell, liveness info or various level markers) with existing content. + Object mergeData(@Nullable Object existing, Object update) + { + // Most common case first + if (update instanceof CellData) + return applyCell((TrieCellData) existing, (CellData) update); + else if (update == TrieBackedRow.COMPLEX_COLUMN_MARKER) + return update; + else if (update instanceof LivenessInfo) + return applyIncomingRowMarker((LivenessInfo) existing, (LivenessInfo) update); + else if (update == TrieBackedPartition.PARTITION_MARKER) + return mergePartitionMarkers((PartitionData) existing); else throw new AssertionError("Unexpected update type: " + update.getClass()); } - /** - * Called when a row needs to be copied to the Memtable trie. - * - * @param existing Existing RowData for this clustering, or null if there isn't any. - * @param insert RowData to be inserted. - * @param keyState Used to obtain the path through which this node was reached. - * @return the insert row, or the merged row, copied using our allocator - */ - private RowData applyRow(RowData existing, RowData insert, InMemoryTrie.KeyProducer keyState) + /// Merge an incoming tombstone with existing deletions. + /// This will be called for all boundary tombstones in the update, but also for all existing boundaries that are + /// covered by an incoming range. + TrieTombstoneMarker mergeMarkers(@Nullable TrieTombstoneMarker existing, TrieTombstoneMarker update) { if (existing == null) { - RowData data = insert.clone(cloner); - - if (indexer != UpdateTransaction.NO_OP) - indexer.onInserted(data.toRow(clusteringFor(keyState))); - - this.dataSize += data.dataSize(); - this.heapSize += data.unsharedHeapSizeExcludingData(); - currentPartition.markInsertedRows(1); // null pointer here means a problem in applyDeletion - return data; + currentPartition.markAddedTombstones(1); + return update; } else { - // data and heap size are updated during merge through the PostReconciliationFunction interface - RowData reconciled = merge(existing, insert); + TrieTombstoneMarker merged = update.mergeWith(existing); + return merged; + } + } - if (indexer != UpdateTransaction.NO_OP) - { - Clustering clustering = clusteringFor(keyState); - indexer.onUpdated(existing.toRow(clustering), reconciled.toRow(clustering)); - } + /// Apply an incoming tombstone to existing data, possibly removing it from the trie. + Object applyIncomingMarker(Object existingContent, TrieTombstoneMarker updateMarker) + { + DeletionTime deletion = updateMarker.applicableToPointForward(); + if (deletion == null) + return existingContent; + + // Most common case first + if (existingContent instanceof CellData) + return applyCellDeletion((CellData) existingContent, deletion); + else if (existingContent == TrieBackedRow.COMPLEX_COLUMN_MARKER) + return existingContent; + else if (existingContent instanceof LivenessInfo) + return applyRowDeletion((LivenessInfo) existingContent, deletion); + else if (existingContent instanceof PartitionData) + return applyPartitionDeletion((PartitionData) existingContent, deletion); + else + throw new AssertionError("Unexpected content in trie " + existingContent + " for deletion " + updateMarker); + } - return reconciled; - } + protected CellData applyCellDeletion(CellData existingContent, DeletionTime deletion) + { + if (!deletion.deletes(existingContent)) + return existingContent; + dataSize -= existingContent.valueSize(); + return null; } - private RowData merge(RowData existing, RowData update) + Object applyPartitionDeletion(PartitionData existing, DeletionTime unused) { + existing.clearStats(); + return existing; + } - LivenessInfo livenessInfo = LivenessInfo.merge(update.livenessInfo, existing.livenessInfo); - DeletionTime deletion = DeletionTime.merge(update.deletion, existing.deletion); - if (deletion.deletes(livenessInfo)) - livenessInfo = LivenessInfo.EMPTY; + LivenessInfo applyRowDeletion(LivenessInfo existing, DeletionTime deletion) + { + if (deletion.deletes(existing)) + { + return LivenessInfo.EMPTY; + // TODO: and also do currentPartition.markInsertedRows(-1) in that case? + // TODO: Does strict row liveness apply here? How do we drop tail trie if it does? + } + return existing; + } - Object[] tree = BTreeRow.mergeRowBTrees(this, - existing.columnsBTree, update.columnsBTree, - deletion, existing.deletion); - return new RowData(tree, livenessInfo, deletion); + /// Apply an existing tombstone to incoming data before merging that data in the trie. + Object applyExistingMarkerToIncomingRow(TrieTombstoneMarker marker, Object content) + { + DeletionTime rowDeletion = marker.applicableToPointForward(); + if (rowDeletion == null) + return content; // there is no row deletion here + + // No size tracking is needed, because the result of this gets applied to the trie with applyRow. + if (content instanceof Cell) + return rowDeletion.deletes((Cell) content) ? null : content; + else if (content == TrieBackedRow.COMPLEX_COLUMN_MARKER) + return content; + else if (content instanceof LivenessInfo) + { + if (!rowDeletion.deletes((LivenessInfo) content)) + return content; + else + return LivenessInfo.EMPTY; + } + else if (content instanceof PartitionData) + return content; + else + throw new AssertionError("Unexpected content in trie " + content + " for deletion " + marker); } - private Clustering clusteringFor(InMemoryTrie.KeyProducer keyState) + LivenessInfo applyIncomingRowMarker(@Nullable LivenessInfo existing, LivenessInfo insert) { - return metadata.comparator.clusteringFromByteComparable( - ByteArrayAccessor.instance, - ByteComparable.preencoded(TrieBackedPartition.BYTE_COMPARABLE_VERSION, - keyState.getBytes(TrieMemtable.IS_PARTITION_BOUNDARY))); + if (existing == null) + { + this.dataSize += insert.dataSize(); + currentPartition.markInsertedRows(1); // null pointer here means a problem in applyDeletion + return insert; + } + else + { + LivenessInfo reconciled = LivenessInfo.merge(existing, insert); + if (reconciled != existing) + this.dataSize += reconciled.dataSize() - existing.dataSize(); + + return reconciled; + } } - /** - * Called at the partition boundary to merge the existing and new metadata associated with the partition. This needs - * to update the deletion time with any new deletion introduced by the update, but also make sure that the - * statistics we track for the partition (dataSize) are updated for the changes caused by merging the update's rows - * (note that this is called _after_ the rows of the partition have been merged, on the return path of the - * recursion). - * - * @param existing Any partition data already associated with the partition. - * @param update The update, always non-null. - * @return the combined partition data, copying any updated deletion information to heap. - */ - private TrieMemtable.PartitionData applyDeletion(TrieMemtable.PartitionData existing, DeletionInfo update) + CellData applyCell(@Nullable TrieCellData existing, CellData update) { - if (indexer != UpdateTransaction.NO_OP) + if (existing == null) + { + this.dataSize += update.valueSize(); + return update; + } + else { - if (!update.getPartitionDeletion().isLive()) - indexer.onPartitionDeletion(update.getPartitionDeletion()); - if (update.hasRanges()) - update.rangeIterator(false).forEachRemaining(indexer::onRangeTombstone); + CellData reconciled = Cells.reconcile(existing, update); + if (reconciled != existing) + { + long timeDelta = Math.abs(reconciled.timestamp() - existing.timestamp()); + if (timeDelta < colUpdateTimeDelta) + colUpdateTimeDelta = timeDelta; + this.dataSize += reconciled.valueSize() - existing.valueSize(); + } + return reconciled; } + } + /// Called at the partition boundary to merge the existing and new metadata associated with the partition. This needs + /// to make sure that the statistics we track for the partition (dataSize) are updated for the changes caused by + /// merging the update's rows. + /// + /// @param existing Any partition data already associated with the partition. + /// @return the combined partition data, creating a new marker if one did not already exist. + protected PartitionData mergePartitionMarkers(@Nullable PartitionData existing) + { if (existing == null) { - // Note: Always on-heap, regardless of cloner - TrieMemtable.PartitionData newRef = new TrieMemtable.PartitionData(update, owner); - this.heapSize += newRef.unsharedHeapSize(); + PartitionData newRef = new PartitionData(owner); ++this.partitionsAdded; return currentPartition = newRef; } assert owner == existing.owner; - if (update.isLive() || !update.mayModify(existing)) - return currentPartition = existing; - - // Note: Always on-heap, regardless of cloner - TrieMemtable.PartitionData merged = new TrieMemtable.PartitionData(existing, update); - this.heapSize += merged.unsharedHeapSize() - existing.unsharedHeapSize(); - return currentPartition = merged; + return currentPartition = existing; } } diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterLegacyIndex.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterLegacyIndex.java new file mode 100644 index 000000000000..3506daa6b4e9 --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterLegacyIndex.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import javax.annotation.Nullable; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.memtable.TrieCellData; +import org.apache.cassandra.db.memtable.TrieMemtable; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.memtable.TrieMemtable.PartitionData; + +/// The function we provide to the trie utilities to perform any partition and row inserts and updates when a legacy +/// secondary index is in use. This builds on the plain [TriePartitionUpdater] and prepares legacy interpretations of +/// the changes for the indexing calls. +public final class TriePartitionUpdaterLegacyIndex extends TriePartitionUpdater +{ + private UpdateTransaction indexer; + private TableMetadata metadata; + /// When a tombstone boundary opens a range, we store the position here to report a range tombstone when it closes. + private ClusteringBound rangeTombstoneOpenPosition; + /// The depth at which we saw the root of the current partition. Used to obtain clustering keys of modified rows. + private int currentPartitionDepth; + /// The depth at which we saw the root of the current row. Used to obtain cell columns and paths. + private int currentRowDepth; + /// The column set for the current row (differs for static rows) + private Columns currentColumns; + + public TriePartitionUpdaterLegacyIndex(TrieMemtable.MemtableShard owner, + InMemoryDeletionAwareTrie data, + TableMetadata metadata) + { + super(owner, data); + this.metadata = metadata; + } + + /// Set the update transaction to use + public void setIndexContext(UpdateTransaction indexer) + { + this.indexer = indexer; + this.rangeTombstoneOpenPosition = null; + assert indexer != UpdateTransaction.NO_OP; + } + + @Override + public TrieTombstoneMarker mergeMarkers(@Nullable TrieTombstoneMarker existing, TrieTombstoneMarker update) + { + TrieTombstoneMarker merged = super.mergeMarkers(existing, update); + if (merged == null) + return merged; + + if (merged.hasLevelMarker(TrieTombstoneMarker.LevelMarker.ROW)) + processRowDeletionUpdate(existing != null ? existing.applicableToPointForward() : null, merged.applicableToPointForward()); + else if (update.isBoundary()) + processMarkerBoundary(merged); + + return merged; + } + + private void processRowDeletionUpdate(DeletionTime existing, DeletionTime updated) + { + Clustering clustering = metadata.comparator.clusteringFromByteComparable( + ByteArrayAccessor.instance, + byteComparableForCurrentDeletionBranchKey()); + + Row.Deletion updatedDeletion = updated != null ? Row.Deletion.regular(updated) : Row.Deletion.LIVE; + if (existing == null) + indexer.startRow(clustering, null, null, LivenessInfo.EMPTY, updatedDeletion); + else + indexer.startRow(clustering, LivenessInfo.EMPTY, Row.Deletion.regular(existing), LivenessInfo.EMPTY, updatedDeletion); + + currentRowDepth = mutator.getDeletionBranchDepth(); + currentColumns = metadata.regularAndStaticColumns().columns(clustering == Clustering.STATIC_CLUSTERING); + } + + private void processMarkerBoundary(TrieTombstoneMarker update) + { + TrieTombstoneMarker.Covering leftSide = update.leftDeletion(); + TrieTombstoneMarker.Covering rightSide = update.rightDeletion(); + TrieTombstoneMarker.Kind leftKind = leftSide != null ? leftSide.deletionKind() : null; + TrieTombstoneMarker.Kind rightKind = rightSide != null ? rightSide.deletionKind() : null; + + assert leftKind != TrieTombstoneMarker.Kind.ROW && rightKind != TrieTombstoneMarker.Kind.ROW + : "Row deletion without row level marker: " + update; + + // We need to report column deletions. Do so by issuing it on the open side. + // Indexer ignores existing deletions, so we don't need to report them here. + if (rightKind == TrieTombstoneMarker.Kind.COLUMN) + { + byte[] cellPath = mutator.getDeletionBranchKeyBytes(currentRowDepth); + ColumnMetadata column = TrieBackedRow.columnMetadataFromPath(cellPath, cellPath.length, currentColumns); + indexer.onComplexColumnDeletion(column, rightSide); + return; + } + // For range tombstones, we should only report when they start and stop. This means ignoring all switches + // that include a lower-level change. + if (leftKind == TrieTombstoneMarker.Kind.COLUMN) + return; + + // We should also skip the sides that switch to or from the partition deletion. + if (leftKind == TrieTombstoneMarker.Kind.PARTITION) + leftSide = null; + if (rightKind == TrieTombstoneMarker.Kind.PARTITION) + rightSide = null; + + if (leftSide != null || rightSide != null) + processRangeTombstoneMarker(leftSide, rightSide); + } + + private void processRangeTombstoneMarker(TrieTombstoneMarker.Covering leftSide, TrieTombstoneMarker.Covering rightSide) + { + ByteComparable deletionBranchKey = byteComparableForCurrentDeletionBranchKey(); + if (rangeTombstoneOpenPosition != null) + { + // We have an active range. The incoming marker's left side must close it. Combine with the start + // position to form the tombstone range we report to the indexer. + assert leftSide != null; // open markers are always closed + ClusteringBound bound = metadata.comparator.boundFromByteComparable( + ByteArrayAccessor.instance, + deletionBranchKey, + true); + indexer.onRangeTombstone(new RangeTombstone(Slice.make(rangeTombstoneOpenPosition, + bound), + leftSide)); + } + else + assert leftSide == null; + + if (rightSide != null) + { + // The right side of the marker tells us if this boundary opens a new deletion. If so, store the + // position to report the range when it closes. + // Note: we don't need to save the deletion time as the closing side will repeat it. + rangeTombstoneOpenPosition = metadata.comparator.boundFromByteComparable( + ByteArrayAccessor.instance, + deletionBranchKey, + false); + } + else + rangeTombstoneOpenPosition = null; + } + + @Override + Object applyIncomingMarker(Object existingContent, TrieTombstoneMarker updateMarker) + { + // We override this to make sure we mark the row start when we start deleting from it, so that we can report + // all removed cells. + if (existingContent instanceof LivenessInfo) + return applyRowDeletion((LivenessInfo) existingContent, updateMarker.applicableToPointForward()); + else + return super.applyIncomingMarker(existingContent, updateMarker); + } + + @Override + protected CellData applyCellDeletion(CellData existingContent, DeletionTime deletion) + { + CellData mergedCellData = super.applyCellDeletion(existingContent, deletion); + if (mergedCellData == existingContent) + return mergedCellData; + + byte[] cellPath = mutator.getCurrentKeyBytes(currentRowDepth); + Cell existingAsCell = TrieBackedRow.cellFromCellData(existingContent, cellPath, cellPath.length, currentColumns); + assert mergedCellData == null; + indexer.onCellUpdate(existingAsCell, null); + return mergedCellData; + } + + @Override + public LivenessInfo applyRowDeletion(LivenessInfo existing, DeletionTime deletion) + { + LivenessInfo mergedInfo = deletion != null ? super.applyRowDeletion(existing, deletion) : existing; + Clustering clustering = clusteringForCurrentKey(); + indexer.startRow(clustering, existing, Row.Deletion.LIVE, mergedInfo, deletion != null ? Row.Deletion.regular(deletion) : Row.Deletion.LIVE); + + currentRowDepth = mutator.currentDepth(); + currentColumns = metadata.regularAndStaticColumns().columns(clustering == Clustering.STATIC_CLUSTERING); + return mergedInfo; + } + + @Override + Object applyPartitionDeletion(PartitionData existing, DeletionTime deletion) + { + indexer.onPartitionDeletion(deletion); + return super.applyPartitionDeletion(existing, deletion); + } + + @Override + protected PartitionData mergePartitionMarkers(@Nullable PartitionData existing) + { + currentPartitionDepth = mutator.currentDepth(); + return super.mergePartitionMarkers(existing); + } + + @Override + LivenessInfo applyIncomingRowMarker(@Nullable LivenessInfo existing, LivenessInfo insert) + { + Clustering clustering = clusteringForCurrentKey(); + LivenessInfo mergedInfo = super.applyIncomingRowMarker(existing, insert); + indexer.startRow(clustering, existing, Row.Deletion.LIVE, mergedInfo, Row.Deletion.LIVE); + currentRowDepth = mutator.currentDepth(); + currentColumns = metadata.regularAndStaticColumns().columns(clustering == Clustering.STATIC_CLUSTERING); + return mergedInfo; + } + + @Override + CellData applyCell(@Nullable TrieCellData existing, CellData update) + { + CellData mergedCellData = super.applyCell(existing, update); + byte[] cellPath = mutator.getCurrentKeyBytes(currentRowDepth); + Cell existingAsCell = null; + Cell mergedAsCell = null; + if (mergedCellData != null) + { + mergedAsCell = TrieBackedRow.cellFromCellData(mergedCellData, cellPath, cellPath.length, currentColumns); + if (existing != null) + existingAsCell = existing.toCell(mergedAsCell.column(), mergedAsCell.path()); + } + else + { + assert existing != null; + existingAsCell = TrieBackedRow.cellFromCellData(existing, cellPath, cellPath.length, currentColumns); + } + indexer.onCellUpdate(existingAsCell, mergedAsCell); + + return mergedCellData; + } + + private ByteComparable byteComparableForCurrentDeletionBranchKey() + { + return ByteComparable.preencoded(mutator.byteComparableVersion(), + mutator.getDeletionBranchKeyBytes()); + } + + private Clustering clusteringForCurrentKey() + { + return metadata.comparator.clusteringFromByteComparable( + ByteArrayAccessor.instance, + ByteComparable.preencoded(mutator.byteComparableVersion(), + mutator.getCurrentKeyBytes(currentPartitionDepth))); + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage2.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage2.java new file mode 100644 index 000000000000..1660abdaffa2 --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage2.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.memtable.TrieMemtableStage2; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryTrie; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.memory.Cloner; + +import static org.apache.cassandra.db.partitions.TrieBackedPartitionStage2.RowData; + +/** + * The function we provide to the trie utilities to perform any partition and row inserts and updates + */ +public final class TriePartitionUpdaterStage2 +extends BasePartitionUpdater +implements InMemoryBaseTrie.UpsertTransformer +{ + private final UpdateTransaction indexer; + private final TableMetadata metadata; + private TrieMemtableStage2.PartitionData currentPartition; + private int currentPartitionDepth; + private final TrieMemtableStage2.MemtableShard owner; + public int partitionsAdded = 0; + private InMemoryTrie.Mutator mutator; + + public TriePartitionUpdaterStage2(Cloner cloner, + UpdateTransaction indexer, + TableMetadata metadata, + TrieMemtableStage2.MemtableShard owner) + { + super(cloner); + this.indexer = indexer; + this.metadata = metadata; + this.owner = owner; + } + + public void apply(InMemoryTrie data, Trie update) throws TrieSpaceExhaustedException + { + mutator = data.mutator(this, TrieMemtableStage2.FORCE_COPY_PARTITION_BOUNDARY); + mutator.apply(update); + } + + @Override + public Object apply(Object existing, Object update) + { + if (update instanceof RowData) + return applyRow((RowData) existing, (RowData) update); + else if (update instanceof DeletionInfo) + return applyDeletion((TrieMemtableStage2.PartitionData) existing, (DeletionInfo) update); + else + throw new AssertionError("Unexpected update type: " + update.getClass()); + } + + /** + * Called when a row needs to be copied to the Memtable trie. + * + * @param existing Existing RowData for this clustering, or null if there isn't any. + * @param insert RowData to be inserted. + * @return the insert row, or the merged row, copied using our allocator + */ + private RowData applyRow(RowData existing, RowData insert) + { + if (existing == null) + { + RowData data = insert.clone(cloner); + + if (indexer != UpdateTransaction.NO_OP) + indexer.onInserted(data.toRow(clusteringForCurrentKey())); + + this.dataSize += data.dataSize(); + this.heapSize += data.unsharedHeapSizeExcludingData(); + currentPartition.markInsertedRows(1); // null pointer here means a problem in applyDeletion + return data; + } + else + { + // data and heap size are updated during merge through the PostReconciliationFunction interface + RowData reconciled = merge(existing, insert); + + if (indexer != UpdateTransaction.NO_OP) + { + Clustering clustering = clusteringForCurrentKey(); + indexer.onUpdated(existing.toRow(clustering), reconciled.toRow(clustering)); + } + + return reconciled; + } + } + + private RowData merge(RowData existing, RowData update) + { + + LivenessInfo livenessInfo = LivenessInfo.merge(update.livenessInfo, existing.livenessInfo); + DeletionTime deletion = DeletionTime.merge(update.deletion, existing.deletion); + if (deletion.deletes(livenessInfo)) + livenessInfo = LivenessInfo.EMPTY; + + Object[] tree = BTreeRow.mergeRowBTrees(this, + existing.columnsBTree, update.columnsBTree, + deletion, existing.deletion); + return new RowData(tree, livenessInfo, deletion); + } + + private Clustering clusteringForCurrentKey() + { + return metadata.comparator.clusteringFromByteComparable( + ByteArrayAccessor.instance, + ByteComparable.preencoded(mutator.byteComparableVersion(), + mutator.getCurrentKeyBytes(currentPartitionDepth))); + } + + /** + * Called at the partition boundary to merge the existing and new metadata associated with the partition. This needs + * to update the deletion time with any new deletion introduced by the update, but also set currentPartition to make + * sure that the statistics we track for the partition (dataSize) are updated for the changes caused by merging the + * update's rows and currentPartitionDepth to make it possible to construct clustering keys from the bytes of the + * trie path (note that this is called _before_ the rows of the partition have been merged, on the descent path of + * the recursion). + * + * @param existing Any partition data already associated with the partition. + * @param update The update, always non-null. + * @return the combined partition data, copying any updated deletion information to heap. + */ + private TrieMemtableStage2.PartitionData applyDeletion(TrieMemtableStage2.PartitionData existing, DeletionInfo update) + { + if (indexer != UpdateTransaction.NO_OP) + { + if (!update.getPartitionDeletion().isLive()) + indexer.onPartitionDeletion(update.getPartitionDeletion()); + if (update.hasRanges()) + update.rangeIterator(false).forEachRemaining(indexer::onRangeTombstone); + } + + currentPartitionDepth = mutator.currentDepth(); + if (existing == null) + { + // Note: Always on-heap, regardless of cloner + TrieMemtableStage2.PartitionData newRef = new TrieMemtableStage2.PartitionData(update, owner); + this.heapSize += newRef.unsharedHeapSize(); + ++this.partitionsAdded; + return currentPartition = newRef; + } + + assert owner == existing.owner; + if (update.isLive() || !update.mayModify(existing)) + return currentPartition = existing; + + // Note: Always on-heap, regardless of cloner + TrieMemtableStage2.PartitionData merged = new TrieMemtableStage2.PartitionData(existing, update); + this.heapSize += merged.unsharedHeapSize() - existing.unsharedHeapSize(); + return currentPartition = merged; + } +} diff --git a/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage3.java b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage3.java new file mode 100644 index 000000000000..21ea34631313 --- /dev/null +++ b/src/java/org/apache/cassandra/db/partitions/TriePartitionUpdaterStage3.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import javax.annotation.Nullable; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.memtable.TrieMemtableStage3; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.InMemoryBaseTrie; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.memory.Cloner; + +import static org.apache.cassandra.db.partitions.TrieBackedPartitionStage3.RowData; + +/** + * The function we provide to the trie utilities to perform any partition and row inserts and updates + */ +public final class TriePartitionUpdaterStage3 +extends BasePartitionUpdater +implements InMemoryBaseTrie.UpsertTransformer +{ + private final UpdateTransaction indexer; + private final TableMetadata metadata; + private TrieMemtableStage3.PartitionData currentPartition; + private int currentPartitionDepth; + private final TrieMemtableStage3.MemtableShard owner; + private ClusteringBound rangeTombstoneOpenPosition = null; + private InMemoryDeletionAwareTrie.Mutator mutator; + public int partitionsAdded = 0; + + public TriePartitionUpdaterStage3(InMemoryDeletionAwareTrie data, + Cloner cloner, + UpdateTransaction indexer, + TableMetadata metadata, + TrieMemtableStage3.MemtableShard owner) + { + super(cloner); + this.indexer = indexer; + this.metadata = metadata; + this.owner = owner; + this.mutator = data.mutator(this, + this::mergeMarkers, + this::applyIncomingMarker, + this::applyExistingMarkerToIncomingRow, + true, + TrieMemtableStage3.FORCE_COPY_PARTITION_BOUNDARY); + } + + public void apply(DeletionAwareTrie update) + throws TrieSpaceExhaustedException + { + mutator.apply(update); + } + + @Override + public Object apply(@Nullable Object existing, Object update) + { + if (update == TrieBackedPartitionStage3.PARTITION_MARKER) + return mergePartitionMarkers((TrieMemtableStage3.PartitionData) existing); + else if (update instanceof RowData) + return applyIncomingRow((RowData) existing, (RowData) update); + else + throw new AssertionError("Unexpected update type: " + update.getClass()); + } + + public TrieTombstoneMarker mergeMarkers(@Nullable TrieTombstoneMarker existing, TrieTombstoneMarker update) + { + if (indexer != UpdateTransaction.NO_OP) + { + DeletionTime updatePointDeletion = update.pointDeletion(); + if (updatePointDeletion != null) + { + Clustering clustering = metadata.comparator.clusteringFromByteComparable( + ByteArrayAccessor.instance, + byteComparableForCurrentDeletionBranchKey()); + DeletionTime existingPointDeletion = existing != null ? existing.pointDeletion() : null; + if (existingPointDeletion != null) + indexer.onUpdated(BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(existingPointDeletion)), + BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(updatePointDeletion))); + else + indexer.onInserted(BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(updatePointDeletion))); + } + else if (update.isBoundary()) + { + if (rangeTombstoneOpenPosition != null) + { + // We have an active range. The incoming marker's left side (preceding in forward direction) must + // close it. Combine with the start position to form the tombstone range we report to the indexer. + DeletionTime deletionTime = update.leftDeletion(); + assert deletionTime != null; // open markers are always closed + ClusteringBound bound = metadata.comparator.boundFromByteComparable( + ByteArrayAccessor.instance, + byteComparableForCurrentDeletionBranchKey(), + true); + indexer.onRangeTombstone(new RangeTombstone(Slice.make(rangeTombstoneOpenPosition, + bound), + deletionTime)); + } + + // The right side (preceding in reverse direction) of the marker tells us if this boundary opens a new + // deletion. If so, store the position to report the range when it closes. + // Note: we don't need to save the deletion time as the closing side will repeat it. + TrieTombstoneMarker.Covering succeeding = update.rightDeletion(); + // Ignore the partition deletion. + if (succeeding != null && succeeding.deletionKind() == TrieTombstoneMarker.Kind.RANGE) + { + rangeTombstoneOpenPosition = metadata.comparator.boundFromByteComparable( + ByteArrayAccessor.instance, + byteComparableForCurrentDeletionBranchKey(), + false); + } + else + { + rangeTombstoneOpenPosition = null; + } + } + } + + if (existing == null) + { + currentPartition.markAddedTombstones(1); + this.heapSize += update.unsharedHeapSize(); + return update; + } + else + { + TrieTombstoneMarker merged = update.mergeWith(existing); + this.heapSize += (merged != null ? merged.unsharedHeapSize() : 0) - existing.unsharedHeapSize(); + return merged; + } + } + + public Object applyIncomingMarker(Object existingContent, TrieTombstoneMarker updateMarker) + { + DeletionTime deletion = updateMarker.applicableToPointForward(); + if (deletion == null) + return existingContent; + + if (existingContent instanceof TrieMemtableStage3.PartitionData) + return applyPartitionDeletion((TrieMemtableStage3.PartitionData) existingContent, deletion); + else if (existingContent instanceof RowData) + return applyRowDeletion((RowData) existingContent, deletion); + else + throw new AssertionError("Unexpected content in trie: " + existingContent); + } + + public Object applyPartitionDeletion(TrieMemtableStage3.PartitionData existing, DeletionTime deletion) + { + indexer.onPartitionDeletion(deletion); + existing.clearStats(); + return existing; + } + + public Object applyRowDeletion(RowData existing, DeletionTime deletion) + { + RowData updated = existing.delete(deletion); + if (updated != existing) + this.heapSize += (updated != null ? updated.unsharedHeapSizeExcludingData() : 0) - existing.unsharedHeapSizeExcludingData(); + if (updated == null) + currentPartition.markInsertedRows(-1); + + if (indexer != UpdateTransaction.NO_OP && updated != existing) + { + Clustering clustering = clusteringForCurrentKey(); + if (updated != null) + indexer.onUpdated(existing.toRow(clustering, DeletionTime.LIVE), + updated.toRow(clustering, DeletionTime.LIVE)); + else + indexer.onUpdated(existing.toRow(clustering, DeletionTime.LIVE), + BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(deletion))); + } + return updated; + } + + public Object applyExistingMarkerToIncomingRow(TrieTombstoneMarker marker, Object content) + { + DeletionTime deletion = marker.applicableToPointForward(); + if (deletion == null) + return content; + + // This is called to apply an existing tombstone to incoming data, before applyRow is called on the result. + // No size tracking is needed, because the result of this then gets applied to the trie with applyRow. + assert content instanceof RowData; // must be non-null, and can't be partition root + return ((RowData) content).delete(deletion); + } + + /** + * Called when a row needs to be copied to the Memtable trie. + * + * @param existing Existing RowData for this clustering, or null if there isn't any. + * @param insert RowData to be inserted. + * @return the insert row, or the merged row, copied using our allocator + */ + private RowData applyIncomingRow(@Nullable RowData existing, RowData insert) + { + if (existing == null) + { + RowData data = insert.clone(cloner); + + if (indexer != UpdateTransaction.NO_OP) + indexer.onInserted(data.toRow(clusteringForCurrentKey(), DeletionTime.LIVE)); + + this.dataSize += data.dataSize(); + this.heapSize += data.unsharedHeapSizeExcludingData(); + currentPartition.markInsertedRows(1); // null pointer here means a problem in applyDeletion + return data; + } + else + { + // data and heap size are updated during merge through the PostReconciliationFunction interface + RowData reconciled = merge(existing, insert); + + if (indexer != UpdateTransaction.NO_OP) + { + Clustering clustering = clusteringForCurrentKey(); + indexer.onUpdated(existing.toRow(clustering, DeletionTime.LIVE), + reconciled.toRow(clustering, DeletionTime.LIVE)); + } + + return reconciled; + } + } + + private RowData merge(RowData existing, RowData update) + { + + LivenessInfo existingLiveness = existing.livenessInfo; + LivenessInfo livenessInfo = LivenessInfo.merge(update.livenessInfo, existingLiveness); + this.heapSize += livenessInfo.unsharedHeapSize() - existingLiveness.unsharedHeapSize(); + + Object[] tree = BTreeRow.mergeRowBTrees(this, + existing.columnsBTree, update.columnsBTree, + DeletionTime.LIVE, DeletionTime.LIVE); + return new RowData(tree, livenessInfo); + } + + /** + * Called at the partition boundary to merge the existing and new metadata associated with the partition. This needs + * to make sure that the statistics we track for the partition (dataSize) are updated for the changes caused by + * merging the update's rows. + * + * @param existing Any partition data already associated with the partition. + * @return the combined partition data, creating a new marker if one did not already exist. + */ + private TrieMemtableStage3.PartitionData mergePartitionMarkers(@Nullable TrieMemtableStage3.PartitionData existing) + { + currentPartitionDepth = mutator.currentDepth(); + + if (existing == null) + { + // Note: Always on-heap, regardless of cloner + TrieMemtableStage3.PartitionData newRef = new TrieMemtableStage3.PartitionData(owner); + this.heapSize += newRef.unsharedHeapSize(); + ++this.partitionsAdded; + return currentPartition = newRef; + } + + assert owner == existing.owner; + return currentPartition = existing; + } + + private ByteComparable byteComparableForCurrentDeletionBranchKey() + { + return ByteComparable.preencoded(mutator.byteComparableVersion(), + mutator.getDeletionBranchKeyBytes()); + } + + private Clustering clusteringForCurrentKey() + { + return metadata.comparator.clusteringFromByteComparable( + ByteArrayAccessor.instance, + ByteComparable.preencoded(mutator.byteComparableVersion(), + mutator.getCurrentKeyBytes(currentPartitionDepth))); + } +} diff --git a/src/java/org/apache/cassandra/db/rows/AbstractBufferCellData.java b/src/java/org/apache/cassandra/db/rows/AbstractBufferCellData.java new file mode 100644 index 000000000000..f814f763224c --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/AbstractBufferCellData.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import java.nio.ByteBuffer; + +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.memory.ByteBufferCloner; +import org.apache.cassandra.utils.memory.Cloner; + +/// Base class for pathless [CellData] objects. +public abstract class AbstractBufferCellData implements CellData +{ + @Override + public ValueAccessor accessor() + { + return ByteBufferAccessor.instance; + } + + @Override + public BufferCellData withNewData(long timestamp, long localDeletionTime, int ttl, ByteBuffer value) + { + return new BufferCellData(value, timestamp, localDeletionTime, ttl, isCounterCell()); + } + + @Override + public BufferCellData withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime) + { + return new BufferCellData(value(), newTimestamp, newLocalDeletionTime, ttl(), isCounterCell()); + } + + @Override + public BufferCellData updateAllTimestamp(long newTimestamp) + { + return new BufferCellData(value(), isTombstone() ? newTimestamp - 1 : newTimestamp, localDeletionTime(), ttl(), isCounterCell()); + } + + @Override + public BufferCellData withSkippedValue() + { + return new BufferCellData(ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp(), localDeletionTime(), ttl(), isCounterCell()); + } + + @Override + public AbstractBufferCellData clone(Cloner cloner) + { + if (!(cloner instanceof ByteBufferCloner)) + throw new AssertionError("Only byte buffer cloner supported for transient CellData."); + + return clone((ByteBufferCloner) cloner); + } + + @Override + public AbstractBufferCellData clone(ByteBufferCloner cloner) + { + ByteBuffer value = value(); + ByteBuffer newBuffer = cloner.clone(value); + return newBuffer == value ? this : new BufferCellData(newBuffer, timestamp(), localDeletionTime(), ttl(), isCounterCell()); + } + + @Override + public AbstractBufferCellData purge(DeletionPurger purger, long nowInSec) + { + if (!isLive(nowInSec)) + { + if (purger.shouldPurge(timestamp(), localDeletionTime())) + return null; + + // We slightly hijack purging to convert expired but not purgeable columns to tombstones. The reason we do that is + // that once a column has expired it is equivalent to a tombstone but actually using a tombstone is more compact since + // we don't keep the column value. The reason we do it here is that 1) it's somewhat related to dealing with tombstones + // so hopefully not too surprising and 2) we want to this and purging at the same places, so it's simpler/more efficient + // to do both here. + if (isExpiring()) + { + // Note that as long as the expiring column and the tombstone put together live longer than GC grace seconds, + // we'll fulfil our responsibility to repair. See discussion at + // http://cassandra-user-incubator-apache-org.3065146.n2.nabble.com/repair-compaction-and-tombstone-rows-td7583481.html + return BufferCellData.tombstone(timestamp(), localDeletionTime() - ttl()).purge(purger, nowInSec); + } + } + return this; + } + + public AbstractBufferCellData purgeDataOlderThan(long timestamp) + { + return timestamp() < timestamp ? null : this; + } + + @Override + public AbstractBufferCellData markCounterLocalToBeCleared() + { + if (!isCounterCell()) + return this; + + ByteBuffer value = buffer(); + ByteBuffer marked = CounterContext.instance().markLocalToBeCleared(value); + return marked == value ? this : new BufferCellData(marked, timestamp(), localDeletionTime(), ttl(), true); + } + + @Override + public int dataSize() + { + return TypeSizes.sizeof(timestamp()) + + TypeSizes.sizeof(ttl()) + + TypeSizes.sizeof(localDeletionTime()) + + valueSize(); + } + + + @Override + public String toString() + { + if (isCounterCell()) + return String.format("[?=%d ts=%d]", CounterContext.instance().total(value(), accessor()), timestamp()); + if (isTombstone()) + return String.format("[?= %s]", livenessInfoString()); + else + return String.format("[?=%s %s]", ByteBufferUtil.bytesToHex(buffer()), livenessInfoString()); + } + + private String livenessInfoString() + { + if (isExpiring()) + return String.format("ts=%d ttl=%d ldt=%d", timestamp(), ttl(), localDeletionTime()); + else if (isTombstone()) + return String.format("ts=%d ldt=%d", timestamp(), localDeletionTime()); + else + return String.format("ts=%d", timestamp()); + } + + @Override + public BufferCell toCell(ColumnMetadata column, CellPath path) + { + return new BufferCell(column, timestamp(), ttl(), localDeletionTime(), value(), path); + } +} diff --git a/src/java/org/apache/cassandra/db/rows/AbstractCell.java b/src/java/org/apache/cassandra/db/rows/AbstractCell.java index e4c306965d1e..82be713e444b 100644 --- a/src/java/org/apache/cassandra/db/rows/AbstractCell.java +++ b/src/java/org/apache/cassandra/db/rows/AbstractCell.java @@ -26,6 +26,7 @@ import org.apache.cassandra.db.context.CounterContext; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.MultiCellCapableType; import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.serializers.MarshalException; @@ -50,21 +51,6 @@ public boolean isCounterCell() return !isTombstone() && column.isCounterColumn(); } - public boolean isLive(long nowInSec) - { - return localDeletionTime() == NO_DELETION_TIME || (ttl() != NO_TTL && nowInSec < localDeletionTime()); - } - - public boolean isTombstone() - { - return localDeletionTime() != NO_DELETION_TIME && ttl() == NO_TTL; - } - - public boolean isExpiring() - { - return ttl() != NO_TTL; - } - public Cell markCounterLocalToBeCleared() { if (!isCounterCell()) @@ -98,12 +84,6 @@ public Cell purge(DeletionPurger purger, long nowInSec) return this; } - - public Cell purgeDataOlderThan(long timestamp) - { - return this.timestamp() < timestamp ? null : this; - } - @Override public Cell clone(ByteBufferCloner cloner) { @@ -117,6 +97,7 @@ public Cell updateAllTimestamp(long newTimestamp) return new BufferCell(column, isTombstone() ? newTimestamp - 1 : newTimestamp, ttl(), localDeletionTime(), buffer(), path()); } + @Override public int dataSize() { CellPath path = path(); @@ -190,7 +171,18 @@ public static boolean equals(Cell left, Cell right) && left.ttl() == right.ttl() && left.localDeletionTime() == right.localDeletionTime() && ValueAccessor.equals(left.value(), left.accessor(), right.value(), right.accessor()) - && Objects.equals(left.path(), right.path()); + && pathsEqual(left.column, left.path(), right.path()); + } + + private static boolean pathsEqual(ColumnMetadata column, CellPath path1, CellPath path2) + { + if (path1 == path2) + return true; + if (path1 == null || path2 == null) + return false; // already true if both null + + assert column.isComplex(); + return ((MultiCellCapableType)column.type).nameComparator().compare(path1.get(0), path2.get(0)) == 0; } @Override @@ -223,7 +215,7 @@ public String toString() CollectionType ct = (CollectionType) type; return String.format("[%s[%s]=%s %s]", column().name, - ct.nameComparator().getString(path().get(0)), + path() == null ? "?" : ct.nameComparator().getString(path().get(0)), isTombstone() ? "" : ct.valueComparator().getString(value(), accessor()), livenessInfoString()); } diff --git a/src/java/org/apache/cassandra/db/rows/AbstractRow.java b/src/java/org/apache/cassandra/db/rows/AbstractRow.java index 470f5dbb4b03..ff91727be92d 100644 --- a/src/java/org/apache/cassandra/db/rows/AbstractRow.java +++ b/src/java/org/apache/cassandra/db/rows/AbstractRow.java @@ -115,7 +115,7 @@ public boolean hasInvalidDeletions() public String toString() { - return columnData().toString(); + return Iterables.toString(this); } public String toString(TableMetadata metadata) diff --git a/src/java/org/apache/cassandra/db/rows/ArrayCell.java b/src/java/org/apache/cassandra/db/rows/ArrayCell.java index 90c20e978c9f..c8d5ff3cfb78 100644 --- a/src/java/org/apache/cassandra/db/rows/ArrayCell.java +++ b/src/java/org/apache/cassandra/db/rows/ArrayCell.java @@ -46,7 +46,7 @@ public class ArrayCell extends AbstractCell // available. public ArrayCell(ColumnMetadata column, long timestamp, int ttl, long localDeletionTime, byte[] value, CellPath path) { - this(column, timestamp, ttl, Cell.deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); + this(column, timestamp, ttl, CellData.deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); } public ArrayCell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTimeUnsignedInteger, byte[] value, CellPath path) @@ -105,6 +105,12 @@ public Cell withSkippedValue() return new ArrayCell(column, timestamp, ttl, localDeletionTimeUnsignedInteger, EMPTY_BYTE_ARRAY, path); } + @Override + public Cell withPath(CellPath path) + { + return new ArrayCell(column, timestamp, ttl, localDeletionTimeUnsignedInteger, value, path); + } + @Override public Cell clone(ByteBufferCloner cloner) { diff --git a/src/java/org/apache/cassandra/db/rows/BTreeComplexColumn.java b/src/java/org/apache/cassandra/db/rows/BTreeComplexColumn.java new file mode 100644 index 000000000000..3bc4e7962926 --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/BTreeComplexColumn.java @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.rows; + +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.Objects; +import java.util.function.Function; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.ByteType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.DroppedColumn; +import org.apache.cassandra.utils.BiLongAccumulator; +import org.apache.cassandra.utils.LongAccumulator; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.memory.Cloner; + +/** + * The data for a complex column, that is its cells and potential complex + * deletion time. + */ +public class BTreeComplexColumn extends ComplexColumnData +{ + static final Cell[] NO_CELLS = new Cell[0]; + + private static final long EMPTY_SIZE = ObjectSizes.measure( + new BTreeComplexColumn(ColumnMetadata.regularColumn("", + "", + "", + SetType.getInstance(ByteType.instance, true)), + NO_CELLS, + DeletionTime.build(0, 0))); + + // The cells for 'column' sorted by cell path. + private final Object[] cells; + + private final DeletionTime complexDeletion; + + BTreeComplexColumn(ColumnMetadata column, Object[] cells, DeletionTime complexDeletion) + { + super(column); + assert column.isComplex(); + assert cells.length > 0 || !complexDeletion.isLive(); + this.cells = cells; + this.complexDeletion = complexDeletion; + } + + @Override + public boolean hasCells() { + return !BTree.isEmpty(this.cells); + } + + @Override + public int cellsCount() + { + return BTree.size(cells); + } + + @Override + public Cell getCell(CellPath path) + { + return (Cell) BTree.find(cells, column.asymmetricCellPathComparator(), path); + } + + public R reduce(R seed, BTree.ReduceFunction reducer) + { + return BTree.reduce(cells, seed, reducer); + } + + @Override + public Cell getCellByIndex(int idx) + { + return BTree.findByIndex(cells, idx); + } + + @Override + public DeletionTime complexDeletion() + { + return complexDeletion; + } + + Object[] tree() + { + return cells; + } + + @Override + public Iterator> iterator() + { + return BTree.iterator(cells); + } + + @Override + public Iterator> reverseIterator() + { + return BTree.iterator(cells, BTree.Dir.DESC); + } + + @Override + public long accumulate(LongAccumulator> accumulator, long initialValue) + { + return BTree.accumulate(cells, accumulator, initialValue); + } + + @Override + public long accumulate(BiLongAccumulator> accumulator, A arg, long initialValue) + { + return BTree.accumulate(cells, accumulator, arg, initialValue); + } + + @Override + public int dataSize() + { + int size = complexDeletion.dataSize(); + for (Cell cell : this) + size += cell.dataSize(); + return size; + } + + @Override + public int liveDataSize(long nowInSec) + { + return complexDeletion.isLive() ? dataSize() : 0; + } + + @Override + public long unsharedHeapSize() + { + long heapSize = EMPTY_SIZE + BTree.sizeOnHeapOf(cells) + complexDeletion.unsharedHeapSize(); + return BTree.accumulate(cells, (cell, value) -> value + cell.unsharedHeapSize(), heapSize); + } + + @Override + public long unsharedHeapSizeExcludingData() + { + long heapSize = EMPTY_SIZE + BTree.sizeOnHeapOf(cells); + // TODO: this can be turned into a simple multiplication, at least while we have only one Cell implementation + for (Cell cell : this) + heapSize += cell.unsharedHeapSizeExcludingData(); + return heapSize; + } + + @Override + public void validate() + { + for (Cell cell : this) + cell.validate(); + } + + @Override + public void digest(Digest digest) + { + if (!complexDeletion.isLive()) + complexDeletion.digest(digest); + + for (Cell cell : this) + cell.digest(digest); + } + + @Override + public boolean hasInvalidDeletions() + { + if (!complexDeletion.validate()) + return true; + for (Cell cell : this) + if (cell.hasInvalidDeletions()) + return true; + return false; + } + + @Override + public BTreeComplexColumn markCounterLocalToBeCleared() + { + return transformAndFilter(complexDeletion, Cell::markCounterLocalToBeCleared); + } + + public BTreeComplexColumn filter(ColumnFilter filter, DeletionTime activeDeletion, DroppedColumn dropped, LivenessInfo rowLiveness) + { + ColumnFilter.Tester cellTester = filter.newTester(column); + boolean isQueriedColumn = filter.fetchedColumnIsQueried(column); + if (cellTester == null && activeDeletion.isLive() && dropped == null && isQueriedColumn) + return this; + + DeletionTime newDeletion = activeDeletion.supersedes(complexDeletion) ? DeletionTime.LIVE : complexDeletion; + return transformAndFilter(newDeletion, (cell) -> + { + CellPath path = cell.path(); + boolean isForDropped = dropped != null && cell.timestamp() <= dropped.droppedTime; + boolean isShadowed = activeDeletion.deletes(cell); + boolean isFetchedCell = cellTester == null || cellTester.fetches(path); + boolean isQueriedCell = isQueriedColumn && isFetchedCell && (cellTester == null || cellTester.fetchedCellIsQueried(path)); + boolean isSkippableCell = !isFetchedCell || (!isQueriedCell && cell.timestamp() < rowLiveness.timestamp()); + if (isForDropped || isShadowed || isSkippableCell) + return null; + // We should apply the same "optimization" as in Cell.deserialize to avoid discrepances + // between sstables and memtables data, i.e resulting in a digest mismatch. + return isQueriedCell ? cell : cell.withSkippedValue(); + }); + } + + public BTreeComplexColumn delete(DeletionTime activeDeletion) + { + // Assuming this column does not keep any shadowed data, there's nothing to change if we already apply the same + // deletion or newer. + if (activeDeletion.isLive() || !activeDeletion.supersedes(complexDeletion)) + return this; + + // The new deletion supersedes the existing, thus we can drop the complex deletion (the assumption is that the + // deletion that is being applied here will be stored at a higher level of the hierarchy (e.g. row/partition)). + return transformAndFilter(DeletionTime.LIVE, (cell) -> activeDeletion.deletes(cell) ? null : cell); + } + + @Override + public BTreeComplexColumn purge(DeletionPurger purger, long nowInSec) + { + DeletionTime newDeletion = complexDeletion.isLive() || purger.shouldPurge(complexDeletion) ? DeletionTime.LIVE : complexDeletion; + return transformAndFilter(newDeletion, (cell) -> cell.purge(purger, nowInSec)); + } + + @Override + public BTreeComplexColumn purgeDataOlderThan(long timestamp) + { + DeletionTime newDeletion = complexDeletion.markedForDeleteAt() < timestamp ? DeletionTime.LIVE : complexDeletion; + return transformAndFilter(newDeletion, (cell) -> cell.purgeDataOlderThan(timestamp)); + } + + public BTreeComplexColumn withOnlyQueriedData(ColumnFilter filter) + { + return transformAndFilter(complexDeletion, (cell) -> filter.fetchedCellIsQueried(column, cell.path()) ? null : cell); + } + + private BTreeComplexColumn update(DeletionTime newDeletion, Object[] newCells) + { + if (cells == newCells && newDeletion == complexDeletion) + return this; + + if (newDeletion == DeletionTime.LIVE && BTree.isEmpty(newCells)) + return null; + + return new BTreeComplexColumn(column, newCells, newDeletion); + } + + public BTreeComplexColumn transformAndFilter(Row.CellTransformer function) + { + return update(complexDeletion, BTree., Cell>transformAndFilter(cells, function::apply)); + } + + public BTreeComplexColumn transformAndFilter(DeletionTime newDeletion, Function function) + { + return update(newDeletion, BTree.transformAndFilter(cells, function)); + } + + public BTreeComplexColumn transform(Function, ? extends Cell> function) + { + return update(complexDeletion, BTree.transform(cells, function)); + } + + @Override + public ColumnData clone(Cloner cloner) + { + return transform(c -> cloner.clone(c)); + } + + @Override + public BTreeComplexColumn updateAllTimestamp(long newTimestamp) + { + DeletionTime newDeletion = complexDeletion.isLive() ? complexDeletion : DeletionTime.build(newTimestamp - 1, complexDeletion.localDeletionTime()); + return transformAndFilter(newDeletion, (cell) -> (Cell) cell.updateAllTimestamp(newTimestamp)); + } + + @Override + public long maxTimestamp() + { + long timestamp = complexDeletion.markedForDeleteAt(); + for (Cell cell : this) + timestamp = Math.max(timestamp, cell.timestamp()); + return timestamp; + } + + @Override + public long minTimestamp() + { + long timestamp = complexDeletion.isLive() + ? Long.MAX_VALUE + : complexDeletion.markedForDeleteAt(); + for (Cell cell : this) + timestamp = Math.min(timestamp, cell.timestamp()); + return timestamp; + } + + // This is the partner in crime of ArrayBackedRow.setValue. The exact warning apply. The short + // version is: "don't use that method". + void setValue(CellPath path, ByteBuffer value) + { + Cell current = (Cell) BTree.find(cells, column.asymmetricCellPathComparator(), path); + BTree.replaceInSitu(cells, column.cellComparator(), current, current.withUpdatedValue(value)); + } + + @Override + public boolean equals(Object other) + { + if (this == other) + return true; + + if(!(other instanceof BTreeComplexColumn)) + return false; + + BTreeComplexColumn that = (BTreeComplexColumn)other; + return this.column().equals(that.column()) + && this.complexDeletion().equals(that.complexDeletion) + && BTree.equals(this.cells, that.cells); + } + + @Override + public int hashCode() + { + return Objects.hash(column(), complexDeletion(), BTree.hashCode(cells)); + } + + @Override + public String toString() + { + return String.format("[%s=%s %s]", + column().name, + complexDeletion.toString(), + BTree.toString(cells)); + } + + @VisibleForTesting + public static BTreeComplexColumn unsafeConstruct(ColumnMetadata column, Object[] cells, DeletionTime complexDeletion) + { + return new BTreeComplexColumn(column, cells, complexDeletion); + } + + public static Builder builder() + { + return new Builder(); + } + + public static class Builder + { + private DeletionTime complexDeletion; + private ColumnMetadata column; + private BTree.Builder> builder; + + public void newColumn(ColumnMetadata column) + { + this.column = column; + this.complexDeletion = DeletionTime.LIVE; // default if writeComplexDeletion is not called + if (builder == null) + builder = BTree.builder(column.cellComparator()); + else + builder.reuse(column.cellComparator()); + } + + public void addComplexDeletion(DeletionTime complexDeletion) + { + this.complexDeletion = complexDeletion; + } + + public void addCell(Cell cell) + { + builder.add(cell); + } + + public BTreeComplexColumn build() + { + if (complexDeletion.isLive() && builder.isEmpty()) + return null; + + return new BTreeComplexColumn(column, builder.build(), complexDeletion); + } + } +} diff --git a/src/java/org/apache/cassandra/db/rows/BTreeRow.java b/src/java/org/apache/cassandra/db/rows/BTreeRow.java index 252c6d440014..1f4b3a51cf95 100644 --- a/src/java/org/apache/cassandra/db/rows/BTreeRow.java +++ b/src/java/org/apache/cassandra/db/rows/BTreeRow.java @@ -18,11 +18,9 @@ package org.apache.cassandra.db.rows; import java.nio.ByteBuffer; - import java.util.AbstractCollection; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.Map; @@ -32,7 +30,6 @@ import java.util.function.Predicate; import com.google.common.collect.Collections2; -import com.google.common.collect.Iterators; import com.google.common.primitives.Ints; import org.apache.cassandra.db.Clustering; @@ -40,15 +37,11 @@ import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.LivenessInfo; -import org.apache.cassandra.db.marshal.AbstractType; -import org.apache.cassandra.db.marshal.UTF8Type; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; - import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.DroppedColumn; - +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.AbstractIterator; import org.apache.cassandra.utils.BiLongAccumulator; import org.apache.cassandra.utils.BulkIterator; @@ -58,6 +51,7 @@ import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.btree.BTreeSearchIterator; import org.apache.cassandra.utils.btree.UpdateFunction; +import org.apache.cassandra.utils.caching.TinyThreadLocalPool; import org.apache.cassandra.utils.memory.Cloner; import static org.apache.cassandra.utils.btree.BTree.STOP_SENTINEL_VALUE; @@ -76,8 +70,8 @@ public class BTreeRow extends AbstractRow // The data for each columns present in this row in column sorted order. private final Object[] btree; - private static final ColumnData FIRST_COMPLEX_STATIC = new ComplexColumnData(Columns.FIRST_COMPLEX_STATIC, new Object[0], DeletionTime.build(0, 0)); - private static final ColumnData FIRST_COMPLEX_REGULAR = new ComplexColumnData(Columns.FIRST_COMPLEX_REGULAR, new Object[0], DeletionTime.build(0, 0)); + private static final ColumnData FIRST_COMPLEX_STATIC = new BTreeComplexColumn(Columns.FIRST_COMPLEX_STATIC, new Object[0], DeletionTime.build(0, 0)); + private static final ColumnData FIRST_COMPLEX_REGULAR = new BTreeComplexColumn(Columns.FIRST_COMPLEX_REGULAR, new Object[0], DeletionTime.build(0, 0)); private static final Comparator COLUMN_COMPARATOR = (cd1, cd2) -> cd1.column.compareTo(cd2.column); @@ -144,7 +138,7 @@ public static BTreeRow singleCellRow(Clustering clustering, Cell cell) if (cell.column().isSimple()) return new BTreeRow(clustering, BTree.singleton(cell), minDeletionTime(cell)); - ComplexColumnData complexData = new ComplexColumnData(cell.column(), new Cell[]{ cell }, DeletionTime.LIVE); + BTreeComplexColumn complexData = new BTreeComplexColumn(cell.column(), new Cell[]{ cell }, DeletionTime.LIVE); return new BTreeRow(clustering, BTree.singleton(complexData), minDeletionTime(cell)); } @@ -179,7 +173,7 @@ private static long minDeletionTime(DeletionTime dt) return dt.isLive() ? Cell.MAX_DELETION_TIME : Long.MIN_VALUE; } - private static long minDeletionTime(ComplexColumnData cd) + private static long minDeletionTime(BTreeComplexColumn cd) { long min = minDeletionTime(cd.complexDeletion()); for (Cell cell : cd) @@ -193,7 +187,7 @@ private static long minDeletionTime(ComplexColumnData cd) private static long minDeletionTime(ColumnData cd) { - return cd.column().isSimple() ? minDeletionTime((Cell) cd) : minDeletionTime((ComplexColumnData)cd); + return cd.column().isSimple() ? minDeletionTime((Cell) cd) : minDeletionTime((BTreeComplexColumn)cd); } public void apply(Consumer function) @@ -287,6 +281,11 @@ && deletion().isLive() && BTree.isEmpty(btree); } + public boolean isEmptyAfterDeletion() + { + return primaryKeyLivenessInfo().isEmpty() && BTree.isEmpty(btree); + } + public Deletion deletion() { return deletion; @@ -301,16 +300,16 @@ public Cell getCell(ColumnMetadata c) public Cell getCell(ColumnMetadata c, CellPath path) { assert c.isComplex(); - ComplexColumnData cd = getComplexColumnData(c); + BTreeComplexColumn cd = getComplexColumnData(c); if (cd == null) return null; return cd.getCell(path); } - public ComplexColumnData getComplexColumnData(ColumnMetadata c) + public BTreeComplexColumn getComplexColumnData(ColumnMetadata c) { assert c.isComplex(); - return (ComplexColumnData) getColumnData(c); + return (BTreeComplexColumn) getColumnData(c); } public ColumnData getColumnData(ColumnMetadata c) @@ -318,8 +317,7 @@ public ColumnData getColumnData(ColumnMetadata c) return (ColumnData) BTree.find(btree, ColumnMetadata.asymmetricColumnDataComparator, c); } - @Override - public Collection columnData() + private Collection columnData() { return new AbstractCollection() { @@ -375,7 +373,7 @@ public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setA Predicate inclusionTester = columns.inOrderInclusionTester(); Predicate queriedByUserTester = filter.queriedColumns().columns(isStatic()).inOrderInclusionTester(); final LivenessInfo rowLiveness = newInfo; - return transformAndFilter(newInfo, newDeletion, (cd) -> { + return transformAndFilterColumns(newInfo, newDeletion, (cd) -> { ColumnMetadata column = cd.column(); if (!inclusionTester.test(column)) @@ -383,7 +381,7 @@ public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setA DroppedColumn dropped = droppedColumns.get(column.name.bytes); if (column.isComplex()) - return ((ComplexColumnData) cd).filter(filter, mayHaveShadowed ? activeDeletion : DeletionTime.LIVE, dropped, rowLiveness); + return ((BTreeComplexColumn) cd).filter(filter, mayHaveShadowed ? activeDeletion : DeletionTime.LIVE, dropped, rowLiveness); Cell cell = (Cell) cd; // We include the cell unless it is 1) shadowed, 2) for a dropped column or 3) skippable. @@ -407,29 +405,19 @@ public Row withOnlyQueriedData(ColumnFilter filter) if (filter.allFetchedColumnsAreQueried()) return this; - return transformAndFilter(primaryKeyLivenessInfo, deletion, (cd) -> { + return transformAndFilterColumns(primaryKeyLivenessInfo, deletion, (cd) -> { ColumnMetadata column = cd.column(); if (column.isComplex()) - return ((ComplexColumnData)cd).withOnlyQueriedData(filter); + return ((BTreeComplexColumn)cd).withOnlyQueriedData(filter); return filter.fetchedColumnIsQueried(column) ? cd : null; }); } - public boolean hasComplex() - { - if (BTree.isEmpty(btree)) - return false; - - int size = BTree.size(btree); - ColumnData last = BTree.findByIndex(btree, size - 1); - return last.column.isComplex(); - } - public boolean hasComplexDeletion() { - long result = accumulate((cd, v) -> ((ComplexColumnData) cd).complexDeletion().isLive() ? 0 : STOP_SENTINEL_VALUE, + long result = accumulate((cd, v) -> ((BTreeComplexColumn) cd).complexDeletion().isLive() ? 0 : STOP_SENTINEL_VALUE, COLUMN_COMPARATOR, isStatic() ? FIRST_COMPLEX_STATIC : FIRST_COMPLEX_REGULAR, 0L); return result == STOP_SENTINEL_VALUE; } @@ -469,7 +457,7 @@ public Row updateAllTimestamp(long newTimestamp) ? Deletion.LIVE : new Deletion(DeletionTime.build(newTimestamp - 1, deletion.time().localDeletionTime()), deletion.isShadowable()); - return transformAndFilter(newInfo, newDeletion, (cd) -> cd.updateAllTimestamp(newTimestamp)); + return transformAndFilterColumns(newInfo, newDeletion, (cd) -> cd.updateAllTimestamp(newTimestamp)); } public Row withRowDeletion(DeletionTime newDeletion) @@ -509,13 +497,7 @@ public Row purgeDataOlderThan(long timestamp, boolean enforceStrictLiveness) if (enforceStrictLiveness && newDeletion.isLive() && newInfo.isEmpty()) return null; - return transformAndFilter(newInfo, newDeletion, cd -> cd.purgeDataOlderThan(timestamp)); - } - - @Override - public Row transformAndFilter(LivenessInfo info, Deletion deletion, Function function) - { - return update(info, deletion, BTree.transformAndFilter(btree, function)); + return transformAndFilterColumns(newInfo, newDeletion, cd -> cd.purgeDataOlderThan(timestamp)); } private Row update(LivenessInfo info, Deletion deletion, Object[] newTree) @@ -531,10 +513,18 @@ private Row update(LivenessInfo info, Deletion deletion, Object[] newTree) return BTreeRow.create(clustering, info, deletion, newTree, minDeletionTime); } + public Row transformAndFilterColumns(LivenessInfo info, Deletion deletion, Function function) + { + return update(info, deletion, BTree.transformAndFilter(btree, function)); + } + @Override - public Row transformAndFilter(Function function) + public Row transformAndFilter(Function infoFunction, CellTransformer cellFunction) { - return transformAndFilter(primaryKeyLivenessInfo, deletion, function); + return update(infoFunction.apply(primaryKeyLivenessInfo), deletion, BTree.transformAndFilter( + btree, + cd -> cd.column.isSimple() ? cellFunction.apply((Cell) cd) + : ((BTreeComplexColumn)cd).transformAndFilter(cellFunction))); } public Row transform(Function function) @@ -611,24 +601,38 @@ public void setValue(ColumnMetadata column, CellPath path, ByteBuffer value) if (column.isSimple()) BTree.replaceInSitu(btree, ColumnData.comparator, current, ((Cell) current).withUpdatedValue(value)); else - ((ComplexColumnData) current).setValue(path, value); + ((BTreeComplexColumn) current).setValue(path, value); } - public Iterable> cellsInLegacyOrder(TableMetadata metadata, boolean reversed) + /** + * Exposed for TrieBackedPartitionStage2. + */ + public Object[] getBTree() { - return () -> new CellInLegacyOrderIterator(metadata, reversed); + return btree; } - public static Row merge(BTreeRow existing, - BTreeRow update, - ColumnData.PostReconciliationFunction reconcileF) + public long getMinLocalDeletionTime() + { + return minLocalDeletionTime; + } + + @Override + public Row mergeWith(Row updateAsRow) + { + if (!(updateAsRow instanceof BTreeRow)) + throw new IllegalArgumentException("Merging different row types."); + return mergeWith((BTreeRow) updateAsRow, ColumnData.noOp); + } + + public Row mergeWith(BTreeRow update, ColumnData.PostReconciliationFunction reconcileF) { - Object[] existingBtree = existing.btree; + Object[] existingBtree = this.btree; Object[] updateBtree = update.btree; - LivenessInfo livenessInfo = LivenessInfo.merge(update.primaryKeyLivenessInfo(), existing.primaryKeyLivenessInfo()); + LivenessInfo livenessInfo = LivenessInfo.merge(update.primaryKeyLivenessInfo(), this.primaryKeyLivenessInfo()); - Row.Deletion rowDeletion = existing.deletion().supersedes(update.deletion()) ? existing.deletion() : update.deletion(); + Row.Deletion rowDeletion = this.deletion().supersedes(update.deletion()) ? this.deletion() : update.deletion(); if (rowDeletion.deletes(livenessInfo)) livenessInfo = LivenessInfo.EMPTY; @@ -636,15 +640,15 @@ else if (rowDeletion.isShadowedBy(livenessInfo)) rowDeletion = Row.Deletion.LIVE; DeletionTime deletion = rowDeletion.time(); - Object[] tree = mergeRowBTrees(reconcileF, existingBtree, updateBtree, deletion, existing.deletion().time()); - return new BTreeRow(existing.clustering, livenessInfo, rowDeletion, tree, minDeletionTime(tree, livenessInfo, deletion)); + Object[] tree = mergeRowBTrees(reconcileF, existingBtree, updateBtree, deletion, this.deletion().time()); + return new BTreeRow(this.clustering, livenessInfo, rowDeletion, tree, minDeletionTime(tree, livenessInfo, deletion)); } public static Object[] mergeRowBTrees(ColumnData.PostReconciliationFunction reconcileF, Object[] existingBtree, Object[] updateBtree, DeletionTime deletion, DeletionTime existingDeletion) { - try (ColumnData.Reconciler reconciler = ColumnData.reconciler(reconcileF, deletion)) + try (Reconciler reconciler = reconciler(reconcileF, deletion)) { if (!deletion.isLive()) { @@ -662,101 +666,155 @@ public static Object[] mergeRowBTrees(ColumnData.PostReconciliationFunction reco } /** - * Exposed for TrieBackedPartition. + * Construct an UpdateFunction for reconciling normal ColumnData + * (i.e. not suitable for ComplexColumnDeletion sentinels, but suitable ComplexColumnData or Cell) + * + * @param updateF a consumer receiving all pairs of reconciled cells + * @param activeDeletion the row or partition deletion time to use for purging */ - public Object[] getBTree() + public static Reconciler reconciler(ColumnData.PostReconciliationFunction updateF, DeletionTime activeDeletion) { - return btree; + TinyThreadLocalPool.TinyPool pool = Reconciler.POOL.get(); + Reconciler reconciler = pool.poll(); + if (reconciler == null) + reconciler = new Reconciler(); + reconciler.init(updateF, activeDeletion); + reconciler.pool = pool; + return reconciler; } - public long getMinLocalDeletionTime() + public static class Reconciler implements UpdateFunction, AutoCloseable { - return minLocalDeletionTime; - } + private static final TinyThreadLocalPool POOL = new TinyThreadLocalPool<>(); + private ColumnData.PostReconciliationFunction postReconcile; + private DeletionTime activeDeletion; + private TinyThreadLocalPool.TinyPool pool; - private class CellIterator extends AbstractIterator> - { - private Iterator columnData = iterator(); - private Iterator> complexCells; + private void init(ColumnData.PostReconciliationFunction postReconcile, DeletionTime activeDeletion) + { + this.postReconcile = postReconcile; + this.activeDeletion = activeDeletion; + } - protected Cell computeNext() + public ColumnData merge(ColumnData existing, ColumnData update) { - while (true) + if (!(existing instanceof ComplexColumnData)) { - if (complexCells != null) - { - if (complexCells.hasNext()) - return complexCells.next(); + Cell existingCell = (Cell) existing, updateCell = (Cell) update; + Cell result = Cells.reconcile(existingCell, updateCell); - complexCells = null; - } + return postReconcile.merge(existingCell, result); + } + else + { + BTreeComplexColumn existingComplex = (BTreeComplexColumn) existing; + BTreeComplexColumn updateComplex = (BTreeComplexColumn) update; - if (!columnData.hasNext()) - return endOfData(); + DeletionTime existingDeletion = existingComplex.complexDeletion(); + DeletionTime updateDeletion = updateComplex.complexDeletion(); + DeletionTime maxComplexDeletion = existingDeletion.supersedes(updateDeletion) ? existingDeletion : updateDeletion; - ColumnData cd = columnData.next(); - if (cd.column().isComplex()) - complexCells = ((ComplexColumnData)cd).iterator(); - else - return (Cell)cd; + Object[] existingTree = existingComplex.tree(); + Object[] updateTree = updateComplex.tree(); + + Object[] cells; + + try (Reconciler reconciler = reconciler(postReconcile, maxComplexDeletion)) + { + if (!maxComplexDeletion.isLive()) + { + if (maxComplexDeletion == existingDeletion) + { + updateTree = BTree.transformAndFilter(updateTree, reconciler::removeShadowed); + } + else + { + Object[] retained = BTree.transformAndFilter(existingTree, reconciler::retain); + if (existingTree != retained) + { + onAllocatedOnHeap(BTree.sizeOnHeapOf(retained) - BTree.sizeOnHeapOf(existingTree)); + existingTree = retained; + } + } + } + cells = BTree.update(existingTree, updateTree, existingComplex.column.cellComparator(), (UpdateFunction) reconciler); + } + return new BTreeComplexColumn(existingComplex.column, cells, maxComplexDeletion); } } - } - private class CellInLegacyOrderIterator extends AbstractIterator> - { - private final Comparator comparator; - private final boolean reversed; - private final int firstComplexIdx; - private int simpleIdx; - private int complexIdx; - private Iterator> complexCells; - private final Object[] data; - - private CellInLegacyOrderIterator(TableMetadata metadata, boolean reversed) + @Override + public void onAllocatedOnHeap(long heapSize) { - AbstractType nameComparator = UTF8Type.instance; - this.comparator = reversed ? Collections.reverseOrder(nameComparator) : nameComparator; - this.reversed = reversed; - - // copy btree into array for simple separate iteration of simple and complex columns - this.data = new Object[BTree.size(btree)]; - BTree.toArray(btree, data, 0); - - int idx = Iterators.indexOf(Iterators.forArray(data), cd -> cd instanceof ComplexColumnData); - this.firstComplexIdx = idx < 0 ? data.length : idx; - this.complexIdx = firstComplexIdx; + postReconcile.onAllocatedOnHeap(heapSize); } - private int getSimpleIdx() + @Override + public ColumnData insert(ColumnData insert) { - return reversed ? firstComplexIdx - simpleIdx - 1 : simpleIdx; + return postReconcile.insert(insert); } - private int getSimpleIdxAndIncrement() + /** + * Checks if the specified value should be deleted or not. + * + * @param existing the existing value to check + * @return {@code null} if the value should be removed from the BTree or the existing value if it should not. + */ + public ColumnData retain(ColumnData existing) { - int idx = getSimpleIdx(); - ++simpleIdx; - return idx; + return removeShadowed(existing, postReconcile); } - private int getComplexIdx() + private ColumnData removeShadowed(ColumnData existing) { - return reversed ? data.length + firstComplexIdx - complexIdx - 1 : complexIdx; + return removeShadowed(existing, ColumnData.noOp); } - private int getComplexIdxAndIncrement() + /** + * Checks if the specified value should be deleted or not. + * + * @param existing the existing value to check + * @return {@code null} if the value should be removed from the BTree or the existing value if it should not. + */ + private ColumnData removeShadowed(ColumnData existing, ColumnData.PostReconciliationFunction recordDeletion) { - int idx = getComplexIdx(); - ++complexIdx; - return idx; + if (!(existing instanceof ComplexColumnData)) + { + if (activeDeletion.deletes((Cell) existing)) + { + recordDeletion.delete(existing); + return null; + } + } + else + { + BTreeComplexColumn existingComplex = (BTreeComplexColumn) existing; + if (activeDeletion.supersedes(existingComplex.complexDeletion())) + { + Object[] cells = BTree.transformAndFilter(existingComplex.tree(), (ColumnData cd) -> removeShadowed(cd, recordDeletion)); + return BTree.isEmpty(cells) ? null : new BTreeComplexColumn(existingComplex.column, cells, DeletionTime.LIVE); + } + } + + return existing; } - private Iterator> makeComplexIterator(Object complexData) + public void close() { - ComplexColumnData ccd = (ComplexColumnData)complexData; - return reversed ? ccd.reverseIterator() : ccd.iterator(); + activeDeletion = null; + postReconcile = null; + + TinyThreadLocalPool.TinyPool tmp = pool; + pool = null; + tmp.offer(this); } + } + + private class CellIterator extends AbstractIterator> + { + private Iterator columnData = iterator(); + private Iterator> complexCells; protected Cell computeNext() { @@ -770,23 +828,14 @@ protected Cell computeNext() complexCells = null; } - if (simpleIdx >= firstComplexIdx) - { - if (complexIdx >= data.length) - return endOfData(); + if (!columnData.hasNext()) + return endOfData(); - complexCells = makeComplexIterator(data[getComplexIdxAndIncrement()]); - } + ColumnData cd = columnData.next(); + if (cd.column().isComplex()) + complexCells = ((BTreeComplexColumn)cd).iterator(); else - { - if (complexIdx >= data.length) - return (Cell)data[getSimpleIdxAndIncrement()]; - - if (comparator.compare(((ColumnData) data[getSimpleIdx()]).column().name.bytes, ((ColumnData) data[getComplexIdx()]).column().name.bytes) < 0) - return (Cell)data[getSimpleIdxAndIncrement()]; - else - complexCells = makeComplexIterator(data[getComplexIdxAndIncrement()]); - } + return (Cell)cd; } } } @@ -862,7 +911,7 @@ public ColumnData resolve(Object[] cells, int lb, int ub) try (BulkIterator iterator = BulkIterator.of(buildFrom)) { Object[] btree = BTree.build(iterator, buildFromCount, UpdateFunction.noOp()); - return new ComplexColumnData(column, btree, deletion); + return new BTreeComplexColumn(column, btree, deletion); } } } diff --git a/src/java/org/apache/cassandra/db/rows/BufferCell.java b/src/java/org/apache/cassandra/db/rows/BufferCell.java index 85a2e3aeb458..cc7372e520f4 100644 --- a/src/java/org/apache/cassandra/db/rows/BufferCell.java +++ b/src/java/org/apache/cassandra/db/rows/BufferCell.java @@ -28,8 +28,6 @@ import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.memory.ByteBufferCloner; -import static java.lang.String.format; - public class BufferCell extends AbstractCell { private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(ColumnMetadata.regularColumn("", "", "", ByteType.instance), 0L, 0, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, null)); @@ -46,14 +44,15 @@ public class BufferCell extends AbstractCell // available. public BufferCell(ColumnMetadata column, long timestamp, int ttl, long localDeletionTime, ByteBuffer value, CellPath path) { - this(column, timestamp, ttl, deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); + this(column, timestamp, ttl, CellData.deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); } public BufferCell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTimeUnsignedInteger, ByteBuffer value, CellPath path) { super(column); assert !column.isPrimaryKeyColumn(); - assert column.isComplex() == (path != null) : format("Column %s.%s(%s: %s) isComplex: %b with cellpath: %s", column.ksName, column.cfName, column.name, column.type.toString(), column.isComplex(), path); + // Trie-backed rows store path-less complex cells. + // assert column.isComplex() == (path != null) : format("Column %s.%s(%s: %s) isComplex: %b with cellpath: %s", column.ksName, column.cfName, column.name, column.type.toString(), column.isComplex(), path); this.timestamp = timestamp; this.ttl = ttl; this.localDeletionTimeUnsignedInteger = localDeletionTimeUnsignedInteger; @@ -137,6 +136,12 @@ public Cell withSkippedValue() return withUpdatedValue(ByteBufferUtil.EMPTY_BYTE_BUFFER); } + @Override + public Cell withPath(CellPath path) + { + return new BufferCell(column, timestamp, ttl, localDeletionTimeUnsignedInteger, value, path); + } + @Override public long unsharedHeapSize() { diff --git a/src/java/org/apache/cassandra/db/rows/BufferCellData.java b/src/java/org/apache/cassandra/db/rows/BufferCellData.java new file mode 100644 index 000000000000..cd20865040d3 --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/BufferCellData.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import java.nio.ByteBuffer; + +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; + +public class BufferCellData extends AbstractBufferCellData +{ + static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCellData(null, 0, 0, 0, false)); + + final ByteBuffer value; + final long timestamp; + final int localDeletionTimeAsUnsignedInt; + final int ttl; + final boolean isCounterCell; + + public BufferCellData(ByteBuffer value, long timestamp, long localDeletionTime, int ttl, boolean isCounterCell) + { + this.value = value; + this.timestamp = timestamp; + this.localDeletionTimeAsUnsignedInt = CellData.deletionTimeLongToUnsignedInteger(localDeletionTime); + this.ttl = ttl; + this.isCounterCell = isCounterCell; + } + + public static BufferCellData tombstone(long timestamp, long localDeletionTime) + { + return new BufferCellData(ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, localDeletionTime, NO_TTL, false); + } + + @Override + public boolean isCounterCell() + { + return isCounterCell; + } + + @Override + public ByteBuffer value() + { + return value; + } + + @Override + public long timestamp() + { + return timestamp; + } + + @Override + public int ttl() + { + return ttl; + } + + @Override + public int localDeletionTimeAsUnsignedInt() + { + return localDeletionTimeAsUnsignedInt; + } + + @Override + public long unsharedHeapSize() + { + return ObjectSizes.sizeOnHeapOf(value) + EMPTY_SIZE; + } + + @Override + public long unsharedHeapSizeExcludingData() + { + return EMPTY_SIZE; + } +} diff --git a/src/java/org/apache/cassandra/db/rows/Cell.java b/src/java/org/apache/cassandra/db/rows/Cell.java index e630ec0c38a2..1eabe0930baa 100644 --- a/src/java/org/apache/cassandra/db/rows/Cell.java +++ b/src/java/org/apache/cassandra/db/rows/Cell.java @@ -30,7 +30,6 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.utils.CassandraUInt; import org.apache.cassandra.utils.memory.ByteBufferCloner; import org.apache.cassandra.utils.memory.Cloner; @@ -43,19 +42,8 @@ * 2) expiring cells: on top of regular cells, those have a ttl and a local deletion time (when they are expired). * 3) tombstone cells: those won't have value, but they have a local deletion time (when the tombstone was created). */ -public abstract class Cell extends ColumnData +public abstract class Cell extends ColumnData implements CellData> { - public static final int NO_TTL = 0; - public static final long NO_DELETION_TIME = Long.MAX_VALUE; - public static final int NO_DELETION_TIME_UNSIGNED_INTEGER = CassandraUInt.MAX_VALUE_UINT; - public static final long MAX_DELETION_TIME = CassandraUInt.MAX_VALUE_LONG - 2; - public static final int MAX_DELETION_TIME_UNSIGNED_INTEGER = CassandraUInt.fromLong(MAX_DELETION_TIME); - - // Since C14227 we only support Uints, negative ldts (corruption, overflow) get converted to this - public static final long INVALID_DELETION_TIME = CassandraUInt.MAX_VALUE_LONG - 1; - // Do not use. Only for legacy ser/deser pre CASSANDRA-14227 and backwards compatible CAP policies - public static final int MAX_DELETION_TIME_2038_LEGACY_CAP = Integer.MAX_VALUE - 1; - public final static Comparator> comparator = (c1, c2) -> { int cmp = c1.column().compareTo(c2.column()); @@ -78,100 +66,6 @@ protected Cell(ColumnMetadata column) super(column); } - public static int deletionTimeLongToUnsignedInteger(long deletionTime) - { - return deletionTime == NO_DELETION_TIME ? NO_DELETION_TIME_UNSIGNED_INTEGER : CassandraUInt.fromLong(deletionTime); - } - - public static long deletionTimeUnsignedIntegerToLong(int deletionTimeUnsignedInteger) - { - return deletionTimeUnsignedInteger == NO_DELETION_TIME_UNSIGNED_INTEGER ? NO_DELETION_TIME : CassandraUInt.toLong(deletionTimeUnsignedInteger); - } - - public static long getVersionedMaxDeletiontionTime() - { - if (DatabaseDescriptor.getStorageCompatibilityMode().disabled()) - // The whole cluster is 2016, we're out of the 2038/2106 mixed cluster scenario. Shortcut to avoid the 'minClusterVersion' volatile read - return Cell.MAX_DELETION_TIME; - else - return MessagingService.Version.supportsExtendedDeletionTime(MessagingService.instance().versions.minClusterVersion) - ? Cell.MAX_DELETION_TIME - : Cell.MAX_DELETION_TIME_2038_LEGACY_CAP; - } - - /** - * Whether the cell is a counter cell or not.CassandraUInt - * - * @return whether the cell is a counter cell or not. - */ - public abstract boolean isCounterCell(); - - public abstract V value(); - - public abstract ValueAccessor accessor(); - - public int valueSize() - { - return accessor().size(value()); - } - - public ByteBuffer buffer() - { - return accessor().toBuffer(value()); - } - - /** - * The cell timestamp. - *

- * @return the cell timestamp. - */ - public abstract long timestamp(); - - /** - * The cell ttl. - * - * @return the cell ttl, or {@code NO_TTL} if the cell isn't an expiring one. - */ - public abstract int ttl(); - - /** - * The cell local deletion time. - * - * @return the cell local deletion time, or {@code NO_DELETION_TIME} if the cell is neither - * a tombstone nor an expiring one. - */ - public long localDeletionTime() - { - return deletionTimeUnsignedIntegerToLong(localDeletionTimeAsUnsignedInt()); - } - - /** - * Whether the cell is a tombstone or not. - * - * @return whether the cell is a tombstone or not. - */ - public abstract boolean isTombstone(); - - /** - * Whether the cell is an expiring one or not. - *

- * Note that this only correspond to whether the cell liveness info - * have a TTL or not, but doesn't tells whether the cell is already expired - * or not. You should use {@link #isLive} for that latter information. - * - * @return whether the cell is an expiring one or not. - */ - public abstract boolean isExpiring(); - - /** - * Whether the cell is live or not given the current time. - * - * @param nowInSec the current time in seconds. This is used to - * decide if an expiring cell is expired or live. - * @return whether the cell is live or not at {@code nowInSec}. - */ - public abstract boolean isLive(long nowInSec); - /** * For cells belonging to complex types (non-frozen collection and UDT), the * path to the cell. @@ -186,6 +80,9 @@ public long localDeletionTime() public abstract Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime); + @Override + public abstract Cell updateAllTimestamp(long newTimestamp); + /** * Used to apply the same optimization as in {@link Cell.Serializer#deserialize} when * the column is not queried but eventhough it's used for digest calculation. @@ -193,6 +90,8 @@ public long localDeletionTime() */ public abstract Cell withSkippedValue(); + public abstract Cell withPath(CellPath path); + @Override public final Cell clone(Cloner cloner) { @@ -210,8 +109,11 @@ public final Cell clone(Cloner cloner) public abstract Cell purge(DeletionPurger purger, long nowInSec); @Override - // Overrides super type to provide a more precise return type. - public abstract Cell purgeDataOlderThan(long timestamp); + public Cell purgeDataOlderThan(long timestamp) + { + return timestamp() < timestamp ? null : this; + } + public abstract int localDeletionTimeAsUnsignedInt(); @@ -227,7 +129,7 @@ public static long decodeLocalDeletionTime(long localDeletionTime, int ttl, Dese { // Overflown signed int, decode to long. The result is guaranteed > ttl (and any signed int) return MessagingService.Version.supportsExtendedDeletionTime(helper.version) - ? deletionTimeUnsignedIntegerToLong((int) localDeletionTime) : INVALID_DELETION_TIME; + ? CellData.deletionTimeUnsignedIntegerToLong((int) localDeletionTime) : INVALID_DELETION_TIME; } if (ttl == LivenessInfo.EXPIRED_LIVENESS_TTL) @@ -237,6 +139,19 @@ public static long decodeLocalDeletionTime(long localDeletionTime, int ttl, Dese // timestamp on expiry. } + @Override + public Cell withNewData(long timestamp, long localDeletionTime, int ttl, ByteBuffer value) + { + return new BufferCell(column(), timestamp, ttl, localDeletionTime, value, path()); + } + + @Override + public Cell toCell(ColumnMetadata column, CellPath cellPath) + { + assert false : "toCell should not be called when CellData is already a cell."; + return this; + } + /** * The serialization format for cell is: * [ flags ][ timestamp ][ deletion time ][ ttl ][ path size ][ path ][ value size ][ value ] diff --git a/src/java/org/apache/cassandra/db/rows/CellData.java b/src/java/org/apache/cassandra/db/rows/CellData.java new file mode 100644 index 000000000000..7b6970ec1062 --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/CellData.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import java.nio.ByteBuffer; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.IDataSize; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.utils.CassandraUInt; +import org.apache.cassandra.utils.memory.ByteBufferCloner; +import org.apache.cassandra.utils.memory.Cloner; + +public interface CellData> extends IDataSize +{ + public static final int NO_TTL = 0; + public static final long NO_DELETION_TIME = Long.MAX_VALUE; + public static final int NO_DELETION_TIME_UNSIGNED_INTEGER = CassandraUInt.MAX_VALUE_UINT; + public static final long MAX_DELETION_TIME = CassandraUInt.MAX_VALUE_LONG - 2; + public static final int MAX_DELETION_TIME_UNSIGNED_INTEGER = CassandraUInt.fromLong(MAX_DELETION_TIME); + + // Since C14227 we only support Uints, negative ldts (corruption, overflow) get converted to this + public static final long INVALID_DELETION_TIME = CassandraUInt.MAX_VALUE_LONG - 1; + // Do not use. Only for legacy ser/deser pre CASSANDRA-14227 and backwards compatible CAP policies + public static final int MAX_DELETION_TIME_2038_LEGACY_CAP = Integer.MAX_VALUE - 1; + + public static int deletionTimeLongToUnsignedInteger(long deletionTime) + { + return deletionTime == NO_DELETION_TIME ? NO_DELETION_TIME_UNSIGNED_INTEGER : CassandraUInt.fromLong(deletionTime); + } + + public static long deletionTimeUnsignedIntegerToLong(int deletionTimeUnsignedInteger) + { + return deletionTimeUnsignedInteger == NO_DELETION_TIME_UNSIGNED_INTEGER ? NO_DELETION_TIME : CassandraUInt.toLong(deletionTimeUnsignedInteger); + } + + public static long getVersionedMaxDeletiontionTime() + { + if (DatabaseDescriptor.getStorageCompatibilityMode().disabled()) + // The whole cluster is 2016, we're out of the 2038/2106 mixed cluster scenario. Shortcut to avoid the 'minClusterVersion' volatile read + return Cell.MAX_DELETION_TIME; + else + return MessagingService.Version.supportsExtendedDeletionTime(MessagingService.instance().versions.minClusterVersion) + ? Cell.MAX_DELETION_TIME + : Cell.MAX_DELETION_TIME_2038_LEGACY_CAP; + } + + /** + * Whether the cell is a counter cell or not. + * + * @return whether the cell is a counter cell or not. + */ + boolean isCounterCell(); + + V value(); + + ValueAccessor accessor(); + + default int valueSize() + { + return accessor().size(value()); + } + + default ByteBuffer buffer() + { + return accessor().toBuffer(value()); + } + + /** + * The cell timestamp. + *

+ * @return the cell timestamp. + */ + long timestamp(); + + /** + * The cell ttl. + * + * @return the cell ttl, or {@code NO_TTL} if the cell isn't an expiring one. + */ + int ttl(); + + /** + * The cell local deletion time. + * + * @return the cell local deletion time, or {@code NO_DELETION_TIME} if the cell is neither + * a tombstone nor an expiring one. + */ + default long localDeletionTime() + { + return deletionTimeUnsignedIntegerToLong(localDeletionTimeAsUnsignedInt()); + } + + int localDeletionTimeAsUnsignedInt(); + + /** + * Whether the cell is a tombstone or not. + * + * @return whether the cell is a tombstone or not. + */ + default boolean isTombstone() + { + return localDeletionTimeAsUnsignedInt() != NO_DELETION_TIME_UNSIGNED_INTEGER && ttl() == NO_TTL; + } + + + /** + * Whether the cell is an expiring one or not. + *

+ * Note that this only correspond to whether the cell liveness info + * have a TTL or not, but doesn't tells whether the cell is already expired + * or not. You should use {@link #isLive} for that latter information. + * + * @return whether the cell is an expiring one or not. + */ + default boolean isExpiring() + { + return ttl() != NO_TTL; + } + + /** + * Whether the cell is live or not given the current time. + * + * @param nowInSec the current time in seconds. This is used to + * decide if an expiring cell is expired or live. + * @return whether the cell is live or not at {@code nowInSec}. + */ + default boolean isLive(long nowInSec) + { + return localDeletionTimeAsUnsignedInt() == NO_DELETION_TIME_UNSIGNED_INTEGER + || (ttl() != NO_TTL && nowInSec < localDeletionTime()); + } + + default boolean hasInvalidDeletions() + { + if (ttl() < 0 || localDeletionTime() < 0 || (isExpiring() && localDeletionTimeAsUnsignedInt() == NO_DELETION_TIME_UNSIGNED_INTEGER)) + return true; + return false; + } + + long unsharedHeapSize(); + long unsharedHeapSizeExcludingData(); + + + C withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime); + + C updateAllTimestamp(long newTimestamp); + + /** + * Used to apply the same optimization as in {@link Cell.Serializer#deserialize} when + * the column is not queried but eventhough it's used for digest calculation. + * @return a cell with an empty buffer as value + */ + C withSkippedValue(); + + C clone(Cloner cloner); + + C clone(ByteBufferCloner cloner); + + C purge(DeletionPurger purger, long nowInSec); + C purgeDataOlderThan(long timestamp); + + C markCounterLocalToBeCleared(); + + /** + * Returns a cell with the same column and path as this one, but with new data (timestamps and value). + * Note that this can and will return a cell/CellData of a different type. + */ + C withNewData(long timestamp, long localDeletionTime, int ttl, ByteBuffer value); + + /** + * Binds a CellData object to the given column and cell path to turn it into a Cell. + */ + Cell toCell(ColumnMetadata column, CellPath cellPath); +} diff --git a/src/java/org/apache/cassandra/db/rows/Cells.java b/src/java/org/apache/cassandra/db/rows/Cells.java index 48331a73a655..23e279ea108b 100644 --- a/src/java/org/apache/cassandra/db/rows/Cells.java +++ b/src/java/org/apache/cassandra/db/rows/Cells.java @@ -65,7 +65,7 @@ public static void collectStats(Cell cell, PartitionStatisticsCollector colle * For non-counter cells, this will always be either {@code c1} or {@code c2}, but for * counter cells this can be a newly allocated cell. */ - public static Cell reconcile(Cell c1, Cell c2) + public static > C reconcile(C c1, C c2) { if (c1 == null || c2 == null) return c2 == null ? c1 : c2; @@ -76,7 +76,7 @@ public static Cell reconcile(Cell c1, Cell c2) return resolveRegular(c1, c2); } - private static Cell resolveRegular(Cell left, Cell right) + private static > C resolveRegular(C left, C right) { long leftTimestamp = left.timestamp(); long rightTimestamp = right.timestamp(); @@ -128,7 +128,7 @@ private static Cell resolveRegular(Cell left, Cell right) return compareValues(left, right) >= 0 ? left : right; } - private static Cell resolveCounter(Cell left, Cell right) + private static > C resolveCounter(C left, C right) { long leftTimestamp = left.timestamp(); long rightTimestamp = right.timestamp(); @@ -168,9 +168,69 @@ private static Cell resolveCounter(Cell left, Cell right) else if (merged == rightValue && timestamp == rightTimestamp) return right; else // merge clocks and timestamps. - return new BufferCell(left.column(), timestamp, Cell.NO_TTL, Cell.NO_DELETION_TIME, merged, left.path()); + return left.withNewData(timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL, merged); } + /** + * Computes the reconciliation of a complex column given its pre-existing + * cells and the ones it is updated with, and generating index update if + * appropriate. + *

+ * Note that this method assumes that the provided cells can meaningfully + * be reconciled together, that is that the cells are for the same row and same + * complex column. + *

+ * Also note that which cells is provided as {@code existing} and which are + * provided as {@code update} matters for index updates. + * + * @param column the complex column the cells are for. + * @param existing the pre-existing cells, the ones that are updated. This can be + * {@code null} if this reconciliation correspond to an insertion. + * @param update the newly added cells, the update. This can be {@code null} out + * of convenience, in which case this function simply copy the cells from + * {@code existing} to {@code writer}. + * @param deletion the deletion time that applies to the cells being considered. + * This deletion time may delete cells in both {@code existing} and {@code update}. + * @param builder the row build to which the result of the reconciliation is written. + */ + public static void reconcileComplex(ColumnMetadata column, + Iterator> existing, + Iterator> update, + DeletionTime deletion, + Row.Builder builder) + { + Comparator comparator = column.cellPathComparator(); + Cell nextExisting = getNext(existing); + Cell nextUpdate = getNext(update); + while (nextExisting != null || nextUpdate != null) + { + int cmp = nextExisting == null ? 1 + : (nextUpdate == null ? -1 + : comparator.compare(nextExisting.path(), nextUpdate.path())); + if (cmp < 0) + { + if (!deletion.deletes(nextExisting)) + builder.addCell(nextExisting); + nextExisting = getNext(existing); + } + else if (cmp > 0) + { + if (!deletion.deletes(nextUpdate)) + builder.addCell(nextUpdate); + nextUpdate = getNext(update); + } + else + { + Cell merged = Cells.reconcile(nextExisting, nextUpdate); + if (!deletion.deletes(merged)) + builder.addCell(merged); + nextExisting = getNext(existing); + nextUpdate = getNext(update); + } + } + } + + /** * Adds to the builder a representation of the given existing cell that, when merged/reconciled with the given * update cell, produces the same result as merging the original with the update. @@ -248,16 +308,12 @@ private static Cell getNext(Iterator> iterator) return iterator == null || !iterator.hasNext() ? null : iterator.next(); } - private static int compareValues(Cell left, Cell right) + @SuppressWarnings("rawtypes") + private static int compareValues(CellData left, CellData right) { return ValueAccessor.compare(left.value(), left.accessor(), right.value(), right.accessor()); } - public static boolean valueEqual(Cell left, Cell right) - { - return ValueAccessor.equals(left.value(), left.accessor(), right.value(), right.accessor()); - } - public static T composeValue(Cell cell, AbstractType type) { return type.compose(cell.value(), cell.accessor()); diff --git a/src/java/org/apache/cassandra/db/rows/ColumnData.java b/src/java/org/apache/cassandra/db/rows/ColumnData.java index a23cbaf60f35..f51bc83717d6 100644 --- a/src/java/org/apache/cassandra/db/rows/ColumnData.java +++ b/src/java/org/apache/cassandra/db/rows/ColumnData.java @@ -23,12 +23,8 @@ import org.apache.cassandra.db.Digest; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.db.DeletionPurger; -import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.serializers.MarshalException; -import org.apache.cassandra.utils.btree.BTree; -import org.apache.cassandra.utils.btree.UpdateFunction; -import org.apache.cassandra.utils.caching.TinyThreadLocalPool; import org.apache.cassandra.utils.memory.Cloner; /** @@ -42,24 +38,6 @@ public abstract class ColumnData implements IMeasurableMemory public static final Comparator comparator = (cd1, cd2) -> cd1.column().compareTo(cd2.column()); - /** - * Construct an UpdateFunction for reconciling normal ColumnData - * (i.e. not suitable for ComplexColumnDeletion sentinels, but suitable ComplexColumnData or Cell) - * - * @param updateF a consumer receiving all pairs of reconciled cells - * @param activeDeletion the row or partition deletion time to use for purging - */ - public static Reconciler reconciler(PostReconciliationFunction updateF, DeletionTime activeDeletion) - { - TinyThreadLocalPool.TinyPool pool = Reconciler.POOL.get(); - Reconciler reconciler = pool.poll(); - if (reconciler == null) - reconciler = new Reconciler(); - reconciler.init(updateF, activeDeletion); - reconciler.pool = pool; - return reconciler; - } - public static PostReconciliationFunction noOp = new PostReconciliationFunction() { @Override @@ -96,135 +74,6 @@ public interface PostReconciliationFunction void onAllocatedOnHeap(long delta); } - public static class Reconciler implements UpdateFunction, AutoCloseable - { - private static final TinyThreadLocalPool POOL = new TinyThreadLocalPool<>(); - private PostReconciliationFunction postReconcile; - private DeletionTime activeDeletion; - private TinyThreadLocalPool.TinyPool pool; - - private void init(PostReconciliationFunction postReconcile, DeletionTime activeDeletion) - { - this.postReconcile = postReconcile; - this.activeDeletion = activeDeletion; - } - - public ColumnData merge(ColumnData existing, ColumnData update) - { - if (!(existing instanceof ComplexColumnData)) - { - Cell existingCell = (Cell) existing, updateCell = (Cell) update; - Cell result = Cells.reconcile(existingCell, updateCell); - - return postReconcile.merge(existingCell, result); - } - else - { - ComplexColumnData existingComplex = (ComplexColumnData) existing; - ComplexColumnData updateComplex = (ComplexColumnData) update; - - DeletionTime existingDeletion = existingComplex.complexDeletion(); - DeletionTime updateDeletion = updateComplex.complexDeletion(); - DeletionTime maxComplexDeletion = existingDeletion.supersedes(updateDeletion) ? existingDeletion : updateDeletion; - - Object[] existingTree = existingComplex.tree(); - Object[] updateTree = updateComplex.tree(); - - Object[] cells; - - try (Reconciler reconciler = reconciler(postReconcile, maxComplexDeletion)) - { - if (!maxComplexDeletion.isLive()) - { - if (maxComplexDeletion == existingDeletion) - { - updateTree = BTree.transformAndFilter(updateTree, reconciler::removeShadowed); - } - else - { - Object[] retained = BTree.transformAndFilter(existingTree, reconciler::retain); - if (existingTree != retained) - { - onAllocatedOnHeap(BTree.sizeOnHeapOf(retained) - BTree.sizeOnHeapOf(existingTree)); - existingTree = retained; - } - } - } - cells = BTree.update(existingTree, updateTree, existingComplex.column.cellComparator(), (UpdateFunction) reconciler); - } - return new ComplexColumnData(existingComplex.column, cells, maxComplexDeletion); - } - } - - @Override - public void onAllocatedOnHeap(long heapSize) - { - postReconcile.onAllocatedOnHeap(heapSize); - } - - @Override - public ColumnData insert(ColumnData insert) - { - return postReconcile.insert(insert); - } - - /** - * Checks if the specified value should be deleted or not. - * - * @param existing the existing value to check - * @return {@code null} if the value should be removed from the BTree or the existing value if it should not. - */ - public ColumnData retain(ColumnData existing) - { - return removeShadowed(existing, postReconcile); - } - - private ColumnData removeShadowed(ColumnData existing) - { - return removeShadowed(existing, ColumnData.noOp); - } - - /** - * Checks if the specified value should be deleted or not. - * - * @param existing the existing value to check - * @return {@code null} if the value should be removed from the BTree or the existing value if it should not. - */ - private ColumnData removeShadowed(ColumnData existing, PostReconciliationFunction recordDeletion) - { - if (!(existing instanceof ComplexColumnData)) - { - if (activeDeletion.deletes((Cell) existing)) - { - recordDeletion.delete(existing); - return null; - } - } - else - { - ComplexColumnData existingComplex = (ComplexColumnData) existing; - if (activeDeletion.supersedes(existingComplex.complexDeletion())) - { - Object[] cells = BTree.transformAndFilter(existingComplex.tree(), (ColumnData cd) -> removeShadowed(cd, recordDeletion)); - return BTree.isEmpty(cells) ? null : new ComplexColumnData(existingComplex.column, cells, DeletionTime.LIVE); - } - } - - return existing; - } - - public void close() - { - activeDeletion = null; - postReconcile = null; - - TinyThreadLocalPool.TinyPool tmp = pool; - pool = null; - tmp.offer(this); - - } - } - protected final ColumnMetadata column; protected ColumnData(ColumnMetadata column) { diff --git a/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java index 5cf23e8bdc1a..e9ea8e22a2bc 100644 --- a/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java +++ b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java @@ -17,83 +17,34 @@ */ package org.apache.cassandra.db.rows; -import java.nio.ByteBuffer; import java.util.Iterator; -import java.util.Objects; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Function; import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.DeletionTime; -import org.apache.cassandra.db.Digest; -import org.apache.cassandra.db.LivenessInfo; -import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.marshal.ByteType; -import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.DroppedColumn; import org.apache.cassandra.utils.BiLongAccumulator; import org.apache.cassandra.utils.LongAccumulator; -import org.apache.cassandra.utils.ObjectSizes; -import org.apache.cassandra.utils.SearchIterator; -import org.apache.cassandra.utils.btree.BTree; -import org.apache.cassandra.utils.memory.Cloner; /** * The data for a complex column, that is it's cells and potential complex * deletion time. */ -public class ComplexColumnData extends ColumnData implements Iterable> +public abstract class ComplexColumnData extends ColumnData implements Iterable> { - static final Cell[] NO_CELLS = new Cell[0]; - - private static final long EMPTY_SIZE = ObjectSizes.measure(new ComplexColumnData(ColumnMetadata.regularColumn("", - "", - "", - SetType.getInstance(ByteType.instance, - true)), - NO_CELLS, - DeletionTime.build(0, 0))); - - // The cells for 'column' sorted by cell path. - private final Object[] cells; - - private final DeletionTime complexDeletion; - - ComplexColumnData(ColumnMetadata column, Object[] cells, DeletionTime complexDeletion) + public ComplexColumnData(ColumnMetadata column) { super(column); assert column.isComplex(); - assert cells.length > 0 || !complexDeletion.isLive(); - this.cells = cells; - this.complexDeletion = complexDeletion; } // Used by CNDB - public boolean hasCells() { - return !BTree.isEmpty(this.cells); - } + public abstract boolean hasCells(); - public int cellsCount() - { - return BTree.size(cells); - } + public abstract int cellsCount(); - public Cell getCell(CellPath path) - { - return (Cell) BTree.find(cells, column.asymmetricCellPathComparator(), path); - } + public abstract Cell getCell(CellPath path); - public R reduce(R seed, BTree.ReduceFunction reducer) - { - return BTree.reduce(cells, seed, reducer); - } - - public Cell getCellByIndex(int idx) - { - return BTree.findByIndex(cells, idx); - } + public abstract Cell getCellByIndex(int idx); /** * The complex deletion time of the complex column. @@ -106,279 +57,19 @@ public Cell getCellByIndex(int idx) * @return the complex deletion time for the column this is the data of or {@code DeletionTime.LIVE} * if the column is not deleted. */ - public DeletionTime complexDeletion() - { - return complexDeletion; - } - - Object[] tree() - { - return cells; - } - - public Iterator> iterator() - { - return BTree.iterator(cells); - } - - public SearchIterator searchIterator() - { - return BTree.slice(cells, column().asymmetricCellPathComparator(), BTree.Dir.ASC); - } - - public Iterator> reverseIterator() - { - return BTree.iterator(cells, BTree.Dir.DESC); - } - - public long accumulate(LongAccumulator> accumulator, long initialValue) - { - return BTree.accumulate(cells, accumulator, initialValue); - } - - public long accumulate(BiLongAccumulator> accumulator, A arg, long initialValue) - { - return BTree.accumulate(cells, accumulator, arg, initialValue); - } - - public int dataSize() - { - int size = complexDeletion.dataSize(); - for (Cell cell : this) - size += cell.dataSize(); - return size; - } - - @Override - public int liveDataSize(long nowInSec) - { - return complexDeletion.isLive() ? dataSize() : 0; - } - - @Override - public long unsharedHeapSize() - { - long heapSize = EMPTY_SIZE + BTree.sizeOnHeapOf(cells) + complexDeletion.unsharedHeapSize(); - return BTree.accumulate(cells, (cell, value) -> value + cell.unsharedHeapSize(), heapSize); - } - - public long unsharedHeapSizeExcludingData() - { - long heapSize = EMPTY_SIZE + BTree.sizeOnHeapOf(cells); - // TODO: this can be turned into a simple multiplication, at least while we have only one Cell implementation - for (Cell cell : this) - heapSize += cell.unsharedHeapSizeExcludingData(); - return heapSize; - } - - public void validate() - { - for (Cell cell : this) - cell.validate(); - } - - public void digest(Digest digest) - { - if (!complexDeletion.isLive()) - complexDeletion.digest(digest); - - for (Cell cell : this) - cell.digest(digest); - } - - public boolean hasInvalidDeletions() - { - if (!complexDeletion.validate()) - return true; - for (Cell cell : this) - if (cell.hasInvalidDeletions()) - return true; - return false; - } - - public ComplexColumnData markCounterLocalToBeCleared() - { - return transformAndFilter(complexDeletion, Cell::markCounterLocalToBeCleared); - } + public abstract DeletionTime complexDeletion(); - public ComplexColumnData filter(ColumnFilter filter, DeletionTime activeDeletion, DroppedColumn dropped, LivenessInfo rowLiveness) - { - ColumnFilter.Tester cellTester = filter.newTester(column); - boolean isQueriedColumn = filter.fetchedColumnIsQueried(column); - if (cellTester == null && activeDeletion.isLive() && dropped == null && isQueriedColumn) - return this; - - DeletionTime newDeletion = activeDeletion.supersedes(complexDeletion) ? DeletionTime.LIVE : complexDeletion; - return transformAndFilter(newDeletion, (cell) -> - { - CellPath path = cell.path(); - boolean isForDropped = dropped != null && cell.timestamp() <= dropped.droppedTime; - boolean isShadowed = activeDeletion.deletes(cell); - boolean isFetchedCell = cellTester == null || cellTester.fetches(path); - boolean isQueriedCell = isQueriedColumn && isFetchedCell && (cellTester == null || cellTester.fetchedCellIsQueried(path)); - boolean isSkippableCell = !isFetchedCell || (!isQueriedCell && cell.timestamp() < rowLiveness.timestamp()); - if (isForDropped || isShadowed || isSkippableCell) - return null; - // We should apply the same "optimization" as in Cell.deserialize to avoid discrepances - // between sstables and memtables data, i.e resulting in a digest mismatch. - return isQueriedCell ? cell : cell.withSkippedValue(); - }); - } - - public ComplexColumnData purge(DeletionPurger purger, long nowInSec) - { - DeletionTime newDeletion = complexDeletion.isLive() || purger.shouldPurge(complexDeletion) ? DeletionTime.LIVE : complexDeletion; - return transformAndFilter(newDeletion, (cell) -> cell.purge(purger, nowInSec)); - } - - public ComplexColumnData withOnlyQueriedData(ColumnFilter filter) - { - return transformAndFilter(complexDeletion, (cell) -> filter.fetchedCellIsQueried(column, cell.path()) ? null : cell); - } - - public ComplexColumnData purgeDataOlderThan(long timestamp) - { - DeletionTime newDeletion = complexDeletion.markedForDeleteAt() < timestamp ? DeletionTime.LIVE : complexDeletion; - return transformAndFilter(newDeletion, (cell) -> cell.purgeDataOlderThan(timestamp)); - } - - private ComplexColumnData update(DeletionTime newDeletion, Object[] newCells) - { - if (cells == newCells && newDeletion == complexDeletion) - return this; - - if (newDeletion == DeletionTime.LIVE && BTree.isEmpty(newCells)) - return null; - - return new ComplexColumnData(column, newCells, newDeletion); - } - - public ComplexColumnData transformAndFilter(Function, ? extends Cell> function) - { - return update(complexDeletion, BTree.transformAndFilter(cells, function)); - } - - public ComplexColumnData transformAndFilter(DeletionTime newDeletion, Function function) - { - return update(newDeletion, BTree.transformAndFilter(cells, function)); - } - - public ComplexColumnData transform(Function, ? extends Cell> function) - { - return update(complexDeletion, BTree.transform(cells, function)); - } - - @Override - public ColumnData clone(Cloner cloner) - { - return transform(c -> cloner.clone(c)); - } - - public ComplexColumnData updateAllTimestamp(long newTimestamp) - { - DeletionTime newDeletion = complexDeletion.isLive() ? complexDeletion : DeletionTime.build(newTimestamp - 1, complexDeletion.localDeletionTime()); - return transformAndFilter(newDeletion, (cell) -> (Cell) cell.updateAllTimestamp(newTimestamp)); - } - - public long maxTimestamp() - { - long timestamp = complexDeletion.markedForDeleteAt(); - for (Cell cell : this) - timestamp = Math.max(timestamp, cell.timestamp()); - return timestamp; - } - - public long minTimestamp() - { - long timestamp = complexDeletion.isLive() - ? Long.MAX_VALUE - : complexDeletion.markedForDeleteAt(); - for (Cell cell : this) - timestamp = Math.min(timestamp, cell.timestamp()); - return timestamp; - } - - // This is the partner in crime of ArrayBackedRow.setValue. The exact warning apply. The short - // version is: "don't use that method". - void setValue(CellPath path, ByteBuffer value) - { - Cell current = (Cell) BTree.find(cells, column.asymmetricCellPathComparator(), path); - BTree.replaceInSitu(cells, column.cellComparator(), current, current.withUpdatedValue(value)); - } - - @Override - public boolean equals(Object other) - { - if (this == other) - return true; - - if(!(other instanceof ComplexColumnData)) - return false; - - ComplexColumnData that = (ComplexColumnData)other; - return this.column().equals(that.column()) - && this.complexDeletion().equals(that.complexDeletion) - && BTree.equals(this.cells, that.cells); - } + public abstract Iterator> reverseIterator(); - @Override - public int hashCode() - { - return Objects.hash(column(), complexDeletion(), BTree.hashCode(cells)); - } + public abstract int liveDataSize(long nowInSec); - @Override - public String toString() - { - return String.format("[%s=%s %s]", - column().name, - complexDeletion.toString(), - BTree.toString(cells)); - } + public abstract long unsharedHeapSize(); - @VisibleForTesting - public static ComplexColumnData unsafeConstruct(ColumnMetadata column, Object[] cells, DeletionTime complexDeletion) - { - return new ComplexColumnData(column, cells, complexDeletion); - } + public abstract long accumulate(LongAccumulator> accumulator, long initialValue); - public static Builder builder() - { - return new Builder(); - } - - public static class Builder - { - private DeletionTime complexDeletion; - private ColumnMetadata column; - private BTree.Builder> builder; - - public void newColumn(ColumnMetadata column) - { - this.column = column; - this.complexDeletion = DeletionTime.LIVE; // default if writeComplexDeletion is not called - if (builder == null) - builder = BTree.builder(column.cellComparator()); - else - builder.reuse(column.cellComparator()); - } + public abstract long accumulate(BiLongAccumulator> accumulator, A arg, long initialValue); - public void addComplexDeletion(DeletionTime complexDeletion) - { - this.complexDeletion = complexDeletion; - } + public abstract ComplexColumnData purge(DeletionPurger purger, long nowInSec); - public void addCell(Cell cell) - { - builder.add(cell); - } - - public ComplexColumnData build() - { - if (complexDeletion.isLive() && builder.isEmpty()) - return null; - - return new ComplexColumnData(column, builder.build(), complexDeletion); - } - } + public abstract ComplexColumnData purgeDataOlderThan(long timestamp); } diff --git a/src/java/org/apache/cassandra/db/rows/NativeCell.java b/src/java/org/apache/cassandra/db/rows/NativeCell.java index 59feb9706d99..c09873824ce2 100644 --- a/src/java/org/apache/cassandra/db/rows/NativeCell.java +++ b/src/java/org/apache/cassandra/db/rows/NativeCell.java @@ -74,7 +74,7 @@ public NativeCell(NativeAllocator allocator, ByteBuffer value, CellPath path) { - this(allocator, writeOp, column, timestamp, ttl, deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); + this(allocator, writeOp, column, timestamp, ttl, CellData.deletionTimeLongToUnsignedInteger(localDeletionTime), value, path); } public NativeCell(NativeAllocator allocator, @@ -90,7 +90,8 @@ public NativeCell(NativeAllocator allocator, long size = offHeapSizeWithoutPath(value.remaining()); assert value.order() == ByteOrder.BIG_ENDIAN; - assert column.isComplex() == (path != null); + // Trie-backed rows store path-less cells. + // assert column.isComplex() == (path != null); if (path != null) { assert path.size() == 1 : String.format("Expected path size to be 1 but was not; %s", path); @@ -187,6 +188,12 @@ public long unsharedHeapSize() return EMPTY_SIZE; } + @Override + public Cell withPath(CellPath path) + { + return new BufferCell(column, timestamp(), ttl(), localDeletionTime(), value(), path); + } + @Override public long unsharedHeapSizeExcludingData() { diff --git a/src/java/org/apache/cassandra/db/rows/Row.java b/src/java/org/apache/cassandra/db/rows/Row.java index f4d9f59f64b6..09fbd35fc264 100644 --- a/src/java/org/apache/cassandra/db/rows/Row.java +++ b/src/java/org/apache/cassandra/db/rows/Row.java @@ -21,7 +21,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Objects; @@ -34,6 +33,7 @@ import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.IDataSize; import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.schema.ColumnMetadata; @@ -44,7 +44,6 @@ import org.apache.cassandra.utils.MergeIterator; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Reducer; -import org.apache.cassandra.utils.SearchIterator; import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.memory.Cloner; @@ -62,7 +61,7 @@ * it's own data. For instance, a {@code Row} cannot contains a cell that is deleted by its own * row deletion. */ -public interface Row extends Unfiltered, Iterable, IMeasurableMemory +public interface Row extends Unfiltered, Iterable, IMeasurableMemory, IDataSize { /** * The clustering values for this row. @@ -124,6 +123,14 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public boolean isEmpty(); + /** + * Whether the row has no live data. This means no PK liveness info, no cells + * and no complex deletion info. + * + * @return {@code true} if the row has no data, {@code false} otherwise. + */ + public boolean isEmptyAfterDeletion(); + /** * Whether the row has some live information (i.e. it's not just deletion informations). * @@ -178,38 +185,11 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public Iterable> cells(); - /** - * A collection of the ColumnData representation of this row, for columns with some data (possibly not live) present - *

- * The data is returned in column order. - * - * @return a Collection of the non-empty ColumnData for this row. - */ - public Collection columnData(); - - /** - * An iterable over the cells of this row that return cells in "legacy order". - *

- * In 3.0+, columns are sorted so that all simple columns are before all complex columns. Previously - * however, the cells where just sorted by the column name. This iterator return cells in that - * legacy order. It's only ever meaningful for backward/thrift compatibility code. - * - * @param metadata the table this is a row of. - * @param reversed if cells should returned in reverse order. - * @return an iterable over the cells of this row in "legacy order". - */ - public Iterable> cellsInLegacyOrder(TableMetadata metadata, boolean reversed); - /** * Whether the row stores any (non-live) complex deletion for any complex column. */ public boolean hasComplexDeletion(); - /** - * Whether the row stores any (non-RT) data for any complex column. - */ - boolean hasComplex(); - /** * Whether the row has any deletion info (row deletion, cell tombstone, expired cell or complex deletion). * @@ -217,13 +197,6 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public boolean hasDeletion(long nowInSec); - /** - * An iterator to efficiently search data for a given column. - * - * @return a search iterator for the cells of this row. - */ - public SearchIterator searchIterator(); - /** * Returns a copy of this row that: * 1) only includes the data for the column included by {@code filter}. @@ -240,24 +213,27 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setActiveDeletionToRow, TableMetadata metadata); - /** - * Requires that {@code function} returns either {@code null} or {@code ColumnData} for the same column. - * - * Returns a copy of this row that: - * 1) {@code function} has been applied to the members of - * 2) doesn't include any {@code null} results of {@code function} - * 3) has precisely the provided {@code LivenessInfo} and {@code Deletion} - */ - public Row transformAndFilter(LivenessInfo info, Deletion deletion, Function function); + + /// Interface used for cell transformations. Because we don't know the type of cell data that the row uses (it can + /// be a full [Cell] including path and column, or just [CellData]), the given function must convert the type the + /// row uses to the same type. + /// + /// This interface unfortunately cannot be given as a lambda, but we can use method references e.g. + /// `CellData::markCounterLocalToBeCleared`. + public interface CellTransformer + { + > C apply(C cellOrCellData); + } /** - * Requires that {@code function} returns either {@code null} or {@code ColumnData} for the same column. + * Requires that {@code function} returns either {@code null} or {@code Cell} for the same cell. * * Returns a copy of this row that: - * 1) {@code function} has been applied to the members of - * 2) doesn't include any {@code null} results of {@code function} + * 1) {@code cellFunction} has been applied to the members of + * 2) doesn't include any {@code null} results of {@code cellFunction} + * 3) has its {@code LivenessInfo} mapped through the given {@code infoFunction} */ - public Row transformAndFilter(Function function); + public Row transformAndFilter(Function infoFunction, CellTransformer cellFunction); public Row clone(Cloner cloner); @@ -354,11 +330,16 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory public long accumulate(LongAccumulator accumulator, long initialValue); - public long accumulate(LongAccumulator accumulator, Comparator comparator, ColumnData from, long initialValue); - public long accumulate(BiLongAccumulator accumulator, A arg, long initialValue); - public long accumulate(BiLongAccumulator accumulator, A arg, Comparator comparator, ColumnData from, long initialValue); + /** + * Merge this row with the given update and return the result. + * + * @param update Row to merge in. Must be the same type (b-tree vs trie) as this. + * @param onReconcile Function to apply on the result of individual cell merges. + * @return The merged row. + */ + public Row mergeWith(Row update); /** * A row deletion/tombstone. @@ -817,14 +798,14 @@ private static class ColumnDataReducer extends Reducer private DeletionTime activeDeletion; - private final ComplexColumnData.Builder complexBuilder; + private final BTreeComplexColumn.Builder complexBuilder; private final List>> complexCells; private final CellReducer cellReducer; public ColumnDataReducer(int size, boolean hasComplex) { this.versions = new ArrayList<>(size); - this.complexBuilder = hasComplex ? ComplexColumnData.builder() : null; + this.complexBuilder = hasComplex ? BTreeComplexColumn.builder() : null; this.complexCells = hasComplex ? new ArrayList<>(size) : null; this.cellReducer = new CellReducer(); } diff --git a/src/java/org/apache/cassandra/db/rows/Rows.java b/src/java/org/apache/cassandra/db/rows/Rows.java index 82cbaae304ad..d2ff03387ea2 100644 --- a/src/java/org/apache/cassandra/db/rows/Rows.java +++ b/src/java/org/apache/cassandra/db/rows/Rows.java @@ -256,14 +256,9 @@ public void onKeyChange() iter.next(); } - public static Row merge(Row existing, Row update) - { - return merge(existing, update, ColumnData.noOp); - } - /** - * Merges two rows. In addition to reconciling the cells in each row, the liveness info, and deletion times for - * the row and complex columns are also merged. + * Merges two rows into a new one. In addition to reconciling the cells in each row, the liveness info, and deletion + * times for the row and complex columns are also merged. *

* Note that this method assumes that the provided rows can meaningfully be reconciled together. That is, * that the rows share the same clustering value, and belong to the same partition. @@ -273,11 +268,75 @@ public static Row merge(Row existing, Row update) * * @return the row resulting from the merge. */ - public static Row merge(Row existing, Row update, ColumnData.PostReconciliationFunction onReconcile) + public static Row merge(Row existing, Row update) { - assert existing instanceof BTreeRow; - assert update instanceof BTreeRow; - return BTreeRow.merge((BTreeRow) existing, (BTreeRow) update, onReconcile); + if (existing.getClass() == update.getClass()) + return existing.mergeWith(update); + else + return mergeRowsGeneric(existing, update); + } + + private static Row mergeRowsGeneric(Row existing, Row update) + { + Row.Builder builder = BTreeRow.sortedBuilder(); + Clustering clustering = existing.clustering(); + builder.newRow(clustering); + + LivenessInfo existingInfo = existing.primaryKeyLivenessInfo(); + LivenessInfo updateInfo = update.primaryKeyLivenessInfo(); + LivenessInfo mergedInfo = existingInfo.supersedes(updateInfo) ? existingInfo : updateInfo; + + Row.Deletion rowDeletion = existing.deletion().supersedes(update.deletion()) ? existing.deletion() : update.deletion(); + + if (rowDeletion.deletes(mergedInfo)) + mergedInfo = LivenessInfo.EMPTY; + else if (rowDeletion.isShadowedBy(mergedInfo)) + rowDeletion = Row.Deletion.LIVE; + + builder.addPrimaryKeyLivenessInfo(mergedInfo); + builder.addRowDeletion(rowDeletion); + + DeletionTime deletion = rowDeletion.time(); + + Iterator a = existing.iterator(); + Iterator b = update.iterator(); + ColumnData nexta = a.hasNext() ? a.next() : null, nextb = b.hasNext() ? b.next() : null; + while (nexta != null | nextb != null) + { + int comparison = nexta == null ? 1 : nextb == null ? -1 : nexta.column.compareTo(nextb.column); + ColumnData cura = comparison <= 0 ? nexta : null; + ColumnData curb = comparison >= 0 ? nextb : null; + ColumnMetadata column = getColumnMetadata(cura, curb); + if (column.isSimple()) + { + Cell merged = Cells.reconcile((Cell) cura, (Cell) curb); + if (!deletion.deletes(merged)) + builder.addCell(merged); + } + else + { + ComplexColumnData existingData = (ComplexColumnData) cura; + ComplexColumnData updateData = (ComplexColumnData) curb; + + DeletionTime existingDt = existingData == null ? DeletionTime.LIVE : existingData.complexDeletion(); + DeletionTime updateDt = updateData == null ? DeletionTime.LIVE : updateData.complexDeletion(); + DeletionTime maxDt = existingDt.supersedes(updateDt) ? existingDt : updateDt; + if (maxDt.supersedes(deletion)) + builder.addComplexDeletion(column, maxDt); + else + maxDt = deletion; + + Iterator> existingCells = existingData == null ? null : existingData.iterator(); + Iterator> updateCells = updateData == null ? null : updateData.iterator(); + Cells.reconcileComplex(column, existingCells, updateCells, maxDt, builder); + } + + if (cura != null) + nexta = a.hasNext() ? a.next() : null; + if (curb != null) + nextb = b.hasNext() ? b.next() : null; + } + return builder.build(); } /** @@ -352,4 +411,22 @@ public static Row removeShadowedCells(Row existing, Row update, DeletionTime ran Row row = builder.build(); return row != null && !row.isEmpty() ? row : null; } + + /** + * Returns the {@code ColumnMetadata} to use for merging the columns. + * If the 2 column metadata are different the latest one will be returned. + */ + private static ColumnMetadata getColumnMetadata(ColumnData cura, ColumnData curb) + { + if (cura == null) + return curb.column; + + if (curb == null) + return cura.column; + + if (ColumnMetadataVersionComparator.INSTANCE.compare(cura.column, curb.column) >= 0) + return cura.column; + + return curb.column; + } } diff --git a/src/java/org/apache/cassandra/db/rows/TrieBackedComplexColumn.java b/src/java/org/apache/cassandra/db/rows/TrieBackedComplexColumn.java new file mode 100644 index 000000000000..d012b0c648c5 --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/TrieBackedComplexColumn.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.rows; + +import java.util.Iterator; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; + +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieEntriesIterator; +import org.apache.cassandra.db.tries.TrieEntriesWalker; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.utils.BiLongAccumulator; +import org.apache.cassandra.utils.LongAccumulator; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.memory.Cloner; + +/** + * The data for a complex column, that is its cells and potential complex deletion time. + */ +public class TrieBackedComplexColumn extends ComplexColumnData +{ + private final DeletionAwareTrie data; + + TrieBackedComplexColumn(ColumnMetadata column, DeletionAwareTrie data) + { + super(column); + assert column.isComplex(); + this.data = data; + } + + @Override + public boolean hasCells() { + return data.contentOnlyTrie().filteredValuesIterator(Direction.FORWARD, CellData.class).hasNext(); + } + + @Override + public int cellsCount() + { + return Iterators.size(data.contentOnlyTrie().filteredValuesIterator(Direction.FORWARD, CellData.class)); + } + + @Override + public Cell getCell(CellPath path) + { + Object cell = data.contentOnlyTrie().get(TrieBackedRow.cellKey(-1, column, path)); + if (cell == null || cell instanceof Cell) + return (Cell) cell; + return ((CellData) cell).toCell(column, path); + } + + @Override + public Cell getCellByIndex(int idx) + { + var entry = Iterators.get(data.contentOnlyTrie().filteredEntryIterator(Direction.FORWARD, CellData.class), idx, null); + return cellDataToCell(entry.getValue(), entry.getKey()); + } + + private Cell cellDataToCell(CellData value, byte[] keyBytes, int keyLength) + { + if (value instanceof Cell || value == null) + return (Cell) value; + return value.toCell(column, TrieBackedRow.cellPath(column, ByteSource.preencoded(keyBytes, 0, keyLength))); + } + + private Cell cellDataToCell(CellData value, ByteComparable.Preencoded key) + { + if (value instanceof Cell || value == null) + return (Cell) value; + return value.toCell(column, TrieBackedRow.cellPath(column, key.getPreencodedBytes())); + } + + @VisibleForTesting + public CellData getCellWithoutPath(CellPath path) + { + return (CellData) data.contentOnlyTrie().get(TrieBackedRow.cellKey(-1, column, path)); + } + + @Override + public DeletionTime complexDeletion() + { + return TrieTombstoneMarker.applicableDeletionOrLive(data, ByteComparable.EMPTY); + } + + class CellsWithPath extends TrieEntriesIterator.WithNullFiltering> + { + protected CellsWithPath(Trie trie, Direction direction) + { + super(trie, direction); + } + + @Override + protected Cell mapContent(Object content, byte[] bytes, int byteLength) + { + if (!(content instanceof CellData)) + return null; + + return cellDataToCell((CellData) content, bytes, byteLength); + } + } + + @Override + public Iterator> iterator() + { + return new CellsWithPath(data.contentOnlyTrie(), Direction.FORWARD); + } + + @Override + public Iterator> reverseIterator() + { + return new CellsWithPath(data.contentOnlyTrie(), Direction.REVERSE); + } + + @Override + public long accumulate(LongAccumulator> accumulator, long initialValue) + { + class Accumulator extends TrieEntriesWalker + { + long longValue = initialValue; + + @Override + protected void content(Object content, byte[] bytes, int byteLength) + { + if (!(content instanceof CellData)) + return; + + Cell c = cellDataToCell((CellData) content, bytes, byteLength); + longValue = accumulator.apply(c, longValue); + } + + @Override + public Accumulator complete() + { + return this; + } + } + return data.process(Direction.FORWARD, new Accumulator()).longValue; + } + + @Override + public long accumulate(BiLongAccumulator> accumulator, A arg, long initialValue) + { + class Accumulator extends TrieEntriesWalker + { + long longValue = initialValue; + + @Override + protected void content(Object content, byte[] bytes, int byteLength) + { + if (!(content instanceof CellData)) + return; + + Cell c = cellDataToCell((CellData) content, bytes, byteLength); + longValue = accumulator.apply(arg, c, longValue); + } + + @Override + public Accumulator complete() + { + return this; + } + } + return data.process(Direction.FORWARD, new Accumulator()).longValue; + } + + @Override + public int dataSize() + { + int size = complexDeletion().dataSize(); + for (Cell cell : this) + size += cell.dataSize(); + return size; + } + + @Override + public int liveDataSize(long nowInSec) + { + return complexDeletion().isLive() ? dataSize() : 0; + } + + @Override + public long unsharedHeapSizeExcludingData() + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public long unsharedHeapSize() + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public void validate() + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public void digest(Digest digest) + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public boolean hasInvalidDeletions() + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public TrieBackedComplexColumn markCounterLocalToBeCleared() + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public TrieBackedComplexColumn purge(DeletionPurger purger, long nowInSec) + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public TrieBackedComplexColumn purgeDataOlderThan(long nowInSec) + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public ColumnData clone(Cloner cloner) + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public TrieBackedComplexColumn updateAllTimestamp(long newTimestamp) + { + throw new AssertionError("Should be done by TrieBackedRow"); + } + + @Override + public long maxTimestamp() + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public long minTimestamp() + { + throw new AssertionError("Should be collected by TrieBackedRow"); + } + + @Override + public boolean equals(Object other) + { + if (this == other) + return true; + + if(!(other instanceof ComplexColumnData)) + return false; + + ComplexColumnData that = (ComplexColumnData)other; + return this.column().equals(that.column()) + && Iterables.elementsEqual(this, that); + } + + @Override + public int hashCode() + { + throw new AssertionError("Should not be used"); + } + + @Override + public String toString() + { + return String.format("[%s=%s %s]", + column().name, + complexDeletion(), + Iterators.toString(iterator())); + } +} diff --git a/src/java/org/apache/cassandra/db/rows/TrieBackedRow.java b/src/java/org/apache/cassandra/db/rows/TrieBackedRow.java new file mode 100644 index 000000000000..cfe1b93b6932 --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/TrieBackedRow.java @@ -0,0 +1,1440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.rows; + +import java.nio.ByteBuffer; +import java.util.AbstractCollection; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Function; + +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; +import com.google.common.primitives.Ints; + +import org.agrona.collections.Object2IntHashMap; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.MultiCellCapableType; +import org.apache.cassandra.db.partitions.TrieBackedPartition; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.RangeTrie; +import org.apache.cassandra.db.tries.Trie; +import org.apache.cassandra.db.tries.TrieEntriesIterator; +import org.apache.cassandra.db.tries.TrieSet; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.db.tries.TrieTailsIterator; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.DroppedColumn; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.BiLongAccumulator; +import org.apache.cassandra.utils.LongAccumulator; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; +import org.apache.cassandra.utils.memory.Cloner; + +import static org.apache.cassandra.db.partitions.TrieBackedPartition.BYTE_COMPARABLE_VERSION; +import static org.apache.cassandra.db.partitions.TrieBackedPartition.mergeTombstoneRanges; +import static org.apache.cassandra.db.partitions.TrieBackedPartition.noConflictInData; +import static org.apache.cassandra.db.partitions.TrieBackedPartition.noExistingSelfDeletion; +import static org.apache.cassandra.db.partitions.TrieBackedPartition.noIncomingSelfDeletion; + +/// Immutable implementation of a [Row] object, where the data structure is represented by a trie. +/// Stores either [Cell], or [TrieCellData] (if the row comes from a memtable trie). +/// +/// Trie-backed rows using a mapping of column ids to numbers that is to be obtained from the table metadata. Currently, +/// we don't support mixing different column sets in trie rows, thus all rows in use in one object (partition, memtable) +/// must share the same column set. +public class TrieBackedRow extends AbstractRow +{ + public static final Object COMPLEX_COLUMN_MARKER = new Object() + { + @Override + public String toString() + { + return "COMPLEX_COLUMN_MARKER"; + } + }; + + static final DeletionAwareTrie EMPTY_ROW = DeletionAwareTrie.singleton(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + LivenessInfo.EMPTY); + static final Object2IntHashMap EMPTY_COLUMN_IDS = makeColumnIdsMap(Columns.NONE); + + private static final long EMPTY_SIZE = ObjectSizes.measure(new TrieBackedRow(Columns.NONE, EMPTY_COLUMN_IDS, Clustering.EMPTY, LivenessInfo.EMPTY, Row.Deletion.LIVE, EMPTY_ROW)); + + private static final int COLUMN_NOT_PRESENT = -1; + + // A column with this ID cannot exist. Used to make sure we don't find anything when we are passed an unknown column definition. + public static final ByteComparable MISSING_COLUMN_KEY = encodeUnsignedInt(Long.MAX_VALUE); + + public static final int MAX_RECURSIVE_LENGTH = 128; + + /// The row's clustering key. + private final Clustering clustering; + + /// Mapping from column id to its numeric index, used to reach the column in the trie. + private final Object2IntHashMap columnIds; + /// List of columns in this row, also used a map from column index to column id/definition. + private final Columns columns; + + /// Pre-computed column maps for each `Columns` instance. + private static final Map> columnsMapCache = new ConcurrentHashMap<>(); + + // We need to filter the tombstones of a row on every read (twice in fact: first to remove purgeable tombstone, and then after reconciliation to remove + // all tombstone since we don't return them to the client) as well as on compaction. But it's likely that many rows won't have any tombstone at all, so + // we want to speed up that case by not having to iterate/copy the row in this case. We could keep a single boolean telling us if we have tombstones, + // but that doesn't work for expiring columns. So instead we keep the deletion time for the first thing in the row to be deleted. This allow at any given + // time to know if we have any deleted information or not. If we any "true" tombstone (i.e. not an expiring cell), this value will be forced to + // Integer.MIN_VALUE, but if we don't and have expiring cells, this will the time at which the first expiring cell expires. If we have no tombstones and + // no expiring cells, this will be Integer.MAX_VALUE; + private int minLocalDeletionTime; + boolean minLocalDeletionTimeSet = false; + + /// Data trie contains: + /// - LivenessInfo header at the root + /// - A Cell for each (simple or complex) cell of the row + /// - Cells may be expiring or even expired (not really expected for memtables but possible) + /// - Deletion branch with tombstones + private final DeletionAwareTrie data; + + /// Copy of the row's liveness info (also stored at the root of the trie). + private final LivenessInfo livenessInfo; + /// Copy of the row-level deletion time as [Row.Deletion] (also stored at the root of the deletion branch). + private final Deletion deletion; + + /// Create a trie-backed version of a row, using the column definitions of the given table metadata. + public static TrieBackedRow from(TableMetadata metadata, Row row) + { + Builder builder = builder(metadata, row.clustering()); + builder.addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo()); + builder.addRowDeletion(row.deletion()); + for (ColumnData cd : row) + { + if (cd.column.isSimple()) + builder.addCell((Cell) cd); + else + { + var ccd = (ComplexColumnData) cd; + builder.addComplexDeletion(ccd.column, ccd.complexDeletion()); + for (Cell cell : ccd) + builder.addCell(cell); + } + } + return builder.build(); + } + + /// Returns false if the given object is a level marker with no meaning of its own. Used to drop unproductive + /// markers that can remain after deletions. + public static boolean shouldPreserveContentWithoutChildren(Object o) + { + if (o == LivenessInfo.EMPTY || o == COMPLEX_COLUMN_MARKER || o == TrieTombstoneMarker.LevelMarker.ROW) + return false; + if (!(o instanceof TrieTombstoneMarker)) + return true; + TrieTombstoneMarker m = (TrieTombstoneMarker) o; + if (!m.hasLevelMarker(TrieTombstoneMarker.LevelMarker.ROW)) + return true; + return !Objects.equals(m.leftDeletion(), m.rightDeletion()); + } + + public static TrieBackedRow create(TableMetadata tableMetadata, Clustering clustering, DeletionAwareTrie data) + { + return new TrieBackedRow(tableMetadata, clustering, data); + } + + TrieBackedRow(TableMetadata tableMetadata, Clustering clustering, DeletionAwareTrie data) + { + this(tableMetadata.regularAndStaticColumns().columns(clustering == Clustering.STATIC_CLUSTERING), clustering, data); + } + + TrieBackedRow(Columns columns, Clustering clustering, DeletionAwareTrie data) + { + this(columns, + columnsMapCache.computeIfAbsent(columns, TrieBackedRow::makeColumnIdsMap), + clustering, + getLivenessInfo(data), + getDeletion(data), + data); + } + + private TrieBackedRow(Columns columns, + Object2IntHashMap columnIds, + Clustering clustering, + LivenessInfo livenessInfo, + Deletion deletion, + DeletionAwareTrie data) + { + this.deletion = deletion; + this.livenessInfo = livenessInfo; + assert data != null; + this.columns = columns; + this.columnIds = columnIds; + this.clustering = clustering; + this.data = data; + } + + private static Object2IntHashMap makeColumnIdsMap(Columns columns) + { + Object2IntHashMap columnIds = new Object2IntHashMap<>(COLUMN_NOT_PRESENT); + for (int i = 0; i < columns.size(); i++) + columnIds.put(columns.getSimple(i).name, i); + return columnIds; + } + + private static RangeTrie rowDeletionTrie(DeletionTime deletion) + { + return deletionTrie(ByteComparable.EMPTY, deletion, TrieTombstoneMarker.Kind.ROW); + } + + private static RangeTrie deletionTrie(ByteComparable prefix, DeletionTime deletion, TrieTombstoneMarker.Kind kind) + { + return withDeletionRoot(RangeTrie.branch(prefix, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(deletion, kind))); + } + + public static TrieBackedRow emptyDeletedRow(Clustering clustering, DeletionTime deletion) + { + assert !deletion.isLive(); + RangeTrie deletionTrie = rowDeletionTrie(deletion); + return new TrieBackedRow(Columns.NONE, EMPTY_COLUMN_IDS, clustering, LivenessInfo.EMPTY, Deletion.regular(deletion), + DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, BYTE_COMPARABLE_VERSION, deletionTrie)); + } + + /// Adjust the given deletion trie to include a row-level marker. + private static RangeTrie withDeletionRoot(RangeTrie trie) + { + // Range tries present separate content in the two directions. We need to add a marker in both. + return trie.mergeWith(RangeTrie.point(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + true, + TrieTombstoneMarker.LevelMarker.ROW), + TrieTombstoneMarker::mergeUpdate) + .mergeWith(RangeTrie.point(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + false, + TrieTombstoneMarker.LevelMarker.ROW), + TrieTombstoneMarker::mergeUpdate); + } + + @Override + public long accumulate(LongAccumulator accumulator, long initialValue) + { + // TODO: this isn't efficient at all + long v = initialValue; + for (ColumnData c : this) + v = accumulator.apply(c, v); + return v; + } + + @Override + public long accumulate(BiLongAccumulator accumulator, A arg, long initialValue) + { + // TODO: this isn't efficient at all + long v = initialValue; + for (ColumnData c : this) + v = accumulator.apply(arg, c, v); + return v; + } + + private static class Accumulator implements DeletionAwareTrie.ValueConsumer + { + final LongAccumulator> cellAccumulator; + final LongAccumulator livenessAccumulator; + final LongAccumulator markerAccumulator; + long value; + + Accumulator(long initialValue, + LongAccumulator> cellAccumulator, + LongAccumulator livenessAccumulator, + LongAccumulator markerAccumulator) + { + this.cellAccumulator = cellAccumulator; + this.livenessAccumulator = livenessAccumulator; + this.markerAccumulator = markerAccumulator; + this.value = initialValue; + } + + @Override + public void content(Object content) + { + if (content instanceof LivenessInfo) + value = livenessAccumulator.apply((LivenessInfo) content, value); + else if (content instanceof CellData) + value = cellAccumulator.apply((CellData) content, value); + else if (content != COMPLEX_COLUMN_MARKER) + throw new AssertionError("Unexpected content type: " + content); + } + + @Override + public void deletionMarker(TrieTombstoneMarker marker) + { + if (marker.isBoundary()) + { + // We only apply the function to one side of the marker; the other has to be already seen as a + // succeeding side of a different marker. + TrieTombstoneMarker.Covering applicableState = marker.applicableToPointForward(); + if (applicableState != null) + value = markerAccumulator.apply(applicableState, value); + } + } + } + + /// Accumulate a long value, using the given cell-level functions. + /// + /// Note: For efficiency, the cell accumulator is given cells without path. If the path is needed, use a different + /// `accumulate` method. + long accumulate(long initialValue, + LongAccumulator livenessAccumulator, + LongAccumulator> cellAccumulator, + LongAccumulator markerAccumulator) + { + Accumulator accumulator = new Accumulator(initialValue, cellAccumulator, livenessAccumulator, markerAccumulator); + data.process(Direction.FORWARD, accumulator); + return accumulator.value; + } + + @Override + public long maxTimestamp() + { + return accumulate(Long.MIN_VALUE, + (livenessInfo, maxTimestamp) -> Math.max(maxTimestamp, livenessInfo.timestamp()), + (cell, maxTimestamp) -> Math.max(maxTimestamp, cell.timestamp()), + (marker, maxTimestamp) -> Math.max(maxTimestamp, marker.markedForDeleteAt())); + } + + @Override + public long minTimestamp() + { + return accumulate(Long.MAX_VALUE, + (livenessInfo, minTimestamp) -> Math.min(minTimestamp, livenessInfo.timestamp()), + (cell, minTimestamp) -> Math.min(minTimestamp, cell.timestamp()), + (marker, minTimestamp) -> Math.min(minTimestamp, marker.markedForDeleteAt())); + } + + @Override + public Clustering clustering() + { + return clustering; + } + + @Override + public LivenessInfo primaryKeyLivenessInfo() + { + return livenessInfo; + } + + public static LivenessInfo getLivenessInfo(DeletionAwareTrie trie) + { + LivenessInfo info = (LivenessInfo) trie.get(ByteComparable.EMPTY); + return info != null ? info : LivenessInfo.EMPTY; + } + + @Override + public boolean isEmpty() + { + // Empty has no live or deletion branch but may have an empty row marker. + return isEmpty(data); + } + + public static boolean isEmpty(DeletionAwareTrie data) + { + if (data == null) + return true; + + if (!isEmptyAfterDeletion(data)) + return false; + + if (data instanceof InMemoryDeletionAwareTrie) + { + // the row deletion marker will only be present if there is a deletion present + return data.applicableDeletion(ByteComparable.EMPTY) == null; + } + else + { + // The row deletion marker may remain even if the data is deleted/filtered out. + // Check for the existence of a deletion boundary + return !data.deletionBranchAtRoot().filteredValuesIterator(Direction.FORWARD, TrieTombstoneMarker.Boundary.class).hasNext(); + } + } + + @Override + public boolean isEmptyAfterDeletion() + { + return isEmptyAfterDeletion(data); + } + + public static boolean isEmptyAfterDeletion(DeletionAwareTrie data) + { + if (data instanceof InMemoryDeletionAwareTrie) + { + // the liveness marker will be dropped if there are no cells + return data.get(ByteComparable.EMPTY) == null; + } + else + { + // The liveness marker may remain even if the data is deleted/filtered out. + // Check for the existence of: + // - non-empty liveness + LivenessInfo info = (LivenessInfo) data.get(ByteComparable.EMPTY); + if (info != null && info != LivenessInfo.EMPTY) + return false; + + // - a cell + return !data.contentOnlyTrie().filteredValuesIterator(Direction.FORWARD, CellData.class).hasNext(); + } + } + + @Override + public Deletion deletion() + { + return deletion; + } + + static Deletion getDeletion(DeletionAwareTrie trie) + { + DeletionTime delTime = TrieTombstoneMarker.applicableDeletion(trie, ByteComparable.EMPTY); + if (delTime == null) + return Deletion.LIVE; + else + return Deletion.regular(delTime); + } + + public static ByteSource columnKey(Columns columns, ColumnMetadata column) + { + return ByteSource.variableLengthUnsignedInteger(columns.simpleIdx(column)); + } + + public static ByteSource cellPathKey(ColumnMetadata column, CellPath path, ByteComparable.Version version) + { + return ByteSource.withTerminator(ByteSource.TERMINATOR, + getCellPathType(column).asComparableBytes(path.get(0), version)); + } + + /// Return a cell key (i.e. path in the trie) for the given column and cell path. + private static ByteComparable cellKey(Object2IntHashMap columnIds, ColumnMetadata column, CellPath path) + { + int id = columnIds.getValue(column.name); + if (id == COLUMN_NOT_PRESENT) + return MISSING_COLUMN_KEY; // SAI can call cellKey for static columns on regular rows and vice versa + if (!column.isComplex()) + return encodeUnsignedInt(id); + else + return cellKey(id, column, path); + } + + private static ByteSource columnIdPrefix(int columnId) + { + if (columnId < 0) + return ByteSource.EMPTY; + else + return ByteSource.variableLengthUnsignedInteger(columnId); + } + + /// Return a cell key (i.e. path in the trie) for the given column index and cell path. + /// + /// Note: this method is also used by [TrieBackedComplexColumn] where the column index is in the path leading to the + /// complex column trie. To support this, a `columnId` of -1 is used to skip the column index. + static ByteComparable cellKey(int columnId, ColumnMetadata column, CellPath path) + { + if (path == CellPath.BOTTOM) + return v -> ByteSource.concat(columnIdPrefix(columnId), + ByteSource.oneByte(ByteSource.LT_NEXT_COMPONENT)); + else if (path == CellPath.TOP) + return v -> ByteSource.concat(columnIdPrefix(columnId), + ByteSource.oneByte(ByteSource.GT_NEXT_COMPONENT)); + else + return v -> ByteSource.concat(columnIdPrefix(columnId), + cellPathKey(column, path, v)); + // TODO: figure out a better way to do path slices and remove the leading path byte as +// return v -> ByteSource.concat(columnIdPrefix(columnId), +// ((MultiCellCapableType)column.type).nameComparator().asComparableBytes(path.get(0), v), +// ByteSource.oneByte(ByteSource.TERMINATOR)); + } + + private static AbstractType getCellPathType(ColumnMetadata column) + { + assert column.isComplex(); + return ((MultiCellCapableType) column.type).nameComparator(); + } + + /// Returns the column key, i.e. the column index path in the trie without the part corresponding to the cell path. + private static ByteComparable columnKey(Object2IntHashMap columnIds, ColumnMetadata column) + { + int id = columnIds.getValue(column.name); + if (id == COLUMN_NOT_PRESENT) + return MISSING_COLUMN_KEY; + return encodeUnsignedInt(id); + } + + /// Convert the cell-path part of the trie path into a cell path to use in a [Cell]. + static CellPath cellPath(ColumnMetadata column, ByteSource.Peekable src) + { + int next = src.next(); + if (next == ByteSource.LT_NEXT_COMPONENT) + return CellPath.BOTTOM; + else if (next == ByteSource.GT_NEXT_COMPONENT) + return CellPath.TOP; + + ByteSource.Peekable componentSource = ByteSourceInverse.nextComponentSource(src, next); + ByteBuffer path = getCellPathType(column).fromComparableBytes(componentSource, BYTE_COMPARABLE_VERSION); + return CellPath.create(path); + // TODO: figure out a better way to do path slices and remove the leading path byte as +// return CellPath.create(getCellPathType(column).fromComparableBytes(src, BYTE_COMPARABLE_VERSION)); + } + + @Override + public Cell getCell(ColumnMetadata c) + { + assert !c.isComplex(); + return getCellInternal(c, null); + } + + @Override + public Cell getCell(ColumnMetadata c, CellPath path) + { + assert c.isComplex(); + return getCellInternal(c, path); + } + + private Cell getCellInternal(ColumnMetadata c, CellPath path) + { + Object o = data.get(cellKey(columnIds, c, path)); + if (o == null || o instanceof Cell) + return (Cell) o; + CellData cellData = (CellData) o; + return cellData.toCell(c, path); + } + + @Override + public ComplexColumnData getComplexColumnData(ColumnMetadata c) + { + assert c.isComplex(); + DeletionAwareTrie tail = data.tailTrie(columnKey(columnIds, c)); + if (!isColumnDataTrieEmpty(tail)) + return new TrieBackedComplexColumn(c, tail); + else + return null; + } + + private static boolean isColumnDataTrieEmpty(DeletionAwareTrie tail) + { + if (tail == null) + return true; + // We may be left with only a COMPLEX_COLUMN_MARKER after some transformation. + if (tail instanceof InMemoryDeletionAwareTrie) + return false; // in-memory trie will drop the marker + if (TrieTombstoneMarker.applicableDeletion(tail, ByteComparable.EMPTY) != null) + return false; + // otherwise it's empty if it has no cells + return !tail.filteredValuesIterator(Direction.FORWARD, CellData.class).hasNext(); + } + + @Override + public ColumnData getColumnData(ColumnMetadata c) + { + return c.isComplex() ? getComplexColumnData(c) : getCell(c); + } + + @Override + public Collection columns() + { + return new AbstractCollection() + { + @Override public Iterator iterator() + { + return Iterators.transform(TrieBackedRow.this.iterator(), ColumnData::column); + } + @Override public int size() + { + return columnCount(); + } + }; + } + + /// Combine data in the live and deletion branches to identify column roots. + private static Object combineDataAndDeletionForColumnIterator(Object content, TrieTombstoneMarker marker, Direction direction) + { + if (content instanceof CellData) + return content; + if (content == COMPLEX_COLUMN_MARKER) + return content; + // We may also have a complex column in the deletion trie, introducing a COLUMN-level deletion. + if (marker == null) + return null; // any other content in the live part of the trie is not a column root + TrieTombstoneMarker.Covering introducedDeletion = marker.succedingState(direction); + if (introducedDeletion != null && introducedDeletion.deletionKind() == TrieTombstoneMarker.Kind.COLUMN) + return COMPLEX_COLUMN_MARKER; + // This is a complex column deletion marker. Return it as a complex column, which will also result in skipping + // the return path marker. + return null; + } + + private static Object combineDataAndDeletionForColumnIteratorForward(Object content, TrieTombstoneMarker marker) + { + return combineDataAndDeletionForColumnIterator(content, marker, Direction.FORWARD); + } + + private static Object combineDataAndDeletionForColumnIteratorReverse(Object content, TrieTombstoneMarker marker) + { + return combineDataAndDeletionForColumnIterator(content, marker, Direction.REVERSE); + } + + static class ColumnDataIterator extends TrieTailsIterator.DeletionAware + { + private final Columns columns; + + ColumnDataIterator(Columns columns, DeletionAwareTrie trie, Direction direction) + { + super(trie, + direction, + direction.select(TrieBackedRow::combineDataAndDeletionForColumnIteratorForward, + TrieBackedRow::combineDataAndDeletionForColumnIteratorReverse), + false); + this.columns = columns; + } + + @Override + protected ColumnData mapContent(Object value, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength) + { + // value is given by combineDataAndDeletionForColumnIterator above + if (value != COMPLEX_COLUMN_MARKER) + return cellFromCellData((CellData) value, bytes, byteLength, columns); + + return new TrieBackedComplexColumn(columnMetadataFromPath(bytes, byteLength, columns), tailTrie); + } + + } + + public static ColumnMetadata columnMetadataFromPath(byte[] bytes, int byteLength, Columns columns) + { + long columnIndex = ByteSourceInverse.getVariableLengthUnsignedInteger(ByteSource.preencoded(bytes, 0, byteLength)); + assert ((int) columnIndex) == columnIndex; + return columns.getSimple((int) columnIndex); + } + + public static Cell cellFromCellData(CellData value, byte[] bytes, int byteLength, Columns columns) + { + if (value instanceof Cell) + return (Cell) value; + ByteSource.Peekable pathBytes = ByteSource.preencoded(bytes, 0, byteLength); + long columnIdx = ByteSourceInverse.getVariableLengthUnsignedInteger(pathBytes); + ColumnMetadata column = columns.getSimple((int) columnIdx); + return value.toCell(column, column.isComplex() ? cellPath(column, pathBytes) : null); + } + + /// Count the columns in the provided data trie (a row or a partition trie). This includes simple and complex + /// columns, including fully deleted ones. + public static int countColumns(DeletionAwareTrie data) + { + class Counter implements Trie.ValueConsumer + { + int count = 0; + + @Override + public void content(Object content) + { + ++count; + } + } + + Counter counter = new Counter(); + data.mergedTrie(TrieBackedRow::combineDataAndDeletionForColumnIteratorForward) + .processSkippingBranches(Direction.FORWARD, counter); + return counter.count; + } + + @Override + public int columnCount() + { + return countColumns(data); + } + + @Override + public Iterator iterator() + { + return new ColumnDataIterator(columns, data, Direction.FORWARD); + } + + @Override + public Iterable> cells() + { + return () -> new CellsWithPath(data.contentOnlyTrie(), Direction.FORWARD); + } + + @Override + public Row filter(ColumnFilter filter, TableMetadata droppedColumnsSource) + { + return filter(filter, DeletionTime.LIVE, false, droppedColumnsSource); + } + + @Override + public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setActiveDeletionToRow, TableMetadata droppedColumnsSource) + { + Map droppedColumns = droppedColumnsSource.droppedColumns; + + boolean mayFilterColumns = !filter.fetchesAllColumns(isStatic()) || !filter.allFetchedColumnsAreQueried(); + // When merging sstable data in Row.Merger#merge(), rowDeletion is removed if it doesn't supersede activeDeletion. + boolean mayHaveDeleted = !activeDeletion.isLive(); + DeletionAwareTrie filteredData = data; + if (!mayFilterColumns && !mayHaveDeleted && droppedColumns.isEmpty()) + return this; + + if (!droppedColumns.isEmpty()) + { + // Filter dropped columns by adding a deletion with the drop time, so that data before the drop time is not + // returned. + List> drops = new ArrayList<>(); + for (ColumnMetadata c : columns) + { + DroppedColumn dropped = droppedColumns.get(c.name.bytes); + if (dropped != null) + { + drops.add(RangeTrie.branch(columnKey(columnIds, c), + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(dropped.droppedTime, 0, TrieTombstoneMarker.Kind.COLUMN))); + } + } + if (!drops.isEmpty()) + filteredData = filteredData.mappingMergeWithDeletion(RangeTrie.merge(drops, TrieTombstoneMarker::merge), + TrieBackedRow::deleteData, + TrieTombstoneMarker::dropShadowedUpdate, + true); + } + + if (mayFilterColumns) + { + // TODO: Column filter may include cell-level filters for complex columns, in both fetched and queried + Columns queried = filter.queriedColumns().columns(isStatic()); + BitSet queriedIds = getColumnIds(queried); + + // Getting queried and fetchedButNotQueried separately and merging looks more efficient, but in general + // most of the time we'll either have fetched getting all columns or fetched equal to queried. + // Filtering by fetched first avoids one operation in the former case. + Columns fetched = filter.fetchedColumns().columns(isStatic()); + BitSet fetchedIds = getColumnIds(fetched); + if (fetchedIds.cardinality() != columns.size()) + filteredData = restrictToColumnSet(filteredData, fetchedIds); + + BitSet fetchedButNotQueried = fetchedIds; + fetchedButNotQueried.andNot(queriedIds); + + if (!fetchedButNotQueried.isEmpty()) + { + DeletionAwareTrie fetchedButNotQueriedData = + filteredData.intersect(TrieSet.ranges(BYTE_COMPARABLE_VERSION, true, true, mapIdsToColumnKeys(fetchedButNotQueried))) + .mapValues(TrieBackedRow::dropCellValue); + + filteredData = filteredData.mergeWith(fetchedButNotQueriedData, + (x, y) -> y, // fetchedButNotQueried overrides data cells + TrieTombstoneMarker::mergeUpdate, + noExistingSelfDeletion(), + true); + } + } + LivenessInfo filteredLivenessInfo = this.livenessInfo; + Deletion filteredDeletion = this.deletion; + + if (mayHaveDeleted) + { + // Apply the given active deletion. If asked to set, add it as a deletion. If not, only drop content that + // it deletes. + if (setActiveDeletionToRow) + filteredData = filteredData.mergeWithDeletion(RangeTrie.branch(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(activeDeletion, TrieTombstoneMarker.Kind.ROW)), + TrieBackedRow::deleteData, + TrieTombstoneMarker::mergeUpdate, + true); + else // we need mappingMerge to make sure that the resolver is called for all update markers so that we can drop them + filteredData = filteredData.mappingMergeWithDeletion(RangeTrie.branch(ByteComparable.EMPTY, + BYTE_COMPARABLE_VERSION, + TrieTombstoneMarker.covering(activeDeletion, TrieTombstoneMarker.Kind.ROW)), + TrieBackedRow::deleteData, + TrieTombstoneMarker::dropShadowedUpdate, + true); + filteredLivenessInfo = getLivenessInfo(filteredData); + filteredDeletion = getDeletion(filteredData); + } + + // TODO: Should we use `fetched` for `columns`? Note the ids cannot change. + + if (isEmpty(filteredData)) + return null; + + return new TrieBackedRow(columns, columnIds, clustering, filteredLivenessInfo, filteredDeletion, filteredData); + } + + private static DeletionAwareTrie restrictToColumnSet(DeletionAwareTrie data, BitSet fetchedIds) + { + // Because we do not support column-level deletions for simple columns, we need to keep the row-level deletion + // at the root. The intersection below moves it down to the cell level. + TrieTombstoneMarker.Covering rowDeletion = TrieTombstoneMarker.applicableDeletion(data, ByteComparable.EMPTY); + + // Restrict to the fetched columns with the liveness info. + if (!fetchedIds.isEmpty()) + data = data.intersect(TrieSet.ranges(BYTE_COMPARABLE_VERSION, true, true, mapIdsToColumnKeys(fetchedIds))); + else + data = DeletionAwareTrie.singleton(ByteComparable.EMPTY, BYTE_COMPARABLE_VERSION, data.get(ByteComparable.EMPTY)); + + // Re-add the row deletion and the ascent-side Level.ROW marker, which we lose in the intersection above. + if (rowDeletion != null) + data = data.mergeWithDeletion(rowDeletionTrie(rowDeletion), + (x, y) -> y, // Row deletion is already applied + TrieTombstoneMarker::mergeUpdate, + true); + return data; + } + + private static Object deleteData(Object existing, TrieTombstoneMarker marker) + { + return deleteData(marker, existing); + } + + private static Object deleteData(TrieTombstoneMarker marker, Object existing) + { + DeletionTime deletion = marker.applicableToPointForward(); + if (deletion == null) + return existing; + if (existing == COMPLEX_COLUMN_MARKER) + return existing; + if (existing instanceof LivenessInfo) + { + if (deletion.deletes(((LivenessInfo) existing).timestamp())) + return LivenessInfo.EMPTY; + else + return existing; + } + if (existing instanceof CellData) + { + if (deletion.deletes((CellData) existing)) + return null; + else + return existing; + } + throw new AssertionError("Unknown content type: " + existing); + } + + private static Object dropCellValue(Object existing) + { + if (!(existing instanceof CellData)) + return existing; + return ((CellData) existing).withSkippedValue(); + } + + private static ByteComparable[] mapIdsToColumnKeys(BitSet fetchedIds) + { + ByteComparable[] keys = new ByteComparable[fetchedIds.cardinality() * 2]; + int keyPos = 0; + for (int i = fetchedIds.nextSetBit(0); i >= 0; i = fetchedIds.nextSetBit(i + 1)) + { + final int id = i; + ByteComparable columnKey = encodeUnsignedInt(id); + keys[keyPos++] = columnKey; // add twice for inclusive start and end + keys[keyPos++] = columnKey; + } + assert keyPos == keys.length; + return keys; + } + + private static ByteComparable encodeUnsignedInt(long id) + { + return v -> ByteSource.variableLengthUnsignedInteger(id); + } + + private BitSet getColumnIds(Columns fetched) + { + BitSet fetchedIds = new BitSet(); + for (ColumnMetadata c : fetched) + { + int idx = columnIds.getValue(c.name); + if (idx == COLUMN_NOT_PRESENT) + continue; + fetchedIds.set(idx); + } + return fetchedIds; + } + + @Override + public Row withOnlyQueriedData(ColumnFilter filter) + { + if (filter.allFetchedColumnsAreQueried()) + return this; + + // TODO: Column filter may include cell-level filters for complex columns + Columns queried = filter.queriedColumns().columns(isStatic()); + BitSet queriedIds = getColumnIds(queried); + if (queriedIds.cardinality() != columns.size()) + return new TrieBackedRow(columns, columnIds, clustering, livenessInfo, deletion, restrictToColumnSet(data, queriedIds)); + else + return this; + } + + @Override + public boolean hasComplexDeletion() + { + for (Iterator it = data.deletionBranchAtRoot().valueIterator(); it.hasNext();) + { + TrieTombstoneMarker marker = it.next(); + TrieTombstoneMarker.Covering introducedDeletion = marker.rightDeletion(); + if (introducedDeletion != null && introducedDeletion.deletionKind() == TrieTombstoneMarker.Kind.COLUMN) + return true; + } + return false; + } + + @Override + public Row markCounterLocalToBeCleared() + { + return transformAndFilter(x -> x, CellData::markCounterLocalToBeCleared); + } + + @Override + public boolean hasDeletion(long nowInSec) + { + return nowInSec >= getMinLocalDeletionTime(); + } + + @Override + public boolean hasInvalidDeletions() + { + return accumulate(0, + (liveness, v) -> (liveness.isExpiring() && (liveness.ttl() < 0 || liveness.localExpirationTime() < 0)) ? 1 : v, + (cell, v) -> cell.hasInvalidDeletions() ? 1 : v, + (marker, v) -> !marker.validate() ? 1 : v) + != 0; + } + + /// Return the backing trie for merging into a partition or memtable. + public DeletionAwareTrie trie() + { + return data; + } + + @Override + public Row updateAllTimestamp(long newTimestamp) + { + return transformAndFilter(liveness -> liveness.withUpdatedTimestamp(newTimestamp), + cell -> cell.updateAllTimestamp(newTimestamp), + dt -> dt.isLive() ? dt : DeletionTime.build(newTimestamp - 1, dt.localDeletionTime())); + } + + @Override + public Row withRowDeletion(DeletionTime newDeletion) + { + // Applies the deletion to the branch, removing any shadowed data (caller should ensure there isn't any, but + // we do this properly for safety). + if (newDeletion.isLive()) + return this; + + DeletionAwareTrie newData = data.mergeWithDeletion(rowDeletionTrie(newDeletion), + TrieBackedRow::deleteData, + TrieTombstoneMarker::mergeUpdate, + true); + return new TrieBackedRow(columns, columnIds, clustering, getLivenessInfo(newData), getDeletion(newData), newData); + } + + @Override + public Row purge(DeletionPurger purger, long nowInSec, boolean enforceStrictLiveness) + { + // TODO: evaluate need/performance effect + if (!hasDeletion(nowInSec)) + return this; + + if (enforceStrictLiveness) + { + // when enforceStrictLiveness is set, a row is considered dead when it's PK liveness info is not present + LivenessInfo primaryLiveness = primaryKeyLivenessInfo(); + primaryLiveness = purger.shouldPurge(primaryLiveness, nowInSec) ? LivenessInfo.EMPTY : primaryLiveness; + DeletionTime rowDeletion = TrieTombstoneMarker.applicableDeletion(data, ByteComparable.EMPTY); + rowDeletion = rowDeletion != null && !purger.shouldPurge(rowDeletion) ? rowDeletion : null; + if (primaryLiveness.isEmpty() && rowDeletion == null) + return null; + } + + return transformAndFilter(primaryKeyLivenessInfo -> purger.shouldPurge(primaryKeyLivenessInfo, nowInSec) ? LivenessInfo.EMPTY : primaryKeyLivenessInfo, + cell -> cell.purge(purger, nowInSec), + deletion -> purger.shouldPurge(deletion) ? null : deletion); + } + + public Row purgeDataOlderThan(long timestamp, boolean enforceStrictLiveness) + { + if (enforceStrictLiveness) + { + // when enforceStrictLiveness is set, a row is considered dead when it's PK liveness info is not present + DeletionTime rowDeletion = TrieTombstoneMarker.applicableDeletion(data, ByteComparable.EMPTY); + if (primaryKeyLivenessInfo().timestamp() < timestamp && rowDeletion == null || rowDeletion.markedForDeleteAt() < timestamp) + return null; + } + + return transformAndFilter(primaryKeyLivenessInfo -> primaryKeyLivenessInfo.timestamp() < timestamp ? LivenessInfo.EMPTY : primaryKeyLivenessInfo, + c -> c.purgeDataOlderThan(timestamp), + deletion -> deletion.markedForDeleteAt() < timestamp ? null : deletion); + } + + + @Override + public Row transformAndFilter(Function livenessInfoFunction, + CellTransformer cellFunction) + { + return new TrieBackedRow(columns, columnIds, clustering, livenessInfoFunction.apply(livenessInfo), deletion, data.mapValues( + (Object x) -> + { + if (x instanceof LivenessInfo) + { + return livenessInfoFunction.apply((LivenessInfo) x); + } + else if (x instanceof CellData) + { + //noinspection rawtypes + return cellFunction.apply((CellData) x); + } + else + return x; // complex column marker + })); + } + + Row transformAndFilter(Function livenessInfoFunction, + Function, CellData> cellFunction, + Function markerFunction) + { + DeletionAwareTrie mappedData = data.mapValuesAndDeletions( + (Object x) -> + { + if (x instanceof LivenessInfo) + { + return (livenessInfoFunction.apply((LivenessInfo) x)); + } + else if (x instanceof CellData) + { + return cellFunction.apply((CellData) x); + } + else + return x; // complex column marker + }, + t -> t.map(markerFunction)); + if (isEmpty(mappedData)) + return null; + + Deletion newDeletion = deletion; + if (!deletion.isLive()) + { + DeletionTime newDeletionTime = markerFunction.apply(deletion.time()); + if (newDeletionTime != deletion.time()) + newDeletion = newDeletionTime != null ? Deletion.regular(newDeletionTime) : Deletion.LIVE; + } + return new TrieBackedRow(columns, + columnIds, + clustering, + livenessInfoFunction.apply(livenessInfo), + newDeletion, + mappedData); + } + + @Override + public Row clone(Cloner cloner) + { + InMemoryDeletionAwareTrie newTrie = newTrie(); + try + { + newTrie.mutator(((ex, toClone) -> toClone instanceof CellData ? ((CellData) toClone).clone(cloner) : toClone), + mergeTombstoneRanges(), + noIncomingSelfDeletion(), + noExistingSelfDeletion(), + true, + Predicates.alwaysFalse()) + .apply(data); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return new TrieBackedRow(columns, columnIds, cloner.clone(clustering), livenessInfo, deletion, newTrie); + } + + // TODO: Redo size collection to be more direct. + + @Override + public int dataSize() + { + int dataSize = clustering.dataSize() + + primaryKeyLivenessInfo().dataSize() + + deletion().dataSize(); + + return Ints.checkedCast(accumulate((cd, v) -> v + cd.dataSize(), dataSize)); + } + + @Override + public int liveDataSize(long nowInSec) + { + int dataSize = clustering.dataSize() + + primaryKeyLivenessInfo().dataSize() + + deletion().dataSize(); + + return Ints.checkedCast(accumulate((cd, v) -> v + cd.liveDataSize(nowInSec), dataSize)); + } + + @Override + public long unsharedHeapSize() + { + long heapSize = EMPTY_SIZE + clustering.unsharedHeapSizeExcludingData(); + if (data instanceof InMemoryDeletionAwareTrie) + heapSize += ((InMemoryDeletionAwareTrie) data).usedSizeOnHeap(); + + return accumulate(heapSize, + (liveness, v) -> v + liveness.unsharedHeapSize(), + (cell, v) -> v + cell.unsharedHeapSize(), + (marker, v) -> v + marker.unsharedHeapSize()); + } + + @Override + public long unsharedHeapSizeExcludingData() + { + long heapSize = EMPTY_SIZE + clustering.unsharedHeapSizeExcludingData(); + if (data instanceof InMemoryDeletionAwareTrie) + heapSize += ((InMemoryDeletionAwareTrie) data).usedSizeOnHeap(); + + return accumulate(heapSize, + (liveness, v) -> v + liveness.unsharedHeapSize(), + (cell, v) -> v + cell.unsharedHeapSizeExcludingData(), + (marker, v) -> v + marker.unsharedHeapSize()); + } + + @Override + public void apply(Consumer function) + { + for (ColumnData cd : this) + function.accept(cd); + } + + @Override + public void apply(BiConsumer function, A arg) + { + for (ColumnData cd : this) + function.accept(arg, cd); + } + + private static Builder builder(TableMetadata metadata, Clustering clustering) + { + Builder builder = new Builder(metadata.regularAndStaticColumns()); + builder.newRow(clustering); + return builder; + } + + public static Row.Builder builder(RegularAndStaticColumns regularAndStaticColumns) + { + return new Builder(regularAndStaticColumns); + } + + private static Object mergeData(Object existing, Object update) + { + if (update instanceof LivenessInfo) + return LivenessInfo.merge((LivenessInfo) existing, (LivenessInfo) update); + else if (update instanceof CellData) + { + CellData existingCell = (CellData) existing; + CellData updateCell = (CellData) update; + //noinspection unchecked + return Cells.reconcile(existingCell, updateCell); + } + else + { + assert existing == COMPLEX_COLUMN_MARKER; + return existing; + } + } + + public static InMemoryDeletionAwareTrie newTrie() + { + return InMemoryDeletionAwareTrie.shortLived(BYTE_COMPARABLE_VERSION, TrieBackedRow::shouldPreserveContentWithoutChildren); + } + + public Row mergeWith(Row updateAsRow) + { + if (!(updateAsRow instanceof TrieBackedRow)) + throw new IllegalArgumentException("Merging different row types."); + TrieBackedRow update = (TrieBackedRow) updateAsRow; + if (!this.columns.containsAll(update.columns)) + throw new IllegalArgumentException("Can't handle varying column lists."); + + try + { + InMemoryDeletionAwareTrie mergedData = newTrie(); + makeMutator(mergedData) + .apply(this.data.mergeWith(update.data, + TrieBackedRow::mergeData, + TrieTombstoneMarker::mergeUpdate, + TrieBackedRow::deleteData, + true + )); + return new TrieBackedRow(this.columns, + this.columnIds, + this.clustering, + getLivenessInfo(mergedData), + getDeletion(mergedData), + mergedData); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + private static long minLocalDeletionTime(CellData cell) + { + return cell.isTombstone() ? Long.MIN_VALUE : cell.localDeletionTime(); + } + + private static long minLocalDeletionTime(LivenessInfo info) + { + return info.isExpiring() ? info.localExpirationTime() : Long.MAX_VALUE; + } + + private static long minLocalDeletionTime(DeletionTime dt) + { + return dt.isLive() ? Long.MAX_VALUE : Long.MIN_VALUE; + } + + public int getMinLocalDeletionTime() + { + if (!minLocalDeletionTimeSet) + { + long accumulated = accumulate(Integer.MAX_VALUE, + (livenessInfo, mldt) -> Math.min(mldt, minLocalDeletionTime(livenessInfo)), + (cell, mldt) -> Math.min(mldt, minLocalDeletionTime(cell)), + (marker, mldt) -> Math.min(mldt, minLocalDeletionTime(marker))); + minLocalDeletionTime = (int) accumulated; + minLocalDeletionTimeSet = true; + } + return minLocalDeletionTime; + } + + class CellsWithPath extends TrieEntriesIterator.WithNullFiltering> + { + protected CellsWithPath(Trie trie, Direction direction) + { + super(trie, direction); + } + + @Override + protected Cell mapContent(Object content, byte[] bytes, int byteLength) + { + if (!(content instanceof CellData)) + return null; + return cellFromCellData((CellData) content, bytes, byteLength, columns); + } + } + + static InMemoryDeletionAwareTrie.Mutator + makeMutator(InMemoryDeletionAwareTrie data) + { + return data.mutator(noConflictInData(), + TrieBackedPartition.mergeTombstoneRanges(), + TrieBackedRow::deleteData, + TrieBackedRow::deleteData, + true, + Predicates.alwaysFalse(), + Predicates.alwaysFalse()); + } + + public static class Builder implements Row.Builder + { + protected final RegularAndStaticColumns regularAndStaticColumns; + protected final Object2IntHashMap regularColumnIds; + protected final Object2IntHashMap staticColumnIds; + protected Clustering clustering; + protected Object2IntHashMap columnIds; + private InMemoryDeletionAwareTrie data; + private InMemoryDeletionAwareTrie.Mutator mutator; + + // For complex column at index i of 'columns', we store at complexDeletions[i] its complex deletion. + + protected Builder(RegularAndStaticColumns regularAndStaticColumns) + { + this.regularAndStaticColumns = regularAndStaticColumns; + regularColumnIds = columnsMapCache.computeIfAbsent(regularAndStaticColumns.regulars, TrieBackedRow::makeColumnIdsMap); + staticColumnIds = columnsMapCache.computeIfAbsent(regularAndStaticColumns.statics, TrieBackedRow::makeColumnIdsMap); + reset(); + } + + protected Builder(Builder builder) + { + this.regularAndStaticColumns = builder.regularAndStaticColumns; + this.regularColumnIds = builder.regularColumnIds; + this.staticColumnIds = builder.staticColumnIds; + this.clustering = builder.clustering; + this.columnIds = builder.columnIds; + reset(); + try + { + mutator.apply(builder.data); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public Builder copy() + { + return new Builder(this); + } + + @Override + public boolean isSorted() + { + return true; + } + + @Override + public void newRow(Clustering clustering) + { + assert this.clustering == null; // Ensures we've properly called build() if we've use this builder before + this.clustering = clustering; + this.columnIds = clustering == Clustering.STATIC_CLUSTERING ? staticColumnIds : regularColumnIds; + } + + @Override + public Clustering clustering() + { + return clustering; + } + + protected void reset() + { + this.clustering = null; + data = newTrie(); + mutator = makeMutator(data); + } + + @Override + public void addPrimaryKeyLivenessInfo(LivenessInfo info) + { + DeletionTime rowDeletion = TrieTombstoneMarker.applicableDeletion(data, ByteComparable.EMPTY); + if (rowDeletion != null && rowDeletion.deletes(info)) + return; + + try + { + data.putRecursive(ByteComparable.EMPTY, (info), (x, y) -> y); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void addRowDeletion(Deletion deletion) + { + if (deletion.isLive()) + return; + + try + { + mutator.delete(rowDeletionTrie(deletion.time())); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void addCell(Cell cell) + { + assert cell.column().isStatic() == (clustering == Clustering.STATIC_CLUSTERING) : "Column is " + cell.column() + ", clustering = " + clustering; + CellPath path = cell.path(); + ByteComparable key = cellKey(columnIds, cell.column, path); + + // TODO: Use apply to take care of this? + DeletionTime cellDeletion = TrieTombstoneMarker.applicableDeletion(data, key); + if (cellDeletion != null && cellDeletion.deletes(cell)) + return; + + try + { + if (path == null || path.dataSize() <= MAX_RECURSIVE_LENGTH) + data.putRecursive(key, cell, (x, y) -> Cells.reconcile((Cell) x, y)); + else // long path, avoid stack overflow by using the apply path + mutator.apply(DeletionAwareTrie.singleton(key, BYTE_COMPARABLE_VERSION, cell)); + + if (path != null) + data.putRecursive(columnKey(columnIds, cell.column), COMPLEX_COLUMN_MARKER, (x, y) -> y); + + if (data.get(ByteComparable.EMPTY) == null) + data.putRecursive(ByteComparable.EMPTY, LivenessInfo.EMPTY, (x, y) -> y); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void addComplexDeletion(ColumnMetadata column, DeletionTime deletion) + { + if (deletion.isLive()) + return; + + ByteComparable key = columnKey(columnIds, column); + try + { + mutator.delete(deletionTrie(key, deletion, TrieTombstoneMarker.Kind.COLUMN)); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public TrieBackedRow build() + { + TrieBackedRow row = new TrieBackedRow(regularAndStaticColumns.columns(clustering == Clustering.STATIC_CLUSTERING), + columnIds, + clustering, + getLivenessInfo(data), + getDeletion(data), + data); + reset(); + return row; + } + } +} diff --git a/src/java/org/apache/cassandra/db/rows/TrieTombstoneMarker.java b/src/java/org/apache/cassandra/db/rows/TrieTombstoneMarker.java new file mode 100644 index 000000000000..f2740a67279a --- /dev/null +++ b/src/java/org/apache/cassandra/db/rows/TrieTombstoneMarker.java @@ -0,0 +1,920 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import java.util.Collection; +import java.util.Objects; +import java.util.function.Function; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.cassandra.cache.IMeasurableMemory; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.tries.DeletionAwareTrie; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.RangeState; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// A tombstone marker used in trie-backed structures. Unlike [RangeTombstoneMarker], this does not include a position +/// in the clustering order, only deletion times. +/// There are two kinds of trie markers: +/// - covering markers, which have a deletion time applicable to some position that is not a boundary in the deletions +/// branch, +/// - boundary markers, which switch from one deletion time (represented as a covering marker) to a different one +/// (either of which may be null). +/// +/// See [RangeState] for further explanation of the types of markers. +/// +/// To be able to easily identify lower-level boundaries inside a deletion branch, we also have "level markers" +/// meta-markers. These can be added standalone or as part of a boundary. These level markers have no effect on the +/// applicable deletions, but are needed e.g. to be able to list deleted rows that have no live data. +/// +/// In addition to these, we have a special form of boundary for point deletions, where the left and right side are the +/// same (often null), but there is a deletion time applicable to the exact point. These are used to mark deletions at +/// the lowest points of the represented data hierarchy where no further complexity can exist below the marked point +/// (currently point markers are used to mark deleted rows, which is the lowest level reached by `TrieMemtableStage3`) +/// to improve efficiency compared to bracketing the point with boundaries on both sides. +/// +/// To aid with some interpretation decisions, our covering markers also include a `deletionKind` which specifies the +/// type of deletion this range comes from (column, row, range or partition). +public interface TrieTombstoneMarker extends RangeState, IMeasurableMemory +{ + /// The kind of covering deletion specifies the origin of the range that is in effect with a given deletion time. + enum Kind + { + PARTITION, + RANGE, + ROW, + COLUMN + // we don't yet use cell-level tombstones + } + + /// The deletion that is in effect to the left of the current position (i.e. preceding in the forward direction). + /// For return path (ascent) positions this includes the current key itself and its children. + Covering leftDeletion(); + /// The deletion that is in effect to the right of the current position (i.e. succeding in the forward direction). + /// For normal (descent) positions this includes the position itself and its children. + Covering rightDeletion(); + + default boolean hasLevelMarker(LevelMarker level) + { + return false; + } + + /// If this is a point deletion, the deletion that is in effect at the specific key (regardless if the current + /// position is on the ascent or descent path). Null otherwise. + default Covering pointDeletion() + { + return null; + } + + /// The deletion that applies to the specific key on which we are positioned when going in the forward direction. + /// Equivalent to `pointDeletion() != null ? pointDeletion() : rightDeletion()` + default Covering applicableToPointForward() + { + return rightDeletion(); + } + /// The deletion that applies to the specific key on which we are positioned when going in the reverse direction. + /// Equivalent to `pointDeletion() != null ? pointDeletion() : leftDeletion()` + default Covering applicableToPointReverse() + { + return leftDeletion(); + } + + @Override + Covering precedingState(Direction direction); + + @Override + default Covering succedingState(Direction direction) + { + return precedingState(direction.opposite()); + } + + /// Converts this marker to [RangeTombstoneMarker], assigning it a clustering position from its byte-comparable + /// path in the trie. This method is only applicable to boundary markers and will use only sides of the marker that + /// have the `RANGE` kind. + RangeTombstoneMarker toRangeTombstoneMarker(ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator); + + + /// Combine two markers and return the applicable combined state, obtained by getting the higher of the deletion + /// times on both sides of the marker. For boundaries this may result in a covering state (when both sides become + /// equal) which is not stored or reported. + TrieTombstoneMarker mergeWith(TrieTombstoneMarker existing); + + /// Static version of [#mergeWith]. + static TrieTombstoneMarker mergeUpdate(TrieTombstoneMarker existing, @Nonnull TrieTombstoneMarker update) + { + return update.mergeWith(existing); + } + + /// Merge multiple markers, yielding a combined state obtained by getting the highest of the deletion times on both + /// sides. + static TrieTombstoneMarker merge(Collection markers) + { + TrieTombstoneMarker acc = null; + for (TrieTombstoneMarker marker : markers) + { + if (acc == null) + acc = marker; + else + acc = acc.mergeWith(marker); + } + return acc; + } + + /// Apply an incoming marker and drop the parts of this marker that do not survive (i.e. supercede) the incoming + /// deletion. The result may be null, this, or a partial version of this. + @Nullable TrieTombstoneMarker dropShadowed(TrieTombstoneMarker deletion); + + /// Static version of [#dropShadowed]. + static @Nullable TrieTombstoneMarker dropShadowedUpdate(TrieTombstoneMarker existing, TrieTombstoneMarker deletion) + { + if (existing == null) + return null; + else + return existing.dropShadowed(deletion); + } + + /// Return a marker that has its timestamps adjusted to the given new value. If this is a boundary between two + /// different deletions, this will convert it to a covering state which is not reported or stored. + TrieTombstoneMarker withUpdatedTimestamp(long newTimestamp); + + /// Apply the given mapper to this marker. The mapper must convert equal deletion times to equal. + @Nullable TrieTombstoneMarker map(Function mapper); + + static Covering covering(DeletionTime deletionTime, Kind kind) + { + return new Covering(deletionTime, kind); + } + + static Point point(DeletionTime deletionTime, Kind kind) + { + return new Point(covering(deletionTime, kind), null); + } + + static Covering covering(long deletedAt, long localDeletionTime, Kind kind) + { + return new Covering(deletedAt, localDeletionTime, kind); + } + + static Point point(long deletedAt, long localDeletionTime, Kind kind) + { + return new Point(covering(deletedAt, localDeletionTime, kind), null); + } + + /// Returns `right` if `left` does not supersede it, `left` otherwise. + static Covering combine(Covering left, Covering right) + { + if (left == null) + return right; + if (right == null) + return left; + if (left.supersedes(right)) + return left; + else + return right; + } + + /// Returns `value` if it survives (i.e. supersedes) `deletion`. + static Covering applyDeletion(Covering value, Covering deletion) + { + if (value == null) + return null; + if (deletion == null) + return value; + if (value.supersedes(deletion)) + return value; + else + return null; + } + + /// Make a marker with a suitable type for the given parameters. + static TrieTombstoneMarker make(Covering left, Covering right, LevelMarker levelMarkerIfPresent) + { + if (levelMarkerIfPresent == null) + { + if (left == right) // includes both being null + return left; + + if (left != null && left.equals(right)) + return left; + } + else if (left == null && right == null) + return levelMarkerIfPresent; + + return new Boundary(left, right, levelMarkerIfPresent); + } + + private static RangeTombstoneMarker makeRangeTombstoneMarker(@Nullable Covering leftDeletion, + @Nullable Covering rightDeletion, + ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator) + { + assert byteComparableVersion == ByteComparable.Version.OSS50; + if (leftDeletion == null || leftDeletion.deletionKind != Kind.RANGE) + { + if (rightDeletion == null || rightDeletion.deletionKind != Kind.RANGE) + return null; + else + return new RangeTombstoneBoundMarker(comparator.boundFromByteComparable(ByteArrayAccessor.instance, + clusteringPrefixAsByteComparable, + false), + rightDeletion); + } + + if (rightDeletion == null || rightDeletion.deletionKind != Kind.RANGE) + return new RangeTombstoneBoundMarker(comparator.boundFromByteComparable(ByteArrayAccessor.instance, + clusteringPrefixAsByteComparable, + true), + leftDeletion); + + return new RangeTombstoneBoundaryMarker(comparator.boundaryFromByteComparable(ByteArrayAccessor.instance, + clusteringPrefixAsByteComparable), + leftDeletion, + rightDeletion); + } + + /// Get the deletion in the given deletion-aware trie that applies to the given key. + static Covering applicableDeletion(DeletionAwareTrie data, ByteComparable key) + { + TrieTombstoneMarker marker = data.applicableDeletion(key); + return marker != null ? marker.applicableToPointForward() : null; + } + + /// Get the deletion in the given deletion-aware trie that applies to the given key, returning DeletionTime.LIVE + /// if none is found. + static DeletionTime applicableDeletionOrLive(DeletionAwareTrie data, ByteComparable key) + { + TrieTombstoneMarker marker = data.applicableDeletion(key); + if (marker != null) + { + Covering deletion = marker.applicableToPointForward(); + if (deletion != null) + return deletion; + } + return DeletionTime.LIVE; + } + + static class Covering extends DeletionTime implements TrieTombstoneMarker + { + static final long HEAP_SIZE = ObjectSizes.measure(new Covering(DeletionTime.LIVE, null)); + private final Kind deletionKind; + + private Covering(DeletionTime deletionTime, Kind kind) + { + super(deletionTime.markedForDeleteAt(), deletionTime.localDeletionTime()); + deletionKind = kind; + } + + private Covering(long markedForDeleteAt, long localDeletionTime, Kind kind) + { + super(markedForDeleteAt, localDeletionTime); + deletionKind = kind; + } + + @Override + public RangeTombstoneMarker toRangeTombstoneMarker(ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator) + { + throw new AssertionError("Covering trie tombstone cannot be converted to a RangeTombstoneMarker"); + } + + @Override + public Covering leftDeletion() + { + return this; + } + + @Override + public Covering rightDeletion() + { + return this; + } + + public Kind deletionKind() + { + return deletionKind; + } + + @Override + public TrieTombstoneMarker mergeWith(TrieTombstoneMarker other) + { + if (other instanceof Boundary) + return other.mergeWith(this); + if (other instanceof Point) + return other.mergeWith(this); + if (other instanceof LevelMarker) + return new Boundary(this, this, (LevelMarker) other); + + return combine(this, (Covering) other); + } + + @Override + public TrieTombstoneMarker dropShadowed(TrieTombstoneMarker deletion) + { + if (deletion == null) + return this; + + if (deletion instanceof Covering) + return applyDeletion(this, (Covering) deletion); + + // We ignore point deletions and level marker in the incoming deletion as we don't have any to remove. + TrieTombstoneMarker other = deletion; + Covering newLeft = applyDeletion(this, other.leftDeletion()); + Covering newRight = applyDeletion(this, other.rightDeletion()); + return make(newLeft, newRight, null); + } + + @Override + public Covering withUpdatedTimestamp(long newTimestamp) + { + return new Covering(newTimestamp, localDeletionTime(), deletionKind); + } + + @Override + public @Nullable Covering map(Function mapper) + { + DeletionTime mapped = mapper.apply(this); + if (mapped == this) + return this; + if (mapped == null || mapped.isLive()) + return null; + return new Covering(mapped, deletionKind); + } + + @Override + public boolean isBoundary() + { + return false; + } + + @Override + public Covering precedingState(Direction direction) + { + return this; + } + + @Override + public TrieTombstoneMarker restrict(boolean applicableBefore, boolean applicableAfter) + { + throw new AssertionError("Restrict is only applicable to boundary markers"); + } + + @Override + public TrieTombstoneMarker asBoundary(Direction direction) + { + return direction.isForward() ? new Boundary(null, this, null) : new Boundary(this, null, null); + } + + @Override + public long unsharedHeapSize() + { + // Note: HEAP_SIZE is used directly by Point and Boundary. Make sure to apply any changes there too. + return HEAP_SIZE; + } + + @Override + public String toString() + { + return super.toString() + '[' + deletionKind + ']'; + } + + // inherits equals and hashcode + } + + static class Boundary implements TrieTombstoneMarker + { + // Every boundary contains one side of a deletion, and for simplicity we assume that any covering deletion we + // interrupt is already accounted for by its end boundaries, so with every new Boundary we add this object's + // size plus one half of a Covering. + static final long UNSHARED_HEAP_SIZE = + ObjectSizes.measure(new Boundary(new Covering(0, 0, null), null, null)) + + Covering.HEAP_SIZE / 2; + + final @Nullable Covering leftDeletion; + final @Nullable Covering rightDeletion; + final @Nullable LevelMarker levelMarkerIfPresent; + + private Boundary(@Nullable Covering left, @Nullable Covering right, LevelMarker levelMarkerIfPresent) + { + assert left != null || right != null; + assert left == null || !left.isLive(); + assert right == null || !right.isLive(); + this.leftDeletion = left; + this.rightDeletion = right; + this.levelMarkerIfPresent = levelMarkerIfPresent; + } + + @Override + public boolean hasLevelMarker(LevelMarker level) + { + return levelMarkerIfPresent == level; + } + + @Override + public RangeTombstoneMarker toRangeTombstoneMarker(ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator) + { + return makeRangeTombstoneMarker(leftDeletion, + rightDeletion, + clusteringPrefixAsByteComparable, + byteComparableVersion, + comparator); + } + + @Override + public TrieTombstoneMarker mergeWith(TrieTombstoneMarker existing) + { + if (existing == null) + return this; + + if (existing instanceof Point) + return existing.mergeWith(this); + + if (existing instanceof LevelMarker) + return this.hasLevelMarker((LevelMarker) existing) ? this : new Boundary(leftDeletion, rightDeletion, (LevelMarker) existing); + + assert existing.pointDeletion() == null : "Unexpected point deletion in " + existing; + Covering otherLeft = existing.leftDeletion(); + Covering newLeft = combine(leftDeletion, otherLeft); + Covering otherRight = existing.rightDeletion(); + Covering newRight = combine(rightDeletion, otherRight); + LevelMarker otherLevelMarker = (existing instanceof Boundary) ? ((Boundary) existing).levelMarkerIfPresent : null; + LevelMarker newLevelMarker = levelMarkerIfPresent != null ? levelMarkerIfPresent : otherLevelMarker; + + if (leftDeletion == newLeft && rightDeletion == newRight && levelMarkerIfPresent == newLevelMarker) + return this; + if (otherLeft == newLeft && otherRight == newRight && newLevelMarker == otherLevelMarker) + return existing; + return make(newLeft, newRight, newLevelMarker); + } + + @Override + public TrieTombstoneMarker dropShadowed(TrieTombstoneMarker deletion) + { + if (deletion == null) + return this; + + // We ignore point deletion in the incoming deletion as we don't have any to remove. + Covering newLeft = applyDeletion(leftDeletion, deletion.leftDeletion()); + Covering newRight = applyDeletion(rightDeletion, deletion.rightDeletion()); + if (leftDeletion == newLeft && rightDeletion == newRight) + return this; + return make(newLeft, newRight, levelMarkerIfPresent); + } + + @Override + public TrieTombstoneMarker withUpdatedTimestamp(long newTimestamp) + { + Covering newLeft = leftDeletion != null ? leftDeletion.withUpdatedTimestamp(newTimestamp) : null; + Covering newRight = rightDeletion != null ? rightDeletion.withUpdatedTimestamp(newTimestamp) : null; + if (Objects.equals(newLeft, newRight)) + return newLeft; + return new Boundary(newLeft, newRight, levelMarkerIfPresent); + } + + @Override + public @Nullable TrieTombstoneMarker map(Function mapper) + { + Covering newLeft = leftDeletion != null ? leftDeletion.map(mapper) : null; + Covering newRight = rightDeletion != null ? rightDeletion.map(mapper) : null; + if (Objects.equals(newLeft, newRight)) + return newLeft; + return new Boundary(newLeft, newRight, levelMarkerIfPresent); + } + + @Override + public boolean isBoundary() + { + return true; + } + + @Override + public Covering precedingState(Direction dir) + { + return dir.isForward() ? leftDeletion : rightDeletion; + } + + @Override + public TrieTombstoneMarker restrict(boolean applicableBefore, boolean applicableAfter) + { + if ((!applicableBefore || leftDeletion == null) && (!applicableAfter || rightDeletion == null)) + return levelMarkerIfPresent; + if (applicableBefore && applicableAfter) + return this; + return new Boundary(applicableBefore ? leftDeletion : null, + applicableAfter ? rightDeletion : null, + levelMarkerIfPresent); + } + + @Override + public TrieTombstoneMarker asBoundary(Direction direction) + { + throw new AssertionError("Already a boundary"); + } + + @Override + public Covering leftDeletion() + { + return leftDeletion; + } + + @Override + public Covering rightDeletion() + { + return rightDeletion; + } + + @Override + public String toString() + { + return (levelMarkerIfPresent != null ? levelMarkerIfPresent + " + " : "") + + (leftDeletion != null ? leftDeletion : "LIVE") + + " -> " + + (rightDeletion != null ? rightDeletion : "LIVE"); + } + + @Override + public long unsharedHeapSize() + { + return UNSHARED_HEAP_SIZE; + } + + @Override + public boolean equals(Object o) + { + if (!(o instanceof Boundary)) return false; + Boundary boundary = (Boundary) o; + return Objects.equals(leftDeletion, boundary.leftDeletion) && + Objects.equals(rightDeletion, boundary.rightDeletion) && + levelMarkerIfPresent == boundary.levelMarkerIfPresent; + } + + @Override + public int hashCode() + { + return Objects.hash(leftDeletion, rightDeletion, levelMarkerIfPresent); + } + } + + /// Point deletion. Marks a deletion at the lowest points of the represented data hierarchy where no further + /// complexity can exist below the marked point to improve efficiency compared to bracketing the point with + /// boundaries on both sides. + /// + /// The point deletion applies only to the exact position of the marker (i.e. if there is substructure, this + /// deletion will not be covering for the branch). `isBoundary` returns true even if the applicable covering + /// deletion does not change, because the point must be reported as content. + static class Point implements TrieTombstoneMarker + { + // Every point deletion introduces a new deletion time. If it interrupts an existing deletion, it will reuse + // the Covering object provided by its end bounds. Thus, the unshared size is this object + the size of + // one Covering. + // If the point is also a boundary, we will add half a Covering size (see Boundary). + static final long UNSHARED_HEAP_SIZE = ObjectSizes.measure(new Point(new Covering(0, 0, null), + null)) + + Covering.HEAP_SIZE; + + final @Nullable Covering leftDeletion; + final @Nullable Covering rightDeletion; + final Covering pointDeletion; + + public Point(Covering pointDeletion, @Nullable Covering coveringDeletion) + { + this(pointDeletion, coveringDeletion, coveringDeletion); + } + + public Point(Covering pointDeletion, @Nullable Covering leftDeletion, @Nullable Covering rightDeletion) + { + assert pointDeletion != null; + this.leftDeletion = leftDeletion; + this.rightDeletion = rightDeletion; + this.pointDeletion = pointDeletion; + } + + @Override + public Covering leftDeletion() + { + return leftDeletion; + } + + @Override + public Covering rightDeletion() + { + return rightDeletion; + } + + + @Override + public Covering applicableToPointForward() + { + return pointDeletion; + } + + @Override + public Covering applicableToPointReverse() + { + return pointDeletion; + } + + @Override + public RangeTombstoneMarker toRangeTombstoneMarker(ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator) + { + if (leftDeletion == rightDeletion) + return null; + + return TrieTombstoneMarker.makeRangeTombstoneMarker(leftDeletion, + rightDeletion, + clusteringPrefixAsByteComparable, + byteComparableVersion, + comparator); + } + + @Override + public TrieTombstoneMarker mergeWith(TrieTombstoneMarker existing) + { + if (existing == null) + return this; + + if (existing == LevelMarker.ROW) + throw new AssertionError("Point deletion on a row marker is invalid"); + + TrieTombstoneMarker existingMarker = existing; + Covering point; + Covering left = combine(leftDeletion, existingMarker.leftDeletion()); + Covering right = combine(rightDeletion, existingMarker.rightDeletion()); + + if (existing instanceof Point) + { + Point existingPoint = (Point) existing; + point = combine(pointDeletion, existingPoint.pointDeletion); + } + else if (existing instanceof Covering) + point = applyDeletion(pointDeletion, (Covering) existingMarker); + else + point = dropIfCoveredByBoth(pointDeletion, existingMarker.leftDeletion(), existingMarker.rightDeletion()); + + return updatedTo(point, left, right); + } + + @Override + public TrieTombstoneMarker dropShadowed(TrieTombstoneMarker deletion) + { + if (deletion == null) + return this; + + TrieTombstoneMarker deletionMarker = deletion; + Covering point; + Covering left = applyDeletion(leftDeletion, deletionMarker.leftDeletion()); + Covering right = applyDeletion(rightDeletion, deletionMarker.rightDeletion()); + + if (deletion instanceof Point) + { + Point deletionPoint = (Point) deletion; + point = applyDeletion(pointDeletion, deletionPoint.pointDeletion); + } + else if (deletion instanceof Covering) + point = applyDeletion(pointDeletion, (Covering) deletionMarker); + else + point = dropIfCoveredByBoth(pointDeletion, deletionMarker.leftDeletion(), deletionMarker.rightDeletion()); + + return updatedTo(point, left, right); + } + + @Override + public Covering pointDeletion() + { + return pointDeletion; + } + + @Override + public TrieTombstoneMarker withUpdatedTimestamp(long newTimestamp) + { + if (leftDeletion != null && rightDeletion != null) + return null; // point is subsumed by range deletion, and the boundary turns to covering which is not reported + + Covering left = leftDeletion != null ? new Covering(newTimestamp, leftDeletion.localDeletionTime(), leftDeletion.deletionKind) : null; + Covering right = rightDeletion != null ? new Covering(newTimestamp, rightDeletion.localDeletionTime(), leftDeletion.deletionKind) : null; + return new Point(new Covering(newTimestamp, pointDeletion.localDeletionTime(), pointDeletion.deletionKind), left, right); + } + + @Override + public @Nullable TrieTombstoneMarker map(Function mapper) + { + Covering point = pointDeletion.map(mapper); + Covering left = leftDeletion != null ? leftDeletion.map(mapper) : null; + Covering right = rightDeletion != null ? rightDeletion.map(mapper) : null; + point = dropIfCoveredByBoth(point, left, right); + return updatedTo(point, left, right); + } + + private Covering dropIfCoveredByBoth(Covering point, Covering left, Covering right) + { + return (left == null || right == null || point.supersedes(left) || point.supersedes(right)) + ? point + : null; + } + + private TrieTombstoneMarker updatedTo(Covering point, Covering left, Covering right) + { + if (point != null) + { + if (point == pointDeletion && left == leftDeletion && right == rightDeletion) + return this; + else + return new Point(point, left, right); + } + else + return make(left, right, null); + } + + @Override + public boolean isBoundary() + { + // Must be reported. + return true; + } + + @Override + public Covering precedingState(Direction direction) + { + return direction.select(leftDeletion, rightDeletion); + } + + @Override + public TrieTombstoneMarker restrict(boolean applicableBefore, boolean applicableAfter) + { + Covering left = applicableBefore ? leftDeletion : null; + Covering right = applicableAfter ? rightDeletion : null; + if (left == leftDeletion && right == rightDeletion) + return this; + + return new Point(pointDeletion, left, right); + } + + @Override + public TrieTombstoneMarker asBoundary(Direction direction) + { + throw new AssertionError("Cannot have a row clustering as slice bound."); + } + + @Override + public String toString() + { + if (leftDeletion == rightDeletion) + return pointDeletion + (leftDeletion != null ? "(under " + leftDeletion + ")" : ""); + else + return pointDeletion + " and " + + (leftDeletion != null ? leftDeletion : "LIVE") + " -> " + + (rightDeletion != null ? rightDeletion : "LIVE"); + + } + + @Override + public long unsharedHeapSize() + { + return UNSHARED_HEAP_SIZE + (leftDeletion != rightDeletion ? Covering.HEAP_SIZE / 2 : 0); + } + + @Override + public boolean equals(Object o) + { + if (!(o instanceof Point)) return false; + Point point = (Point) o; + return Objects.equals(leftDeletion, point.leftDeletion) && + Objects.equals(rightDeletion, point.rightDeletion) && + Objects.equals(pointDeletion, point.pointDeletion); + } + + @Override + public int hashCode() + { + return Objects.hash(leftDeletion, rightDeletion, pointDeletion); + } + } + + enum LevelMarker implements TrieTombstoneMarker + { + // We currently only need row level markers. + ROW; + + @Override + public boolean hasLevelMarker(LevelMarker level) + { + return this == level; + } + + @Override + public RangeTombstoneMarker toRangeTombstoneMarker(ByteComparable clusteringPrefixAsByteComparable, + ByteComparable.Version byteComparableVersion, + ClusteringComparator comparator) + { + return null; + } + + @Override + public Covering leftDeletion() + { + return null; + } + + @Override + public Covering rightDeletion() + { + return null; + } + + @Override + public TrieTombstoneMarker mergeWith(TrieTombstoneMarker existing) + { + if (existing == null || existing == this) + return this; + else if (existing instanceof LevelMarker) + throw new AssertionError("Attempt to merge different level markers: " + this + " vs " + existing); + else + return existing.mergeWith(this); + } + + @Nullable + @Override + public TrieTombstoneMarker dropShadowed(TrieTombstoneMarker deletion) + { + return this; + } + + @Override + public TrieTombstoneMarker withUpdatedTimestamp(long newTimestamp) + { + return this; + } + + @Nullable + @Override + public TrieTombstoneMarker map(Function mapper) + { + return this; + } + + @Override + public long unsharedHeapSize() + { + return 0; + } + + @Override + public boolean isBoundary() + { + return true; // to return it in toContent + } + + @Override + public Covering precedingState(Direction direction) + { + return null; + } + + @Override + public TrieTombstoneMarker restrict(boolean applicableBefore, boolean applicableAfter) + { + // Markers must be retained regardless of set coverage. + return this; + } + + @Override + public TrieTombstoneMarker asBoundary(Direction direction) + { + throw new AssertionError("Already a boundary"); + } + + @Override + public String toString() + { + return "Level " + super.toString(); + } + } +} diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java index bbef11e8495a..a92fc0c57abc 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java @@ -17,7 +17,7 @@ */ package org.apache.cassandra.db.rows; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.DeletionTime; /** * An iterator over the rows of a given partition that also includes deletion informations. @@ -66,4 +66,13 @@ public default boolean isEmpty() && staticRow().isEmpty() && !hasNext(); } + + /// Ask the iterator to stop issuing tombstones because they are no longer useful to the consumer. + /// Usually done just before filtering out tombstones in [FilteredRows]. + /// If the iterator will not gain efficiency from dropping tombstones, it can reject this call by returning false. + public default boolean stopIssuingTombstones() + { + // ignored in base class + return false; + } } diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java index 7cba7dcc906b..389200ff0394 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java @@ -679,7 +679,7 @@ private void readComplexColumn(ColumnMetadata column, DataInputPlus in, Serializ if (complexDeletion.localDeletionTime() < 0) { if (MessagingService.Version.supportsExtendedDeletionTime(helper.version)) - complexDeletion = DeletionTime.build(complexDeletion.markedForDeleteAt(), Cell.deletionTimeUnsignedIntegerToLong((int) complexDeletion.localDeletionTime())); + complexDeletion = DeletionTime.build(complexDeletion.markedForDeleteAt(), CellData.deletionTimeUnsignedIntegerToLong((int) complexDeletion.localDeletionTime())); else complexDeletion = DeletionTime.build(complexDeletion.markedForDeleteAt(), Cell.INVALID_DELETION_TIME); } diff --git a/src/java/org/apache/cassandra/db/transform/Filter.java b/src/java/org/apache/cassandra/db/transform/Filter.java index e927adf48bb6..43d96253e6b4 100644 --- a/src/java/org/apache/cassandra/db/transform/Filter.java +++ b/src/java/org/apache/cassandra/db/transform/Filter.java @@ -37,9 +37,10 @@ public Filter(long nowInSec, boolean enforceStrictLiveness) @Override protected RowIterator applyToPartition(BaseRowIterator iterator) { + ((UnfilteredRowIterator) iterator).stopIssuingTombstones(); return iterator instanceof UnfilteredRows ? new FilteredRows(this, (UnfilteredRows) iterator) - : new FilteredRows((UnfilteredRowIterator) iterator, this); + : new FilteredRows(((UnfilteredRowIterator) iterator), this); } @Override diff --git a/src/java/org/apache/cassandra/db/transform/FilteredRows.java b/src/java/org/apache/cassandra/db/transform/FilteredRows.java index dffb76cd00e1..68fde813f574 100644 --- a/src/java/org/apache/cassandra/db/transform/FilteredRows.java +++ b/src/java/org/apache/cassandra/db/transform/FilteredRows.java @@ -55,6 +55,7 @@ public boolean isEmpty() */ public static RowIterator filter(UnfilteredRowIterator iterator, long nowInSecs) { + iterator.stopIssuingTombstones(); return new Filter(nowInSecs, iterator.metadata().enforceStrictLiveness()).applyToPartition(iterator); } } diff --git a/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java b/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java index b8720fcdaa12..4ea01471f5bf 100644 --- a/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java +++ b/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java @@ -23,6 +23,7 @@ import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; @@ -73,4 +74,16 @@ public boolean isEmpty() { return staticRow().isEmpty() && partitionLevelDeletion().isLive() && !hasNext(); } + + @Override + public boolean stopIssuingTombstones() + { + if (!input.stopIssuingTombstones()) + return false; + + // If we are stopping tombstones, we must check if any already prepared `next` is a tombstone and drop it if so. + if (next != null && (next.isRangeTombstoneMarker() || ((Row) next).isEmptyAfterDeletion())) + next = null; + return true; + } } diff --git a/src/java/org/apache/cassandra/db/tries/BaseTrie.java b/src/java/org/apache/cassandra/db/tries/BaseTrie.java new file mode 100644 index 000000000000..32e868fb9715 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/BaseTrie.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Iterator; +import java.util.Map; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.function.Predicate; + +import com.google.common.base.Predicates; + +import org.agrona.DirectBuffer; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// Base trie interface, providing various transformations of the trie, conversion +/// of its content to other formats (e.g. iterable of values), and several forms of processing. +/// +/// For any unimplemented data extraction operations one can build on the [TrieEntriesWalker] (for-each processing) +/// and [TrieEntriesIterator] (to iterator) base classes, which provide the necessary mechanisms to handle walking +/// the trie. +/// +/// See [Trie.md](./Trie.md) for further description of the trie representation model. +/// +/// @param The content type of the trie. +/// @param The concrete subtype of the trie. +public interface BaseTrie, Q extends BaseTrie> extends CursorWalkable +{ + /// Adapter interface providing most of the methods a [Cursor.Walker], so that the latter can be used + /// with [#process]. + /// This enables calls like + /// `trie.forEachValue(x -> System.out.println(x));` + /// to be mapped directly to a single call to [#process] without extra allocations. + interface ValueConsumer extends Cursor.Walker + { + @Override + default Void complete() + { + return null; + } + + @Override + default void resetPathLength(int newDepth) + { + // not tracking path + } + + @Override + default void addPathByte(int nextByte) + { + // not tracking path + } + + @Override + default void addPathBytes(DirectBuffer buffer, int pos, int count) + { + // not tracking path + } + } + + /// Call the given consumer on all content values in the trie in order. + default void forEachValue(ValueConsumer consumer) + { + forEachValue(Direction.FORWARD, consumer); + } + + /// Call the given consumer on all content values in the trie in order. + default void forEachValue(Direction direction, ValueConsumer consumer) + { + process(direction, consumer); + } + + /// Call the given consumer on all (path, content) pairs with non-null content in the trie in order. + default void forEachEntry(BiConsumer consumer) + { + forEachEntry(Direction.FORWARD, consumer); + } + + /// Call the given consumer on all (path, content) pairs with non-null content in the trie in order. + default void forEachEntry(Direction direction, BiConsumer consumer) + { + Cursor cursor = cursor(direction); + cursor.process(new TrieEntriesWalker.WithConsumer<>(consumer, cursor.byteComparableVersion())); + // Note: we can't do the ValueConsumer trick here, because the implementation requires state and cannot be + // implemented with default methods alone. + } + + /// Process the trie using the given [Cursor.Walker]. + default R process(Direction direction, Cursor.Walker walker) + { + return cursor(direction).process(walker); + } + + /// Process the trie using the given [ValueConsumer], skipping all branches below the top content-bearing node. + default void forEachValueSkippingBranches(Direction direction, ValueConsumer consumer) + { + processSkippingBranches(direction, consumer); + } + + /// Call the given consumer on all `(path, content)` pairs with non-null content in the trie in order, skipping all + /// branches below the top content-bearing node. + default void forEachEntrySkippingBranches(Direction direction, Predicate acceptancePredicate, BiConsumer consumer) + { + // TODO: test + Cursor cursor = cursor(direction); + cursor.processSkippingBranches(acceptancePredicate, new TrieEntriesWalker.WithConsumer<>(consumer, cursor.byteComparableVersion())); + // Note: we can't do the ValueConsumer trick here, because the implementation requires state and cannot be + // implemented with default methods alone. + } + + /// Call the given consumer on all `(path, content)` pairs with non-null content in the trie in order, skipping all + /// branches below the top content-bearing node. + default void forEachEntrySkippingBranches(Direction direction, BiConsumer consumer) + { + forEachEntrySkippingBranches(direction, Predicates.alwaysTrue(), consumer); + } + + /// Process the trie using the given [Cursor.Walker], skipping all branches below the top content-bearing node. + default R processSkippingBranches(Direction direction, Cursor.Walker walker) + { + return processSkippingBranches(direction, Predicates.alwaysTrue(), walker); + } + + /// Process the trie using the given [Cursor.Walker], skipping all branches below the top content-bearing node. + default R processSkippingBranches(Direction direction, Predicate acceptancePredicate, Cursor.Walker walker) + { + return cursor(direction).processSkippingBranches(acceptancePredicate, walker); + } + + /// Map-like get by key. + default T get(ByteComparable key) + { + Cursor cursor = cursor(Direction.FORWARD); + if (cursor.descendAlong(key.asComparableBytes(cursor.byteComparableVersion()))) + return cursor.content(); + else + return null; + } + + /// Constuct a textual representation of the trie. + default String dump() + { + return dump(Object::toString); + } + + /// Constuct a textual representation of the trie using the given content-to-string mapper. + default String dump(Function contentToString) + { + return process(Direction.FORWARD, new TrieDumper.Plain<>(contentToString)); + } + + /// Returns the ordered entry set of this trie's content as an iterable. + default Iterable> entrySet() + { + return this::entryIterator; + } + + /// Returns the ordered entry set of this trie's content as an iterable. + default Iterable> entrySet(Direction direction) + { + return () -> entryIterator(direction); + } + + /// Returns the ordered entry set of this trie's content in an iterator. + default Iterator> entryIterator() + { + return entryIterator(Direction.FORWARD); + } + + /// Returns the ordered entry set of this trie's content in an iterator. + default Iterator> entryIterator(Direction direction) + { + return new TrieEntriesIterator.AsEntries<>(cursor(direction)); + } + + /// Returns the ordered entry set of this trie's content in an iterable, filtered by the given type. + default Iterable> filteredEntrySet(Class clazz) + { + return filteredEntrySet(Direction.FORWARD, clazz); + } + + /// Returns the ordered entry set of this trie's content in an iterable, filtered by the given type. + default Iterable> filteredEntrySet(Direction direction, Class clazz) + { + return () -> filteredEntryIterator(direction, clazz); + } + + /// Returns the ordered entry set of this trie's content in an iterator, filtered by the given type. + default Iterator> filteredEntryIterator(Direction direction, Class clazz) + { + return new TrieEntriesIterator.AsEntriesFilteredByType<>(cursor(direction), clazz); + } + + /// Returns the ordered set of values of this trie as an iterable. + default Iterable values() + { + return this::valueIterator; + } + + /// Returns the ordered set of values of this trie as an iterable. + default Iterable values(Direction direction) + { + return direction.isForward() ? this::valueIterator : this::reverseValueIterator; + } + + /// Returns the ordered set of values of this trie in an iterator. + default Iterator valueIterator() + { + return valueIterator(Direction.FORWARD); + } + + /// Returns the inversely ordered set of values of this trie in an iterator. + default Iterator reverseValueIterator() + { + return valueIterator(Direction.REVERSE); + } + + /// Returns the ordered set of values of this trie in an iterator. + default Iterator valueIterator(Direction direction) + { + return new TrieValuesIterator<>(cursor(direction)); + } + + /// Returns the ordered set of values of this trie in an iterable, filtered by the given type. + default Iterable filteredValues(Class clazz) + { + return filteredValues(Direction.FORWARD, clazz); + } + + /// Returns the ordered set of values of this trie in an iterable, filtered by the given type. + default Iterable filteredValues(Direction direction, Class clazz) + { + return () -> filteredValuesIterator(direction, clazz); + } + + /// Returns the ordered set of values of this trie in an iterator, filtered by the given type. + default Iterator filteredValuesIterator(Direction direction, Class clazz) + { + return new TrieValuesIterator.FilteredByType<>(cursor(direction), clazz); + } + + /// Returns a view of the subtrie containing everything in this trie whose keys fall between the given boundaries, + /// inclusive of both bounds, any prefix of the bounds, as well as any descendant of the bounds (if one bound is a + /// prefix of the other, only all descendants of the longer bound). + /// + /// The view is live, i.e. any write to the source will be reflected in the subtrie. + /// + /// This method will not check its arguments for correctness. The resulting trie may throw an exception if the right + /// bound is smaller than the left. + /// + /// @param left the left bound for the returned subtrie. If `null`, the resulting subtrie is not left-bounded. + /// @param right the right bound for the returned subtrie. If `null`, the resulting subtrie is not right-bounded. + /// @return a view of the subtrie containing all the keys of this trie falling between `left` and `right`, + /// including both bounds, their prefixes and branches. + default Q subtrie(ByteComparable left, ByteComparable right) + { + return intersect(TrieSet.rangeInclusiveEnd(cursor(Direction.FORWARD).byteComparableVersion(), left, right)); + } + + /// Returns a view of this trie that is an intersection of its content with the given set. Note that intersections + /// return content for all positions listed by the set, including prefixes that are not actually contained, to be + /// able to preserve branch metadata. + /// + /// The view is live, i.e. any write to the source will be reflected in the intersection. + Q intersect(TrieSet set); + + /// Returns a Trie that is a view of this one, where the given prefix is prepended before the root. + Q prefixedBy(ByteComparable prefix); + + /// Returns a trie that corresponds to the branch of this trie rooted at the given prefix. + /// + /// The result will include the same values as `subtrie(prefix, prefix)`, but the keys in the + /// resulting trie will not include the prefix. In other words, + /// ```tailTrie(prefix).prefixedBy(prefix) = subtrie(prefix, prefix)``` + /// (Note: This equivalence does not hold for content on the path leading to the branch, because the tail trie + /// has no way of presenting such content.) + Q tailTrie(ByteComparable prefix); + + /// Returns an entry set containing all tail tree constructed at the points that contain content of + /// the given type. + default Iterable> tailTries(Direction direction, Class clazz) + { + return tailTries(direction, clazz::isInstance); + } + + /// Returns an entry set containing all tail tree constructed at the points that contain content passing + /// the given predicate. + Iterable> tailTries(Direction direction, Predicate predicate); +} diff --git a/src/java/org/apache/cassandra/db/tries/BufferAccessor.java b/src/java/org/apache/cassandra/db/tries/BufferAccessor.java new file mode 100644 index 000000000000..11e3a7055eb2 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/BufferAccessor.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.agrona.concurrent.UnsafeBuffer; + +public interface BufferAccessor +{ + /// Get the buffer to use for reading or writing to a given cell. + UnsafeBuffer getBuffer(int cell); + + /// Get the offset to use for reading or writing to the given cell in the buffer returned by [#getBuffer]. + int inBufferOffset(int cell); +} diff --git a/src/java/org/apache/cassandra/db/tries/BufferManager.java b/src/java/org/apache/cassandra/db/tries/BufferManager.java new file mode 100644 index 000000000000..72cc358eb3f8 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/BufferManager.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.io.compress.BufferType; + +/// Buffer-managing component of in-memory tries. Deals with the allocation, access and +/// recycling of trie cells. +public interface BufferManager extends MemoryManager, BufferAccessor +{ + /// Allocate a cell to use for storing data. This uses the memory allocation strategy to reuse cells if any are + /// available, or to allocate new cells. Because some node types rely on cells being filled with 0 as initial state, + /// any cell we get through the allocator must also be cleaned. + int allocateCell() throws TrieSpaceExhaustedException; + + /// Creates a copy of a given cell and marks the original for recycling. Used when a mutation needs to force-copy + /// paths to ensure earlier states are still available for concurrent readers. + int copyCell(int cell) throws TrieSpaceExhaustedException; + + /// Prepare the given cell for recycling. The cell cannot be immediately recycled, + /// because read operations as well as the ongoing mutation may still need it. + void recycleCell(int cell); + + /// Returns true if the allocation threshold has been reached. To be called by the mutating thread (ideally, just + /// after the write completes). When this returns true, the user should switch to a new trie as soon as feasible. + /// + /// The trie expects up to 10% growth above this threshold. Any growth beyond that may be done inefficiently, and + /// the trie will fail altogether when the size grows beyond 2G - 256 bytes. + boolean reachedAllocatedSizeThreshold(); + + /// Returns the amount of memory in use in buffers, excluding any cells that are marked for recycling. + @VisibleForTesting + long usedBufferSpace(); + + /// Called to clean up all buffers when the trie is known to no longer be needed. + void discardBuffers(); + + BufferType bufferType(); +} diff --git a/src/java/org/apache/cassandra/db/tries/BufferManagerMultibuf.java b/src/java/org/apache/cassandra/db/tries/BufferManagerMultibuf.java new file mode 100644 index 000000000000..c8fd1f2425da --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/BufferManagerMultibuf.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; + +import com.google.common.annotations.VisibleForTesting; + +import org.agrona.concurrent.UnsafeBuffer; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.InMemoryReadTrie.CELL_SIZE; +import static org.apache.cassandra.db.tries.InMemoryReadTrie.getBufferIdx; + +/// Multi-buffer implementation of a buffer manager, where multiple buffers of growing size are maintained and adding a +/// new cell when the space is exhausted is accomplished by adding a new buffer with twice the size of the last one. +/// This has some extra complexity compared to using a single buffer, but avoids having to copy buffers of up to 1GiB +/// of data to grow. +/// +/// EXPANDABLE DATA STORAGE +/// +/// The tries will need more and more space in buffers and content lists as they grow. Instead of using ArrayList-like +/// reallocation with copying, which may be prohibitively expensive for large buffers, we use a sequence of +/// buffers/content arrays that double in size on every expansion. +/// +/// For a given address `x` the index of the buffer can be found with the following calculation: +/// ```index_of_most_significant_set_bit(x / min_size + 1)``` +/// (relying on `sum (2^i) for i in [0, n-1] == 2^n - 1`) which can be performed quickly on modern hardware. +/// +/// Finding the offset within the buffer is then +/// ```x + min - (min << buffer_index)``` +/// +/// The allocated space starts at 256 bytes for the buffer and 16 entries for the content list. +/// +/// Note that a buffer is not allowed to split 32-byte cells (code assumes same buffer can be used for all bytes +/// inside the cell). +/// +/// This class can optionally recycle cells that are no longer in use. +public class BufferManagerMultibuf implements BufferManager +{ + static final int BUF_START_SHIFT = 8; + static final int BUF_START_SIZE = 1 << BUF_START_SHIFT; + + static + { + assert BUF_START_SIZE % CELL_SIZE == 0 : "Initial buffer size must fit a full cell."; + } + + /// Trie size limit. This is not enforced, but users must check from time to time that it is not exceeded (using + /// [#reachedAllocatedSizeThreshold()]) and start switching to a new trie if it is. + /// This must be done to avoid tries growing beyond their hard 2GB size limit (due to the 32-bit pointers). + @VisibleForTesting + static final int ALLOCATED_SIZE_THRESHOLD; + + static + { + // Default threshold + 10% == 2 GB. This should give the owner enough time to react to the + // {@link #reachedAllocatedSizeThreshold()} signal and switch this trie out before it fills up. + int limitInMB = CassandraRelevantProperties.MEMTABLE_TRIE_SIZE_LIMIT.getInt(2048 * 10 / 11); + if (limitInMB < 1 || limitInMB > 2047) + throw new AssertionError(CassandraRelevantProperties.MEMTABLE_TRIE_SIZE_LIMIT.getKey() + + " must be within 1 and 2047"); + ALLOCATED_SIZE_THRESHOLD = 1024 * 1024 * limitInMB; + } + + private int allocatedPos = 0; + + final BufferType bufferType; // on or off heap + final MemoryAllocationStrategy cellAllocator; + + final UnsafeBuffer[] buffers; + + /// Creates a new buffer manager with the given buffer type (on- or off-heap) and expected lifetime. + /// Short-lived managers will not recycle cells as it is simpler to throw the whole thing away at the end of its + /// lifecycle, while long-lived will track freed cells and will reuse them after the given opOrder indicates that + /// all operations that may be using them have finished. + public BufferManagerMultibuf(BufferType bufferType, + InMemoryBaseTrie.ExpectedLifetime lifetime, + OpOrder opOrder) + { + this.buffers = new UnsafeBuffer[31 - BUF_START_SHIFT]; // last one is 1G for a total of ~2G bytes + this.bufferType = bufferType; + + switch (lifetime) + { + case SHORT: + cellAllocator = new MemoryAllocationStrategy.NoReuseStrategy(this::allocateNewCell); + break; + case LONG: + cellAllocator = new MemoryAllocationStrategy.OpOrderReuseStrategy(this::allocateNewCell, opOrder); + break; + default: + throw new AssertionError(); + } + + } + + @Override + public UnsafeBuffer getBuffer(int pos) + { + int leadBit = getBufferIdx(pos, BUF_START_SHIFT, BUF_START_SIZE); + return buffers[leadBit]; + } + + @Override + public int inBufferOffset(int pos) + { + int leadBit = getBufferIdx(pos, BUF_START_SHIFT, BUF_START_SIZE); + return InMemoryReadTrie.inBufferOffset(pos, leadBit, BUF_START_SIZE); + } + + + /// Allocate a new cell in the data buffers. This is called by the memory allocation strategy when it runs out of + /// free cells to reuse. + private int allocateNewCell() throws TrieSpaceExhaustedException + { + // Note: If this method is modified, please run InMemoryTrieTest.testOver1GSize to verify it acts correctly + // close to the 2G limit. + int v = allocatedPos; + if (inBufferOffset(v) == 0) + { + int leadBit = getBufferIdx(v, BUF_START_SHIFT, BUF_START_SIZE); + if (leadBit + BUF_START_SHIFT == 31) + throw new TrieSpaceExhaustedException(); + + ByteBuffer newBuffer = bufferType.allocate(BUF_START_SIZE << leadBit); + buffers[leadBit] = new UnsafeBuffer(newBuffer); + // Note: Since we are not moving existing data to a new buffer, we are okay with no happens-before enforcing + // writes. Any reader that sees a pointer in the new buffer may only do so after reading the volatile write + // that attached the new path. + } + + allocatedPos += CELL_SIZE; + return v; + } + + @Override + public int allocateCell() throws TrieSpaceExhaustedException + { + int cell = cellAllocator.allocate(); + getBuffer(cell).setMemory(inBufferOffset(cell), CELL_SIZE, (byte) 0); + return cell; + } + + @Override + public void recycleCell(int cell) + { + cellAllocator.recycle(cell & -CELL_SIZE); + } + + @Override + public int copyCell(int cell) throws TrieSpaceExhaustedException + { + int copy = cellAllocator.allocate(); + getBuffer(copy).putBytes(inBufferOffset(copy), getBuffer(cell), inBufferOffset(cell & -CELL_SIZE), CELL_SIZE); + recycleCell(cell); + return copy | (cell & (CELL_SIZE - 1)); + } + + @Override + public void completeMutation() + { + cellAllocator.completeMutation(); + } + + @Override + public void abortMutation() + { + cellAllocator.abortMutation(); + } + + @Override + public boolean reachedAllocatedSizeThreshold() + { + return allocatedPos >= ALLOCATED_SIZE_THRESHOLD; + } + + /// For tests only! Advance the allocation pointer (and allocate space) to the given position to test behaviour + /// close to full. If the parameter is -1, consume all the space until the next request would throw an exception. + @VisibleForTesting + int advanceAllocatedPos(int wantedPos) throws TrieSpaceExhaustedException + { + if (wantedPos == -1) + { + if (cellAllocator instanceof MemoryAllocationStrategy.OpOrderReuseStrategy) + wantedPos = (int) (0x80000000L - BUF_START_SIZE - MemoryAllocationStrategy.REUSE_BLOCK_SIZE * 32); + else + wantedPos = (int) (0x80000000L - BUF_START_SIZE - 32); + } + + while (allocatedPos < wantedPos) + allocateCell(); + + if (cellAllocator instanceof MemoryAllocationStrategy.OpOrderReuseStrategy) + { + // grab all the cells that were just prepared + for (int i = 1; i < MemoryAllocationStrategy.REUSE_BLOCK_SIZE; ++i) + allocateCell(); + } + + return allocatedPos; + } + + /// For tests only! Returns the current allocation position. + @VisibleForTesting + int getAllocatedPos() + { + return allocatedPos; + } + + @Override + public long usedSizeOffHeap() + { + return (bufferType == BufferType.ON_HEAP ? 0 : usedBufferSpace()); + } + + @Override + public long usedSizeOnHeap() + { + return (bufferType == BufferType.ON_HEAP ? usedBufferSpace() : 0) + + InMemoryBaseTrie.REFERENCE_ARRAY_ON_HEAP_SIZE * getBufferIdx(allocatedPos, BUF_START_SHIFT, BUF_START_SIZE); + } + + @Override + @VisibleForTesting + public long usedBufferSpace() + { + return allocatedPos - cellAllocator.indexCountInPipeline() * CELL_SIZE; + } + + @Override + public long unusedReservedOnHeapMemory() + { + long bufferOverhead = 0; + if (bufferType == BufferType.ON_HEAP) + { + int pos = this.allocatedPos; + UnsafeBuffer buffer = getBuffer(pos); + if (buffer != null) + bufferOverhead = buffer.capacity() - inBufferOffset(pos); + bufferOverhead += cellAllocator.indexCountInPipeline() * CELL_SIZE; + } + return bufferOverhead; + } + + @Override + public void discardBuffers() + { + if (bufferType == BufferType.ON_HEAP) + return; // no cleaning needed + + for (UnsafeBuffer b : buffers) + { + if (b != null) + FileUtils.clean(b.byteBuffer()); + } + } + + @Override + public BufferType bufferType() + { + return bufferType; + } +} diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeCursor.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeCursor.java new file mode 100644 index 000000000000..a22a2bbac404 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeCursor.java @@ -0,0 +1,809 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.IntFunction; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// A merged view of multiple trie cursors. +/// +/// This is accomplished by walking the cursors in parallel; the merged cursor takes the position and features of the +/// smallest and advances with it; when multiple cursors are equal, all of them are advanced. The ordered view of the +/// cursors is maintained using a custom binary min-heap, built for efficiently reforming the heap when the top elements +/// are advanced. +/// +/// Crucial for the efficiency of this is the fact that when they are advanced like this, we can compare cursors' +/// positions by their `depth` descending and then `incomingTransition` ascending. +/// See [Trie.md](./Trie.md) for further details. +/// +/// The merge cursor is a variation of the idea of a merge iterator with one key observation: because we advance +/// the source iterators together, we can compare them just by depth and incoming transition. +/// +/// The most straightforward way to implement merging of iterators is to use a `PriorityQueue`, +/// `poll` it to find the next item to consume, then `add` the iterator back after advancing. +/// This is not very efficient as `poll` and `add` in all cases require at least +/// `log(size)` comparisons and swaps (usually more than `2*log(size)`) per consumed item, even +/// if the input is suitable for fast iteration. +/// +/// The implementation below makes use of the fact that replacing the top element in a binary heap can be +/// done much more efficiently than separately removing it and placing it back, especially in the cases where +/// the top iterator is to be used again very soon (e.g. when there are large sections of the output where +/// only a limited number of input iterators overlap, which is normally the case in many practically useful +/// situations, e.g. levelled compaction). +/// +/// The implementation builds and maintains a binary heap of sources (stored in an array), where we do not +/// add items after the initial construction. Instead we advance the smallest element (which is at the top +/// of the heap) and push it down to find its place for its new position. Should this source be exhausted, +/// we swap it with the last source in the heap and proceed by pushing that down in the heap. +/// +/// In the case where we have multiple sources with matching positions, the merging algorithm +/// must be able to merge all equal values. To achieve this `content` walks the heap to +/// find all equal cursors without advancing them, and separately `advance` advances +/// all equal sources and restores the heap structure. +/// +/// The latter is done equivalently to the process of initial construction of a min-heap using back-to-front +/// heapification as done in the classic heapsort algorithm. It only needs to heapify subheaps whose top item +/// is advanced (i.e. one whose position matches the current), and we can do that recursively from +/// bottom to top. Should a source be exhausted when advancing, it can be thrown away by swapping in the last +/// source in the heap (note: we must be careful to advance that source too if required). +/// +/// To make it easier to advance efficienty in single-sourced branches of tries, we extract the current smallest +/// cursor (the head) separately, and start any advance with comparing that to the heap's first. When the smallest +/// cursor remains the same (e.g. in branches coming from a single source) this makes it possible to advance with +/// just one comparison instead of two at the expense of increasing the number by one in the general case. +/// +/// Note: This is a simplification of the MergeIterator code from CASSANDRA-8915, without the leading ordered +/// section and equalParent flag since comparisons of cursor positions are cheap. +abstract class CollectionMergeCursor> implements Cursor +{ + final Trie.CollectionMergeResolver resolver; + + /// The smallest cursor, tracked separately to improve performance in single-source sections of the trie. + C head; + + /// Binary heap of the remaining cursors. The smallest element is at position 0. + /// Every element `i` is smaller than or equal to its two children, i.e. + /// ```heap[i] <= heap[i*2 + 1] && heap[i] <= heap[i*2 + 2]``` + final C[] heap; + + /// A list used to collect contents during [#content()] calls. + final List contents; + /// Whether content has already been collected for this position. + boolean contentCollected; + /// The collected content. + T collectedContent; + + CollectionMergeCursor(Trie.CollectionMergeResolver resolver, Direction direction, Collection inputs, IntFunction cursorArrayConstructor, BiFunction extractor) + { + this.resolver = resolver; + int count = inputs.size(); + // Get cursors for all inputs. Put one of them in head and the rest in the heap. + heap = cursorArrayConstructor.apply(count - 1); + contents = new ArrayList<>(count); + int i = -1; + for (I src : inputs) + { + C cursor = extractor.apply(src, direction); + cursor.assertFresh(); + if (i >= 0) + heap[i] = cursor; + else + head = cursor; + ++i; + } + + // The cursors are all currently positioned on the root and thus in valid heap order. + } + + /// Interface for internal operations that can be applied to selected top elements of the heap. + interface HeapOp> + { + void apply(CollectionMergeCursor self, C cursor, int index); + + default boolean shouldContinueWithChild(C child, C head) + { + return equalCursor(child, head); + } + } + + /// Interface for non-interfering operations that can be applied to the source cursors. + interface SourceOp> extends HeapOp + { + void apply(C cursor); + + default void apply(CollectionMergeCursor self, C cursor, int index) + { + apply(cursor); + } + } + + /// Apply a non-interfering operation, i.e. one that does not change the cursor state, to the head and all inputs + /// in the heap that satisfy the [SourceOp#shouldContinueWithChild] condition (by default, being equal to the head). + /// For interfering operations like advancing the cursors, use [#advanceSelectedAndRestoreHeap(AdvancingHeapOp)]. + void applyToSelectedSources(SourceOp action) + { + action.apply(head); + applyToSelectedElementsInHeap(action, 0); + } + + /// Apply a non-interfering operation, i.e. one that does not change the cursor state, to the head and all inputs + /// in the heap. + void applyToAllSources(SourceOp action) + { + action.apply(head); + for (int i = 0; i < heap.length; i++) + action.apply(heap[i]); + } + + /// Interface for internal advancing operations that can be applied to the heap cursors. This interface provides + /// the code to restore the heap structure after advancing the cursors. + interface AdvancingHeapOp> extends HeapOp + { + void apply(C cursor); + + default void apply(CollectionMergeCursor self, C cursor, int index) + { + // Apply the operation, which should advance the position of the element. + apply(cursor); + + // This method is called on the back path of the recursion. At this point the heaps at both children are + // advanced and well-formed. + // Place current node in its proper position. + self.heapifyDown(cursor, index); + // The heap rooted at index is now advanced and well-formed. + } + } + + + /// Advance the state of all inputs in the heap that satisfy the [SourceOp#shouldContinueWithChild] condition + /// (by default, being equal to the head) and restore the heap invariant. + /// + /// Note that this does not apply the operation to [#head]. + void advanceSelectedAndRestoreHeap(AdvancingHeapOp action) + { + applyToSelectedElementsInHeap(action, 0); + } + + /// Apply an operation to all elements on the heap that satisfy, recursively through the heap hierarchy, the + /// `shouldContinueWithChild` condition (being equal to the head by default). Descends recursively in the + /// heap structure to all selected children and applies the operation on the way back. + /// + /// This operation can be something that does not change the cursor state (see [#content]) or an operation + /// that advances the cursor to a new state, wrapped in a [AdvancingHeapOp] ([#advance] or + /// [#skipTo]). The latter interface takes care of pushing elements down in the heap after advancing + /// and restores the subheap state on return from each level of the recursion. + private void applyToSelectedElementsInHeap(HeapOp action, int index) + { + if (index >= heap.length) + return; + C item = heap[index]; + if (!action.shouldContinueWithChild(item, head)) + return; + + // If the children are at the same position, they also need advancing and their subheap + // invariant to be restored. + applyToSelectedElementsInHeap(action, index * 2 + 1); + applyToSelectedElementsInHeap(action, index * 2 + 2); + + // Apply the action. This is done on the reverse direction to give the action a chance to form proper + // subheaps and combine them on processing the parent. + action.apply(this, item, index); + } + + /// Push the given state down in the heap from the given index until it finds its proper place among + /// the subheap rooted at that position. + private void heapifyDown(C item, int index) + { + while (true) + { + int next = index * 2 + 1; + if (next >= heap.length) + break; + // Select the smaller of the two children to push down to. + if (next + 1 < heap.length && greaterCursor(heap[next], heap[next + 1])) + ++next; + // If the child is greater or equal, the invariant has been restored. + if (!greaterCursor(item, heap[next])) + break; + heap[index] = heap[next]; + index = next; + } + heap[index] = item; + } + + /// Check if the head is greater than the top element in the heap, and if so, swap them and push down the new + /// top until its proper place. + /// + /// @param headPosition the position of the head cursor (as returned by e.g. advance). + /// @return the new head element's position + private long maybeSwapHead(long headPosition) + { + long heap0Position = heap[0].encodedPosition(); + if (Cursor.compare(headPosition, heap0Position) <= 0) + return headPosition; // head is still smallest + + // otherwise we need to swap heap and heap[0] + C newHeap0 = head; + head = heap[0]; + heapifyDown(newHeap0, 0); + return heap0Position; + } + + boolean branchHasMultipleSources() + { + return equalCursor(heap[0], head); + } + + boolean isExhausted() + { + return Cursor.isExhausted(head.encodedPosition()); + } + + @Override + public long advance() + { + contentCollected = false; + return doAdvance(); + } + + private long doAdvance() + { + advanceSelectedAndRestoreHeap(Cursor::advance); + return maybeSwapHead(head.advance()); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + contentCollected = false; + // If the current position is present in just one cursor, we can safely descend multiple levels within + // its branch as no one of the other tries has content for it. + if (branchHasMultipleSources()) + return doAdvance(); // More than one source at current position, do single-step advance. + + // If there are no children, i.e. the cursor ascends, we have to check if it's become larger than some + // other candidate. + return maybeSwapHead(head.advanceMultiple(receiver)); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + // We need to advance all cursors that stand before the requested position. + // If a child cursor does not need to advance as it is greater than the skip position, neither of the ones + // below it in the heap hierarchy do as they can't have an earlier position. + class SkipTo implements AdvancingHeapOp + { + @Override + public boolean shouldContinueWithChild(C child, C head) + { + // When the requested position descends, the implicit prefix bytes are those of the head cursor, + // and thus we need to check against that if it is a match. + if (equalCursor(child, head)) + return true; + // Otherwise we can compare the child's position against a cursor advanced as requested, and need + // to skip only if it would be before it. + long childPosition = child.encodedPosition(); + return Cursor.compare(childPosition, encodedSkipPosition) < 0; + } + + @Override + public void apply(C cursor) + { + cursor.skipTo(encodedSkipPosition); + } + } + + contentCollected = false; + applyToSelectedElementsInHeap(new SkipTo(), 0); + return maybeSwapHead(head.skipTo(encodedSkipPosition)); + } + + @Override + public long encodedPosition() + { + return head.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return head.byteComparableVersion(); + } + + T maybeCollectContent() + { + if (!contentCollected) + { + collectedContent = isExhausted() ? null : collectContent(); + contentCollected = true; + } + return collectedContent; + } + + T collectContent() + { + applyToSelectedSources(this::collectContent); + return resolveContent(); + } + + T resolveContent() + { + T toReturn; + switch (contents.size()) + { + case 0: + toReturn = null; + break; + case 1: + toReturn = contents.get(0); + break; + default: + toReturn = resolver.resolve(contents); + break; + } + contents.clear(); + return toReturn; + } + + void collectContent(C item) + { + T itemContent = getContent(item); + if (itemContent != null) + contents.add(itemContent); + } + + abstract T getContent(C item); + + /// Compare the positions of two cursors. One is before the other when + /// - its depth is greater, or + /// - its depth is equal, and the incoming transition is smaller. + static boolean greaterCursor(Cursor c1, Cursor c2) + { + return Cursor.compare(c1.encodedPosition(), c2.encodedPosition()) > 0; + } + + static boolean equalCursor(Cursor c1, Cursor c2) + { + return Cursor.compare(c1.encodedPosition(), c2.encodedPosition()) == 0; + } + + static class Plain extends CollectionMergeCursor> implements Cursor + { + public Plain(Trie.CollectionMergeResolver resolver, Direction direction, Collection inputs, BiFunction> extractor) + { + super(resolver, direction, inputs, Cursor[]::new, extractor); + } + + @Override + public T content() + { + return maybeCollectContent(); + } + + @Override + T getContent(Cursor item) + { + return item.content(); + } + + @Override + public Cursor tailCursor(Direction dir) + { + if (!branchHasMultipleSources()) + return head.tailCursor(dir); + + List> inputs = new ArrayList<>(heap.length + 1); + applyToSelectedSources(inputs::add); + + return new Plain<>(resolver, dir, inputs, Cursor::tailCursor); + } + } + + static class Range> extends CollectionMergeCursor> implements RangeCursor + { + Range(Trie.CollectionMergeResolver resolver, + Direction direction, + Collection inputs, + BiFunction> extractor) + { + super(resolver, direction, inputs, RangeCursor[]::new, extractor); + } + + @Override + public S state() + { + return maybeCollectContent(); + } + + @Override + S collectContent() + { + // Unlike the parent method, we need to collect the state of all cursors on the heap + // (state for equal cursors, and preceding state for the ones that have moved ahead). + applyToAllSources(this::collectContent); + return resolveContent(); + } + + @Override + S getContent(RangeCursor item) + { + return equalCursor(item, head) ? item.state() : item.precedingState(); + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + List> inputs = new ArrayList<>(heap.length + 1); + applyToAllSources(cursor -> + { + if (equalCursor(head, cursor)) + inputs.add(cursor.tailCursor(direction)); + else if (cursor.precedingState() != null) + inputs.add(cursor.precedingStateCursor(direction)); + }); + + if (inputs.size() == 1) + return inputs.get(0); + + return new Range<>(resolver, direction, inputs, (x, dir) -> x); + } + } + + /// Collection merge cursor for deletion-aware tries. + /// + /// This cursor efficiently merges multiple deletion-aware tries by walking their cursors in parallel + /// while properly handling both live data and deletion metadata. It extends the basic collection merge + /// functionality with deletion-aware semantics, including proper deletion application and branch management. + /// + /// The implementation maintains a separate merge cursor for deletion branches (`relevantDeletions`) and + /// coordinates between live data and deletions to ensure correct deletion application during iteration. + static class DeletionAware> + extends CollectionMergeCursor> implements DeletionAwareCursor + { + final BiFunction deleter; + final Trie.CollectionMergeResolver deletionResolver; + + /// Store to avoid calling direction() repeatedly for deletion branch handling. + Direction direction; + + /// Critical performance optimization flag. When true, guarantees that if one merge source + /// has a deletion branch at some position, the other source cannot have deletion branches + /// below or above that position. This allows us to skip walking the data trie to look for + /// lower-level deletion branches when merging. If the flag is false, we cannot know where + /// in the covered branch we may have a deletion, thus to be sure to find all we _must_ + /// walk the whole data subtrie. This can be terribly expensive. + /// + /// If we can guarantee that deletions always come at the same points in each path (e.g. at + /// partition roots), we can use this optimization. + final boolean deletionsAtFixedPoints; + + RangeCursor relevantDeletions; + int deletionBranchDepth = -1; + + enum DeletionState + { + NONE, + MATCHING, + AHEAD + } + DeletionState relevantDeletionsState = DeletionState.NONE; + List> collectedDeletionBranches; + List> sourcesWithNoDeletionBranch; + + /// Creates a deletion-aware collection merge cursor with configurable deletion optimization. + /// + /// @param liveResolver resolver for merging live data content + /// @param deletionResolver resolver for merging deletion metadata + /// @param deleter function to apply deletions to live data + /// @param deletionsAtFixedPoints optimization flag for deletion handling + /// @param direction iteration direction (forward or reverse) + /// @param inputs collection of input sources to merge + /// @param extractor function to extract deletion-aware cursors from inputs + DeletionAware(Trie.CollectionMergeResolver liveResolver, + Trie.CollectionMergeResolver deletionResolver, + BiFunction deleter, + boolean deletionsAtFixedPoints, + Direction direction, + Collection inputs, + BiFunction> extractor) + { + super(liveResolver, + direction, + inputs, + DeletionAwareCursor[]::new, + extractor); + this.direction = direction(); + + // We will add deletion sources to the above as we find them. + this.deletionResolver = deletionResolver; + this.deleter = deleter; + this.deletionsAtFixedPoints = deletionsAtFixedPoints; + // Initialize deletion merger as null - we'll create it lazily when needed + relevantDeletions = null; + collectedDeletionBranches = new ArrayList<>(heap.length + 1); + if (!deletionsAtFixedPoints) + sourcesWithNoDeletionBranch = new ArrayList<>(heap.length + 1); + else + sourcesWithNoDeletionBranch = null; + + processRelevantDeletions(this.encodedPosition()); + } + + @Override + public long advance() + { + return processRelevantDeletions(super.advance()); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return processRelevantDeletions(super.skipTo(encodedSkipPosition)); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return (branchHasMultipleSources() || relevantDeletionsState == DeletionState.MATCHING) + ? advance() + : processRelevantDeletions(super.advanceMultiple(receiver)); + } + + /// Adjusts the deletion state based on the relative positions of deletion and content cursors. + /// This determines how deletions should be applied to live data at the current position. + void adjustDeletionState(long deletionPosition, long contentPosition) + { + if (Cursor.isExhausted(deletionPosition)) + relevantDeletionsState = DeletionState.NONE; + else if (Cursor.compare(deletionPosition, contentPosition) > 0) + relevantDeletionsState = DeletionState.AHEAD; + else + relevantDeletionsState = DeletionState.MATCHING; + } + + /// Manages deletion branches during cursor advancement. + /// This method coordinates between live data cursors and deletion cursors to ensure + /// proper deletion application at each position. + long processRelevantDeletions(long contentPosition) + { + if (deletionBranchDepth != -1) + { + if (Cursor.depth(contentPosition) > deletionBranchDepth) + { + // We are still in the branch where the current relevantDeletions apply. + // Advance them to match the current state. + switch (relevantDeletionsState) + { + case MATCHING: + { + long deletionPosition = relevantDeletions.skipTo(contentPosition); + adjustDeletionState(deletionPosition, contentPosition); + break; + } + case AHEAD: + { + long deletionPosition = relevantDeletions.skipToWhenAhead(contentPosition); + adjustDeletionState(deletionPosition, contentPosition); + break; + } + // nothing to do for NONE (where relevantDeletions is exhausted, but we still haven't left its branch) + } + return contentPosition; + } + else + { + // ascended above the common deletions root, we need to track and report deletion branches again. + deletionBranchDepth = -1; + relevantDeletions = null; + relevantDeletionsState = DeletionState.NONE; + } + } + + // If the branch is single-source, its deletions cannot affect the merge as they can't delete its own data. + // (Note that covering deletions from other sources can still affect it though.) + // Otherwise we need to get the deletions from all sources to track and apply them. + if (branchHasMultipleSources()) + { + RangeCursor deletions = deletionsAtFixedPoints ? makeRelevantDeletionsFixedPoints() + : makeRelevantDeletionsNoFixedPoints(); + if (deletions != null) + { + deletionBranchDepth = Cursor.depth(contentPosition); + relevantDeletions = DepthAdjustedCursor.make(deletions, contentPosition); + relevantDeletionsState = DeletionState.MATCHING; + } + } + + return contentPosition; + } + + private RangeCursor makeRelevantDeletionsFixedPoints() + { + applyToSelectedSources(this::addDeletionTrieBranchFixedPoints); + if (collectedDeletionBranches.isEmpty()) + return null; + + return cursorForCollectedDeletionBranches(); + } + + /// Adds a deletion trie branch for the given cursor. This version applies to the fixed point mode, where + /// deletion branches must be presented at shared positions. + void addDeletionTrieBranchFixedPoints(DeletionAwareCursor cursor) + { + RangeCursor deletionsBranch = cursor.deletionBranchCursor(direction); + if (deletionsBranch != null) + collectedDeletionBranches.add(deletionsBranch); + // Otherwise there is no need to track the subtrie. If there are deletions, they must be presented here. + } + + private RangeCursor makeRelevantDeletionsNoFixedPoints() + { + applyToSelectedSources(this::addDeletionTrieBranchNoFixedPoints); + if (collectedDeletionBranches.isEmpty()) + { + sourcesWithNoDeletionBranch.clear(); + return null; + } + + for (DeletionAwareCursor cursor : sourcesWithNoDeletionBranch) + collectedDeletionBranches.add(new DeletionsTrieCursor<>(cursor.tailCursor(direction))); + sourcesWithNoDeletionBranch.clear(); + + return cursorForCollectedDeletionBranches(); + } + + /// Adds a deletion trie branch for the given cursor. This means either the deletion branch that it presents, + /// or, in the case where we accept non-aligned deletions, any deletion branch that may be present in its + /// substructure. + void addDeletionTrieBranchNoFixedPoints(DeletionAwareCursor cursor) + { + RangeCursor deletionsBranch = cursor.deletionBranchCursor(direction); + if (deletionsBranch != null) + collectedDeletionBranches.add(deletionsBranch); + else + sourcesWithNoDeletionBranch.add(cursor); + } + + private RangeCursor cursorForCollectedDeletionBranches() + { + RangeCursor toReturn; + if (collectedDeletionBranches.size() == 1) + toReturn = collectedDeletionBranches.get(0); + else + toReturn = new Range(deletionResolver, direction, collectedDeletionBranches, (c, d) -> c); + collectedDeletionBranches.clear(); + return toReturn; + } + + /// Resolves content by applying deletions to live data. + /// This is the core method that implements deletion application during iteration. + @Override + T resolveContent() + { + T content = super.resolveContent(); + if (content == null) + return null; + + D deletion; + switch (relevantDeletionsState) + { + case MATCHING: + deletion = relevantDeletions.state(); + break; + case AHEAD: + deletion = relevantDeletions.precedingState(); + break; + default: + deletion = null; + } + if (deletion == null) + return content; + return deleter.apply(deletion, content); + } + + /// Returns the deletion branch cursor for the current position. + @Override + public RangeCursor deletionBranchCursor(Direction dir) + { + // If we aren't tracking relevant deletions yet, it may be because we are in a single-source branch. + // If that is so, defer to that source's deletionBranchCursor. + if (deletionBranchDepth == -1) + return branchHasMultipleSources() ? null : head.deletionBranchCursor(dir); + + // Otherwise we are already tracking deletions. We only need to report them if they are introduced at this depth. + if (deletionBranchDepth == Cursor.depth(encodedPosition())) + { + assert relevantDeletionsState == DeletionState.MATCHING; + return relevantDeletions.tailCursor(dir); + } + + return null; + } + + @Override + public T content() + { + return maybeCollectContent(); + } + + /// + /// Gets content from a specific cursor (required by CollectionMergeCursor). + /// + @Override + T getContent(DeletionAwareCursor cursor) + { + return cursor.content(); + } + + @Override + public DeletionAwareCursor tailCursor(Direction dir) + { + if (deletionBranchDepth != -1 && Cursor.depth(encodedPosition()) > deletionBranchDepth) + { + // We are already inside the coverage of a deletion branch. In this case we don't report that branch, + // but we make sure we apply its deletions to the data we report. + RangeCursor deletions = null; + switch (relevantDeletionsState) + { + case NONE: + break; + case MATCHING: + deletions = relevantDeletions.tailCursor(dir); + break; + case AHEAD: + deletions = relevantDeletions.precedingStateCursor(dir); + break; + } + + if (deletions != null) + { + // Because deletions branch is already active (and no new one can be introduced now), we treat the + // sources as plain tries. + Cursor source; + + if (!branchHasMultipleSources()) + source = head.tailCursor(dir); + else + { + List> inputs = new ArrayList<>(heap.length + 1); + applyToSelectedSources(inputs::add); + + source = new Plain<>(resolver, dir, inputs, DeletionAwareCursor::tailCursor); + } + + return new RangeApplyCursor.DeletionAwareDataBranch<>(deleter, deletions, source); + } + } + + if (!branchHasMultipleSources()) + return head.tailCursor(dir); + + List> inputs = new ArrayList<>(heap.length + 1); + applyToSelectedSources(inputs::add); + + return new DeletionAware<>(resolver, deletionResolver, deleter, deletionsAtFixedPoints, dir, inputs, DeletionAwareCursor::tailCursor); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java deleted file mode 100644 index 04f732627f4a..000000000000 --- a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java +++ /dev/null @@ -1,432 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.db.tries; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -import com.google.common.collect.Iterables; - -import org.apache.cassandra.utils.bytecomparable.ByteComparable; - -/** - * A merged view of multiple tries. - * - * This is accomplished by walking the cursors in parallel; the merged cursor takes the position and features of the - * smallest and advances with it; when multiple cursors are equal, all of them are advanced. The ordered view of the - * cursors is maintained using a custom binary min-heap, built for efficiently reforming the heap when the top elements - * are advanced (see {@link CollectionMergeCursor}). - * - * Crucial for the efficiency of this is the fact that when they are advanced like this, we can compare cursors' - * positions by their depth descending and then incomingTransition ascending. - * - * See Trie.md for further details. - */ -class CollectionMergeTrie extends Trie -{ - private final CollectionMergeResolver resolver; // only called on more than one input - protected final Collection> inputs; - - CollectionMergeTrie(Collection> inputs, CollectionMergeResolver resolver) - { - this.resolver = resolver; - this.inputs = inputs; - } - - @Override - protected Cursor cursor(Direction direction) - { - return new CollectionMergeCursor<>(resolver, direction, inputs); - } - - /** - * Compare the positions of two cursors. One is before the other when - * - its depth is greater, or - * - its depth is equal, and the incoming transition is smaller. - */ - static boolean greaterCursor(Direction direction, Cursor c1, Cursor c2) - { - int c1depth = c1.depth(); - int c2depth = c2.depth(); - if (c1depth != c2depth) - return c1depth < c2depth; - return direction.lt(c2.incomingTransition(), c1.incomingTransition()); - } - - static boolean equalCursor(Cursor c1, Cursor c2) - { - return c1.depth() == c2.depth() && c1.incomingTransition() == c2.incomingTransition(); - } - - /* - * The merge cursor is a variation of the idea of a merge iterator with one key observation: because we advance - * the source iterators together, we can compare them just by depth and incoming transition. - * - * The most straightforward way to implement merging of iterators is to use a {@code PriorityQueue}, - * {@code poll} it to find the next item to consume, then {@code add} the iterator back after advancing. - * This is not very efficient as {@code poll} and {@code add} in all cases require at least - * {@code log(size)} comparisons and swaps (usually more than {@code 2*log(size)}) per consumed item, even - * if the input is suitable for fast iteration. - * - * The implementation below makes use of the fact that replacing the top element in a binary heap can be - * done much more efficiently than separately removing it and placing it back, especially in the cases where - * the top iterator is to be used again very soon (e.g. when there are large sections of the output where - * only a limited number of input iterators overlap, which is normally the case in many practically useful - * situations, e.g. levelled compaction). - * - * The implementation builds and maintains a binary heap of sources (stored in an array), where we do not - * add items after the initial construction. Instead we advance the smallest element (which is at the top - * of the heap) and push it down to find its place for its new position. Should this source be exhausted, - * we swap it with the last source in the heap and proceed by pushing that down in the heap. - * - * In the case where we have multiple sources with matching positions, the merging algorithm - * must be able to merge all equal values. To achieve this {@code content} walks the heap to - * find all equal cursors without advancing them, and separately {@code advance} advances - * all equal sources and restores the heap structure. - * - * The latter is done equivalently to the process of initial construction of a min-heap using back-to-front - * heapification as done in the classic heapsort algorithm. It only needs to heapify subheaps whose top item - * is advanced (i.e. one whose position matches the current), and we can do that recursively from - * bottom to top. Should a source be exhausted when advancing, it can be thrown away by swapping in the last - * source in the heap (note: we must be careful to advance that source too if required). - * - * To make it easier to advance efficienty in single-sourced branches of tries, we extract the current smallest - * cursor (the head) separately, and start any advance with comparing that to the heap's first. When the smallest - * cursor remains the same (e.g. in branches coming from a single source) this makes it possible to advance with - * just one comparison instead of two at the expense of increasing the number by one in the general case. - * - * Note: This is a simplification of the MergeIterator code from CASSANDRA-8915, without the leading ordered - * section and equalParent flag since comparisons of cursor positions are cheap. - */ - static class CollectionMergeCursor implements Cursor - { - private final CollectionMergeResolver resolver; - private final Direction direction; - - /** - * The smallest cursor, tracked separately to improve performance in single-source sections of the trie. - */ - private Cursor head; - - /** - * Binary heap of the remaining cursors. The smallest element is at position 0. - * Every element i is smaller than or equal to its two children, i.e. - * heap[i] <= heap[i*2 + 1] && heap[i] <= heap[i*2 + 2] - */ - private final Cursor[] heap; - - /** - * A list used to collect contents during content() calls. - */ - private final List contents; - - public CollectionMergeCursor(CollectionMergeResolver resolver, Direction direction, Collection> inputs) - { - this.resolver = resolver; - this.direction = direction; - int count = inputs.size(); - // Get cursors for all inputs. Put one of them in head and the rest in the heap. - heap = new Cursor[count - 1]; - contents = new ArrayList<>(count); - int i = -1; - for (Trie trie : inputs) - { - Cursor cursor = trie.cursor(direction); - assert cursor.depth() == 0; - if (i >= 0) - heap[i] = cursor; - else - head = cursor; - ++i; - } - // The cursors are all currently positioned on the root and thus in valid heap order. - } - - /** - * Interface for internal operations that can be applied to selected top elements of the heap. - */ - interface HeapOp - { - void apply(CollectionMergeCursor self, Cursor cursor, int index); - - default boolean shouldContinueWithChild(Cursor child, Cursor head) - { - return equalCursor(child, head); - } - } - - /** - * Apply a non-interfering operation, i.e. one that does not change the cursor state, to all inputs in the heap - * that satisfy the {@link HeapOp#shouldContinueWithChild} condition (by default, being equal to the head). - * For interfering operations like advancing the cursors, use {@link #advanceSelectedAndRestoreHeap(AdvancingHeapOp)}. - */ - private void applyToSelectedInHeap(HeapOp action) - { - applyToSelectedElementsInHeap(action, 0); - } - - /** - * Interface for internal advancing operations that can be applied to the heap cursors. This interface provides - * the code to restore the heap structure after advancing the cursors. - */ - interface AdvancingHeapOp extends HeapOp - { - void apply(Cursor cursor); - - default void apply(CollectionMergeCursor self, Cursor cursor, int index) - { - // Apply the operation, which should advance the position of the element. - apply(cursor); - - // This method is called on the back path of the recursion. At this point the heaps at both children are - // advanced and well-formed. - // Place current node in its proper position. - self.heapifyDown(cursor, index); - // The heap rooted at index is now advanced and well-formed. - } - } - - - /** - * Advance the state of all inputs in the heap that satisfy the {@link HeapOp#shouldContinueWithChild} condition - * (by default, being equal to the head) and restore the heap invariant. - */ - private void advanceSelectedAndRestoreHeap(AdvancingHeapOp action) - { - applyToSelectedElementsInHeap(action, 0); - } - - /** - * Apply an operation to all elements on the heap that satisfy, recursively through the heap hierarchy, the - * {@code shouldContinueWithChild} condition (being equal to the head by default). Descends recursively in the - * heap structure to all selected children and applies the operation on the way back. - *

- * This operation can be something that does not change the cursor state (see {@link #content}) or an operation - * that advances the cursor to a new state, wrapped in a {@link AdvancingHeapOp} ({@link #advance} or - * {@link #skipTo}). The latter interface takes care of pushing elements down in the heap after advancing - * and restores the subheap state on return from each level of the recursion. - */ - private void applyToSelectedElementsInHeap(HeapOp action, int index) - { - if (index >= heap.length) - return; - Cursor item = heap[index]; - if (!action.shouldContinueWithChild(item, head)) - return; - - // If the children are at the same position, they also need advancing and their subheap - // invariant to be restored. - applyToSelectedElementsInHeap(action, index * 2 + 1); - applyToSelectedElementsInHeap(action, index * 2 + 2); - - // Apply the action. This is done on the reverse direction to give the action a chance to form proper - // subheaps and combine them on processing the parent. - action.apply(this, item, index); - } - - /** - * Push the given state down in the heap from the given index until it finds its proper place among - * the subheap rooted at that position. - */ - private void heapifyDown(Cursor item, int index) - { - while (true) - { - int next = index * 2 + 1; - if (next >= heap.length) - break; - // Select the smaller of the two children to push down to. - if (next + 1 < heap.length && greaterCursor(direction, heap[next], heap[next + 1])) - ++next; - // If the child is greater or equal, the invariant has been restored. - if (!greaterCursor(direction, item, heap[next])) - break; - heap[index] = heap[next]; - index = next; - } - heap[index] = item; - } - - /** - * Check if the head is greater than the top element in the heap, and if so, swap them and push down the new - * top until its proper place. - * @param headDepth the depth of the head cursor (as returned by e.g. advance). - * @return the new head element's depth - */ - private int maybeSwapHead(int headDepth) - { - int heap0Depth = heap[0].depth(); - if (headDepth > heap0Depth || - (headDepth == heap0Depth && direction.le(head.incomingTransition(), heap[0].incomingTransition()))) - return headDepth; // head is still smallest - - // otherwise we need to swap heap and heap[0] - Cursor newHeap0 = head; - head = heap[0]; - heapifyDown(newHeap0, 0); - return heap0Depth; - } - - boolean branchHasMultipleSources() - { - return equalCursor(heap[0], head); - } - - @Override - public int advance() - { - advanceSelectedAndRestoreHeap(Cursor::advance); - return maybeSwapHead(head.advance()); - } - - @Override - public int advanceMultiple(TransitionsReceiver receiver) - { - // If the current position is present in just one cursor, we can safely descend multiple levels within - // its branch as no one of the other tries has content for it. - if (branchHasMultipleSources()) - return advance(); // More than one source at current position, do single-step advance. - - // If there are no children, i.e. the cursor ascends, we have to check if it's become larger than some - // other candidate. - return maybeSwapHead(head.advanceMultiple(receiver)); - } - - @Override - public int skipTo(int skipDepth, int skipTransition) - { - // We need to advance all cursors that stand before the requested position. - // If a child cursor does not need to advance as it is greater than the skip position, neither of the ones - // below it in the heap hierarchy do as they can't have an earlier position. - class SkipTo implements AdvancingHeapOp - { - @Override - public boolean shouldContinueWithChild(Cursor child, Cursor head) - { - // When the requested position descends, the inplicit prefix bytes are those of the head cursor, - // and thus we need to check against that if it is a match. - if (equalCursor(child, head)) - return true; - // Otherwise we can compare the child's position against a cursor advanced as requested, and need - // to skip only if it would be before it. - int childDepth = child.depth(); - return childDepth > skipDepth || - childDepth == skipDepth && direction.lt(child.incomingTransition(), skipTransition); - } - - @Override - public void apply(Cursor cursor) - { - cursor.skipTo(skipDepth, skipTransition); - } - } - - applyToSelectedElementsInHeap(new SkipTo(), 0); - return maybeSwapHead(head.skipTo(skipDepth, skipTransition)); - } - - @Override - public int depth() - { - return head.depth(); - } - - @Override - public int incomingTransition() - { - return head.incomingTransition(); - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - return head.byteComparableVersion(); - } - - @Override - public T content() - { - if (!branchHasMultipleSources()) - return head.content(); - - applyToSelectedInHeap(CollectionMergeCursor::collectContent); - collectContent(head, -1); - - T toReturn; - switch (contents.size()) - { - case 0: - toReturn = null; - break; - case 1: - toReturn = contents.get(0); - break; - default: - toReturn = resolver.resolve(contents); - break; - } - contents.clear(); - return toReturn; - } - - private void collectContent(Cursor item, int index) - { - T itemContent = item.content(); - if (itemContent != null) - contents.add(itemContent); - } - - @Override - public Trie tailTrie() - { - if (!branchHasMultipleSources()) - return head.tailTrie(); - - List> inputs = new ArrayList<>(heap.length); - inputs.add(head.tailTrie()); - applyToSelectedInHeap((self, cursor, index) -> inputs.add(cursor.tailTrie())); - - return new CollectionMergeTrie<>(inputs, resolver); - } - } - - /** - * Special instance for sources that are guaranteed distinct. The main difference is that we can form unordered - * value list by concatenating sources. - */ - static class Distinct extends CollectionMergeTrie - { - Distinct(Collection> inputs) - { - super(inputs, throwingResolver()); - } - - @Override - public Iterable valuesUnordered() - { - return Iterables.concat(Iterables.transform(inputs, Trie::valuesUnordered)); - } - } -} diff --git a/src/java/org/apache/cassandra/db/tries/ContentManager.java b/src/java/org/apache/cassandra/db/tries/ContentManager.java new file mode 100644 index 000000000000..c46cffa6b865 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/ContentManager.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +/// Content manager for in-memory tries. Deals with allocation, access and recycling of trie content, mapping objects +/// to and from leaf pointers in the trie. +public interface ContentManager extends MemoryManager +{ + /// Add a new content value. + /// + /// @param value The value to add. + /// @param contentAfterBranch Whether the content should be understood to reside after the branch, i.e. it is to be + /// returned on the ascent path of the cursor walk. + /// @return A content id that can be used to reference the content. This id must be interpreted as a leaf by the + /// trie code (i.e. it must either be negative or a valid pointer to a content cell). + int addContent(T value, boolean contentAfterBranch) throws TrieSpaceExhaustedException; + + /// Change the content associated with a given content id. + /// + /// @param id Encoded content id, returned from a previous call to [#addContent] or [#setContent]. + /// @param value New content value to store. + /// @return The id to use for the modified content; an attempt will be made to make this the same as `id`, but not + /// all content managers will be able to freely modify the data for a given id. + /// Implementations must ensure that if the id changes, the previous id is released. + int setContent(int id, T value) throws TrieSpaceExhaustedException; + + /// Prepare the given content id for recycling. The id cannot be immediately recycled, + /// because read operations as well as the ongoing mutation may still need it. + void releaseContent(int id); + + /// Get the content for the given content pointer. + /// + /// @param id content pointer, returned by a previous call to [#addContent]. + /// @return the current content value. + T getContent(int id); + + /// Returns false if the given contentId should be presented before the children of the branch in forward direction, + /// and true if it should be presented after them. + boolean shouldPresentAfterBranch(int contentId); + + /// This is called when content is left without children and is used to remove dangling metadata + /// or markers for branches (e.g. rows) that have become empty. + /// + /// If it returns false, the content will be dropped when its branch becomes empty. + boolean shouldPreserveWithoutChildren(int contentId); + + /// Make a textual representation of the id for debugging. + String dumpContentId(int id); + + /// If the content manager uses trie cells, this must return the cell corresponding to the given id. If not, it + /// should return a negative value. + int cellUsedIfAny(int id); + + /// Release all recycled content references, including the ones waiting in still incomplete recycling lists. + /// This is a test method and can cause null pointer exceptions if used on a live trie. + /// + /// If similar functionality is required for non-test purposes, a version of this should be developed that only + /// releases references on barrier-complete lists. + void releaseReferencesUnsafe(); + + /// Returns the number of values in the trie + int valuesCount(); +} diff --git a/src/java/org/apache/cassandra/db/tries/ContentManagerBytes.java b/src/java/org/apache/cassandra/db/tries/ContentManagerBytes.java new file mode 100644 index 000000000000..5fa57b944e75 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/ContentManagerBytes.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import static org.apache.cassandra.db.tries.ContentSerializer.OFFSET_SPECIAL; +import static org.apache.cassandra.db.tries.InMemoryReadTrie.offset; + +/// Content manager used for storing data directly in trie cells. +/// +/// Relies on a [ContentSerializer] to perform encoding and decoding of the content and refers to the trie's +/// [BufferManager] to manage the cells used for storing the data. +/// +/// It also supports "special" values, encoded as negative integers, which are to be directly mapped to objects by the +/// serialized without taking up trie cells. +/// +/// Because the trie cells are limited in size (32 bytes), the user must use its own method of handling payloads that +/// don't fit (e.g. deferring to [ContentManagerPojo] to generate negative special ids for larger objects). +class ContentManagerBytes implements ContentManager +{ + private final ContentSerializer serializer; + private final BufferManager bufferManager; + private int valuesCount = 0; + + // Leaves have negative pointers. If we mask the sign bit and + + static final int SIGN = 0x80000000; + static final int MASK_ID_TO_CELL = 0x7FFFFFE0; + + static + { + assert offset(specialToContent(0)) == OFFSET_SPECIAL + : "OFFSET_SPECIAL must be 0x1F"; + } + + public ContentManagerBytes(ContentSerializer serializer, BufferManager bufferManager) + { + this.serializer = serializer; + this.bufferManager = bufferManager; + } + + static final int contentToSpecial(int contentId) + { + return (~contentId) >> 5; + } + + static final int specialToContent(int specialId) + { + assert specialId >= 0; + // ~ rather than - to permit an id of 0, and it also sets the offset to OFFSET_SPECIAL + return ~(specialId << 5); + } + + @Override + public T getContent(int id) + { + int offset = offset(id); + if (offset == OFFSET_SPECIAL) + return serializer.special(contentToSpecial(id)); + int cell = id & MASK_ID_TO_CELL; + return serializer.deserialize(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset); + } + + @Override + public boolean shouldPresentAfterBranch(int id) + { + int offset = offset(id); + if (offset == OFFSET_SPECIAL) + return serializer.shouldPresentSpecialAfterBranch(contentToSpecial(id)); + return serializer.shouldPresentAfterBranch(offset); + } + + @Override + public boolean shouldPreserveWithoutChildren(int id) + { + int offset = offset(id); + if (offset == OFFSET_SPECIAL) + return serializer.shouldPreserveSpecialWithoutChildren(contentToSpecial(id)); + if (serializer.shouldPreserveWithoutChildren(offset)) + return true; + int cell = id & MASK_ID_TO_CELL; + return serializer.shouldPreserveWithoutChildren(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset); + } + + @Override + public int addContent(T value, boolean contentAfterBranch) throws TrieSpaceExhaustedException + { + ++valuesCount; + int idIfSpecial = serializer.idIfSpecial(value, contentAfterBranch); + if (idIfSpecial >= 0) + return specialToContent(idIfSpecial); // special value + + int cell = bufferManager.allocateCell(); + int offset = serializer.serialize(value, contentAfterBranch, bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell)); + return cell | offset | SIGN; + } + + @Override + public int setContent(int id, T value) throws TrieSpaceExhaustedException + { + int offset = offset(id); + // check if we are switching from a special + if (offset == OFFSET_SPECIAL) + { + int specialId = contentToSpecial(id); + serializer.releaseSpecial(specialId); + --valuesCount; // compensate for +1 in addContent + return addContent(value, serializer.shouldPresentSpecialAfterBranch(specialId)); + } + + // Check if we need to switch to a special + boolean afterBranch = serializer.shouldPresentAfterBranch(offset); + int special = serializer.idIfSpecial(value, afterBranch); + int cell = id & MASK_ID_TO_CELL; + if (special >= 0) + { + if (serializer.releaseNeeded(offset)) + serializer.release(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset); + bufferManager.recycleCell(cell); + return specialToContent(special); + } + + int newOffset = serializer.updateInPlace(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset, value); + return cell | SIGN | newOffset; + } + + @Override + public void releaseContent(int id) + { + --valuesCount; + int offset = offset(id); + if (offset == OFFSET_SPECIAL) + { + serializer.releaseSpecial(contentToSpecial(id)); + return; + } + + int cell = id & MASK_ID_TO_CELL; + if (serializer.releaseNeeded(offset)) + serializer.release(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset); + bufferManager.recycleCell(cell); + } + + @Override + public void completeMutation() + { + serializer.completeMutation(); + } + + @Override + public void abortMutation() + { + serializer.abortMutation(); + } + + @Override + public String dumpContentId(int id) + { + int offset = offset(id); + if (offset == OFFSET_SPECIAL) + return serializer.dumpSpecial(contentToSpecial(id)); + + int cell = id & MASK_ID_TO_CELL; + return serializer.dumpContent(bufferManager.getBuffer(cell), bufferManager.inBufferOffset(cell), offset); + } + + @Override + public int cellUsedIfAny(int id) + { + return offset(id) == OFFSET_SPECIAL ? -1 : (id & MASK_ID_TO_CELL); + } + + @Override + public long usedSizeOnHeap() + { + // serializer may store large blobs outside our buffers + return serializer.usedSizeOnHeap(); + } + + @Override + public long usedSizeOffHeap() + { + // serializer may store large blobs outside our buffers + return serializer.usedSizeOffHeap(); + } + + @Override + public long unusedReservedOnHeapMemory() + { + return serializer.unusedReservedOnHeapMemory(); + } + + @Override + public void releaseReferencesUnsafe() + { + serializer.releaseReferencesUnsafe(); + } + + @Override + public int valuesCount() + { + return valuesCount; + } +} diff --git a/src/java/org/apache/cassandra/db/tries/ContentManagerPojo.java b/src/java/org/apache/cassandra/db/tries/ContentManagerPojo.java new file mode 100644 index 000000000000..bfedcc75de3a --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/ContentManagerPojo.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.concurrent.atomic.AtomicReferenceArray; +import java.util.function.Predicate; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.InMemoryBaseTrie.REFERENCE_ARRAY_ON_HEAP_SIZE; +import static org.apache.cassandra.db.tries.InMemoryReadTrie.getBufferIdx; +import static org.apache.cassandra.db.tries.InMemoryReadTrie.inBufferOffset; +import static org.github.jamm.MemoryMeterStrategy.MEMORY_LAYOUT; + +/// Content manager storing data in lists of java objects. Encoded objects are put in the maintained lists and are +/// mapped to negative content ids that encode the position in the list. This avoids taking up data cells for content, +/// but has to maintain the list of references. +/// +/// Like [BufferManagerMultibuf], we use multiple lists that grow in size and can optionally recycle indexes. +public class ContentManagerPojo implements ContentManager +{ + static final int CONTENT_FLAGS_SHIFT = 30; + static final int CONTENT_INDEX_MASK = (1 << CONTENT_FLAGS_SHIFT) - 1; + + static final int CONTENT_AFTER_BRANCH = 1 << CONTENT_FLAGS_SHIFT; + + static final int CONTENTS_START_SHIFT = 4; + static final int CONTENTS_START_SIZE = 1 << CONTENTS_START_SHIFT; + + private int reservedCount = 0; + private int valuesCount = 0; + final AtomicReferenceArray[] contentArrays; + final Predicate shouldPreserveWithoutChildren; + final MemoryAllocationStrategy objectAllocator; + + /// Creates a new content manager with the given expected lifetime. + /// Short-lived managers will not recycle cells as it is simpler to throw the whole thing away at the end of its + /// lifecycle, while long-lived will track freed cells and will reuse them after the given opOrder indicates that + /// all operations that may be using them have finished. + /// + /// @param shouldPreserveWithoutChildren Predicate used to check whether a given object should be preserved when + /// its branch becomes empty. See [ContentManager#shouldPreserveWithoutChildren]. + public ContentManagerPojo(Predicate shouldPreserveWithoutChildren, + InMemoryBaseTrie.ExpectedLifetime lifetime, + OpOrder opOrder) + { + this.contentArrays = new AtomicReferenceArray[29 - CONTENTS_START_SHIFT]; + this.shouldPreserveWithoutChildren = shouldPreserveWithoutChildren; + switch (lifetime) + { + case SHORT: + objectAllocator = new MemoryAllocationStrategy.NoReuseStrategy(this::allocateNewObject); + break; + case LONG: + objectAllocator = new MemoryAllocationStrategy.OpOrderReuseStrategy(this::allocateNewObject, opOrder); + break; + default: + throw new AssertionError(); + } + } + + @Override + public T getContent(int id) + { + int leadBit = getBufferIdx(id & CONTENT_INDEX_MASK, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + int ofs = inBufferOffset(id & CONTENT_INDEX_MASK, leadBit, CONTENTS_START_SIZE); + AtomicReferenceArray array = contentArrays[leadBit]; + return array.get(ofs); + } + + @Override + public boolean shouldPresentAfterBranch(int contentId) + { + return (contentId & CONTENT_AFTER_BRANCH) != 0; + } + + @Override + public boolean shouldPreserveWithoutChildren(int contentId) + { + if (shouldPreserveWithoutChildren == null) + return true; + + return shouldPreserveWithoutChildren.test(getContent(contentId)); + } + + @Override + public String dumpContentId(int id) + { + return "~" + (id & CONTENT_INDEX_MASK) + ((id & CONTENT_AFTER_BRANCH) != 0 ? "↑" : ""); + } + + @Override + public int cellUsedIfAny(int id) + { + return -1; + } + + /// Allocate a new position in the object array. Used by the memory allocation strategy to allocate a content spot + /// when it runs out of recycled positions. + private int allocateNewObject() + { + int index = reservedCount++; + int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + AtomicReferenceArray array = contentArrays[leadBit]; + if (array == null) + { + assert inBufferOffset(index, leadBit, CONTENTS_START_SIZE) == 0 : "Error in content arrays configuration."; + contentArrays[leadBit] = new AtomicReferenceArray<>(CONTENTS_START_SIZE << leadBit); + } + return index; + } + + + @Override + public int addContent(T value, boolean contentAfterBranch) throws TrieSpaceExhaustedException + { + ++valuesCount; + int index = objectAllocator.allocate(); + int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + int ofs = inBufferOffset(index, leadBit, CONTENTS_START_SIZE); + AtomicReferenceArray array = contentArrays[leadBit]; + // no need for a volatile set here; at this point the item is not referenced + // by any node in the trie, and a volatile set will be made to reference it. + array.setPlain(ofs, value); + return formContentId(index, contentAfterBranch); + } + + private int formContentId(int index, boolean contentAfterBranch) + { + return index | (1 << 31) | (contentAfterBranch ? CONTENT_AFTER_BRANCH : 0); + } + + @Override + public int setContent(int id, T value) throws TrieSpaceExhaustedException // descendants may throw + { + int leadBit = getBufferIdx(id & CONTENT_INDEX_MASK, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + int ofs = inBufferOffset(id & CONTENT_INDEX_MASK, leadBit, CONTENTS_START_SIZE); + AtomicReferenceArray array = contentArrays[leadBit]; + array.set(ofs, value); + return id; + } + + @Override + public void releaseContent(int id) + { + --valuesCount; + objectAllocator.recycle(id & CONTENT_INDEX_MASK); + } + + @Override + public void completeMutation() + { + objectAllocator.completeMutation(); + } + + @Override + public void abortMutation() + { + objectAllocator.abortMutation(); + } + + @Override + public long usedSizeOffHeap() + { + return 0; + } + + @Override + public long usedSizeOnHeap() + { + return usedObjectSpace() + + REFERENCE_ARRAY_ON_HEAP_SIZE * getBufferIdx(reservedCount, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + } + + @VisibleForTesting + long usedObjectSpace() + { + return valuesCount() * MEMORY_LAYOUT.getReferenceSize(); + } + + @Override + @VisibleForTesting + public long unusedReservedOnHeapMemory() + { + long bufferOverhead = 0; + + int index = reservedCount; + int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); + int ofs = inBufferOffset(index, leadBit, CONTENTS_START_SIZE); + AtomicReferenceArray contentArray = contentArrays[leadBit]; + long contentOverhead = ((contentArray != null ? contentArray.length() : 0) - ofs); + contentOverhead += reservedCount - valuesCount; + contentOverhead *= MEMORY_LAYOUT.getReferenceSize(); + + return bufferOverhead + contentOverhead; + } + + @Override + @VisibleForTesting + public void releaseReferencesUnsafe() + { + try + { + for (int idx : objectAllocator.indexesInPipeline()) + setContent(formContentId(idx, false), null); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + } + + @Override + public int valuesCount() + { + return valuesCount; + } +} diff --git a/src/java/org/apache/cassandra/db/tries/ContentMappingCursor.java b/src/java/org/apache/cassandra/db/tries/ContentMappingCursor.java new file mode 100644 index 000000000000..327d25c5d41b --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/ContentMappingCursor.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.Function; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// Cursor that sends all values through a specified content mapper. +abstract class ContentMappingCursor, V> implements Cursor +{ + final Function mapper; + final C source; + + ContentMappingCursor(Function mapper, C source) + { + this.mapper = mapper; + this.source = source; + } + + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } + + @Override + public long advance() + { + return source.advance(); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return source.advanceMultiple(receiver); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return source.skipTo(encodedSkipPosition); + } + + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public V content() + { + T content = source.content(); + if (content != null) + return mapper.apply(content); + else + return null; + } + + static class Plain extends ContentMappingCursor, V> implements Cursor + { + Plain(Function mapper, Cursor source) + { + super(mapper, source); + } + + @Override + public Cursor tailCursor(Direction direction) + { + return new Plain<>(mapper, source.tailCursor(direction)); + } + } + + static class Range, V extends RangeState> extends ContentMappingCursor, V> implements RangeCursor + { + Range(Function mapper, RangeCursor source) + { + super(mapper, source); + } + + @Override + public V state() + { + S state = source.state(); + if (state == null) + return null; + return mapper.apply(state); + } + + @Override + public V precedingState() + { + S state = source.precedingState(); + if (state == null) + return null; + return mapper.apply(state); + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + return new Range<>(mapper, source.tailCursor(direction)); + } + } + + static class DeletionAware, V, E extends RangeState> + extends ContentMappingCursor, V> implements DeletionAwareCursor + { + final Function deletionMapper; + DeletionAware(Function mapper, Function deletionMapper, DeletionAwareCursor source) + { + super(mapper, source); + this.deletionMapper = deletionMapper; + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + RangeCursor branch = source.deletionBranchCursor(direction); + if (branch == null) + return null; + return new Range<>(deletionMapper, branch); + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new DeletionAware<>(mapper, deletionMapper, source.tailCursor(direction)); + } + } + + + static class DeletionAwareDataOnly, V> + extends ContentMappingCursor, V> implements DeletionAwareCursor + { + DeletionAwareDataOnly(Function mapper, DeletionAwareCursor source) + { + super(mapper, source); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return source.deletionBranchCursor(direction); + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new DeletionAwareDataOnly<>(mapper, source.tailCursor(direction)); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/ContentSerializer.java b/src/java/org/apache/cassandra/db/tries/ContentSerializer.java new file mode 100644 index 000000000000..d9d15ae61ed5 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/ContentSerializer.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import com.google.common.annotations.VisibleForTesting; + +import org.agrona.concurrent.UnsafeBuffer; + +/// Object serialization used by [ContentManagerBytes]. Defines how the objects are store in trie cells, when they +/// are mapped to special ids, and the special id mapping itself. +public interface ContentSerializer extends MemoryManager +{ + int OFFSET_SPECIAL = InMemoryTrie.offset(~0); + + /// Returns a negative special id if the given content should be stored as a special value, 0 or larger if the value + /// should be serialized. + int idIfSpecial(T content, boolean shouldPresentAfterBranch); + + /// Store the given content in the 32-byte cell at the given `inBufferPos` in `buffer`. + /// Returns the offsetBits to add to the id of the leaf. + int serialize(T content, boolean shouldPresentAfterBranch, UnsafeBuffer buffer, int inBufferPos) throws TrieSpaceExhaustedException; + + /// Returns the value associated with the given special id. + T special(int id); + + /// Load the content from the 32-byte cell at the given `inBufferPos` in `buffer`. + T deserialize(UnsafeBuffer buffer, int inBufferPos, int offsetBits); + + /// Update the value at the given `inBufferPos` in `buffer` if possible. + /// If the call successfully update the value, it must return true. Otherwise, the value is stored in a different + /// cell/id and this one is released. + int updateInPlace(UnsafeBuffer buffer, int inBufferPos, int offsetBits, T newContent) throws TrieSpaceExhaustedException; + + /// Prepare the given special id for recycling. + void releaseSpecial(int id); + + /// Should return true if this serializer needs to recycle external data for serialized content, + /// i.e. if [#release] must be called in addition to recycling the cell when the associated content is no longer in + /// use. + boolean releaseNeeded(int offsetBits); + + /// Prepare the external content in the given cell for recycling. Called only if [#releaseNeeded] returns true. + void release(UnsafeBuffer buffer, int inBufferPos, int offsetBits); + + /// See [ContentManager#shouldPreserveWithoutChildren]. + boolean shouldPreserveSpecialWithoutChildren(int id); + + /// See [ContentManager#shouldPreserveWithoutChildren]. + boolean shouldPreserveWithoutChildren(int offsetBits); + + /// See [ContentManager#shouldPreserveWithoutChildren]. + boolean shouldPreserveWithoutChildren(UnsafeBuffer buffer, int inBufferPos, int offsetBits); + + /// Whether the content with this special id should be presented before or after its branch. + boolean shouldPresentSpecialAfterBranch(int id); + + /// Whether the content in this cell should be presented before or after its branch. + /// + /// Because this is used separately from getting and working with the content, reading the cell content is not + /// acceptable from performance point of view. This information needs to be part of the offset. + boolean shouldPresentAfterBranch(int offsetBits); + + /// Release all external references held. See [ContentManager#releaseReferencesUnsafe]. + @VisibleForTesting + void releaseReferencesUnsafe(); + + /// Make a string representation of the given id for debugging. + String dumpSpecial(int id); + + /// Make a string representation of the given cell for debugging. + String dumpContent(UnsafeBuffer buffer, int inBufferPos, int offsetBits); +} diff --git a/src/java/org/apache/cassandra/db/tries/Cursor.java b/src/java/org/apache/cassandra/db/tries/Cursor.java new file mode 100644 index 000000000000..663ca7fe86f8 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/Cursor.java @@ -0,0 +1,593 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.Function; +import java.util.function.Predicate; + +import javax.annotation.Nullable; + +import org.agrona.DirectBuffer; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// A trie cursor. +/// +/// This is the internal representation of a trie, which enables efficient walks and basic operations (merge, +/// slice) on tries. +/// +/// The cursor represents the state of a walk over the nodes of trie. It provides three main features: +/// - the current [#depth] or descend-depth in the trie; +/// - the [#incomingTransition], i.e. the byte that was used to reach the current point; +/// - the [#content] associated with the current node, +/// +/// and provides methods for advancing to the next position. This is enough information to extract all paths, and +/// also to easily compare cursors over different tries that are advanced together. Advancing is always done in +/// order; if one imagines the set of nodes in the trie with their associated paths, a cursor may only advance from a +/// node with a lexicographically smaller path to one with bigger. The [#advance] operation moves to the immediate +/// next, it is also possible to skip over some items to a specific position ahead ([#skipTo]). +/// +/// Moving to the immediate next position in the lexicographic order is accomplished by: +/// - if the current node has children, moving to its first child; +/// - otherwise, ascend the parent chain and return the next child of the closest parent that still has any. +/// +/// As long as the trie is not exhausted, advancing always takes one step down, from the current node, or from a node +/// on the parent chain. By comparing the new depth with the one before the advance, one can tell if the former was +/// the case (if `newDepth == oldDepth + 1`) and how many steps up we had to take (`oldDepth + 1 - newDepth`) if it +/// wasn't. When following a path down, the cursor will stop on all prefixes. +/// +/// When it is created the cursor is placed on the root node with `depth = 0`, `incomingTransition = 0`. +/// Since tries can have mappings for empty, `content()` can possibly be non-null. The cursor is exhausted when it +/// returns a depth of -1 (the operations that advance a cursor return the depth, and `depth()` will also +/// return -1 if queried afterwards). It is not allowed for a cursor to start in exhausted state; once a cursor is +/// exhausted, calling any of the advance methods or `tailTrie` is an error. +/// +/// For example, the following trie: +///

+///  t
+///   r
+///    e
+///     e *
+///    i
+///     e *
+///     p *
+///  w
+///   i
+///    n  *
+/// 
+/// has nodes reachable with the paths +/// `"", t, tr, tre, tree*, tri, trie*, trip*, w, wi, win*` +/// and the cursor will list them with the following `(depth, incomingTransition)` pairs: +/// `(0, -1), (1, t), (2, r), (3, e), (4, e)*, (3, i), (4, e)*, (4, p)*, (1, w), (2, i), (3, n)*` +/// +/// Because we exhaust transitions on bigger depths before we go the next transition on the smaller ones, when +/// cursors are advanced together, i.e. when we walk the combination of two or more cursors in order, where we would +/// want to advance the one that is earlier in the comparison order until it catches up, their positions can be easily +/// compared using only the `depth` and +/// `incomingTransition`: +/// - one that is higher in depth is before one that is lower; +/// - for equal depths, the one with smaller incomingTransition is first. +/// +/// If we consider walking the trie above in parallel with this: +///
+///  t
+///   r
+///    i
+///     c
+///      k *
+///  u
+///   p *
+/// 
+/// the combined iteration will proceed as follows:
+///  (0, -1)+  (0, -1)+          cursors equal, advance both
+///  (1, t)+   (1, t)+   t       cursors equal, advance both
+///  (2, r)+   (2, r)+   tr      cursors equal, advance both
+///  (3, e)+ < (3, i)    tre     cursors not equal, advance smaller (3 = 3, e < i)
+///  (4, e)+ < (3, i)    tree*   cursors not equal, advance smaller (4 > 3)
+///  (3, i)+   (3, i)+   tri     cursors equal, advance both
+///  (4, e)  > (4, c)+   tric    cursors not equal, advance smaller (4 = 4, e > c)
+///  (4, e)  > (5, k)+   trick*  cursors not equal, advance smaller (4 < 5)
+///  (4, e)+ < (1, u)    trie*   cursors not equal, advance smaller (4 > 1)
+///  (4, p)+ < (1, u)    trip*   cursors not equal, advance smaller (4 > 1)
+///  (1, w)  > (1, u)+   u       cursors not equal, advance smaller (1 = 1, w > u)
+///  (1, w)  > (2, p)+   up*     cursors not equal, advance smaller (1 < 2)
+///  (1, w)+ < (-1, -1)  w       cursors not equal, advance smaller (1 > -1)
+///  (2, i)+ < (-1, -1)  wi      cursors not equal, advance smaller (2 > -1)
+///  (3, n)+ < (-1, -1)  win*    cursors not equal, advance smaller (3 > -1)
+///  (-1, -1)  (-1, -1)          both exhasted
+///  
+/// +/// To improve performance, `depth` and `incomingTransition` are encoded together in a single long that is set up in +/// a way that lets us compare positions (in the sense of the demonstration above) by comparing the encoded position +/// as an integer. +/// +/// Cursors are created with a direction (forward or reverse), which specifies the order in which a node's children +/// are iterated (smaller first or larger first). Note that entries returned in reverse direction are in +/// lexicographic order for the inverted alphabet, which is not the same as being presented in reverse. For example, +/// a cursor for a trie containing "ab", "abc" and "cba", will visit the nodes in order "cba", "ab", "abc", i.e. +/// prefixes will still be reported before their descendants. +/// +/// Also see [Trie.md](./Trie.md) for further documentation. +interface Cursor +{ + int DEPTH_SHIFT = 32; + int TRANSITION_SHIFT = 20; + + /// 1 for reverse direction, 0 for forward. Used to xor transition bits for incomingTransition. + /// This takes part in comparisons but this does not matter positions are always compared with same direction. + /// This _must_ be 31, the code below takes advantage of this bit being the sign bit of (int) encodedPosition. + int DIRECTION_BIT = 31; + + /// An additional transition bit used to revisit positions on the way back after iterating the branch. + /// Used for sets and ranges to correctly define the range states for branch-inclusive ranges. + long ON_RETURN_PATH_BIT = 1L << 19; + + /// Mask of the transition bits including the direction. We apply xor with this value to form a position in the + /// reverse direction. + long TRANSITION_MASK = 0x8FFL << TRANSITION_SHIFT; + + long ROOT_POSITION_FORWARD = encode(0, 0, Direction.FORWARD); + long ROOT_POSITION_REVERSE = encode(0, 0, Direction.REVERSE); + long ROOT_POSITION_DEPTH = ROOT_POSITION_FORWARD & 0xFFFFFFFF00000000L; + + long EXHAUSTED_POSITION_FORWARD = encode(-1, 0, Direction.FORWARD); + long EXHAUSTED_POSITION_REVERSE = encode(-1, 0, Direction.REVERSE); + long EXHAUSTED_POSITION_DEPTH = EXHAUSTED_POSITION_FORWARD & 0xFFFFFFFF00000000L; + + long DEPTH_ADJUSTMENT_ONE = -1L << DEPTH_SHIFT; + + static int depth(long encodedPosition) + { + return ~(int) (encodedPosition >> DEPTH_SHIFT); + } + + static boolean isExhausted(long encodedPosition) + { + // Depth of -1 translates to positive encoding. + // This must also be true for other positive values that may be the result of adjusting depths. + return encodedPosition >= 0; + } + + /// Construct a "depth correction" adjustment that can be added to or subtract from positions to adjust the depth + /// by the depth of the given encoded position. + /// + /// The value is such that + /// `depth(somePosition) + depth(initialPosition) = depth(somePosition + depthCorrectionValue(initialPosition))` + /// including + /// `depth(ROOT_POSITION + depthCorrectionValue(encodedPosition)) == depth(encodedPosition)` + static long depthCorrectionValue(long encodedPosition) + { + return ((long) -depth(encodedPosition)) << DEPTH_SHIFT; + } + + static int incomingTransition(long encodedPosition) + { + int transitionInt = (int) encodedPosition; + transitionInt ^= transitionInt >> DIRECTION_BIT; // flip the transition bits if the direction bit is 1 + return (transitionInt >> TRANSITION_SHIFT) & 0xFF; + } + + static Direction direction(long encodedPosition) + { + return Direction.values()[((int) encodedPosition >>> DIRECTION_BIT) & 1]; + } + + /// Returns true if this position is on the return/ascent path. Positions on the ascent path are used to present + /// content or boundaries after the children of a node have been seen, e.g. to correctly order prefix content after + /// children in reverse order, or to close a range covering a branch. + static boolean isOnReturnPath(long encodedPosition) + { + return (encodedPosition & ON_RETURN_PATH_BIT) != 0; + } + + static long compare(long encoded1, long encoded2) + { + // This can support depth of 2^31 - 1 without overflowing. + return encoded1 - encoded2; + } + + static long rootPosition(Direction direction) + { + return direction.select(ROOT_POSITION_FORWARD, ROOT_POSITION_REVERSE); + } + + /// Returns the ascent path position for the root, i.e. the last point before going exhausted, as used by open-ended + /// ranges and sets. + static long rootReturnPosition(long prevPosition) + { + return ROOT_POSITION_DEPTH | ((((long) ((int) prevPosition) >> DIRECTION_BIT)) & TRANSITION_MASK) | ON_RETURN_PATH_BIT; + } + + static long exhaustedPosition(Direction direction) + { + return direction.select(EXHAUSTED_POSITION_FORWARD, EXHAUSTED_POSITION_REVERSE); + } + + static long exhaustedPosition(long prevPosition) + { + return EXHAUSTED_POSITION_DEPTH | ((((long) ((int) prevPosition) >> DIRECTION_BIT)) & TRANSITION_MASK); + } + + static boolean isRootPosition(long encodedPosition) + { + return encodedPosition == ROOT_POSITION_FORWARD || encodedPosition == ROOT_POSITION_REVERSE; + } + + static long encode(int depth, int transition, Direction direction) + { + assert depth >= -1; + assert transition <= 0xFF && transition >= 0; + // The xor below flips transition bits and also sets the direction bit to 1 for REVERSE direction. + long transitionXored = (transition << TRANSITION_SHIFT) ^ direction.select(0, TRANSITION_MASK); + return ((long) ~depth << DEPTH_SHIFT) | transitionXored; + } + + /// Returns the position that descends one byte from the given, i.e. + /// `positionForDescentWithByte(encodedPosition, incomingByte) == + /// encode(depth(encodedPosition) + 1, incomingByte, direction(encodedPosition))` + static long positionForDescentWithByte(long encodedPosition, int incomingByte) + { + assert !isOnReturnPath(encodedPosition) : "Can't descend from a return path position " + toString(encodedPosition); + long depthPart = (encodedPosition + DEPTH_ADJUSTMENT_ONE) & 0xFFFFFFFF00000000L; + long transitionXored = incomingByte ^ (((int) encodedPosition) >> DIRECTION_BIT); + return depthPart | ((transitionXored << TRANSITION_SHIFT) & TRANSITION_MASK); + } + + /// Returns a position that can be used to skip over the given branch. Note that this can only work when the + /// returned encoded position is a valid `skipTo` position for the current state. + static long positionForSkippingBranch(long encodedBranchPosition) + { + return encodedBranchPosition + (1L << TRANSITION_SHIFT); + } + + /// Returns true if the given `currPosition` as returned by `advance`, `advanceMultiple` or `skipTo` is the result + /// of ascending in the trie structure (i.e. if its depth is the same or smaller). + static boolean ascended(long currPosition, long prevPosition) + { + // Descending increases the depth, and thus results in a position that is "earlier" according to our comparison + // order. Any possible ascent position will be ordered higher than the current (it is either a lower depth or + // the same depth with a higher incoming transition). + return compare(currPosition, prevPosition) > 0; + } + + static String toString(long encodedPosition) + { + return String.format("depth %d incomingTransition %02x%s %s", + depth(encodedPosition), + incomingTransition(encodedPosition), + isOnReturnPath(encodedPosition) ? "↑" : " ", + direction(encodedPosition)); + } + + /// Returns the cursor's current position encoded as a long. This combines the depth and incoming transition as well + /// as other flags that are part of the state. Use the static methods above to compare, manipulate, decode or encode + /// cursor positions. + long encodedPosition(); + + /// @return the content associated with the current node. This may be non-null for any presented node, including + /// the root. + @Nullable + T content(); + + /// Returns the direction in which this cursor is progressing. + default Direction direction() + { + return direction(encodedPosition()); + } + + /// Returns the byte-comparable version that this trie uses. + ByteComparable.Version byteComparableVersion(); + + /// Advance one position to the node whose associated path is next lexicographically. + /// This can be either: + /// - descending one level to the first child of the current node, + /// - ascending to the closest parent that has remaining children, and then descending one level to its next + /// child. + /// + /// It is an error to call this after the trie has already been exhausted (i.e. when `depth() == -1`); + /// for performance reasons we won't always check this. + /// + /// @return encoded position after the advance (see [#encodedPosition()]). + long advance(); + + /// Advance, descending multiple levels if the cursor can do this for the current position without extra work + /// (e.g. when positioned on a chain node in a memtable trie). If the current node does not have children this + /// is exactly the same as advance(), otherwise it may take multiple steps down (but will not necessarily, even + /// if they exist). + /// + /// Note that if any positions are skipped, their content must be null. + /// + /// This is an optional optimization; the default implementation falls back to calling advance. + /// + /// It is an error to call this after the trie has already been exhausted (i.e. when `depth() == -1`); + /// for performance reasons we won't always check this. + /// + /// @param receiver object that will receive all transitions taken except the last; + /// on ascend, or if only one step down was taken, it will not receive any + /// @return encoded position after the advance (see [#encodedPosition()]). + default long advanceMultiple(TransitionsReceiver receiver) + { + return advance(); + } + + /// Advance all the way to the next node with non-null content. + /// + /// It is an error to call this after the trie has already been exhausted (i.e. when `depth() == -1`); + /// for performance reasons we won't always check this. + /// + /// @param receiver object that will receive all taken transitions + /// @return the content, null if the trie is exhausted + default T advanceToContent(ResettingTransitionsReceiver receiver) + { + long prevPosition = encodedPosition(); + while (true) + { + long currPosition = advanceMultiple(receiver); + if (isExhausted(currPosition)) + return null; + if (receiver != null) + { + if (ascended(currPosition, prevPosition)) + { + int depth = depth(currPosition); + if (depth > 0) + { + receiver.resetPathLength(depth - 1); + receiver.addPathByte(incomingTransition(currPosition)); + } + else + { + receiver.resetPathLength(0); + } + } + else + receiver.addPathByte(incomingTransition(currPosition)); + + if (isOnReturnPath(currPosition)) + receiver.onReturnPath(); + } + T content = content(); + if (content != null) + return content; + prevPosition = currPosition; + } + } + + /// Advance to the specified depth and incoming transition or the first valid position that is after the specified + /// position. The inputs must be something that could be returned by a single call to [#advance] (i.e. + /// `depth` must be <= current depth + 1, and `incomingTransition` must be higher than what the + /// current state saw at the requested depth). + /// This method must also support a transition value of 0x100, which may be used to request ascent from the current + /// position. + /// + /// @return encoded position after the skip; the new position will satisfy + /// `compare(returnedSkipPosition, encodedSkipPosition) >= 0`. + long skipTo(long encodedSkipPosition); + + /// A version of [#skipTo] which checks if the requested position is ahead of the cursor's current position and only + /// advances if it is. This can only be used if the [#skipTo] instruction is issued from a position that is behind + /// this cursor's (i.e. if the [#skipTo] request is to descend, it is assumed to descend from a position _before_ + /// this cursor's and will not be acted on). + /// + /// Used for parallel walks when one of the source cursors is known to be ahead of the current position. + default long skipToWhenAhead(long encodedSkipPosition) + { + long current = encodedPosition(); + if (compare(encodedSkipPosition, current) > 0) + return skipTo(encodedSkipPosition); + else + return current; + } + + /// Descend into the cursor with the given path. + /// + /// @return True if the descent is positioned at the end of the given path, false if the trie did not have a path + /// for it. In the latter case the cursor is positioned at the first node that follows the given key in iteration + /// order. + default boolean descendAlong(ByteSource bytes) + { + int next = bytes.next(); + long position = encodedPosition(); + while (next != ByteSource.END_OF_STREAM) + { + long nextPosition = positionForDescentWithByte(position, next); + if (compare(skipTo(nextPosition), nextPosition) != 0) + return false; + next = bytes.next(); + position = nextPosition; + } + return true; + } + + /// Returns a tail cursor, i.e. a cursor whose root is the current position. Walking a tail cursor will list all + /// descendants of the current position with depth adjusted by the current depth. + /// + /// It is an error to call `tailCursor` on an exhausted cursor. + /// + /// Descendants that override this class should return their specific cursor type. + Cursor tailCursor(Direction direction); + + /// Used by [#advanceMultiple] to feed the transitions taken. + interface TransitionsReceiver + { + /// Add a single byte to the path. + void addPathByte(int nextByte); + /// Add the count bytes from position pos in the given buffer. + void addPathBytes(DirectBuffer buffer, int pos, int count); + + /// Called when the current position is on the return path. Sets and ranges use these positions to return + /// end-inclusive ranges. + default void onReturnPath() + { + // nothing by default + } + } + + /// Used by [#advanceToContent] to track the transitions and backtracking taken. + interface ResettingTransitionsReceiver extends TransitionsReceiver + { + /// Delete all bytes beyond the given length. + void resetPathLength(int newLength); + } + + /// A push interface for walking over a trie. Builds upon [TransitionsReceiver] to be given the bytes of the + /// path, and adds methods called on encountering content and completion. + /// See [TrieDumper] for an example of how this can be used, and [TrieEntriesWalker] as a base class + /// for other common usages. + interface Walker extends Cursor.ResettingTransitionsReceiver + { + /// Called when content is found. + void content(T content); + + /// Called at the completion of the walk. + R complete(); + } + + /// Process the trie using the given [Walker]. + /// This method must only be called on a freshly constructed cursor. + default R process(Cursor.Walker walker) + { + assertFresh(); + T content = content(); // handle content on the root node + if (content == null) + content = advanceToContent(walker); + + while (content != null) + { + walker.content(content); + content = advanceToContent(walker); + } + return walker.complete(); + } + + /// Process the trie using the given [Walker], skipping over branches where content is found. + /// In other words, it walks the top levels of the trie until it finds a content-bearing node. When it does, it + /// presents this content and continues with the next sibling of that node, ignoring all substructure below it. + /// This is useful, for example, when the user uses content/metadata to mark levels of internal hierarchy and wants + /// to visit only the top-level elements. + /// + /// This is similar to [Trie#tailTries], but able to access only the content instead of the full branch. + /// + /// This method should only be called on a freshly constructed cursor. + default R processSkippingBranches(Predicate acceptancePredicate, Cursor.Walker walker) + { + assertFresh(); + T content = content(); // handle content on the root node + if (content != null && acceptancePredicate.test(content)) + { + walker.content(content); + return walker.complete(); + } + content = advanceToContent(walker); + + while (content != null) + { + if (acceptancePredicate.test(content)) + { + walker.content(content); + // skip over the branch by requesting a position that is beyond + long current = skipTo(positionForSkippingBranch(encodedPosition())); + if (isExhausted(current)) + break; + walker.resetPathLength(depth(current) - 1); + walker.addPathByte(incomingTransition(current)); + content = content(); + if (content == null) + content = advanceToContent(walker); + } + else + content = advanceToContent(walker); + } + return walker.complete(); + } + + class Empty implements Cursor + { + private final ByteComparable.Version byteComparableVersion; + long position; + + Empty(Direction direction, ByteComparable.Version byteComparableVersion) + { + assert byteComparableVersion != null; + this.byteComparableVersion = byteComparableVersion; + position = Cursor.rootPosition(direction); + } + + public long advance() + { + return position = exhaustedPosition(position); + } + + public long skipTo(long encodedSkipPosition) + { + return advance(); + } + + public ByteComparable.Version byteComparableVersion() + { + return byteComparableVersion; + } + + @Override + public Cursor tailCursor(Direction direction) + { + assert position == Cursor.rootPosition(direction) : "tailTrie called on exhausted cursor"; + return new Empty<>(direction, byteComparableVersion); + } + + @Override + public long encodedPosition() + { + return position; + } + + @Override + public T content() + { + return null; + } + } + + /// Dump the current branch. To be used for debugging only. + @SuppressWarnings("unused") + private String dumpBranch() + { + return dumpBranch(Direction.FORWARD); + } + + /// Dump the current branch. To be used for debugging only. + @SuppressWarnings("unused") + private String dumpBranch(Direction direction) + { + return dumpBranch(direction, Object::toString); + } + + /// Dump the current branch. To be used for debugging only. + private String dumpBranch(Direction direction, Function toStringFunction) + { + TrieDumper dumper = new TrieDumper.Plain<>(toStringFunction); + tailCursor(direction).process(dumper); + return dumper.complete(); + } + + default void assertFresh() + { + assert depth(encodedPosition()) == 0 : "The provided cursor has already been advanced."; + } +} diff --git a/src/java/org/apache/cassandra/db/tries/CursorWalkable.java b/src/java/org/apache/cassandra/db/tries/CursorWalkable.java new file mode 100644 index 000000000000..ecf3d85d7c2d --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/CursorWalkable.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.tries; + +/// Package-private interface for trie implementations, defining a method of extracting the internal cursor +/// representation of the trie. +/// +/// @param The specific type of cursor a descendant uses. +interface CursorWalkable +{ + /// Returns a cursor that can be used to walk over the trie. The cursor will be positioned at the root of the trie + /// and prepared to walk it in the given direction. + C cursor(Direction direction); +} diff --git a/src/java/org/apache/cassandra/db/tries/DeletionAwareCursor.java b/src/java/org/apache/cassandra/db/tries/DeletionAwareCursor.java new file mode 100644 index 000000000000..682f4cf05ac3 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/DeletionAwareCursor.java @@ -0,0 +1,515 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Cursor interface for deletion-aware tries that provides access to both live data and deletion branches. +/// +/// This cursor extends the basic [Cursor] interface to support the dual nature of deletion-aware tries, +/// where live data and deletion information coexist in a unified structure. The cursor walks the live +/// data portion of the trie while providing access to deletion branches through the +/// [#deletionBranchCursor] method. +/// +/// The cursor behaves like a standard trie cursor for live data, supporting all standard navigation and +/// content access operations inherited from [Cursor]. It can also be used as a plain trie cursor for +/// processing or iteration methods and classes, in which case only live data will be presented. +/// +/// At any position, this cursor can provide access to deletion branches through [#deletionBranchCursor], +/// which returns a [RangeCursor] covering deletion ranges rooted at the current position. The deletion +/// information is only reachable after taking and following deletion branches. When a consumer is interested +/// in the deletion information, it can be merged into the main tree using [LiveAndDeletionsMergeCursor] or +/// presented as the full range trie via [DeletionsTrieCursor]. +/// +/// Deletion-aware cursors must maintain strict structural invariants to ensure correctness and efficiency: +/// +/// **Non-Overlapping Deletion Branches**: No deletion branch can be covered by another deletion branch. +/// When a deletion branch exists at a given node, all descendants of that node must have null deletion +/// branches. This prevents nested deletion scopes and simplifies merge algorithms. +/// +/// **Well-Formed Deletion Branches**: Each deletion branch must be a properly constructed range trie: +/// - It cannot start or end with an active deletion (no open-ended ranges at boundaries). +/// - Every deletion opened by an entry must be closed by the next entry. +/// - Preceding state must be correctly reported for all positions. +/// +/// **Deletion Consistency**: There cannot be live entries in the trie that are deleted by deletion +/// branches in the same trie. This ensures that the trie represents a consistent view where deletions +/// have been properly applied. +/// +/// @param The content type for live data in the trie +/// @param The deletion marker type, must extend `RangeState` +public interface DeletionAwareCursor> extends Cursor +{ + /// Returns the deletion branch rooted at the current cursor position, if any. + /// + /// This method provides access to deletion information associated with the current position in the + /// trie and nodes below it. The deletion branch is represented as a [RangeCursor] that can cover + /// ranges of keys with deletion markers. It is presented as a tail cursor for the current position, + /// i.e. it starts with depth 0 and cannot extend beyond the current position. + /// + /// When this method returns a non-null deletion branch, the source cursor is not allowed to return another deletion + /// branch in the covered branch. In other words, for any given path in the trie there must be at most one node + /// where [#deletionBranchCursor] is non-null. + /// + /// @param direction The direction for traversing the deletion branch. + /// @return A range cursor for deletions at this position, or null if no deletion branch is defined at this level. + RangeCursor deletionBranchCursor(Direction direction); + + @Override + DeletionAwareCursor tailCursor(Direction direction); + + + /// Process the trie using the given [DeletionAwareWalker], providing access to both live and + /// deletion branches. + default R process(DeletionAwareWalker walker) + { + assertFresh(); + long currentPosition = encodedPosition(); + + while (true) + { + T content = content(); // handle content on the root node + if (content != null) + walker.content(content); + RangeCursor deletionBranch = deletionBranchCursor(direction()); + if (deletionBranch != null && walker.enterDeletionsBranch()) + { + processDeletionBranch(walker, deletionBranch); + walker.exitDeletionsBranch(); + } + + long prevPosition = currentPosition; + currentPosition = advanceMultiple(walker); + if (Cursor.isExhausted(currentPosition)) + break; + if (Cursor.ascended(currentPosition, prevPosition)) + walker.resetPathLength(Cursor.depth(currentPosition) - 1); + walker.addPathByte(Cursor.incomingTransition(currentPosition)); + } + + return walker.complete(); + } + + /// Process a deletion branch using the given walker. + private static void processDeletionBranch(DeletionAwareWalker walker, Cursor cursor) + { + cursor.assertFresh(); + D content = cursor.content(); // handle content on the root node + if (content == null) + content = cursor.advanceToContent(walker); + + while (content != null) + { + walker.deletionMarker(content); + content = cursor.advanceToContent(walker); + } + } + + /// Walker interface extended to also process deletion branches. + interface DeletionAwareWalker extends Walker + { + /// Called when a deletion branch is found. Return false to skip over it, or true to use to descend inside + /// it. If this returns true, this walker will go through the deletion branch, call [#deletionMarker] for + /// all content of the deletion branch and exit the branch by calling [#exitDeletionsBranch] when it is + /// exhausted, after which it will start the walk over the data branch of this node. + /// + /// Note that the depth given by [#resetPathLength] in the deletion branch will be relative to the root of the + /// deletion branch. See [TrieEntriesWalker.DeletionAware] for an example of handling this. + default boolean enterDeletionsBranch() + { + // do nothing by default + return true; + } + + /// Called for every deletion marker found in the deletion branch. + void deletionMarker(D marker); + + /// Called when the deletion branch is exited. + default void exitDeletionsBranch() + { + // do nothing by default + } + } + + /// A cursor merging the live data and deletion markers of a deletion-aware trie into a combined trie. + class LiveAndDeletionsMergeCursor, Z> + extends FlexibleMergeCursor.WithMappedContent, RangeCursor, Z> + { + LiveAndDeletionsMergeCursor(BiFunction resolver, DeletionAwareCursor c1) + { + super(resolver, c1); + postAdvance(encodedPosition()); + } + + LiveAndDeletionsMergeCursor(BiFunction resolver, DeletionAwareCursor c1, RangeCursor c2) + { + super(resolver, c1, c2); + postAdvance(encodedPosition()); + } + + @Override + long postAdvance(long encodedPosition) + { + if (state == State.C1_ONLY) + { + RangeCursor deletionsBranch = c1.deletionBranchCursor(direction()); + if (deletionsBranch != null) + addCursor(deletionsBranch); + } + return encodedPosition; + } + + @Override + public LiveAndDeletionsMergeCursor tailCursor(Direction direction) + { + switch (state) + { + case C1_ONLY: + return new LiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction)); + case AT_C2: + return new LiveAndDeletionsMergeCursor<>(resolver, new DeletionAwareCursor.Empty<>(direction, byteComparableVersion()), c2.tailCursor(direction)); + case AT_C1: + return new LiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction), c2.precedingStateCursor(direction)); + case AT_BOTH: + return new LiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction), c2.tailCursor(direction)); + default: + throw new AssertionError(); + } + } + + /// Returns an unmerged tail cursor that includes the data and deletion branches applicable to the current + /// point. Used by [TrieTailsIterator.DeletionAware]. + /// + /// @param includeCoveringDeletion If false, any covering deletion will not be included in the tail deletion + /// branch, including the internal ranges where the covering deletion applies. + public DeletionAwareTrie deletionAwareTail(boolean includeCoveringDeletion) + { + if (Cursor.isOnReturnPath(encodedPosition())) + return null; + + switch (state) + { + case C1_ONLY: + return combineTails(c1, null); + case AT_C2: + return combineTails(null, includeCoveringDeletion ? c2 : dropCoveringDeletions(c2)); + case AT_C1: + return combineTails(c1, includeCoveringDeletion ? c2.precedingStateCursor(direction()) : null); + case AT_BOTH: + return combineTails(c1, includeCoveringDeletion ? c2 : dropCoveringDeletions(c2)); + default: + throw new AssertionError(); + } + } + } + + /// Returns a wrapped version of the given cursor that drops any deletion that applies to the current point as + /// a covering deletion. Used to prepare a cursor for taking its tail when `ignoreCoveringDeletion` is false. + static > RangeCursor dropCoveringDeletions(RangeCursor cursor) + { + D state = cursor.state(); + if (state == null) + return cursor; + // If a covering state applies, it must be the left side of the state. + D preceeding = state.precedingState(cursor.direction()); + if (preceeding == null) + return cursor; + return new ContentMappingCursor.Range<>(s -> dropDeletion(s, preceeding), cursor); + } + + private static > D dropDeletion(D state, D toDrop) + { + if (state.isBoundary()) + { + boolean dropLeft = toDrop.equals(state.precedingState(Direction.FORWARD)); + boolean dropRight = toDrop.equals(state.succedingState(Direction.FORWARD)); + if (!dropLeft && !dropRight) + return state; + return state.restrict(!dropLeft, !dropRight); + } + else + return state.equals(toDrop) ? null : state; + } + + /// Returns a tail trie formed by combining the tail tries of the positions of the given live and deletion cursors, + /// correcting for any of the arguments being null. + static > + DeletionAwareTrie combineTails(DeletionAwareCursor c, RangeCursor deletionBranch) + { + if (c == null && deletionBranch == null) + return null; + + // Create a trie cursor now to make sure changes to c or deletionBranch do not affect it. + DeletionAwareCursor cursor = combineTailCursors(c, deletionBranch); + + return dir -> cursor.tailCursor(dir); + } + + private static > + DeletionAwareCursor combineTailCursors(DeletionAwareCursor c, RangeCursor deletionBranch) + { + if (c != null) + { + Direction direction = c.direction(); + if (deletionBranch != null) + return new PrefixedCursor.DeletionAwareSeparately<>(ByteComparable.EMPTY, + c.tailCursor(direction), + deletionBranch.tailCursor(direction)); + else + return c.tailCursor(direction); + } + else if (deletionBranch != null) + { + // fix the position of the deletion branch + Direction direction = deletionBranch.direction(); + deletionBranch = deletionBranch.tailCursor(direction); + return new SingletonCursor.DeletionBranch<>(direction, + ByteSource.EMPTY, + deletionBranch.byteComparableVersion(), + deletionBranch::tailCursor); + } + else + return null; + } + + /// A variant of [LiveAndDeletionsMergeCursor] that can be asked to stop issuing deletion markers. + class SwitchableLiveAndDeletionsMergeCursor, Z> + extends LiveAndDeletionsMergeCursor + implements DeletionAwareTrie.DeletionsStopControl + { + boolean stopIssuingDeletions; + + SwitchableLiveAndDeletionsMergeCursor(BiFunction resolver, DeletionAwareCursor c1) + { + super(resolver, c1); + this.stopIssuingDeletions = false; + } + + SwitchableLiveAndDeletionsMergeCursor(BiFunction resolver, DeletionAwareCursor c1, boolean stopIssuingDeletions) + { + super(resolver, c1); + this.stopIssuingDeletions = stopIssuingDeletions; + } + + SwitchableLiveAndDeletionsMergeCursor(BiFunction resolver, DeletionAwareCursor c1, RangeCursor c2) + { + super(resolver, c1, c2); + this.stopIssuingDeletions = false; + } + + public void stopIssuingDeletions(ResettingTransitionsReceiver receiver) + { + stopIssuingDeletions = true; + // drop any already open deletion branch + switch (state) + { + case AT_C2: + // we need to exit the deletion branch at the next advance + c2 = RangeCursor.empty(direction(), byteComparableVersion()); + break; + default: + state = State.C1_ONLY; + c2 = null; + break; + } + } + + @Override + long postAdvance(long encodedPosition) + { + if (stopIssuingDeletions) + return encodedPosition; + return super.postAdvance(encodedPosition); + } + + @Override + public SwitchableLiveAndDeletionsMergeCursor tailCursor(Direction direction) + { + switch (state) + { + case C1_ONLY: + return new SwitchableLiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction), stopIssuingDeletions); + case AT_C2: + // If stopIssuingDeletions was just set, c2 is empty thus we return an empty cursor as expected. + return new SwitchableLiveAndDeletionsMergeCursor<>(resolver, new DeletionAwareCursor.Empty<>(direction, byteComparableVersion()), c2.tailCursor(direction)); + // we can't reach any of the other states if stopIssuingDeletions is true + case AT_C1: + return new SwitchableLiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction), c2.precedingStateCursor(direction)); + case AT_BOTH: + return new SwitchableLiveAndDeletionsMergeCursor<>(resolver, c1.tailCursor(direction), c2.tailCursor(direction)); + default: + throw new AssertionError(); + } + } + } + + /// A cursor presenting the deletion markers of a deletion-aware trie. + /// + /// This cursor combines all deletion branches into a single trie. Because it is not known where a deletion branch + /// can be introduced, this cursor has to walk all nodes of the live trie that are not covered by a deletion branch, + /// returning (likely a lot of) unproductive branches where a deletion is not defined. + class DeletionsTrieCursor> + extends FlexibleMergeCursor, RangeCursor, D> implements RangeCursor + { + DeletionsTrieCursor(DeletionAwareCursor c1) + { + super(c1); + postAdvance(encodedPosition()); + } + + @Override + public D state() + { + return c2 != null ? c2.state() : null; + } + + @Override + public D precedingState() + { + return c2 != null ? c2.precedingState() : null; + } + + @Override + public D content() + { + return c2 != null ? c2.content() : null; + } + + @Override + long postAdvance(long encodedPosition) + { + switch (state) + { + case AT_C2: + // already in deletion branch + break; + case C1_ONLY: + RangeCursor deletionsBranch = c1.deletionBranchCursor(direction()); + if (deletionsBranch != null) + { + addCursor(deletionsBranch); + // deletion branches cannot be nested; skip past the current position in the main trie as we + // don't need to further track it inside this branch + c1.skipTo(Cursor.positionForSkippingBranch(encodedPosition)); + state = State.AT_C2; + } + break; + default: + throw new AssertionError("Deletion branch extends above its introduction"); + } + return encodedPosition; + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + switch (state) + { + case AT_C2: + return c2.tailCursor(direction); + case C1_ONLY: + return new DeletionsTrieCursor<>(c1.tailCursor(direction)); + default: + throw new AssertionError("Deletion branch extends above its introduction"); + } + } + } + + class Empty> + extends Cursor.Empty implements DeletionAwareCursor + { + public Empty(Direction direction, ByteComparable.Version byteComparableVersion) + { + super(direction, byteComparableVersion); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new DeletionAwareCursor.Empty<>(direction, byteComparableVersion()); + } + } + + class Wrapping> implements DeletionAwareCursor + { + final Cursor source; + + public Wrapping(Cursor source) + { + this.source = source; + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return null; + } + + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } + + @Override + public T content() + { + return source.content(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public long advance() + { + return source.advance(); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return source.advanceMultiple(receiver); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return source.skipTo(encodedSkipPosition); + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new Wrapping<>(source.tailCursor(direction)); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/DeletionAwareMergeSource.java b/src/java/org/apache/cassandra/db/tries/DeletionAwareMergeSource.java new file mode 100644 index 000000000000..cc22b2716bf5 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/DeletionAwareMergeSource.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; + +import javax.annotation.Nullable; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// A cursor applying deletions to a deletion-aware cursor, where the deletions can be dynamically added. +/// Based on [RangeApplyCursor] and used by [MergeCursor.DeletionAware] to process each source with the deletions of the +/// other. The cursor will present the content of the data trie modified by any applicable/covering range of the +/// deletion trie, and will leave the deletion branches unmodied (allowing the merger to process them). +class DeletionAwareMergeSource, E extends RangeState> implements DeletionAwareCursor +{ + final BiFunction resolver; + final DeletionAwareCursor data; + @Nullable RangeCursor deletions; + long deletionsDepthCorrection; + + boolean atDeletions; + + DeletionAwareMergeSource(BiFunction resolver, DeletionAwareCursor data) + { + this(resolver, data, null); + } + + DeletionAwareMergeSource(BiFunction resolver, DeletionAwareCursor data, RangeCursor deletions) + { + this.resolver = resolver; + this.deletions = deletions; + this.data = data; + this.deletionsDepthCorrection = 0; + data.assertFresh(); + if (deletions != null) + deletions.assertFresh(); + atDeletions = deletions != null; + } + + @Override + public long encodedPosition() + { + return data.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + assert deletions == null || deletions.byteComparableVersion() == data.byteComparableVersion() : + "Merging cursors with different byteComparableVersions: " + + deletions.byteComparableVersion() + " vs " + data.byteComparableVersion(); + return data.byteComparableVersion(); + } + + @Override + public long advance() + { + long newDataPosition = data.advance(); + + if (deletions == null) + return newDataPosition; + else if (atDeletions) // if both cursors were at the same position, always advance the deletions' cursor to catch up. + return skipDeletionsToDataPosition(newDataPosition); + else // otherwise skip deletions to the new data position only if it advances past the deletions' current position. + return maybeSkipDeletions(newDataPosition); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + long newDataPosition = data.skipTo(encodedSkipPosition); + + if (deletions == null) + return newDataPosition; + else if (atDeletions) // if both cursors were at the same position, always advance the deletions' cursor to catch up. + return skipDeletionsToDataPosition(newDataPosition); + else // otherwise skip deletions to the new data position only if it advances past the deletions' current position. + return maybeSkipDeletions(newDataPosition); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + if (deletions == null) + return data.advanceMultiple(receiver); + + // While we are on a shared position, we must descend one byte at a time to maintain the cursor ordering. + if (atDeletions) + return skipDeletionsToDataPosition(data.advance()); + else // atData only + return maybeSkipDeletions(data.advanceMultiple(receiver)); + } + + long maybeSkipDeletions(long dataPosition) + { + long deletionsPosition = deletions.encodedPosition() + deletionsDepthCorrection; + + // If data position is at or before the deletions position, we are good. + long cmp = Cursor.compare(dataPosition, deletionsPosition); + if (cmp <= 0) + return setAtDeletionsAndReturnPosition(cmp == 0, dataPosition); + + // Deletions cursor is before data cursor. + return skipDeletionsToDataPosition(dataPosition); + } + + private long skipDeletionsToDataPosition(long dataPosition) + { + // Skip deletions cursor to the data position; if that is beyond the branch's root, no need to skip, just leave it. + long deletionsSkipPosition = dataPosition - deletionsDepthCorrection; + long deletionsPositionUncorrected = !Cursor.isExhausted(deletionsSkipPosition) + ? deletions.skipTo(deletionsSkipPosition) + : Cursor.exhaustedPosition(deletionsSkipPosition); + if (Cursor.isExhausted(deletionsPositionUncorrected)) + return leaveDeletionsBranch(dataPosition); + else + return setAtDeletionsAndReturnPosition(deletionsPositionUncorrected == deletionsSkipPosition, + dataPosition); + } + + private long leaveDeletionsBranch(long dataPosition) + { + deletions = null; + return setAtDeletionsAndReturnPosition(false, dataPosition); + } + + private long setAtDeletionsAndReturnPosition(boolean atDeletions, long position) + { + this.atDeletions = atDeletions; + return position; + } + + @Override + public T content() + { + T content = data.content(); + if (content == null) + return null; + if (deletions == null) + return content; + + E applicableDeletions = atDeletions ? deletions.content() : null; + if (applicableDeletions == null) + { + applicableDeletions = deletions.precedingState(); + if (applicableDeletions == null) + return content; + } + + return resolver.apply(applicableDeletions, content); + } + + @Override + public DeletionAwareMergeSource tailCursor(Direction direction) + { + if (atDeletions) + return new DeletionAwareMergeSource<>(resolver, data.tailCursor(direction), deletions.tailCursor(direction)); + else if (deletions != null) + return new DeletionAwareMergeSource<>(resolver, data.tailCursor(direction), deletions.precedingStateCursor(direction)); + else + return new DeletionAwareMergeSource<>(resolver, data.tailCursor(direction)); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + // Return unchanged, to be handled by MergeCursor. + return data.deletionBranchCursor(direction); + } + + public void addDeletions(RangeCursor deletions) + { + assert this.deletions == null; + deletions.assertFresh(); + this.deletions = deletions; + this.deletionsDepthCorrection = Cursor.depthCorrectionValue(data.encodedPosition()); + this.atDeletions = true; + } + + public boolean hasDeletions() + { + return deletions != null; + } + + public static , E extends RangeState> + DeletionAwareMergeSource empty(Direction direction, ByteComparable.Version version) + { + return new DeletionAwareMergeSource(null, new DeletionAwareCursor.Empty<>(direction, version)); + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/tries/DeletionAwareTrie.java b/src/java/org/apache/cassandra/db/tries/DeletionAwareTrie.java new file mode 100644 index 000000000000..d79c0c48e430 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/DeletionAwareTrie.java @@ -0,0 +1,760 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Predicate; + +import com.google.common.collect.ImmutableList; + +import org.agrona.DirectBuffer; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Deletion-aware trie interface that combines live data and deletion information in a unified structure. +/// +/// This class implements the definitions of some simple deletion-aware tries (live singletons, deleted ranges), as +/// well as the main algebraic operations: +/// - intersecting a deletion-aware trie with a set/range, returning only the live paths and content covered by the +/// set, as well as any applicable range restricted to the bounds of the set; +/// - merging deletion-aware tries, applying the deletions of each set to the live data of the others and properly +/// combining the deletion branches of all sources. +/// +/// It also provides methods of processing and iterating over the live content of the trie, as well as means +/// of obtaining the full range deletion view of the trie and a combined data-with-deletions view. +/// +/// The structure of a deletion-aware trie presents the live data in its normal paths, and deleted ranges +/// in additional "deletion branches". The objective of this split is to be able to separately and efficiently +/// query the two: on one hand, to search for the closest live entry without having to walk paths leading to +/// deleted data, and on the other, to be able to find the covering deletions affecting any position in the +/// trie. With this design, both can be achieved in time proportional to the length of the key. +/// +/// For efficiency there can only be at most one deletion branch defined for any path in the trie. I.e. a deletion +/// branch cannot cover another deletion branch. To additionally improve merge performance we also support a mode +/// of operation where it is known that the depth at which a deletion can be introduced is determined in advance for +/// every path for all sources (in other words, that if one source defines a deletion branch at one point, none of the +/// other sources can define a deletion branch below it); this is the mode of operation intended for use in Cassandra +/// memtables and sstables, where deletion branches are defined at the root of each partition. +/// +/// It is also expected that a deletion-aware trie does not contain any live data that is deleted by its own deletion +/// branches. If such data exists, whether it is preserved after transformations is undefined. +/// +/// See [DeletionAwareCursor] for details on cursor operations and [InMemoryDeletionAwareTrie] for the +/// concrete in-memory implementation. +/// +/// @param The content type for live data in the trie +/// @param The deletion marker type, must extend [RangeState] for range operations +public interface DeletionAwareTrie> +extends BaseTrie, DeletionAwareTrie> +{ + /// Creates a singleton deletion-aware trie containing only live data at the specified key. + /// + /// This method creates a trie with a single entry mapping the given byte-comparable key to the provided + /// content. The resulting trie contains no deletion information and behaves similarly to a regular + /// [Trie#singleton], but is compatible with deletion-aware operations. + /// + /// @param b The byte-comparable key for the content + /// @param byteComparableVersion The version to use for byte-comparable serialization + /// @param v The content to associate with the key + /// @return A deletion-aware trie containing the single key-value mapping + static > + DeletionAwareTrie singleton(ByteComparable b, ByteComparable.Version byteComparableVersion, T v) + { + return dir -> new SingletonCursor.DeletionAware<>(dir, b.asComparableBytes(byteComparableVersion), byteComparableVersion, v); + } + + /// Creates a deletion-aware trie containing a single deletion range. + /// + /// This method creates a trie that represents a deletion covering the range from `prefixInDataTrie`+`left` to + /// `prefixInDataTrie`+`right`. The deletion is presented as a deletion branch at the specified prefix, allowing + /// the user to take advantage of predefined deletion-branch positions. + /// + /// This version is inclusive on the left side and exclusive on the right. + /// + /// @param prefixInDataTrie The position in the data trie where this deletion branch is rooted + /// @param left The left boundary of the deletion range, inclusive + /// @param right The right boundary of the deletion range, exclusive + /// @param byteComparableVersion The version to use for byte-comparable serialization + /// @param deletion A _covering_ range state that defines the deletion information + /// @return A deletion-aware trie containing the deletion range + static > + DeletionAwareTrie deletedRange(ByteComparable prefixInDataTrie, ByteComparable left, ByteComparable right, ByteComparable.Version byteComparableVersion, D deletion) + { + return deletedRange(prefixInDataTrie, left, true, right, false, byteComparableVersion, deletion); + } + + /// Creates a deletion-aware trie containing a single deletion range. + /// + /// This method creates a trie that represents a deletion covering the range from `prefixInDataTrie`+`left` to + /// `prefixInDataTrie`+`right`. The deletion is presented as a deletion branch at the specified prefix, allowing + /// the user to take advantage of predefined deletion-branch positions. + /// + /// This version is inclusive on the left side and exclusive on the right. + /// + /// @param prefixInDataTrie The position in the data trie where this deletion branch is rooted + /// @param left The left boundary of the deletion range + /// @param leftInclusive Whether the range should include the left bound and its descendants. + /// @param right The right boundary of the deletion range, exclusive + /// @param rightInclusive Whether the range should include the right bound and its descendants. + /// @param byteComparableVersion The version to use for byte-comparable serialization + /// @param deletion A _covering_ range state that defines the deletion information + /// @return A deletion-aware trie containing the deletion range + static > + DeletionAwareTrie deletedRange(ByteComparable prefixInDataTrie, + ByteComparable left, boolean leftInclusive, + ByteComparable right, boolean rightInclusive, + ByteComparable.Version byteComparableVersion, + D deletion) + { + RangeTrie rangeTrie = RangeTrie.range(left, leftInclusive, right, rightInclusive, byteComparableVersion, deletion); + return deletionBranch(prefixInDataTrie, byteComparableVersion, rangeTrie); + } + + /// Creates a deletion-aware trie from an existing range trie representing deletions. + /// + /// This method allows for more complex deletion patterns by accepting a pre-constructed [RangeTrie] + /// that may contain multiple ranges, boundaries, and complex deletion states. This is useful for + /// advanced scenarios where simple range deletions are insufficient. + /// + /// @param prefixInDataTrie The position in the data trie where this deletion branch is to be rooted + /// @param byteComparableVersion The version to use for byte-comparable serialization + /// @param rangeTrie A pre-constructed range trie representing the deletion pattern + /// @return A deletion-aware trie containing the deletion branch + static > + DeletionAwareTrie deletionBranch(ByteComparable prefixInDataTrie, ByteComparable.Version byteComparableVersion, RangeTrie rangeTrie) + { + return dir -> new SingletonCursor.DeletionBranch<>(dir, + prefixInDataTrie.asComparableBytes(byteComparableVersion), + byteComparableVersion, + rangeTrie); + } + + /// Wraps a plain data trie into a deletion-aware one. The resulting trie has no deletion branches and matches + /// `plainTrie` in all positions and content. + static > + DeletionAwareTrie wrap(Trie plainTrie) + { + return dir -> new DeletionAwareCursor.Wrapping<>(plainTrie.cursor(dir)); + } + + /// @inheritDoc + /// The returned deletion branches will be restricted to the bounds of the set; i.e. any ranges extending beyond + /// boundaries of the set will be cut to the confines of the set. + @Override + default DeletionAwareTrie intersect(TrieSet set) + { + return dir -> new IntersectionCursor.DeletionAware<>(cursor(dir), set.cursor(dir)); + } + + /// Specialized merge resolver for deletion-aware trie operations. + /// + /// This interface extends the basic [Trie.MergeResolver] to handle the additional complexity of + /// deletion-aware merging, including deletion marker resolution and deletion application logic. + /// + /// During merge operations, this resolver handles three types of conflicts: + /// - **Live Data Conflicts**: Resolved using inherited [#resolve] method + /// - **Deletion Marker Conflicts**: Resolved using [#resolveMarkers] method + /// - **Deletion Application**: Applied using [#applyMarker] method + /// + /// Additionally, this also provides the [#deletionsAtFixedPoints] flag, which significantly improves merge + /// performance when the user can guarantee that deletion branches are only introduced at predefined positions. + interface MergeResolver> extends Trie.MergeResolver + { + /// Resolves conflicts between deletion markers from different sources. + /// + /// It is expected that this method will return the overriding deletion marker (e.g. the one with the higher + /// timestamp), or some combination of information from the two markers. + /// + /// @param left Deletion marker from the left source (order not guaranteed) + /// @param right Deletion marker from the right source (order not guaranteed) + /// @return The resolved deletion marker, or null if deletions cancel out + D resolveMarkers(D left, D right); + + /// Applies a deletion marker to live content, potentially removing or modifying it. + /// + /// This method defines how deletions affect live data during merge operations. The + /// implementation determines whether the content should be deleted, partially modified, + /// or left unchanged based on the deletion marker's properties. + /// + /// @param marker The deletion marker to apply + /// @param content The live content that may be affected by the deletion + /// @return The content after deletion application, or null if completely deleted + T applyMarker(D marker, T content); + + /// Indicates whether deletions occur at predetermined points in the trie structure. + /// + /// This is a critical performance optimization. When true, guarantees that if one merge source + /// has a deletion branch at some position, the other source cannot have deletion branches + /// below or above that position. This allows us to skip walking the data trie to look for + /// lower-level deletion branches when merging. If the flag is false, we cannot know where + /// in the covered branch we may have a deletion, thus to be sure to find all we _must_ + /// walk the whole data subtrie. This can be terribly expensive. + boolean deletionsAtFixedPoints(); + } + + /// Constructs a view of the merge of this deletion-aware trie with another, applying deletions during the merge + /// process. The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// This merge applies each source's deletions to the other source's live data, and merges deletion branches + /// to form a valid deletion-aware trie. + /// + /// The resolvers will only be called if both sources contains data for a given position, with arguments presented + /// in arbitrary order. + /// + /// @param other The other deletion-aware trie to merge with. + /// @param mergeResolver Resolver for live data conflicts between the two tries. + /// @param deletionResolver Resolver for deletion marker conflicts. See [MergeResolver#resolveMarkers]. + /// @param deleter Function to apply deletion markers to live content. See [MergeResolver#applyMarker]. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See [MergeResolver#deletionsAtFixedPoints]. + /// @return A live view of the merged tries with deletions applied + default DeletionAwareTrie mergeWith(DeletionAwareTrie other, + Trie.MergeResolver mergeResolver, + Trie.MergeResolver deletionResolver, + BiFunction deleter, + boolean deletionsAtFixedPoints) + { + return dir -> new MergeCursor.DeletionAware<>(mergeResolver, + deletionResolver, + deleter, + cursor(dir), + other.cursor(dir), + deletionsAtFixedPoints); + } + + /// Constructs a view of the merge of this deletion-aware trie with another, applying deletions during the merge + /// process. The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// This merge applies each source's deletions to the other source's live data, and merges deletion branches + /// to form a valid deletion-aware trie. + /// + /// The resolvers will only be called if both sources contains data for a given position, with arguments presented + /// in arbitrary order. + /// + /// @param other The other deletion-aware trie to merge with + /// @param mergeResolver Unified [MergeResolver] providing the merge logic + /// @return A live view of the merged tries with deletions applied + default DeletionAwareTrie mergeWith(DeletionAwareTrie other, MergeResolver mergeResolver) + { + return mergeWith(other, mergeResolver, mergeResolver::resolveMarkers, mergeResolver::applyMarker, mergeResolver.deletionsAtFixedPoints()); + } + + /// Constructs a view of the merge of this deletion-aware trie with a deletion. This has the same effect as merging + /// the trie with a deletion-aware trie containing only a deletion branch with the given data at its root. + /// + /// This merge applies the incoming deletions to this trie's live data, and merges it into this trie's deletion + /// branch, hoisting it to the root of the trie if necessary. + /// + /// The resolvers will only be called if both sources contains data for a given position, with arguments presented + /// in arbitrary order. + /// + /// @param deletionTrie Range trie specifying the deletion to apply. + /// @param deletionResolver Resolver for deletion marker conflicts. See [MergeResolver#resolveMarkers]. + /// @param deleter Function to apply deletion markers to live content. See [MergeResolver#applyMarker]. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See [MergeResolver#deletionsAtFixedPoints]. + /// @return A live view of the merged tries with deletions applied + default DeletionAwareTrie mergeWithDeletion(RangeTrie deletionTrie, + BiFunction deleter, + Trie.MergeResolver deletionResolver, + boolean deletionsAtFixedPoints) + { + // TODO: Optimize/simplify + return mergeWith(deletionBranch(ByteComparable.EMPTY, + deletionTrie.cursor(Direction.FORWARD).byteComparableVersion(), + deletionTrie), + throwingResolver(), + deletionResolver, + deleter, + deletionsAtFixedPoints); + } + + /// Constructs a view of the merge of this deletion-aware trie with another, applying deletions during the merge + /// process and a transformation over all values. The view is live, i.e. any write to any of the sources will be + /// reflected in the merged view. + /// + /// This merge applies each source's deletions to the other source's live data, and merges deletion branches + /// to form a valid deletion-aware trie. + /// + /// The resolvers will be called for all content and deletion boundaries, with the other argument being null if a + /// value applies in only one source. + /// + /// @param other The other deletion-aware trie to merge with + /// @param mergeResolver Resolver for live data conflicts between the two tries. + /// @param deletionResolver Resolver for deletion marker conflicts. See [MergeResolver#resolveMarkers]. + /// @param deleter1 Function to apply deletion markers from the other source to live content in this. + /// See [MergeResolver#applyMarker]. + /// @param deleter2 Function to apply deletion markers from this source to live content in the other. + /// See [MergeResolver#applyMarker]. + /// @return A live view of the merged tries with deletions applied and data transformed + default , R, Q extends RangeState> + DeletionAwareTrie mappingMergeWith(DeletionAwareTrie other, + BiFunction mergeResolver, + BiFunction deletionResolver, + BiFunction deleter1, + BiFunction deleter2, + boolean deletionsAtFixedPoints) + { + return dir -> new MergeCursor.DeletionAwareMapping<>(mergeResolver, + deletionResolver, + deleter1, + deleter2, + cursor(dir), + other.cursor(dir), + deletionsAtFixedPoints); + } + + /// Constructs a view of the merge of this deletion-aware trie with a deletion. This has the same effect as merging + /// the trie with a deletion-aware trie containing only a deletion branch with the given data at its root. + /// + /// This merge applies the incoming deletions to this trie's live data, and merges it into this trie's deletion + /// branch, hoisting it to the root of the trie if necessary. + /// + /// The resolvers will be called for all content and deletion boundaries, with the other argument being null if a + /// value applies in only one source. + /// + /// @param deletionTrie Range trie specifying the deletion to apply. + /// @param deletionResolver Resolver for deletion marker conflicts. See [MergeResolver#resolveMarkers]. + /// @param deleter Function to apply deletion markers to live content. See [MergeResolver#applyMarker]. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See [MergeResolver#deletionsAtFixedPoints]. + /// @return A live view of the merged tries with deletions applied and data transformed + default , Q extends RangeState> + DeletionAwareTrie mappingMergeWithDeletion(RangeTrie deletionTrie, + BiFunction deleter, + BiFunction deletionResolver, + boolean deletionsAtFixedPoints) + { + return mappingMergeWith(deletionBranch(ByteComparable.EMPTY, + deletionTrie.cursor(Direction.FORWARD).byteComparableVersion(), + deletionTrie), + (x, y) -> x, // y is always null + deletionResolver, + deleter, + (x, y) -> { throw new AssertionError(); }, + deletionsAtFixedPoints); + } + + + /// See [MergeResolver] + interface CollectionMergeResolver> + extends MergeResolver, Trie.CollectionMergeResolver + { + /// Resolves conflicts between deletion markers from different sources. + /// + /// It is expected that this method will return the overriding deletion marker (e.g. the one with the higher + /// timestamp), or some combination of information from the two markers. + /// + /// @param markers A collection of all the markers that apply to a position + /// @return The resolved deletion marker, or null if deletions cancel out + D resolveMarkers(Collection markers); + + @Override + default D resolveMarkers(D c1, D c2) + { + return resolveMarkers(ImmutableList.of(c1, c2)); + } + } + + /// Constructs a view of the merge of multiple deletion-aware tries, applying deletions during the merge + /// process. The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// This merge applies each source's deletions to the other sources' live data, and merges deletion branches + /// to form a valid deletion-aware trie. + /// + /// The resolvers will only be called if more than one source contains data for a given position, with arguments + /// presented in arbitrary order. + /// + /// @param sources Collection of deletion-aware tries to merge (must not be empty) + /// @param mergeResolver Unified [CollectionMergeResolver] providing the merge logic + /// @return A live view of the merged tries with deletions applied + /// @throws AssertionError if sources collection is empty. + static > + DeletionAwareTrie merge(Collection> sources, + CollectionMergeResolver mergeResolver) + { + return merge(sources, + mergeResolver, + mergeResolver::resolveMarkers, + mergeResolver::applyMarker, + mergeResolver.deletionsAtFixedPoints()); + } + + + /// Constructs a view of the merge of multiple deletion-aware tries, applying deletions during the merge + /// process. The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// This merge applies each source's deletions to the other sources' live data, and merges deletion branches + /// to form a valid deletion-aware trie. + /// + /// The resolvers will only be called if more than one source contains data for a given position, with arguments + /// presented in arbitrary order. + /// + /// @param sources Collection of deletion-aware tries to merge (must not be empty). + /// @param mergeResolver Resolver for live data conflicts across all sources. + /// @param deletionResolver Resolver for deletion marker conflicts across all sources. See [CollectionMergeResolver#resolveMarkers]. + /// @param deleter Function to apply deletion markers to live content. See [MergeResolver#applyMarker]. + /// @param deletionsAtFixedPoints Optimization flag for predictable deletion patterns. See [MergeResolver#deletionsAtFixedPoints]. + /// @return A live view of the merged tries with deletions applied. + /// @throws AssertionError if sources collection is empty. + static > + DeletionAwareTrie merge(Collection> sources, + Trie.CollectionMergeResolver mergeResolver, + Trie.CollectionMergeResolver deletionResolver, + BiFunction deleter, + boolean deletionsAtFixedPoints) + { + switch (sources.size()) + { + case 0: + throw new AssertionError("Cannot merge empty collection of tries"); + case 1: + return sources.iterator().next(); + case 2: + { + Iterator> it = sources.iterator(); + DeletionAwareTrie t1 = it.next(); + DeletionAwareTrie t2 = it.next(); + return t1.mergeWith(t2, mergeResolver, deletionResolver, deleter, deletionsAtFixedPoints); + } + default: + return dir -> new CollectionMergeCursor.DeletionAware<>(mergeResolver, + deletionResolver, + deleter, + deletionsAtFixedPoints, + dir, + sources, + DeletionAwareTrie::cursor); + } + } + + static > DeletionAwareTrie mergeDistinct(List> tries) + { + return merge(tries, throwingResolver()); + } + + @SuppressWarnings("unchecked") + static > CollectionMergeResolver throwingResolver() + { + return THROWING_RESOLVER; + } + + @SuppressWarnings("rawtypes") + CollectionMergeResolver THROWING_RESOLVER = new CollectionMergeResolver() + { + @Override + public Object resolve(Collection contents) + { + throw new AssertionError("Distinct tries expected"); + } + + @Override + public Object applyMarker(RangeState marker, Object content) + { + throw new AssertionError("Distinct tries expected"); + } + + @Override + public RangeState resolveMarkers(Collection markers) + { + throw new AssertionError("Distinct tries expected"); + } + + @Override + public boolean deletionsAtFixedPoints() + { + return true; + } + }; + + /// Deletion-aware version of a simple consumer that must implement `content` for content in live branches and + /// `deletionMarker` for boundaries in deletion branches. + interface ValueConsumer extends DeletionAwareCursor.DeletionAwareWalker + { + @Override + default Void complete() + { + return null; + } + + @Override + default void resetPathLength(int newDepth) + { + // not tracking path + } + + @Override + default void addPathByte(int nextByte) + { + // not tracking path + } + + @Override + default void addPathBytes(DirectBuffer buffer, int pos, int count) + { + // not tracking path + } + } + + @Override + default String dump(Function contentToString) + { + return dump(contentToString, Object::toString); + } + + default String dump(Function contentToString, + Function rangeToString) + { + return process(Direction.FORWARD, new TrieDumper.DeletionAware<>(contentToString, rangeToString)); + } + + /// Process the trie using the given [DeletionAwareCursor.DeletionAwareWalker]. + default R process(Direction direction, DeletionAwareCursor.DeletionAwareWalker walker) + { + return cursor(direction).process(walker); + } + + + /// Returns the state that applies to the given key. This is either the precise state at the given position, or + /// the range that covers it (i.e. the `precedingState` of the next marker). + default D applicableDeletion(ByteComparable key) + { + DeletionAwareCursor dac = cursor(Direction.FORWARD); + final ByteSource bytes = key.asComparableBytes(dac.byteComparableVersion()); + long currentPosition = dac.encodedPosition(); + RangeCursor rc; + while (true) + { + rc = dac.deletionBranchCursor(Direction.FORWARD); + if (rc != null) + break; + int next = bytes.next(); + if (next == ByteSource.END_OF_STREAM) + return null; // no deletion branch found + long nextPosition = Cursor.positionForDescentWithByte(currentPosition, next); + if (Cursor.compare(dac.skipTo(nextPosition), nextPosition) != 0) + return null; + currentPosition = nextPosition; + } + + if (rc.descendAlong(bytes)) + return rc.state(); + else + return rc.precedingState(); + } + + + /// Returns a view of the live content in this trie as a regular [Trie]. + default Trie contentOnlyTrie() + { + return this::cursor; + } + + /// Returns a view of all deletion ranges in this trie as a single [RangeTrie]. + /// + /// Because a deletion branch can be introduced at any depth, the returned trie will present all paths in the data + /// trie that do not introduce a deletion branch. In particular, walks over tries that do not have any deletions + /// will have to follow the entire data trie. + /// + /// If it is known that the deletion branch can only be introduced at the root, one can use [#deletionBranchAtRoot] + /// as a more efficient version of this. + default RangeTrie deletionOnlyTrie() + { + // Note: We must walk the main trie to find deletion branch roots. This can be inefficient. + return dir -> new DeletionAwareCursor.DeletionsTrieCursor<>(cursor(dir)); + } + + /// Return a trie representing the deletion branch at the root of the trie, if there is one, or an empty trie. + /// This method does not consider any deletion branches introduced below the root and is an efficient alternative to + /// [#deletionOnlyTrie], to be used when it is known that the deletion branch must be at the root of the trie. + default RangeTrie deletionBranchAtRoot() + { + return dir -> { + DeletionAwareCursor cursor = cursor(dir); + RangeCursor deletionBranch = cursor.deletionBranchCursor(dir); + return deletionBranch != null ? deletionBranch : RangeCursor.empty(dir, cursor.byteComparableVersion()); + }; + } + + /// Returns a view of the combination of the live data and deletions in this trie as a regular [Trie], using + /// the provided mapping function to covert values to a common type. + default Trie mergedTrie(BiFunction resolver) + { + return dir -> new DeletionAwareCursor.LiveAndDeletionsMergeCursor<>(resolver, cursor(dir)); + } + + /// Interface used to ask a cursor to stop issuing deletions. Provided by the cursor implementing + /// [#mergedTrieSwitchable]. + interface DeletionsStopControl + { + void stopIssuingDeletions(Cursor.ResettingTransitionsReceiver receiver); + } + + /// Returns a view of the combination of the live data and deletions in this trie as a regular [Trie], using + /// the provided mapping function to covert values to a common type. + /// + /// The only difference with [#mergedTrie] is that this cursor can be asked to stop visiting deletion branches + /// via the [DeletionsStopControl] interface. + default Trie mergedTrieSwitchable(BiFunction resolver) + { + return dir -> new DeletionAwareCursor.SwitchableLiveAndDeletionsMergeCursor<>(resolver, cursor(dir)); + } + + static > + DeletionAwareTrie empty(ByteComparable.Version byteComparableVersion) + { + return direction -> new DeletionAwareCursor.Empty<>(direction, byteComparableVersion); + } + + @Override + default DeletionAwareTrie prefixedBy(ByteComparable prefix) + { + return dir -> new PrefixedCursor.DeletionAware<>(prefix, cursor(dir)); + } + + /// A variation of [#prefixedBy] with the same effective result, but where content and deletion portions are + /// separately prefixed by `prefix`. + default DeletionAwareTrie prefixedBySeparately(ByteComparable prefix, boolean deletionsMustBeAtRoot) + { + return dir -> + { + DeletionAwareCursor cursor = cursor(dir); + if (deletionsMustBeAtRoot) + return new PrefixedCursor.DeletionAwareSeparately<>(prefix, cursor, cursor.deletionBranchCursor(dir)); + else + return new PrefixedCursor.DeletionAwareSeparately<>(prefix, cursor, new DeletionAwareCursor.DeletionsTrieCursor<>(cursor.tailCursor(dir))); + }; + } + + /// @inheritDoc + /// + /// Note: if the cursor is positioned below a deletion branch root and a deletion applies to the prefix, the tail + /// will include it as a deletion branch at the root of the returned tail trie. + @Override + default DeletionAwareTrie tailTrie(ByteComparable prefix) + { + return tailTrie(prefix, true); + } + + /// Returns a trie that corresponds to the branch of this trie rooted at the given prefix. + /// + /// The result will include the same values as `subtrie(prefix, prefix)`, but the keys in the + /// resulting trie will not include the prefix. In other words, + /// ```tailTrie(prefix).prefixedBy(prefix) = subtrie(prefix, prefix)``` + /// (with `includeCoveringDeletion` and ignoring the depth of introduction of the deletion branch). + /// + /// When the tail falls below a deletion branch root and a deletion covers the whole tail branch, + /// this method will include that deletion to cover the root of the returned trie if `includeCoveringDeletions` is + /// true. If not, the covering deletion will be dropped, and the content of the returned trie will be modified to + /// remove the application of that covering deletion. That is, any contained deletion that switches from or to the + /// covering deletion will be changed to drop that side, in order to maintain a valid sequence of ranges. + default DeletionAwareTrie tailTrie(ByteComparable prefix, boolean includeCoveringDeletions) + { + DeletionAwareCursor c = cursor(Direction.FORWARD); + ByteSource bytes = prefix.asComparableBytes(c.byteComparableVersion()); + while (true) + { + RangeCursor deletionBranch = c.deletionBranchCursor(Direction.FORWARD); + if (deletionBranch != null) + return tailTrieSeparately(ByteSource.duplicatable(bytes), c, deletionBranch, includeCoveringDeletions); + + int next = bytes.next(); + long position = c.encodedPosition(); + if (next == ByteSource.END_OF_STREAM) + return c::tailCursor; + long nextPosition = Cursor.positionForDescentWithByte(position, next); + if (Cursor.compare(c.skipTo(nextPosition), nextPosition) != 0) + return null; + } + } + + private static > DeletionAwareTrie + tailTrieSeparately(ByteSource.Duplicatable bytes, DeletionAwareCursor c, RangeCursor deletionBranch, boolean includeCoveringDeletions) + { + ByteSource.Duplicatable bytesDeletion = bytes.duplicate(); + if (!deletionBranch.descendAlong(bytesDeletion)) + deletionBranch = includeCoveringDeletions ? deletionBranch.precedingStateCursor(Direction.FORWARD) : null; + else if (!includeCoveringDeletions) + deletionBranch = DeletionAwareCursor.dropCoveringDeletions(deletionBranch); + + if (!c.descendAlong(bytes)) + c = null; + + return DeletionAwareCursor.combineTails(c, deletionBranch); + } + + /// @inheritDoc + /// + /// Note: if a tail is positioned below a deletion branch root and a deletion applies to the prefix, the tail + /// will include it as a deletion branch at the root of the returned tail trie. + @Override + default Iterable>> tailTries(Direction direction, Predicate predicate) + { + return tailTries(direction, predicate, true); + } + + /// Returns an entry set containing all tail tries constructed at the points that contain content passing + /// the given predicate. + /// + /// When the tail falls below a deletion branch root and a deletion covers the whole tail branch, + /// this method will include that deletion to cover the root of the returned trie if `includeCoveringDeletions` is + /// true. If not, the covering deletion will be dropped, and the content of the returned trie will be modified to + /// remove the application of that covering deletion. That is, any contained deletion that switches from or to the + /// covering deletion will be changed to drop that side, in order to maintain a valid sequence of ranges. + default + Iterable>> + tailTries(Direction direction, Predicate predicate, boolean includeCoveringDeletions) + { + return () -> new TrieTailsIterator.AsEntriesDeletionAware<>(cursor(direction), predicate, includeCoveringDeletions); + } + + /// Returns a view of this trie where all live content is processed through the given mapping function. + default DeletionAwareTrie mapValues(Function mapper) + { + return dir -> new ContentMappingCursor.DeletionAwareDataOnly<>(mapper, cursor(dir)); + } + + /// Returns a view of this trie where all live content and deletions are processed through the given mapping + /// functions. + default > DeletionAwareTrie mapValuesAndDeletions(Function mapper, Function deletionMapper) + { + return dir -> new ContentMappingCursor.DeletionAware<>(mapper, deletionMapper, cursor(dir)); + } + + // The methods below form the non-public implementation, whose visibility is restricted to package-level. + // The warning suppression below is necessary because we cannot limit the visibility of an interface method. + // We need an interface to be able to implement trie methods by lambdas, which is heavily used above. + + /// Implement this method to provide the concrete trie implementation as the cursor that presents it, most easily + /// done via a lambda as in the methods above. + //noinspection ClassEscapesDefinedScope + DeletionAwareCursor makeCursor(Direction direction); + + /// @inheritDoc This method's implementation uses [#makeCursor] to get the cursor and may apply additional cursor + /// checks for tests that run with verification enabled. + //noinspection ClassEscapesDefinedScope + @Override + default DeletionAwareCursor cursor(Direction direction) + { + return Trie.DEBUG ? new VerificationCursor.DeletionAware<>(makeCursor(direction)) + : makeCursor(direction); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/DepthAdjustedCursor.java b/src/java/org/apache/cassandra/db/tries/DepthAdjustedCursor.java new file mode 100644 index 000000000000..762dcc717cf5 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/DepthAdjustedCursor.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +class DepthAdjustedCursor> implements Cursor +{ + final C source; + private long depthAdjustment; + private long matchingPositionAtRoot; + + DepthAdjustedCursor(C source, long matchingPositionAtRoot) + { + this.source = source; + setAttachmentPoint(matchingPositionAtRoot); + } + + void setAttachmentPoint(long matchingPositionAtRoot) + { + this.matchingPositionAtRoot = matchingPositionAtRoot; + this.depthAdjustment = Cursor.depthCorrectionValue(matchingPositionAtRoot); + } + + long toAdjustedDepth(long position) + { + if (Cursor.depth(position) > 0) + return position + depthAdjustment; + else if (Cursor.isExhausted(position)) + return position; + else + return matchingPositionAtRoot | (position & Cursor.ON_RETURN_PATH_BIT); + } + + long fromAdjustedDepth(long position) + { + // matchingPositionAtRoot | ON_RETURN_PATH_BIT should map to rootPosition | ON_RETURN_PATH_BIT + long adjusted = position - depthAdjustment; + if (Cursor.depth(adjusted) > 0) + return adjusted; + + // The only non-exhausted position that can be requested with this depth is the return path stop for the root. + if (position == (matchingPositionAtRoot | Cursor.ON_RETURN_PATH_BIT)) + return Cursor.rootReturnPosition(adjusted); + else + return Cursor.exhaustedPosition(adjusted); + + } + + @Override + public long encodedPosition() + { + return toAdjustedDepth(source.encodedPosition()); + } + + @Override + public T content() + { + return source.content(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public long advance() + { + return toAdjustedDepth(source.advance()); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return toAdjustedDepth(source.advanceMultiple(receiver)); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return toAdjustedDepth(source.skipTo(fromAdjustedDepth(encodedSkipPosition))); + } + + @Override + public boolean descendAlong(ByteSource bytes) + { + return source.descendAlong(bytes); + } + + @Override + public Cursor tailCursor(Direction direction) + { + return source.tailCursor(direction); + } + + static Cursor make(Cursor source, long matchingPositionAtRoot) + { + return Cursor.depth(matchingPositionAtRoot) == 0 ? source : new Plain<>(source, matchingPositionAtRoot); + } + + static > RangeCursor make(RangeCursor source, long matchingPositionAtRoot) + { + return Cursor.depth(matchingPositionAtRoot) == 0 ? source : new Range<>(source, matchingPositionAtRoot); + } + + static class Plain extends DepthAdjustedCursor> + { + public Plain(Cursor source, long matchingPositionAtRoot) + { + super(source, matchingPositionAtRoot); + } + } + + static class Range> extends DepthAdjustedCursor> implements RangeCursor + { + Range(RangeCursor source, long matchingPositionAtRoot) + { + super(source, matchingPositionAtRoot); + } + + @Override + public S state() + { + return source.state(); + } + + @Override + public S precedingState() + { + return source.precedingState(); + } + + @Override + public RangeCursor precedingStateCursor(Direction direction) + { + return source.precedingStateCursor(direction); + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + return source.tailCursor(direction); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/Direction.java b/src/java/org/apache/cassandra/db/tries/Direction.java index 29f8e2b97b79..a7f8ed6dcb2a 100644 --- a/src/java/org/apache/cassandra/db/tries/Direction.java +++ b/src/java/org/apache/cassandra/db/tries/Direction.java @@ -45,22 +45,17 @@ public boolean le(int a, int b) return a <= b; } - public int min(int a, int b) - { - return Math.min(a, b); - } - - public int max(int a, int b) + public T select(T forward, T reverse) { - return Math.max(a, b); + return forward; } - public T select(T forward, T reverse) + public int select(int forward, int reverse) { return forward; } - public int select(int forward, int reverse) + public long select(long forward, long reverse) { return forward; } @@ -92,22 +87,17 @@ public boolean le(int a, int b) return a >= b; } - public int min(int a, int b) - { - return Math.max(a, b); - } - - public int max(int a, int b) + public T select(T forward, T reverse) { - return Math.min(a, b); + return reverse; } - public T select(T forward, T reverse) + public int select(int forward, int reverse) { return reverse; } - public int select(int forward, int reverse) + public long select(long forward, long reverse) { return reverse; } @@ -145,10 +135,6 @@ public boolean ge(int a, int b) { return le(b, a); } - /** Returns the result of the operation corresponding to min(a, b) for the forward direction */ - public abstract int min(int a, int b); - /** Returns the result of the operation corresponding to max(a, b) for the forward direction */ - public abstract int max(int a, int b); /** * Use the first argument in forward direction and the second in reverse, i.e. isForward() ? forward : reverse. @@ -160,6 +146,11 @@ public boolean ge(int a, int b) */ public abstract int select(int forward, int reverse); + /** + * Use the first argument in forward direction and the second in reverse, i.e. isForward() ? forward : reverse. + */ + public abstract long select(long forward, long reverse); + /** * Helper to perform loops over possible values in the given direction. Returns whether the given index is still * within bounds when iterating. diff --git a/src/java/org/apache/cassandra/db/tries/FlexibleMergeCursor.java b/src/java/org/apache/cassandra/db/tries/FlexibleMergeCursor.java new file mode 100644 index 000000000000..5a41c4738821 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/FlexibleMergeCursor.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; +import javax.annotation.Nullable; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +abstract class FlexibleMergeCursor, D extends Cursor, T> implements Cursor +{ + final C c1; + @Nullable D c2; + long c2depthCorrection; + long currentPosition; + + enum State + { + AT_C1, + AT_C2, + AT_BOTH, + C1_ONLY, // c2 is null + } + State state; + + FlexibleMergeCursor(C c1) + { + c1.assertFresh(); + this.c1 = c1; + this.c2 = null; + state = State.C1_ONLY; + currentPosition = c1.encodedPosition(); + // We can't call postAdvance here because the class may not be completely initialized. + // The concrete class should do that instead + } + + FlexibleMergeCursor(C c1, D c2) + { + c1.assertFresh(); + c2.assertFresh(); + this.c1 = c1; + this.c2 = c2; + this.c2depthCorrection = 0; + state = c2 != null ? State.AT_BOTH : State.C1_ONLY; + currentPosition = c1.encodedPosition(); + // We can't call postAdvance here because the class may not be completely initialized. + // The concrete class should do that instead + } + + public void addCursor(D c2) + { + assert state == State.C1_ONLY : "Attempting to add further cursors to a cursor that already has two sources"; + c2.assertFresh(); + this.c2 = c2; + this.c2depthCorrection = Cursor.depthCorrectionValue(currentPosition); + this.state = State.AT_BOTH; + } + + abstract long postAdvance(long depth); + + @Override + public long advance() + { + switch (state) + { + case C1_ONLY: + return inC1Only(c1.advance()); + case AT_C1: + return checkOrder(c1.advance(), c2.encodedPosition()); + case AT_C2: + return checkOrder(c1.encodedPosition(), c2.advance()); + case AT_BOTH: + return checkOrder(c1.advance(), c2.advance()); + default: + throw new AssertionError(); + } + } + + @Override + public long skipTo(long encodedSkipPosition) + { + if (state == State.C1_ONLY) + return inC1Only(c1.skipTo(encodedSkipPosition)); + + long c2encodedSkipPosition = encodedSkipPosition - c2depthCorrection; + // Handle request to exit c2 branch separately for simplicity + if (Cursor.isExhausted(c2encodedSkipPosition)) + { + switch (state) + { + case AT_C1: + case AT_BOTH: + return leaveC2(c1.skipTo(encodedSkipPosition)); + case AT_C2: + return leaveC2(c1.skipToWhenAhead(encodedSkipPosition)); + default: + throw new AssertionError(); + } + } + + switch (state) + { + case AT_C1: + return checkOrder(c1.skipTo(encodedSkipPosition), c2.skipToWhenAhead(c2encodedSkipPosition)); + case AT_C2: + return checkOrder(c1.skipToWhenAhead(encodedSkipPosition), c2.skipTo(c2encodedSkipPosition)); + case AT_BOTH: + return checkOrder(c1.skipTo(encodedSkipPosition), c2.skipTo(c2encodedSkipPosition)); + default: + throw new AssertionError(); + } + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + switch (state) + { + case C1_ONLY: + return inC1Only(c1.advanceMultiple(receiver)); + // If we are in a branch that's only covered by one of the sources, we can use its advanceMultiple as it is + // only different from advance if it takes multiple steps down, which does not change the order of the + // cursors. + // Since it might ascend, we still have to check the order after the call. + case AT_C1: + return checkOrder(c1.advanceMultiple(receiver), c2.encodedPosition()); + case AT_C2: + return checkOrder(c1.encodedPosition(), c2.advanceMultiple(receiver)); + // While we are on a shared position, we must descend one byte at a time to maintain the cursor ordering. + case AT_BOTH: + return checkOrder(c1.advance(), c2.advance()); + default: + throw new AssertionError(); + } + } + + private long inC1Only(long c1pos) + { + return postAdvance(currentPosition = c1pos); + } + + private long checkOrder(long c1pos, long c2posUncorrected) + { + if (Cursor.isExhausted(c2posUncorrected)) + return leaveC2(c1pos); + + long c2pos = c2posUncorrected + c2depthCorrection; + long cmp = Cursor.compare(c1pos, c2pos); + if (cmp < 0) + { + state = State.AT_C1; + return postAdvance(currentPosition = c1pos); + } + if (cmp > 0) + { + state = State.AT_C2; + return postAdvance(currentPosition = c2pos); + } + // c1pos == c2pos + state = State.AT_BOTH; + return postAdvance(currentPosition = c1pos); + } + + private long leaveC2(long c1pos) + { + state = State.C1_ONLY; + c2 = null; + return postAdvance(currentPosition = c1pos); + } + + @Override + public long encodedPosition() + { + return currentPosition; + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return c1.byteComparableVersion(); + } + + static abstract class WithMappedContent, D extends Cursor, Z> extends FlexibleMergeCursor + { + final BiFunction resolver; + + WithMappedContent(BiFunction resolver, C c1) + { + super(c1); + this.resolver = resolver; + } + + WithMappedContent(BiFunction resolver, C c1, D c2) + { + super(c1, c2); + this.resolver = resolver; + } + + @Override + public Z content() + { + U mc = null; + T nc = null; + switch (state) + { + case C1_ONLY: + case AT_C1: + nc = c1.content(); + break; + case AT_C2: + mc = c2.content(); + break; + case AT_BOTH: + mc = c2.content(); + nc = c1.content(); + break; + default: + throw new AssertionError(); + } + if (nc == null && mc == null) + return null; + return resolver.apply(nc, mc); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryBaseTrie.java b/src/java/org/apache/cassandra/db/tries/InMemoryBaseTrie.java new file mode 100644 index 000000000000..a08f7a962980 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/InMemoryBaseTrie.java @@ -0,0 +1,1765 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicReferenceArray; +import java.util.function.Predicate; +import javax.annotation.Nonnull; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; + +/// Base class for mutable in-memory tries, providing the common infrastructure for plain, range and deletion-aware +/// in-memory tries. +public abstract class InMemoryBaseTrie extends InMemoryReadTrie +{ + // See the trie format description in InMemoryReadTrie. + + // constants for space calculations + static final long REFERENCE_ARRAY_ON_HEAP_SIZE = ObjectSizes.measureDeep(new AtomicReferenceArray<>(0)); + + public enum ExpectedLifetime + { + SHORT, LONG + } + + InMemoryBaseTrie(ByteComparable.Version byteComparableVersion, boolean presentContentOnDescentPath, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder) + { + this(byteComparableVersion, presentContentOnDescentPath, null, bufferType, lifetime, opOrder); + } + + InMemoryBaseTrie(ByteComparable.Version byteComparableVersion, boolean presentContentOnDescentPath, Predicate shouldPreserveWithoutChildren, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder) + { + this(byteComparableVersion, + presentContentOnDescentPath, + new BufferManagerMultibuf(bufferType, lifetime, opOrder), // last one is 1G for a total of ~2G bytes + new ContentManagerPojo<>(shouldPreserveWithoutChildren, lifetime, opOrder)); // takes at least 4 bytes to write pointer to one content -> 4 times smaller than buffers + } + + InMemoryBaseTrie(ByteComparable.Version byteComparableVersion, boolean presentContentOnDescentPath, BufferManager bufferManager, ContentManager contentManager) + { + super(byteComparableVersion, presentContentOnDescentPath, bufferManager, contentManager, NONE); + } + + // Buffer, content list and cell management + + private void putInt(int pos, int value) + { + getBuffer(pos).putInt(inBufferOffset(pos), value); + } + + protected void putIntVolatile(int pos, int value) + { + getBuffer(pos).putIntVolatile(inBufferOffset(pos), value); + } + + private void putShort(int pos, short value) + { + getBuffer(pos).putShort(inBufferOffset(pos), value); + } + + private void putShortVolatile(int pos, short value) + { + getBuffer(pos).putShortVolatile(inBufferOffset(pos), value); + } + + private void putByte(int pos, byte value) + { + getBuffer(pos).putByte(inBufferOffset(pos), value); + } + + private int allocateCell() throws TrieSpaceExhaustedException + { + return bufferManager.allocateCell(); + } + + protected void recycleCell(int cell) + { + bufferManager.recycleCell(cell); + } + + /// Creates a copy of a given cell and marks the original for recycling. Used when a mutation needs to force-copy + /// paths to ensure earlier states are still available for concurrent readers. + protected int copyCell(int cell) throws TrieSpaceExhaustedException + { + return bufferManager.copyCell(cell); + } + + /// Add a new content value. + /// + /// @return A content id that can be used to reference the content, a negative number where + /// `id & CONTENT_INDEX_MASK` encodes the position of the value in the content array. + protected int addContent(T value, boolean contentAfterBranch) throws TrieSpaceExhaustedException + { + if (value == null) + return NONE; + int id = contentManager.addContent(value, contentAfterBranch); + assert isLeaf(id); + return id; + } + + /// Change the content associated with a given content id. + /// + /// @param id encoded content id, where `id & CONTENT_INDEX_MASK` is the position in the content array + /// @param value new content value to store + /// @return the id to use for the modified content; an attempt will be made to make this the same as id, but not + /// all content managers will be able to freely modify the data for a given id. + protected int setContent(int id, T value) throws TrieSpaceExhaustedException + { + return contentManager.setContent(id, value); + } + + protected void releaseContent(int id) + { + contentManager.releaseContent(id); + } + + protected boolean shouldPreserveWithoutChildren(int id) + { + return contentManager.shouldPreserveWithoutChildren(id); + } + + /// Called to clean up all buffers when the trie is known to no longer be needed. + public void discardBuffers() + { + bufferManager.discardBuffers(); + } + + private int copyIfOriginal(int node, int originalNode) throws TrieSpaceExhaustedException + { + return (node == originalNode) + ? copyCell(originalNode) + : node; + } + + private int getOrAllocate(int pointerAddress, int offsetWhenAllocating) throws TrieSpaceExhaustedException + { + int child = getIntVolatile(pointerAddress); + if (child != NONE) + return child; + + child = allocateCell() | offsetWhenAllocating; + // volatile writes not needed because this branch is not attached yet + putInt(pointerAddress, child); + return child; + } + + private int getCopyOrAllocate(int pointerAddress, int originalChild, int offsetWhenAllocating) throws TrieSpaceExhaustedException + { + int child = getIntVolatile(pointerAddress); + if (child == NONE) + { + child = allocateCell() | offsetWhenAllocating; + // volatile writes not needed because this branch is not attached yet + putInt(pointerAddress, child); + } + else if (child == originalChild) + { + child = copyCell(originalChild); + // volatile writes not needed because this branch is not attached yet + putInt(pointerAddress, child); + } + + return child; + } + + // Write methods + + // Write visibility model: writes are not volatile, with the exception of the final write before a call returns + // the same value that was present before (e.g. content was updated in-place / existing node got a new child or had + // a child pointer updated); if the whole path including the root node changed, the root itself gets a volatile + // write. + // This final write is the point where any new cells created during the write become visible for readers for the + // first time, and such readers must pass through reading that pointer, which forces a happens-before relationship + // that extends to all values written by this thread before it. + + /// Attach a child to the given non-content node. This may be an update for an existing branch, or a new child for + /// the node. An update _is_ required (i.e. this is only called when the `newChild` pointer is not the same as the + /// existing value). + /// This method is called when the original node content must be preserved for concurrent readers (i.e. any cell to + /// be modified needs to be copied first.) + /// + /// @param node pointer to the node to update or copy + /// @param originalNode pointer to the node as it was before any updates in the current modification (i.e. apply + /// call) were started. In other words, the node that is currently reachable by readers if they + /// follow the same key, and which will become unreachable for new readers after this update + /// completes. Used to avoid copying again if already done -- if `node` is already != `originalNode` + /// (which is the case when a second or further child of a node is changed by an update), + /// then node is currently not reachable and can be safely modified or completely overwritten. + /// @param trans transition to modify/add + /// @param newChild new child pointer + /// @return pointer to the updated node + protected int attachChildCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException + { + assert !isLeaf(node) : "attachChild cannot be used on content nodes."; + + switch (offset(node)) + { + case PREFIX_OFFSET: + assert false : "attachChild cannot be used on content nodes."; + case SPARSE_OFFSET: + // If the node is already copied (e.g. this is not the first child being modified), there's no need to copy + // it again. + return attachChildToSparseCopying(node, originalNode, trans, newChild); + case SPLIT_OFFSET: + // This call will copy the split node itself and any intermediate cells as necessary to make sure cells + // reachable from the original node are not modified. + return attachChildToSplitCopying(node, originalNode, trans, newChild); + default: + // chain nodes + return attachChildToChainCopying(node, trans, newChild); // always copies + } + } + + /// Attach a child to the given node. This may be an update for an existing branch, or a new child for the node. + /// An update _is_ required (i.e. this is only called when the newChild pointer is not the same as the existing value). + /// + /// @param node pointer to the node to update or copy + /// @param trans transition to modify/add + /// @param newChild new child pointer + /// @return pointer to the updated node; same as node if update was in-place + protected int attachChild(int node, int trans, int newChild) throws TrieSpaceExhaustedException + { + assert !isLeaf(node) : "attachChild cannot be used on content nodes."; + + switch (offset(node)) + { + case PREFIX_OFFSET: + assert false : "attachChild cannot be used on content nodes."; + case SPARSE_OFFSET: + return attachChildToSparse(node, trans, newChild); + case SPLIT_OFFSET: + return attachChildToSplit(node, trans, newChild); + default: + return attachChildToChain(node, trans, newChild); + } + } + + /// Attach a child to the given split node. This may be an update for an existing branch, or a new child for the node. + private int attachChildToSplit(int node, int trans, int newChild) throws TrieSpaceExhaustedException + { + int midPos = splitCellPointerAddress(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); + int mid = getIntVolatile(midPos); + if (isNull(mid)) + { + if (isNull(newChild)) + return node; + mid = createEmptySplitNode(); + int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + int tail = createEmptySplitNode(); + int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + putInt(childPos, newChild); + putInt(tailPos, tail); + putIntVolatile(midPos, mid); + return node; + } + + int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + int tail = getIntVolatile(tailPos); + if (isNull(tail)) + { + if (isNull(newChild)) + return node; + tail = createEmptySplitNode(); + int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + putInt(childPos, newChild); + putIntVolatile(tailPos, tail); + return node; + } + + int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + if (isNull(newChild)) + return removeSplitChildVolatile(node, midPos, mid, tailPos, tail, childPos); + else + { + putIntVolatile(childPos, newChild); + return node; // normal path, adding data + } + } + + /// Remove a split child, propagating the removal upward if this results in an empty split node cell. + /// This version of the method is called when the node being modified is reachable and may be concurrently accessed + /// by reads. + private int removeSplitChildVolatile(int node, int midPos, int mid, int tailPos, int tail, int childPos) + throws TrieSpaceExhaustedException + { + if (isNull(getIntVolatile(childPos))) + return node; + + // Because there may be concurrent accesses to this node that have saved the path we are removing as the next + // transition and expect it to be valid, we need to copy any cell where we set the value to NONE. + if (!isSplitBlockEmptyExcept(tail, SPLIT_OTHER_LEVEL_LIMIT, childPos)) + { + int newTail = copyCell(tail); + putInt(newTail + childPos - tail, NONE); + putIntVolatile(tailPos, newTail); + return node; + } + recycleCell(tail); + if (!isSplitBlockEmptyExcept(mid, SPLIT_OTHER_LEVEL_LIMIT, tailPos)) + { + int newMid = copyCell(mid); + putInt(newMid + tailPos - mid, NONE); + putIntVolatile(midPos, newMid); + return node; + } + recycleCell(mid); + if (!isSplitBlockEmptyExcept(node, SPLIT_START_LEVEL_LIMIT, midPos)) + { + int newNode = copyCell(node); + putInt(newNode + midPos - node, NONE); + return newNode; + } + recycleCell(node); + return NONE; + } + + boolean isSplitBlockEmptyExcept(int node, int limit, int deletedPos) + { + for (int i = 0; i < limit; ++i) + { + int pos = splitCellPointerAddress(node, i, limit); + if (pos != deletedPos && !isNull(getIntVolatile(pos))) + return false; + } + return true; + } + + /// Non-volatile version of `attachChildToSplit`. Used when the split node is not reachable yet (during the conversion + /// from sparse). + private int attachChildToSplitNonVolatile(int node, int trans, int newChild) throws TrieSpaceExhaustedException + { + assert offset(node) == SPLIT_OFFSET : "Invalid split node in trie"; + int midPos = splitCellPointerAddress(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); + int mid = getOrAllocate(midPos, SPLIT_OFFSET); + assert offset(mid) == SPLIT_OFFSET : "Invalid split node in trie"; + int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + int tail = getOrAllocate(tailPos, SPLIT_OFFSET); + assert offset(tail) == SPLIT_OFFSET : "Invalid split node in trie"; + int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + putInt(childPos, newChild); + if (isNull(newChild)) + return removeSplitPathNonVolatile(node, midPos, mid, tailPos, tail); + else + return node; // normal path, adding data + } + + /// Attach a child to the given split node, copying all modified content to enable atomic visibility + /// of modification. + /// This may be an update for an existing branch, or a new child for the node. + private int attachChildToSplitCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException + { + if (offset(originalNode) != SPLIT_OFFSET) // includes originalNode == NONE + return attachChildToSplitNonVolatile(node, trans, newChild); + + node = copyIfOriginal(node, originalNode); + assert offset(node) == SPLIT_OFFSET : "Invalid split node in trie"; + + int midPos = splitCellPointerAddress(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); + int midOriginal = originalNode != NONE ? getIntVolatile(midPos + originalNode - node) : NONE; + int mid = getCopyOrAllocate(midPos, midOriginal, SPLIT_OFFSET); + assert offset(mid) == SPLIT_OFFSET : "Invalid split node in trie"; + + int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + int tailOriginal = midOriginal != NONE ? getIntVolatile(tailPos + midOriginal - mid) : NONE; + int tail = getCopyOrAllocate(tailPos, tailOriginal, SPLIT_OFFSET); + assert offset(tail) == SPLIT_OFFSET : "Invalid split node in trie"; + + int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); + putInt(childPos, newChild); + if (isNull(newChild)) + return removeSplitPathNonVolatile(node, midPos, mid, tailPos, tail); + else + return node; + } + + /// Propagate the removal of a split child upward if it resulted in an empty split node cell, + /// assuming that the node being modified is not reachable and cannot be accessed concurrently. + private int removeSplitPathNonVolatile(int node, int midPos, int mid, int tailPos, int tail) + { + if (!isSplitBlockEmpty(tail, SPLIT_OTHER_LEVEL_LIMIT)) + return node; + recycleCell(tail); + putInt(tailPos, NONE); + if (!isSplitBlockEmpty(mid, SPLIT_OTHER_LEVEL_LIMIT)) + return node; + recycleCell(mid); + putInt(midPos, NONE); + if (!isSplitBlockEmpty(node, SPLIT_START_LEVEL_LIMIT)) + return node; + recycleCell(node); + return NONE; + } + + boolean isSplitBlockEmpty(int node, int limit) + { + for (int i = 0; i < limit; ++i) + if (!isNull(getSplitCellPointer(node, i, limit))) + return false; + return true; + } + + /// Attach a child to the given sparse node. This may be an update for an existing branch, or a new child for the node. + private int attachChildToSparse(int node, int trans, int newChild) throws TrieSpaceExhaustedException + { + int index; + int smallerCount = 0; + // first check if this is an update and modify in-place if so + for (index = 0; index < SPARSE_CHILD_COUNT; ++index) + { + if (isNull(getIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * Integer.BYTES))) + break; + final int existing = getUnsignedByte(node + SPARSE_BYTES_OFFSET + index); + if (existing == trans) + { + if (isNull(newChild)) + return removeSparseChild(node, index); + putIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * Integer.BYTES, newChild); + return node; + } + else if (existing < trans) + ++smallerCount; + } + int childCount = index; + if (isNull(newChild)) + return node; + + if (childCount == SPARSE_CHILD_COUNT) + { + // Node is full. Switch to split + return upgradeSparseToSplit(node, trans, newChild); + } + + // Add a new transition. They are not kept in order, so append it at the first free position. + putByte(node + SPARSE_BYTES_OFFSET + childCount, (byte) trans); + + // Update order word. + int order = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); + int newOrder = insertInOrderWord(order, childCount, smallerCount); + + // Sparse nodes have two access modes: via the order word, when listing transitions, or directly to characters + // and addresses. + // To support the former, we volatile write to the order word last, after everything is correctly set up. + // The latter does not touch the order word. To support that too, we volatile write the address, as the reader + // can't determine if the position is in use based on the character byte alone (00 is also a valid transition). + // Note that this means that reader must check the transition byte AFTER the address, to ensure they get the + // correct value (see getSparseChild). + + // setting child enables reads to start seeing the new branch + putIntVolatile(node + SPARSE_CHILDREN_OFFSET + childCount * Integer.BYTES, newChild); + + // some readers will decide whether to check the pointer based on the order word + // write that volatile to make sure they see the new change too + putShortVolatile(node + SPARSE_ORDER_OFFSET, (short) newOrder); + return node; + } + + /// Remove a child of the given sparse node. To ensure the safety of concurrent operations, this is always done + /// as a copying operation as we can't safely shift entries in a sparse node. + private int removeSparseChild(int node, int index) throws TrieSpaceExhaustedException + { + recycleCell(node); + int order = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); + if (index <= 1 && order == 6) + { + int survivingIndex = index ^ 1; + return expandOrCreateChainNode(getUnsignedByte(node + SPARSE_BYTES_OFFSET + survivingIndex), + getIntVolatile(node + SPARSE_CHILDREN_OFFSET + survivingIndex * Integer.BYTES)); + } + + // Because we need the smallest child to not be the last (which can happen if we just remove entries), we will + // put the remaining data in order. + int newNode = allocateCell() | SPARSE_OFFSET; + int i = 0; + int newOrder = 0; + int mul = 1; + while (order > 0) + { + int next = order % SPARSE_CHILD_COUNT; + order /= SPARSE_CHILD_COUNT; + if (next == index) + continue; + putInt(newNode + SPARSE_CHILDREN_OFFSET + i * Integer.BYTES, getIntVolatile(node + SPARSE_CHILDREN_OFFSET + next * Integer.BYTES)); + putInt(newNode + SPARSE_BYTES_OFFSET + i, getUnsignedByte(node + SPARSE_BYTES_OFFSET + next)); + newOrder += i * mul; + mul *= SPARSE_CHILD_COUNT; + ++i; + } + putShort(newNode + SPARSE_ORDER_OFFSET, (short) newOrder); + return newNode; + } + + /** + * Attach a child to the given sparse node. This may be an update for an existing branch, or a new child for the node. + * Resulting node is not reachable, no volatile set needed. + */ + private int attachChildToSparseCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException + { + int index; + int smallerCount = 0; + // first check if this is an update and modify in-place if so + for (index = 0; index < SPARSE_CHILD_COUNT; ++index) + { + if (isNull(getIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * Integer.BYTES))) + break; + final int existing = getUnsignedByte(node + SPARSE_BYTES_OFFSET + index); + if (existing == trans) + { + if (isNull(newChild)) + return removeSparseChild(node, index); + node = copyIfOriginal(node, originalNode); + putInt(node + SPARSE_CHILDREN_OFFSET + index * Integer.BYTES, newChild); + return node; + } + else if (existing < trans) + ++smallerCount; + } + int childCount = index; + + if (isNull(newChild)) + return node; + + if (childCount == SPARSE_CHILD_COUNT) + { + // Node is full. Switch to split. + // Note that even if node != originalNode, we still have to recycle it as it was a temporary one that will + // no longer be attached. + return upgradeSparseToSplit(node, trans, newChild); + } + + node = copyIfOriginal(node, originalNode); + + // Add a new transition. They are not kept in order, so append it at the first free position. + putByte(node + SPARSE_BYTES_OFFSET + childCount, (byte) trans); + + putInt(node + SPARSE_CHILDREN_OFFSET + childCount * Integer.BYTES, newChild); + + // Update order word. + int order = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); + int newOrder = insertInOrderWord(order, childCount, smallerCount); + putShort(node + SPARSE_ORDER_OFFSET, (short) newOrder); + + return node; + } + + private int upgradeSparseToSplit(int node, int trans, int newChild) throws TrieSpaceExhaustedException + { + int split = createEmptySplitNode(); + for (int i = 0; i < SPARSE_CHILD_COUNT; ++i) + { + int t = getUnsignedByte(node + SPARSE_BYTES_OFFSET + i); + int p = getIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * Integer.BYTES); + attachChildToSplitNonVolatile(split, t, p); + } + attachChildToSplitNonVolatile(split, trans, newChild); + recycleCell(node); + return split; + } + + /// Insert the given newIndex in the base-6 encoded order word in the correct position with respect to the ordering. + /// + /// E.g. + /// - `insertOrderWord(120, 3, 0)` must return 1203 (decimal 48*6 + 3) + /// - `insertOrderWord(120, 3, 1, ptr)` must return 1230 (decimal 8*36 + 3*6 + 0) + /// - `insertOrderWord(120, 3, 2, ptr)` must return 1320 (decimal 1*216 + 3*36 + 12) + /// - `insertOrderWord(120, 3, 3, ptr)` must return 3120 (decimal 3*216 + 48) + private static int insertInOrderWord(int order, int newIndex, int smallerCount) + { + int r = 1; + for (int i = 0; i < smallerCount; ++i) + r *= 6; + int head = order / r; + int tail = order % r; + // insert newIndex after the ones we have passed (order % r) and before the remaining (order / r) + return tail + (head * 6 + newIndex) * r; + } + + /// Attach a child to the given chain node. This may be an update for an existing branch with different target + /// address, or a second child for the node. + /// + /// This method always copies the node -- with the exception of updates that change the child of the last node in a + /// chain cell with matching transition byte (which this method is not used for, see [#attachChild]), modifications to + /// chain nodes cannot be done in place, either because we introduce a new transition byte and have to convert from + /// the single-transition chain type to sparse, or because we have to remap the child from the implicit node + 1 to + /// something else. + private int attachChildToChain(int node, int transitionByte, int newChild) throws TrieSpaceExhaustedException + { + int existingByte = getUnsignedByte(node); + if (transitionByte == existingByte) + { + // This is still a single path. Update child if possible (only if this is the last character in the chain). + if (offset(node) == LAST_POINTER_OFFSET - 1) + { + if (isNull(newChild)) + { + recycleCell(node); + return NONE; + } + + putIntVolatile(node + 1, newChild); + return node; + } + else + { + if (isNull(newChild)) + return NONE; + + // This will only be called if new child is different from old, and the update is not on the final child + // where we can change it in place (see attachChild). We must always create something new. + // Note that since this is not the last character, we either still need this cell or we have already + // released it (a createSparseNode must have been called earlier). + // If the child is a chain, we can expand it (since it's a different value, its branch must be new and + // nothing can already reside in the rest of the cell). + return expandOrCreateChainNode(transitionByte, newChild); + } + } + if (isNull(newChild)) + return node; + + // The new transition is different, so we no longer have only one transition. Change type. + return convertChainToSparse(node, existingByte, newChild, transitionByte); + } + + /// Attach a child to the given chain node, when we are force-copying. + private int attachChildToChainCopying(int node, int transitionByte, int newChild) + throws TrieSpaceExhaustedException + { + int existingByte = getUnsignedByte(node); + if (transitionByte == existingByte) + { + // This is still a single path. + // Make sure we release the cell if it will no longer be referenced (if we update last reference, the whole + // path has to move as the other nodes in this chain can't be remapped). + if (offset(node) == LAST_POINTER_OFFSET - 1) + recycleCell(node); + + if (isNull(newChild)) + return NONE; + + return expandOrCreateChainNode(transitionByte, newChild); + } + else + { + if (isNull(newChild)) + return node; + + // The new transition is different, so we no longer have only one transition. Change type. + return convertChainToSparse(node, existingByte, newChild, transitionByte); + } + } + + private int convertChainToSparse(int node, int existingByte, int newChild, int transitionByte) + throws TrieSpaceExhaustedException + { + int existingChild = node + 1; + if (offset(existingChild) == LAST_POINTER_OFFSET) + { + existingChild = getIntVolatile(existingChild); + // This was a chain with just one transition which will no longer be referenced. + // The cell may contain other characters/nodes leading to this, which are also guaranteed to be + // unreferenced. + // However, these leading nodes may still be in the parent path and will be needed until the + // mutation completes. + recycleCell(node); + } + // Otherwise the sparse node we will now create references this cell, so it can't be recycled. + return createSparseNode(existingByte, existingChild, transitionByte, newChild); + } + + private boolean isExpandableChain(int newChild) + { + int newOffset = offset(newChild); + return newChild > 0 && newChild - 1 > NONE && newOffset > CHAIN_MIN_OFFSET && newOffset <= CHAIN_MAX_OFFSET; + } + + /// Create a sparse node with two children. + private int createSparseNode(int byte1, int child1, int byte2, int child2) throws TrieSpaceExhaustedException + { + assert byte1 != byte2 : "Attempted to create a sparse node with two of the same transition"; + if (byte1 > byte2) + { + // swap them so the smaller is byte1, i.e. there's always something bigger than child 0 so 0 never is + // at the end of the order + int t = byte1; byte1 = byte2; byte2 = t; + t = child1; child1 = child2; child2 = t; + } + + int node = allocateCell() + SPARSE_OFFSET; + putByte(node + SPARSE_BYTES_OFFSET + 0, (byte) byte1); + putByte(node + SPARSE_BYTES_OFFSET + 1, (byte) byte2); + putInt(node + SPARSE_CHILDREN_OFFSET + 0 * Integer.BYTES, child1); + putInt(node + SPARSE_CHILDREN_OFFSET + 1 * Integer.BYTES, child2); + putShort(node + SPARSE_ORDER_OFFSET, (short) (1 * 6 + 0)); + // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be + // put in an existing node or the root. That action ends in a happens-before enforcing write. + return node; + } + + /// Creates a chain node with the single provided transition (pointing to the provided child). + /// Note that to avoid creating inefficient tries with under-utilized chain nodes, this should only be called from + /// [#expandOrCreateChainNode] and other call-sites should call [#expandOrCreateChainNode]. + private int createNewChainNode(int transitionByte, int newChild) throws TrieSpaceExhaustedException + { + int newNode = allocateCell() + LAST_POINTER_OFFSET - 1; + putByte(newNode, (byte) transitionByte); + putInt(newNode + 1, newChild); + // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be + // put in an existing node or the root. That action ends in a happens-before enforcing write. + return newNode; + } + + /// Like [#createNewChainNode], but if the new child is already a chain node and has room, expand + /// it instead of creating a brand new node. + protected int expandOrCreateChainNode(int transitionByte, int newChild) throws TrieSpaceExhaustedException + { + if (isExpandableChain(newChild)) + { + // attach as a new character in child node + int newNode = newChild - 1; + putByte(newNode, (byte) transitionByte); + return newNode; + } + + return createNewChainNode(transitionByte, newChild); + } + + private int createEmptySplitNode() throws TrieSpaceExhaustedException + { + return allocateCell() + SPLIT_OFFSET; + } + + protected int createContentNode(int contentId, int child, boolean isSafeChain) throws TrieSpaceExhaustedException + { + return createPrefixNode(contentId, NONE, child, isSafeChain); + } + + protected int createPrefixNode(int contentId, int alternateBranch, int child, boolean isSafeChain) throws TrieSpaceExhaustedException + { + assert !isLeaf(child) : "Prefix node cannot reference a leaf node."; + assert !isNull(child) || !isNull(alternateBranch) : "Prefix node can only have a null child if it includes an alternate branch."; + + int offset = offset(child); + int node; + if (offset == SPLIT_OFFSET || isSafeChain && offset > (PREFIX_FLAGS_OFFSET + PREFIX_OFFSET) && offset <= CHAIN_MAX_OFFSET) + { + // We can do an embedded prefix node + // Note: for chain nodes we have a risk that the node continues beyond the current point, in which case + // creating the embedded node may overwrite information that is still needed by concurrent readers or the + // mutation process itself. + node = (child & -CELL_SIZE) | PREFIX_OFFSET; + putByte(node + PREFIX_FLAGS_OFFSET, (byte) offset); + } + else + { + // Full prefix node + node = allocateCell() + PREFIX_OFFSET; + putByte(node + PREFIX_FLAGS_OFFSET, (byte) 0xFF); + putInt(node + PREFIX_POINTER_OFFSET, child); + } + + putInt(node + PREFIX_CONTENT_OFFSET, contentId); + putInt(node + PREFIX_ALTERNATE_OFFSET, alternateBranch); + return node; + } + + private int updatePrefixNodeChild(int node, int child, boolean forcedCopy) throws TrieSpaceExhaustedException + { + assert offset(node) == PREFIX_OFFSET : "updatePrefix called on non-prefix node"; + assert !isNullOrLeaf(child) : "Prefix node cannot reference a childless node."; + + // We can only update in-place if we have a full prefix node + if (!isEmbeddedPrefixNode(node)) + { + if (!forcedCopy) + { + // This attaches the child branch and makes it reachable -- the write must be volatile. + putIntVolatile(node + PREFIX_POINTER_OFFSET, child); + } + else + { + node = copyCell(node); + putInt(node + PREFIX_POINTER_OFFSET, child); + } + return node; + } + else + { + // No need to recycle this cell because that is already done by the modification of the child + int contentId = getIntVolatile(node + PREFIX_CONTENT_OFFSET); + int alternateBranch = getIntVolatile(node + PREFIX_ALTERNATE_OFFSET); + return createPrefixNode(contentId, alternateBranch, child, true); + } + } + + protected boolean isEmbeddedPrefixNode(int node) + { + return getUnsignedByte(node + PREFIX_FLAGS_OFFSET) < CELL_SIZE; + } + + /// Copy the content from an existing node, if it has any, to a newly-prepared update for its child. + /// + /// @param existingPreContentNode pointer to the existing node before skipping over content nodes, i.e. this is + /// either the same as existingPostContentNode or a pointer to a prefix or leaf node + /// whose child is `existingPostContentNode` + /// @param existingPostContentNode pointer to the existing node being updated, after any content nodes have been + /// skipped and before any modification have been applied; always a non-content node + /// @param updatedPostContentNode is the updated node, i.e. the node to which all relevant modifications have been + /// applied; if the modifications were applied in-place, this will be the same as + /// `existingPostContentNode`, otherwise a completely different pointer; always a non- + /// content node + /// @param forcedCopy whether or not we need to preserve all pre-existing data for concurrent readers + /// @return a node which has the children of updatedPostContentNode combined with the content of + /// `existingPreContentNode` + private int preservePrefix(int existingPreContentNode, + int existingPostContentNode, + int updatedPostContentNode, + boolean forcedCopy) + throws TrieSpaceExhaustedException + { + if (existingPreContentNode == existingPostContentNode) + return updatedPostContentNode; // no content to preserve + + if (existingPostContentNode == updatedPostContentNode) + { + assert !forcedCopy; + return existingPreContentNode; // child didn't change, no update necessary + } + + // else we have existing prefix node, and we need to reference a new child + if (isLeaf(existingPreContentNode)) + { + return createContentNode(existingPreContentNode, updatedPostContentNode, true); + } + + assert offset(existingPreContentNode) == PREFIX_OFFSET : "Unexpected content in non-prefix and non-leaf node."; + if (updatedPostContentNode != NONE || getIntVolatile(existingPreContentNode + PREFIX_ALTERNATE_OFFSET) != NONE) + return updatePrefixNodeChild(existingPreContentNode, updatedPostContentNode, forcedCopy); + else + { + if (!isEmbeddedPrefixNode(existingPreContentNode)) + recycleCell(existingPreContentNode); + // otherwise cell is recycled with the post-prefix node + return getIntVolatile(existingPreContentNode + PREFIX_CONTENT_OFFSET); + } + } + + /// Represents the state for an [InMemoryTrie#apply] operation. Contains a stack of all nodes we descended through + /// and used to update the nodes with any new data during ascent. + /// + /// To make this as efficient and GC-friendly as possible, we use an integer array (instead of is an object stack) + /// and we reuse the same object. The latter is safe because memtable tries cannot be mutated in parallel by + /// multiple writers. + static class ApplyState + { + static final int STATE_SIZE = 5; + + int[] data = new int[16 * STATE_SIZE]; + int currentDepth = -1; + + /// Pointer to the existing node before skipping over content nodes, i.e. this is either the same as + /// existingPostContentNode or a pointer to a prefix or leaf node whose child is `existingPostContentNode`. + int existingFullNode() + { + return data[currentDepth * STATE_SIZE + 0]; + } + void setExistingFullNode(int value) + { + data[currentDepth * STATE_SIZE + 0] = value; + } + int existingFullNodeAtDepth(int stackDepth) + { + return data[stackDepth * STATE_SIZE + 0]; + } + + /// Pointer to the existing node being updated, after any content nodes have been skipped and before any + /// modification have been applied. Always a non-content node. + int existingPostContentNode() + { + return data[currentDepth * STATE_SIZE + 1]; + } + void setExistingPostContentNode(int value) + { + data[currentDepth * STATE_SIZE + 1] = value; + } + + /// The updated node, i.e. the node to which the relevant modifications are being applied. This will change as + /// children are processed and attached to the node. After all children have been processed, this will contain + /// the fully updated node (i.e. the union of `existingPostContentNode` and `mutationNode`) without any content, + /// which will be processed separately and, if necessary, attached ahead of this. If the modifications were + /// applied in-place, this will be the same as `existingPostContentNode`, otherwise a completely different + /// pointer. Always a non-content node. + int updatedPostContentNode() + { + return data[currentDepth * STATE_SIZE + 2]; + } + void setUpdatedPostContentNode(int value) + { + data[currentDepth * STATE_SIZE + 2] = value; + } + + /// The transition we took on the way down. + int transition() + { + return data[currentDepth * STATE_SIZE + 3]; + } + void setTransition(int transition) + { + data[currentDepth * STATE_SIZE + 3] = transition; + } + int transitionAtDepth(int stackDepth) + { + return data[stackDepth * STATE_SIZE + 3]; + } + int incomingTransition() + { + return transitionAtDepth(currentDepth - 1); + } + + /// The compiled content id. Needed because we can only access a cursor's content on the way down but we can't + /// attach it until we ascend from the node. + int descentPathContentId() + { + return data[currentDepth * STATE_SIZE + 4]; + } + void setDescentPathContentId(int value) + { + data[currentDepth * STATE_SIZE + 4] = value; + } + int descentPathContentIdAtDepth(int stackDepth) + { + return data[stackDepth * STATE_SIZE + 4]; + } + + protected final InMemoryBaseTrie trie; + + ApplyState(InMemoryBaseTrie trie) + { + this.trie = trie; + } + + ApplyState start() + { + return start(trie.root); + } + + ApplyState start(int root) + { + currentDepth = -1; + descendInto(root); + return this; + } + + /// Advance to the given depth and transition. Returns false if the depth signals mutation cursor is exhausted. + boolean advanceTo(int depth, int transition, int forcedCopyDepth) throws TrieSpaceExhaustedException + { + return advanceTo(depth, transition, forcedCopyDepth, 0); + } + /// Advance to the given depth and transition. Returns false if the depth signals mutation cursor is exhausted. + boolean advanceTo(int depth, int transition, int forcedCopyDepth, int ascendLimit) throws TrieSpaceExhaustedException + { + while (currentDepth >= Math.max(ascendLimit + 1, depth)) + { + // There are no more children. Ascend to the parent state to continue walk. + attachAndMoveToParentState(forcedCopyDepth); + } + if (depth <= ascendLimit) + return false; + + // We have a transition, get child to descend into + descend(transition); + return true; + } + + /// Advance to an existing position in the trie or the given limit, whichever comes first. + /// If there is an existing position before this limit, the state will be positioned on it, and true will be + /// returned. If not, we will advance and descend into the given limit position, and return false. + /// + /// The `limitDepth`, `limitTransition` and `limitIsOnReturnPath` parameters specify the limit position. This + /// must be a valid non-exhausted position. + boolean advanceToNextExistingOr(int limitDepth, + int limitTransition, + boolean limitIsOnReturnPath, + int forcedCopyDepth, + int ascendLimit) + throws TrieSpaceExhaustedException + { + assert limitDepth >= ascendLimit; + while (true) + { + int currentTransition = transition(); + int nextTransition = trie.getNextTransition(updatedPostContentNode(), currentTransition + 1); + if (currentDepth + 1 == limitDepth && nextTransition >= limitTransition) + { + descend(limitTransition); + return false; + } + if (nextTransition <= 0xFF) + { + descend(nextTransition); + return true; + } + + if (limitIsOnReturnPath && currentDepth == limitDepth && + (limitDepth == ascendLimit || transitionAtDepth(currentDepth - 1) == limitTransition)) + return false; + + attachAndMoveToParentState(forcedCopyDepth); + } + } + + /// Advance to the next existing position in the trie. + boolean advanceToNextExisting(int forcedCopyDepth, int ascendLimit) + throws TrieSpaceExhaustedException + { + while (true) + { + int currentTransition = transition(); + int nextTransition = trie.getNextTransition(updatedPostContentNode(), currentTransition + 1); + if (nextTransition <= 0xFF) + { + descend(nextTransition); + return true; + } + + if (currentDepth <= ascendLimit) + return false; + + attachAndMoveToParentState(forcedCopyDepth); + } + } + + /// Descend to a child node. Prepares a new entry in the stack for the node. + void descend(int transition) + { + setTransition(transition); + int existingFullNode = trie.getChild(updatedPostContentNode(), transition); + + descendInto(existingFullNode); + } + + private void descendInto(int existingFullNode) + { + ++currentDepth; + if (currentDepth * STATE_SIZE >= data.length) + data = Arrays.copyOf(data, currentDepth * STATE_SIZE * 2); + setExistingFullNode(existingFullNode); + + int existingContentId = NONE; + int existingPostContentNode; + if (isLeaf(existingFullNode)) + { + existingContentId = trie.shouldPresentAfterBranch(existingFullNode) ? NONE : existingFullNode; + existingPostContentNode = NONE; + } + else if (offset(existingFullNode) == PREFIX_OFFSET) + { + existingContentId = trie.getIntVolatile(existingFullNode + PREFIX_CONTENT_OFFSET); + existingPostContentNode = trie.followPrefixTransition(existingFullNode); + } + else + existingPostContentNode = existingFullNode; + + setExistingPostContentNode(existingPostContentNode); + setUpdatedPostContentNode(existingPostContentNode); + setDescentPathContentId(existingContentId); + setTransition(-1); + } + + T getDescentPathContent() + { + int contentId = descentPathContentId(); + if (contentId == NONE) + return null; + return trie.getContent(descentPathContentId()); + } + + void setDescentPathContent(T content, boolean forcedCopy) throws TrieSpaceExhaustedException + { + setDescentPathContentId(combineContent(descentPathContentId(), content, false, forcedCopy)); + } + + int combineContent(int existingContentId, T newContent, boolean contentAfterBranch, boolean forcedCopy) throws TrieSpaceExhaustedException + { + if (existingContentId == NONE) + { + assert (newContent != null); // combineContent cannot be called if new is the same as existing + return trie.addContent(newContent, contentAfterBranch); + } + else if (newContent == null) + { + trie.releaseContent(existingContentId); + return NONE; + } + else if (forcedCopy) + { + trie.releaseContent(existingContentId); + return trie.addContent(newContent, contentAfterBranch); + } + else + { + return trie.setContent(existingContentId, newContent); + } + } + + /// Attach a child to the current node. + private void attachChild(int transition, int child, boolean forcedCopy) throws TrieSpaceExhaustedException + { + int updatedPostContentNode = updatedPostContentNode(); + if (isNull(updatedPostContentNode)) + setUpdatedPostContentNode(trie.expandOrCreateChainNode(transition, child)); + else if (forcedCopy) + setUpdatedPostContentNode(trie.attachChildCopying(updatedPostContentNode, + existingPostContentNode(), + transition, + child)); + else + setUpdatedPostContentNode(trie.attachChild(updatedPostContentNode, + transition, + child)); + } + + /// Apply the collected content to a node. If there is content to add, converts `NONE` to a leaf node, and adds + /// or updates a prefix for all others. + protected int applyContent(boolean forcedCopy) + throws TrieSpaceExhaustedException + { + // Note: the old content id itself is already released by setContent. Here we must release any standalone + // prefix nodes that may reference it. + int contentId = descentPathContentId(); + final int updatedPostContentNode = updatedPostContentNode(); + final int existingPreContentNode = existingFullNode(); + final int existingPostContentNode = existingPostContentNode(); + + // applyPrefixChange does not understand leaf nodes, handle upgrade from and to one explicitly. + if (isNull(updatedPostContentNode)) + { + // This node has no children. If the content is metadata that has no meaning if no children exist, + // remove it. + if (!isNull(contentId) && !trie.shouldPreserveWithoutChildren(contentId)) + { + trie.releaseContent(contentId); + contentId = NONE; + } + + if (existingPreContentNode != existingPostContentNode + && !isNullOrLeaf(existingPreContentNode) + && !trie.isEmbeddedPrefixNode(existingPreContentNode)) + trie.recycleCell(existingPreContentNode); + return contentId; // also fine for contentId == NONE + } + + if (isLeaf(existingPreContentNode)) + return contentId != NONE + ? trie.createContentNode(contentId, updatedPostContentNode, true) + : updatedPostContentNode; + + return applyPrefixChange(updatedPostContentNode, + existingPreContentNode, + existingPostContentNode, + contentId, + NONE, + forcedCopy); + } + + protected int applyPrefixChange(int updatedPostPrefixNode, + int existingPrePrefixNode, + int existingPostPrefixNode, + int contentId, + int alternateBranch, + boolean forcedCopy) + throws TrieSpaceExhaustedException + { + boolean prefixWasPresent = existingPrePrefixNode != existingPostPrefixNode; + boolean prefixWasEmbedded = prefixWasPresent && trie.isEmbeddedPrefixNode(existingPrePrefixNode); + if (contentId == NONE && alternateBranch == NONE) + { + if (prefixWasPresent && !prefixWasEmbedded) + trie.recycleCell(existingPrePrefixNode); + return updatedPostPrefixNode; + } + + boolean childChanged = updatedPostPrefixNode != existingPostPrefixNode; + boolean dataChanged = !prefixWasPresent || contentId != trie.getIntVolatile(existingPrePrefixNode + PREFIX_CONTENT_OFFSET) + || alternateBranch != trie.getIntVolatile(existingPrePrefixNode + PREFIX_ALTERNATE_OFFSET); + if (!childChanged && !dataChanged) + return existingPrePrefixNode; + + if (forcedCopy) + { + if (!childChanged && prefixWasEmbedded) + { + // If we directly create in this case, we will find embedding is possible and will overwrite the + // previous value. + // We could create a separate metadata node referencing the child, but in that case we'll + // use two nodes while one suffices. Instead, copy the child and embed the new metadata. + updatedPostPrefixNode = trie.copyCell(existingPostPrefixNode); + } + else if (prefixWasPresent && !prefixWasEmbedded) + { + trie.recycleCell(existingPrePrefixNode); + // otherwise cell is already recycled by the recycling of the child + } + return trie.createPrefixNode(contentId, alternateBranch, updatedPostPrefixNode, isNull(existingPostPrefixNode)); + } + + // We can't update in-place if there was no preexisting prefix, or if the + // prefix was embedded and the target node must change. + if (!prefixWasPresent || prefixWasEmbedded && childChanged) + return trie.createPrefixNode(contentId, alternateBranch, updatedPostPrefixNode, isNull(existingPostPrefixNode)); + + // Otherwise modify in place + if (childChanged) // to use volatile write but also ensure we don't corrupt embedded nodes + trie.putIntVolatile(existingPrePrefixNode + PREFIX_POINTER_OFFSET, updatedPostPrefixNode); + if (dataChanged) + { + trie.putIntVolatile(existingPrePrefixNode + PREFIX_CONTENT_OFFSET, contentId); + trie.putIntVolatile(existingPrePrefixNode + PREFIX_ALTERNATE_OFFSET, alternateBranch); + } + return existingPrePrefixNode; + } + + /// After a node's children are processed, this is called to ascend from it. This means applying the collected + /// content to the compiled `updatedPostContentNode` and creating a mapping in the parent to it (or updating if + /// one already exists). + void attachAndMoveToParentState(int forcedCopyDepth) throws TrieSpaceExhaustedException + { + attachBranchAndMoveToParentState(applyContent(currentDepth >= forcedCopyDepth), forcedCopyDepth); + } + + void attachBranchAndMoveToParentState(int updatedFullNode, int forcedCopyDepth) throws TrieSpaceExhaustedException { + int existingFullNode = existingFullNode(); + --currentDepth; + assert currentDepth >= 0; + + if (updatedFullNode != existingFullNode) + attachChild(transition(), updatedFullNode, currentDepth >= forcedCopyDepth); + } + + /// Ascend and update the root at the end of processing. + void attachAndUpdateRoot(int forcedCopyDepth) throws TrieSpaceExhaustedException + { + attachRoot(applyContent(0 >= forcedCopyDepth), forcedCopyDepth); + } + + void attachRoot(int updatedFullNode, int ignoredForcedCopyDepth) + { + int existingFullNode = existingFullNode(); + --currentDepth; + assert trie.root == existingFullNode : "Unexpected change to root. Concurrent trie modification?"; + if (updatedFullNode != existingFullNode) + { + // Only write to root if they are different (value doesn't change, but + // we don't want to invalidate the value in other cores' caches unnecessarily). + trie.root = updatedFullNode; + } + } + + void prepareToWalkBranchAgain() + { + setTransition(-1); + } + + public byte[] getBytes(int startDepth) + { + Preconditions.checkArgument(startDepth >= 0 && startDepth <= currentDepth); + int arrSize = currentDepth - startDepth; + byte[] data = new byte[arrSize]; + int pos = 0; + for (int i = startDepth; i < currentDepth; ++i) + { + int trans = transitionAtDepth(i); + data[pos++] = (byte) trans; + } + return data; + } + + public int getNearestAncestorDepthSatisfying(Predicate shouldStop) + { + return getNearestAncestorDepthSatisfying(shouldStop, currentDepth - 1); + } + + public int getNearestAncestorDepthSatisfying(Predicate shouldStop, int startDepth) + { + int i; + for (i = startDepth; i >= 0; --i) + { + int content = descentPathContentIdAtDepth(i); + if (!isNull(content) && shouldStop.test(trie.getContent(content))) + return i; + } + return -1; + } + + public ByteComparable.Version byteComparableVersion() + { + return trie.byteComparableVersion; + } + + public String toString() + { + if (data == null) + return "uninitialized"; + StringBuilder sb = new StringBuilder(); + sb.append('@'); + for (int i = 0; i < currentDepth; ++i) + sb.append(String.format("%02x", transitionAtDepth(i))); + + sb.append(" existingPostContentNode=").append(existingPostContentNode()); + sb.append(" updatedPostContentNode=").append(updatedPostContentNode()); + sb.append(" descentPathContentId=").append(descentPathContentId()); + return sb.toString(); + } + + public InMemoryBaseTrie trie() + { + return trie; + } + } + + + /// Somewhat similar to [Trie.MergeResolver], this encapsulates logic to be applied whenever new content is + /// being upserted into a [InMemoryBaseTrie]. Unlike [Trie.MergeResolver], [UpsertTransformer] will be + /// applied no matter if there's pre-existing content for that trie key/path or not. + /// + /// @param The content type for this [InMemoryBaseTrie]. + /// @param The type of the new content being applied to this [InMemoryBaseTrie]. + public interface UpsertTransformer + { + /// Called when there's content in the updating trie. + /// + /// @param existing Existing content for this key, or null if there isn't any. + /// @param update The update, always non-null. + /// @return The combined value to use. A value of null will delete the existing entry. + T apply(T existing, @Nonnull U update); + } + + /// Interface providing features of the mutating node during mutation done using [InMemoryTrie#apply]. + /// Effectively a subset of the [Cursor] interface which only permits operations that are safe to + /// perform before iterating the children of the mutation node to apply the branch mutation. + /// + /// This is mainly used as an argument to predicates that decide when to copy substructure when modifying tries, + /// which enables different kinds of atomicity and consistency guarantees. + /// + /// See the [InMemoryTrie] javadoc or [InMemoryTrieThreadedTest] for demonstration of the typical usages and what + /// they achieve. + public interface NodeFeatures + { + /// Whether or not the node has more than one descendant. If a checker needs mutations to be atomic, they can + /// return true when this becomes true. + boolean isBranching(); + + /// The metadata associated with the node. If readers need to see a consistent view (i.e. where older updates + /// cannot be missed if a new one is presented) below some specified point (e.g. within a partition), the checker + /// should return true when it identifies that point. + T content(); + } + + /// This class includes the common functionality of the various trie mutators. + /// + /// Stores the configured transformers and flags of the operations, and can be reused to apply + /// modifications with the same configuration multiple times. + /// + /// The mutator provides some methods that can be called by the given transformer to obtain information about the + /// state when merging in data. + protected static class Mutator, A extends ApplyState> implements NodeFeatures + { + final UpsertTransformer transformer; + final Predicate> needsForcedCopy; + final A state; + + C mutationCursor; + int forcedCopyDepth; + + Mutator(UpsertTransformer transformer, + Predicate> needsForcedCopy, + A state) + { + this.transformer = transformer; + this.needsForcedCopy = needsForcedCopy; + this.state = state; + } + + Mutator start(int root, C mutationCursor, int initialForcedCopyDepth) + { + mutationCursor.assertFresh(); + + this.mutationCursor = mutationCursor; + this.forcedCopyDepth = initialForcedCopyDepth; + this.state.start(root); + return this; + } + + Mutator start(C mutationCursor) + { + return start(state.trie.root, mutationCursor, Integer.MAX_VALUE); + } + + Mutator apply() throws TrieSpaceExhaustedException + { + int depth = state.currentDepth; + while (true) + { + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; + + applyContent(mutationCursor.content()); + + long position = mutationCursor.advance(); + assert !Cursor.isOnReturnPath(position) : "Return path in forward direction can only be used in range tries."; + depth = Cursor.depth(position); + if (!state.advanceTo(depth, Cursor.incomingTransition(position), forcedCopyDepth)) + break; + assert state.currentDepth == depth : "Unexpected change to applyState. Concurrent trie modification?"; + } + return this; + } + + void applyContent(U content) throws TrieSpaceExhaustedException + { + if (content != null) + { + T existingContent = state.getDescentPathContent(); + T combinedContent = transformer.apply(existingContent, content); + if (combinedContent != existingContent) + state.setDescentPathContent(combinedContent, // can be null + state.currentDepth >= forcedCopyDepth); // this is called at the start of processing + } + } + + + void complete() throws TrieSpaceExhaustedException + { + assert state.currentDepth == 0 : "Unexpected change to applyState. Concurrent trie modification?"; + state.attachAndUpdateRoot(forcedCopyDepth); + } + + @Override + public boolean isBranching() + { + if (Cursor.isOnReturnPath(mutationCursor.encodedPosition())) + return false; + + // This is not very efficient, but we only currently use this option in tests. + // If it's needed for production use, isBranching should be implemented in the cursor interface. + Cursor dupe = mutationCursor.tailCursor(Direction.FORWARD); + long childPosition = dupe.advance(); + return !Cursor.isExhausted(childPosition) && + !Cursor.isExhausted(dupe.skipTo(Cursor.positionForSkippingBranch(childPosition))); + } + + @Override + public U content() + { + return mutationCursor.content(); + } + + /// Return the depth of the currently processed node. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public int currentDepth() + { + return state.currentDepth; + } + + /// Get the bytes of the path leading to this node. The returned array can be safely modified and/or stored. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public byte[] getCurrentKeyBytes() + { + return getCurrentKeyBytes(0); + } + + /// Get the bytes of the path leading to this node from the given depth. + /// The returned array can be safely modified and/or stored. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public byte[] getCurrentKeyBytes(int startDepth) + { + return state.getBytes(startDepth); + } + + /// Get the depth of the nearest ancestor that has content satisfying the given predicate. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public int getNearestAncestorDepthSatisfying(Predicate shouldStop) + { + return state.getNearestAncestorDepthSatisfying(shouldStop); + } + + /// Get the key bytes to the nearest ancestor that has content satisfying the given predicate. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public byte[] getCurrentKeyBytesToNearestAncestorSatisfying(Predicate shouldStop) + { + return state.getBytes(Math.max(0, getNearestAncestorDepthSatisfying(shouldStop))); + } + + public ByteComparable.Version byteComparableVersion() + { + return state.byteComparableVersion(); + } + } + + /// Map-like put method, using a fast recursive implementation through the key bytes. May run into stack overflow if + /// the trie becomes too deep. When the correct position in the trie has been reached, the value will be resolved + /// with the given function before being placed in the trie (even if there's no pre-existing content in this trie). + /// @param key The trie path/key for the given value. + /// @param value The value being put in the memtable trie. Note that it can be of type different than the element + /// type for this memtable trie. It's up to the `transformer` to return the final value that will stay in + /// the memtable trie. + /// @param transformer A function applied to the potentially pre-existing value for the given key, and the new + /// value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied + /// even if there's no pre-existing value in the memtable trie. + public void putRecursive(ByteComparable key, R value, final UpsertTransformer transformer) throws TrieSpaceExhaustedException + { + putRecursive(key, value, false, transformer); + } + + + /// Map-like put method, using a fast recursive implementation through the key bytes. May run into stack overflow if + /// the trie becomes too deep. When the correct position in the trie has been reached, the value will be resolved + /// with the given function before being placed in the trie (even if there's no pre-existing content in this trie). + /// @param key The trie path/key for the given value. + /// @param value The value being put in the memtable trie. Note that it can be of type different than the element + /// type for this memtable trie. It's up to the `transformer` to return the final value that will stay in + /// the memtable trie. + /// @param contentAfterBranch Whether the content should be placed after descendants in forward iteration order. + /// Setting this to true only makes sense with ordered or range tries (i.e. where [#presentContentOnDescentPath] is + /// false). + /// @param transformer A function applied to the potentially pre-existing value for the given key, and the new + /// value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied + /// even if there's no pre-existing value in the memtable trie. + public void putRecursive(ByteComparable key, R value, boolean contentAfterBranch, final UpsertTransformer transformer) throws TrieSpaceExhaustedException + { + assert !(contentAfterBranch && presentContentOnDescentPath) : "After branch placement only makes sense with range or ordered tries"; + try + { + int newRoot = putRecursive(root, key.asComparableBytes(byteComparableVersion), value, contentAfterBranch, transformer); + if (newRoot != root) + root = newRoot; + completeMutation(); + } + catch (Throwable t) + { + abortMutation(); + throw t; + } + } + + private int putRecursive(int node, ByteSource key, R value, boolean contentAfterBranch, final UpsertTransformer transformer) throws TrieSpaceExhaustedException + { + int transition = key.next(); + if (transition == ByteSource.END_OF_STREAM) + return applyContent(node, value, contentAfterBranch, transformer); + + int child = getChild(node, transition); + + int newChild = putRecursive(child, key, value, contentAfterBranch, transformer); + if (newChild == child) + return node; + + int skippedContent = followPrefixTransition(node); + int attachedChild = !isNull(skippedContent) + ? attachChild(skippedContent, transition, newChild) // Single path, no copying required + : expandOrCreateChainNode(transition, newChild); + + return preservePrefix(node, skippedContent, attachedChild, false); + } + + private int applyContent(int node, R value, boolean contentAfterBranch, UpsertTransformer transformer) throws TrieSpaceExhaustedException + { + if (isNull(node)) + return addContent(transformer.apply(null, value), contentAfterBranch); + + if (isLeaf(node)) + { + int contentId = node; + + if (contentAfterBranch == shouldPresentAfterBranch(node)) + { + T existingContent = getContent(contentId); + T newContent = transformer.apply(existingContent, value); + + if (newContent == existingContent) + return contentId; + if (newContent != null) + { + return setContent(contentId, newContent); + } + else + { + releaseContent(contentId); + return NONE; + } + } + else + { + T newContent = transformer.apply(null, value); + + // We already have content, but we also need to add content on the other side of the branch. + if (newContent == null) + return contentId; // we are not adding anything, leave existing node. + + // Convert this to prefix node to be able to store both. + return createPrefixNode(contentId, addContent(newContent, contentAfterBranch), NONE, false); + } + } + + if (offset(node) == PREFIX_OFFSET) + { + int contentOffset = contentAfterBranch ? PREFIX_ALTERNATE_OFFSET : PREFIX_CONTENT_OFFSET; + int contentId = getIntVolatile(node + contentOffset); + + assert isNullOrLeaf(contentId) : "Content after branch cannot be used toghether with alternate branch"; + T existingContent = isNull(contentId) ? null : getContent(contentId); + T newContent = transformer.apply(existingContent, value); + if (newContent == existingContent) + return node; + + if (newContent != null) + { + if (!isNull(contentId)) + { + int newId = setContent(contentId, newContent); + if (newId != contentId) + putIntVolatile(node + contentOffset, newId); + } + else + putIntVolatile(node + contentOffset, addContent(newContent, contentAfterBranch)); + return node; + } + else + { + releaseContent(contentId); + int otherContentOffset = contentAfterBranch ? PREFIX_CONTENT_OFFSET : PREFIX_ALTERNATE_OFFSET; + + if (!isNull(getIntVolatile(node + otherContentOffset))) + { + // keep the prefix node for the other content / alternate path + putIntVolatile(node + contentOffset, NONE); + return node; + } + + int b = getUnsignedByte(node + PREFIX_FLAGS_OFFSET); + if (b < CELL_SIZE) + { + // embedded prefix node + return node - PREFIX_OFFSET + b; + } + else + { + // separate prefix node. recycle it as it's no longer needed + recycleCell(node); + return getIntVolatile(node + PREFIX_POINTER_OFFSET); + } + } + } + + T newContent = transformer.apply(null, value); + if (newContent == null) + return node; + else + return createContentNode(addContent(newContent, contentAfterBranch), node, false); + } + + void completeMutation() + { + bufferManager.completeMutation(); + contentManager.completeMutation(); + } + + void abortMutation() + { + bufferManager.abortMutation(); + contentManager.abortMutation(); + } + + /// Returns true if the allocation threshold has been reached. To be called by the the writing thread (ideally, just + /// after the write completes). When this returns true, the user should switch to a new trie as soon as feasible. + /// + /// The trie expects up to 10% growth above this threshold. Any growth beyond that may be done inefficiently, and + /// the trie will fail altogether when the size grows beyond 2G - 256 bytes. + public boolean reachedAllocatedSizeThreshold() + { + return bufferManager.reachedAllocatedSizeThreshold(); + } + + protected abstract long emptySizeOnHeap(); + + /// Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content, or + /// any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). + /// The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when + /// to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out + /// immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it + /// possible to flush out before making these large allocations. + public long usedSizeOffHeap() + { + return contentManager.usedSizeOffHeap() + bufferManager.usedSizeOffHeap(); + } + + /// Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content, or + /// any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). + /// The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when + /// to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out + /// immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it + /// possible to flush out before making these large allocations. + public long usedSizeOnHeap() + { + return emptySizeOnHeap() + + contentManager.usedSizeOnHeap() + + bufferManager.usedSizeOnHeap(); + } + + @VisibleForTesting + public long usedBufferSpace() + { + return bufferManager.usedBufferSpace(); + } + + /// Returns the amount of memory that has been allocated for various buffers but isn't currently in use. + /// The total on-heap space used by the trie is `usedSizeOnHeap() + unusedReservedOnHeapMemory()`. + @VisibleForTesting + public long unusedReservedOnHeapMemory() + { + return bufferManager.unusedReservedOnHeapMemory() + contentManager.unusedReservedOnHeapMemory(); + } + + /// Release all recycled content references, including the ones waiting in still incomplete recycling lists. + /// This is a test method and can cause null pointer exceptions if used on a live trie. + /// + /// If similar functionality is required for non-test purposes, a version of this should be developed that only + /// releases references on barrier-complete lists. + @VisibleForTesting + public void releaseReferencesUnsafe() + { + contentManager.releaseReferencesUnsafe(); + } + + /// Returns the number of values in the trie + public int valuesCount() + { + return contentManager.valuesCount(); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrie.java b/src/java/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrie.java new file mode 100644 index 000000000000..27fcc01410b3 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrie.java @@ -0,0 +1,703 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Predicate; + +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; + +/// In-memory implementation of deletion-aware tries with concurrent access support. +/// +/// This class provides a concrete implementation of [DeletionAwareTrie] that stores both live data +/// and deletion information in a unified in-memory structure. It extends [InMemoryBaseTrie] to +/// inherit the efficient cell-based memory management, concurrent access patterns, and performance +/// optimizations of the base trie implementation. +/// +/// This class stores deletion branches in the "alternate branch" field of prefix nodes. All the +/// machinery to support this is already provided by [InMemoryBaseTrie]. This class implements the +/// relevant cursor and mutation methods. +/// +/// See [InMemoryTrie] for information on the consistency model. +/// +/// @param The content type for live data stored in the trie +/// @param The deletion marker type, must extend [RangeState] for range operations +public class InMemoryDeletionAwareTrie> +extends InMemoryBaseTrie implements DeletionAwareTrie +{ + // constants for space calculations + private static final long EMPTY_SIZE_ON_HEAP; + private static final long EMPTY_SIZE_OFF_HEAP; + static + { + // Measuring the empty size of long-lived tries, because these are the ones for which we want to track size. + InMemoryBaseTrie empty = new InMemoryDeletionAwareTrie<>(ByteComparable.Version.OSS50, null, BufferType.ON_HEAP, ExpectedLifetime.LONG, null); + EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty); + empty = new InMemoryDeletionAwareTrie<>(ByteComparable.Version.OSS50, null, BufferType.OFF_HEAP, ExpectedLifetime.LONG, null); + EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty); + } + + InMemoryDeletionAwareTrie(ByteComparable.Version byteComparableVersion, Predicate shouldPreserveContentWithoutChildren, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder) + { + super(byteComparableVersion, true, shouldPreserveContentWithoutChildren, bufferType, lifetime, opOrder); + } + + InMemoryDeletionAwareTrie(ByteComparable.Version version, BufferManager bufferManager, ContentManager contentManager) + { + super(version, true, bufferManager, contentManager); + } + + public static > + InMemoryDeletionAwareTrie shortLived(ByteComparable.Version byteComparableVersion) + { + return new InMemoryDeletionAwareTrie<>(byteComparableVersion, null, BufferType.ON_HEAP, ExpectedLifetime.SHORT, null); + } + + /// Create a short-lived on-heap in-memory deletion-aware trie, where content that has no children and fails the + /// `shouldPreserveContentWithoutChildren` predicate is removed. + /// This is used to clean up dangling metadata that has no meaning when its branch is empty. + public static > + InMemoryDeletionAwareTrie shortLived(ByteComparable.Version byteComparableVersion, Predicate shouldPreserveContentWithoutChildren) + { + return new InMemoryDeletionAwareTrie<>(byteComparableVersion, shouldPreserveContentWithoutChildren, BufferType.ON_HEAP, ExpectedLifetime.SHORT, null); + } + + public static > + InMemoryDeletionAwareTrie shortLived(ByteComparable.Version byteComparableVersion, BufferType bufferType) + { + return new InMemoryDeletionAwareTrie<>(byteComparableVersion, null, bufferType, ExpectedLifetime.SHORT, null); + } + + public static > + InMemoryDeletionAwareTrie longLived(ByteComparable.Version byteComparableVersion, OpOrder opOrder) + { + return longLived(byteComparableVersion, BufferType.OFF_HEAP, opOrder); + } + + public static > + InMemoryDeletionAwareTrie longLived(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder) + { + return new InMemoryDeletionAwareTrie<>(byteComparableVersion, null, bufferType, ExpectedLifetime.LONG, opOrder); + } + + /// Create a long-lived in-memory deletion-aware trie, where data is stored in trie cells using the given + /// `contentSerializer`. + public static > + InMemoryDeletionAwareTrie longLived(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder, ContentSerializer contentSerializer) + { + BufferManagerMultibuf bufferManager = new BufferManagerMultibuf(bufferType, ExpectedLifetime.LONG, opOrder); + ContentManager contentManager = new ContentManagerBytes<>(contentSerializer, bufferManager); + return new InMemoryDeletionAwareTrie<>(byteComparableVersion, bufferManager, contentManager); + } + + static class DeletionAwareInMemoryCursor> + extends InMemoryCursor implements DeletionAwareCursor + { + DeletionAwareInMemoryCursor(InMemoryBaseTrie trie, Direction direction, int root) + { + super(trie, direction, root); + } + + @Override + public T content() + { + return content; + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + int alternateBranch = trie.getAlternateBranch(currentFullNode); + return ((InMemoryDeletionAwareTrie) trie).makeRangeCursor(direction, alternateBranch); + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new DeletionAwareInMemoryCursor<>((InMemoryDeletionAwareTrie) trie, + direction, + currentFullNode); + } + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + private RangeCursor makeRangeCursor(Direction direction, int alternateBranch) { + return isNull(alternateBranch) + ? null + : new InMemoryRangeTrie.InMemoryRangeCursor<>((InMemoryReadTrie) this, direction, alternateBranch); + } + + //noinspection ClassEscapesDefinedScope + @Override + public DeletionAwareInMemoryCursor makeCursor(Direction direction) + { + return new DeletionAwareInMemoryCursor<>(this, direction, root); + } + + protected long emptySizeOnHeap() + { + return bufferManager.bufferType() == BufferType.ON_HEAP ? EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP; + } + + static class ApplyState> extends InMemoryBaseTrie.ApplyState + { + int alternateBranchToAttach = NONE; + + ApplyState(InMemoryDeletionAwareTrie trie) + { + super(trie); + } + + ApplyState start() + { + return start(trie.root); + } + + ApplyState start(int root) + { + return (ApplyState) super.start(root); + } + + public int alternateBranch() + { + return trie.getAlternateBranch(existingFullNode()); + } + + @Override + protected int applyContent(boolean forcedCopy) throws TrieSpaceExhaustedException + { + if (alternateBranchToAttach != NONE) + { + int alternateBranch = alternateBranchToAttach; + alternateBranchToAttach = NONE; + return applyContentWithAlternateBranch(alternateBranch, forcedCopy); + } + else + return super.applyContent(forcedCopy); + } + + /// Apply the collected content and alternate branch to a node, when it is known that the node contains an + /// alternate branch. This will create or update a prefix node to reflect the new alternate branch pointer. + int applyContentWithAlternateBranch(int alternateBranch, boolean forcedCopy) throws TrieSpaceExhaustedException + { + int contentId = descentPathContentId(); + final int updatedPostContentNode = updatedPostContentNode(); + final int existingPreContentNode = existingFullNode(); + final int existingPostContentNode = existingPostContentNode(); + if (isNull(updatedPostContentNode) && !isNull(contentId) && !trie.shouldPreserveWithoutChildren(contentId)) + { + trie.releaseContent(contentId); + contentId = NONE; + } + + // applyPrefixChange does not understand leaf nodes, handle upgrade from one explicitly. + if (isLeaf(existingPreContentNode)) + return trie.createPrefixNode(contentId, alternateBranch, updatedPostContentNode, true); + + return applyPrefixChange(updatedPostContentNode, + existingPreContentNode, + existingPostContentNode, + contentId, + alternateBranch, + forcedCopy); + } + } + + /// Reused storage for the state of application of mutations. This stores the backtracking path, including changes + /// already applied (e.g. new version of a node that is not yet linked to the current trie) and some that are yet + /// to be applied (e.g. updated content). + /// + /// This state is used for the data part of the trie (i.e. excluding deletion branches). This includes machinery to + /// attach deletion branches to nodes in the data trie. + /// + /// Because in-memory tries are single-writer, we can reuse a single state array for all updates. The updates are + /// serialized and thus no other thread can corrupt this state (note that this is not the factor enforcing the + /// single writer policy, and since we are already bound to it there is cost involved in reusing this state array). + final private ApplyState applyState = new ApplyState<>(this); + + /// Reused storage for the state of application of deletions (i.e. merging deletion branches into this trie). + /// Mutations switch to using this state object (leaving the [#applyState] to store the state leading up to the + /// deletion branch) whenever we switch to using [InMemoryRangeTrie] methods to apply deletions. + /// + /// This treats this data buffers as a range trie and uses an unchecked cast to treat the deletion branches as + /// containing only deletion states of type `D`. + @SuppressWarnings("unchecked") + final InMemoryRangeTrie.ApplyState deletionState = new InMemoryRangeTrie.ApplyState<>((InMemoryBaseTrie) this); + + /// Deletion-aware trie mutator, binding the trie with a merge configuration (i.e. transformers and predicates). + /// Can be used to apply multiple modifications to the trie using [#apply(DeletionAwareTrie)]. + public class Mutator> + extends InMemoryBaseTrie.Mutator, ApplyState> + { + final BiFunction insertedDeleter; + final boolean deletionsAtFixedPoints; + final InMemoryRangeTrie.MutatorStatic deletionMutator; + final InMemoryTrie.RangeMutator deleter; + + /// See [InMemoryDeletionAwareTrie#mutator(UpsertTransformer, UpsertTransformer, UpsertTransformer, BiFunction, boolean, Predicate, Predicate)] + /// for the meaning of the parameters. + Mutator(UpsertTransformer dataTransformer, + UpsertTransformer deletionTransformer, + UpsertTransformer existingDeleter, + BiFunction insertedDeleter, + Predicate> needsForcedCopyInData, + Predicate> needsForcedCopyInDeletionBranch, + boolean deletionsAtFixedPoints) + { + super(dataTransformer, needsForcedCopyInData, applyState); + this.deletionMutator = new InMemoryRangeTrie.MutatorStatic<>(deletionState, + deletionTransformer, + needsForcedCopyInDeletionBranch); + this.deleter = new InMemoryTrie.RangeMutator<>(applyState, existingDeleter, needsForcedCopyInDeletionBranch); + this.insertedDeleter = insertedDeleter; + this.deletionsAtFixedPoints = deletionsAtFixedPoints; + } + + Mutator start(DeletionAwareCursor mutationCursor) + { + super.start(new DeletionAwareMergeSource<>(insertedDeleter, mutationCursor)); + return this; + } + + @Override + Mutator apply() throws TrieSpaceExhaustedException + { + int depth = state.currentDepth; + while (true) + { + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; + + // Content must be applied before descending into the branch to make sure we call the transformers + // in the right order. + applyContent(mutationCursor.content()); + + int existingAlternateBranch = state.alternateBranch(); + RangeCursor incomingAlternateBranch = mutationCursor.deletionBranchCursor(Direction.FORWARD); + long position; + if (incomingAlternateBranch != null || existingAlternateBranch != NONE) + { + int updatedAlternateBranch = existingAlternateBranch; + RangeCursor ourDeletionBranch; + if (!deletionsAtFixedPoints && existingAlternateBranch == NONE && state.existingFullNode() != NONE) + { + // Move any covered deletion branches up to this depth so that we can correctly merge the + // incoming deletions. + updatedAlternateBranch = hoistOurDeletionBranches(); + } + ourDeletionBranch = ((InMemoryDeletionAwareTrie) state.trie()).makeRangeCursor(Direction.FORWARD, updatedAlternateBranch); + + if (!deletionsAtFixedPoints && incomingAlternateBranch == null) + { + // The incoming cursor has no deletions here, but it may have some below this point. + // Switch to deletion branch to transform them to be rooted here. + // (Note: this will cause a lot of processing of unproductive branches.) + incomingAlternateBranch = new DeletionAwareCursor.DeletionsTrieCursor<>(mutationCursor.tailCursor(Direction.FORWARD)); + } + + if (incomingAlternateBranch != null) + { + // Duplicate cursor as we need it for both deletion and data branches. + RangeCursor deletionBranch = incomingAlternateBranch.tailCursor(Direction.FORWARD); + + // Delete data that is covered by the new deletions. + applyDeletions(incomingAlternateBranch); + + // Merge the deletions into our deletion branch. + updatedAlternateBranch = mergeDeletionBranch(updatedAlternateBranch, deletionBranch); + } + + // Continue processing to also insert the incoming data at this branch. + // Note that this will also apply the incoming content to this node and advance the mutation cursor + // to the position after this branch. + applyDataUnderDeletion(ourDeletionBranch); + + // Ascend and apply alternate branch. + state.alternateBranchToAttach = updatedAlternateBranch; + if (state.currentDepth == 0) + break; // to be attached to root by complete() + state.attachAndMoveToParentState(forcedCopyDepth); + position = mutationCursor.encodedPosition(); + } + else + position = mutationCursor.advance(); + + assert !Cursor.isOnReturnPath(position) : "Return path in forward direction can only be used in range tries."; + depth = Cursor.depth(position); + if (!state.advanceTo(depth, Cursor.incomingTransition(position), forcedCopyDepth)) + break; + assert state.currentDepth == depth : "Unexpected change to applyState. Concurrent trie modification?"; + } + return this; + } + + private void applyDataUnderDeletion(RangeCursor ourDeletionBranch) throws TrieSpaceExhaustedException + { + // Add our deletion to DeletionAwareMergeSource to apply them to incoming data. + if (ourDeletionBranch != null) + mutationCursor.addDeletions(ourDeletionBranch); + int initialDepth = state.currentDepth; + + // The first forcedCopyDepth and applyContent are already called. + long position = mutationCursor.advance(); + int depth = Cursor.depth(position); + assert !Cursor.isOnReturnPath(position) : "Return path in forward direction can only be used in range tries."; + + // Below is the same as the main loop in `apply`, slightly rearranged and ignoring deletion branches. + while (state.advanceTo(depth, Cursor.incomingTransition(position), forcedCopyDepth, initialDepth)) + { + assert state.currentDepth == depth : "Unexpected change to applyState. Concurrent trie modification?"; + + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; + + applyContent(mutationCursor.content()); + position = mutationCursor.advance(); + depth = Cursor.depth(position); + } + assert state.currentDepth == initialDepth; + } + + private void applyDeletions(RangeCursor incomingAlternateBranch) throws TrieSpaceExhaustedException + { + // Apply the deletion branch to our data. + deleter.continueFromCurrentState(incomingAlternateBranch, forcedCopyDepth).apply(); + + // Make sure the data pass that follows walks the updated branch. + state.prepareToWalkBranchAgain(); + } + + private int mergeDeletionBranch(int existingAlternateBranch, RangeCursor deletionBranch) throws TrieSpaceExhaustedException + { + // If forced copying is in force, apply it to the deletion branch too. + int deletionForcedCopyDepth = forcedCopyDepth <= state.currentDepth ? 0 : Integer.MAX_VALUE; + deletionMutator.start(existingAlternateBranch, deletionBranch, deletionForcedCopyDepth).apply(); + + return deletionMutator.completeBranch(); + } + + private int hoistOurDeletionBranches() throws TrieSpaceExhaustedException + { + // Walk all of our data branch and build new branches corresponding to it. When we reach a deletion + // branch, link it. If a branch is walked without finding a deletion branch, the returned NONEs should + // propagate up. + // We need to walk both the deletion-aware/data trie, as well as the deletion branch being built, so that + // the existing deletion branch mappings can be removed. + InMemoryRangeTrie.ApplyState deletionState = deletionMutator.state; + deletionState.start(NONE); + int initialDepth = state.currentDepth; + + int depth = state.currentDepth; + while (true) + { + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; + + int existingAlternateBranch = state.alternateBranch(); + if (existingAlternateBranch != NONE) + { + deletionState.attachBranchAndMoveToParentState(existingAlternateBranch, forcedCopyDepth); + // Drop the existing alternate branch from the main state and ascend. + // The normal applyContent() method uses alternate branch value of NONE. + state.attachAndMoveToParentState(forcedCopyDepth); + } + + if (!state.advanceToNextExisting(forcedCopyDepth, initialDepth)) + break; + depth = state.currentDepth; + deletionState.advanceTo(depth - initialDepth, state.incomingTransition(), forcedCopyDepth - initialDepth); + } + if (deletionState.currentDepth > 0) + deletionState.advanceTo(-1, -1, forcedCopyDepth - initialDepth); + + // Make sure the walks over the data branch that follow use the updated branch. + state.prepareToWalkBranchAgain(); + return deletionState.applyContent(forcedCopyDepth >= initialDepth); + } + + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// All of the deletions in the given mutation trie will be applied, removing any content and trie paths that become + /// empty as a result of the deletions and releasing any of the trie cells that they occupied. The deletion branches + /// of the trie will be combined with the incoming deletions. + /// + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + /// @see DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints + public + void apply(DeletionAwareTrie mutation) + throws TrieSpaceExhaustedException + { + // TODO: track hasDeletions and do plain Trie merges if neither this nor mutation has deletions. + try + { + start(mutation.cursor(Direction.FORWARD)).apply().complete(); + completeMutation(); + } + catch (Throwable t) + { + abortMutation(); + throw t; + } + } + + /// Modify this trie to apply the given deletion branch. This has the same effect as applying a deletion-aware + /// trie that contains only the given range trie as a deletion branch at its root. + /// All deletions in the given mutation trie will be applied, removing any content and trie paths that become + /// empty as a result of the deletions and releasing any of the trie cells that they occupied. The deletion + /// branches of the trie will be combined with the incoming deletions, hoisting the deletion branch to the root + /// of the trie if necessary. + /// + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + public + void delete(RangeTrie mutation) + throws TrieSpaceExhaustedException + { + DeletionAwareCursor mutationCursor = new SingletonCursor.DeletionBranch<>(Direction.FORWARD, + ByteSource.EMPTY, + byteComparableVersion, + mutation); + + try + { + start(mutationCursor).apply().complete(); + completeMutation(); + } + catch (Throwable t) + { + abortMutation(); + throw t; + } + } + + /// Get the bytes of the key in the deletion branch. This will not include the portion of the trie path leading + /// to the deletion branch. + /// + /// This method may be called by `deletionTransformer` to get information about the current state. + public byte[] getDeletionBranchKeyBytes() + { + return deletionMutator.getCurrentKeyBytes(); + } + + /// Get the bytes of the key in the deletion branch from the given depth, which must be obtained using + /// [#getDeletionBranchDepth()]. The returned array can be safely modified and/or stored. + /// + /// This method may be called by `deletionTransformer` to get information about the current state. + public byte[] getDeletionBranchKeyBytes(int startDepth) + { + return deletionMutator.getCurrentKeyBytes(startDepth); + } + + /// Return the depth of the currently processed node in the deletion branch. + /// + /// This method may be called by the upsert transformer to get information about the current state. + public int getDeletionBranchDepth() + { + return deletionMutator.currentDepth(); + } + } + + /// Creates a trie mutator that can be used to apply multiple modifications to the trie. + /// + /// @param dataTransformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. The transformer can return null + /// if the entry should not be added or preserved. + /// @param deletionTransformer a function applied to combine overlapping deletions into a consistent view. Called + /// even if there is no pre-existing deletion to convert the marker type. The transformer can return null if + /// deletions cancel out or should not be preserved. + /// **Note: for code simplicity this transformer is provided only the path to the root of the deletion branch.** + /// @param existingDeleter a function used to apply a deletion marker to potentially delete live data. This is + /// only called if there is both content and deletion at a given covered point. It should return null if the entry + /// is to be deleted. + /// @param insertedDeleter a function used to filter incoming entries that are covered by existing deletions + /// in this trie, called only if both an entry and a deletion apply to a given point. This function is not provided + /// with a path to the modified data. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See + /// [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints]. + /// @param needsForcedCopyInData a predicate which decides when to fully copy a branch to provide atomicity + /// guarantees to concurrent readers, applied in data branches. See [NodeFeatures] for details. + /// @param needsForcedCopyInDeletions a predicate which decides when to fully copy a branch to provide atomicity + /// guarantees to concurrent readers, applied in deletion branches. See [NodeFeatures] for details. + public > + Mutator mutator(final UpsertTransformer dataTransformer, + final UpsertTransformer deletionTransformer, + final UpsertTransformer existingDeleter, + final BiFunction insertedDeleter, + boolean deletionsAtFixedPoints, + Predicate> needsForcedCopyInData, + Predicate> needsForcedCopyInDeletions) + { + return new Mutator<>(dataTransformer, + deletionTransformer, + existingDeleter, + insertedDeleter, + needsForcedCopyInData, + needsForcedCopyInDeletions, + deletionsAtFixedPoints); + } + + + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// All of the deletions in the given mutation trie will be applied, removing any content and trie paths that become + /// empty as a result of the deletions and releasing any of the trie cells that they occupied. The deletion branches + /// of the trie will be combined with the incoming deletions. + /// + /// @param dataTransformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. The transformer can return null + /// if the entry should not be added or preserved. + /// @param deletionTransformer a function applied to combine overlapping deletions into a consistent view. Called + /// even if there is no pre-existing deletion to convert the marker type. The transformer can return null if + /// deletions cancel out or should not be preserved. + /// **Note: for code simplicity this transformer is provided only the path to the root of the deletion branch.** + /// @param existingDeleter a function used to apply a deletion marker to potentially delete live data. This is + /// only called if there is both content and deletion at a given covered point. It should return null if the entry + /// is to be deleted. + /// @param insertedDeleter a function used to filter incoming entries that are covered by existing deletions + /// in this trie, called only if both an entry and a deletion apply to a given point. This function is not provided + /// with a path to the modified data. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See + /// [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints]. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity + /// guarantees to concurrent readers, applied in both data and deletion branches. See [NodeFeatures] for details. + @SuppressWarnings({"rawtypes", "unchecked"}) + public > + Mutator mutator(final UpsertTransformer dataTransformer, + final UpsertTransformer deletionTransformer, + final UpsertTransformer existingDeleter, + final BiFunction insertedDeleter, + boolean deletionsAtFixedPoints, + Predicate> needsForcedCopy) + { + return mutator(dataTransformer, + deletionTransformer, + existingDeleter, + insertedDeleter, + deletionsAtFixedPoints, + (Predicate) needsForcedCopy, + (Predicate) needsForcedCopy); + } + + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// All of the deletions in the given mutation trie will be applied, removing any content and trie paths that become + /// empty as a result of the deletions and releasing any of the trie cells that they occupied. The deletion branches + /// of the trie will be combined with the incoming deletions. + /// + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + /// @param dataTransformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. The transformer can return null + /// if the entry should not be added or preserved. + /// @param deletionTransformer a function applied to combine overlapping deletions into a consistent view. Called + /// even if there is no pre-existing deletion to convert the marker type. The transformer can return null if + /// deletions cancel out or should not be preserved. + /// **Note: for code simplicity this transformer is provided only the path to the root of the deletion branch.** + /// @param existingDeleter a function used to apply a deletion marker to potentially delete live data. This is + /// only called if there is both content and deletion at a given covered point. It should return null if the entry + /// is to be deleted. + /// @param insertedDeleter a function used to filter incoming entries that are covered by existing deletions + /// in this trie, called only if both an entry and a deletion apply to a given point. This function is not provided + /// with a path to the modified data. + /// @param dataTransformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. The transformer can return null + /// if the entry should not be added or preserved. + /// @param deletionTransformer a function applied to combine overlapping deletions into a consistent view. Called + /// even if there is no pre-existing deletion to convert the marker type. The transformer can return null if + /// deletions cancel out or should not be preserved. + /// **Note: for code simplicity this transformer is provided only the path to the root of the deletion branch.** + /// @param existingDeleter a function used to apply a deletion marker to potentially delete live data. This is + /// only called if there is both content and deletion at a given covered point. It should return null if the entry + /// is to be deleted. + /// @param insertedDeleter a function used to filter incoming entries that are covered by existing deletions + /// in this trie, called only if both an entry and a deletion apply to a given point. This function is not provided + /// with a path to the modified data. + /// @param deletionsAtFixedPoints True if deletion branches are at predetermined positions. See + /// [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints]. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity + /// guarantees to concurrent readers, applied in both data and deletion branches. See [NodeFeatures] for details. + public > + void apply(DeletionAwareTrie mutation, + final UpsertTransformer dataTransformer, + final UpsertTransformer deletionTransformer, + final UpsertTransformer existingDeleter, + final BiFunction insertedDeleter, + boolean deletionsAtFixedPoints, + Predicate> needsForcedCopy) + throws TrieSpaceExhaustedException + { + // TODO: track hasDeletions and do plain Trie merges if neither this nor mutation has deletions. + mutator(dataTransformer, + deletionTransformer, + existingDeleter, + insertedDeleter, + deletionsAtFixedPoints, + needsForcedCopy) + .apply(mutation); + } + + private class DumpCursor extends InMemoryReadTrie.DumpCursor> implements DeletionAwareCursor + { + DumpCursor(DeletionAwareInMemoryCursor source, Function contentToString) + { + super(source, contentToString); + } + + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return source.deletionBranchCursor(direction); + } + + @Override + public DumpCursor tailCursor(Direction direction) + { + // `DumpCursor` is only created by the dump method to be used immediately, and that use should never call + // `tailCursor`. + throw new AssertionError(); + } + } + + public String dump(Function contentToString) + { + return dump(contentToString, Object::toString); + } + + /// Override of dump to provide more detailed printout that includes the type of each node in the trie. + /// We do this via a wrapping cursor that returns a content string for the type of node for every node we return. + public String dump(Function contentToString, Function rangeToString) + { + return new DumpCursor(makeCursor(Direction.FORWARD), contentToString).process(new TrieDumper.DeletionAware<>(Function.identity(), rangeToString)); + } + + /// Dump the branch rooted at the given node. To be used for debugging only. + @SuppressWarnings("unused") + private String dumpBranch(int branchRoot) + { + return new DumpCursor(new DeletionAwareInMemoryCursor<>(this, Direction.FORWARD, branchRoot), Object::toString) + .process(new TrieDumper.DeletionAware<>(Function.identity(), Object::toString)); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryRangeTrie.java b/src/java/org/apache/cassandra/db/tries/InMemoryRangeTrie.java new file mode 100644 index 000000000000..db15e30134a1 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/InMemoryRangeTrie.java @@ -0,0 +1,850 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.Predicate; + +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +public class InMemoryRangeTrie> extends InMemoryBaseTrie implements RangeTrie +{ + // constants for space calculations + private static final long EMPTY_SIZE_ON_HEAP; + private static final long EMPTY_SIZE_OFF_HEAP; + static + { + // Measuring the empty size of long-lived tries, because these are the ones for which we want to track size. + InMemoryBaseTrie empty = new InMemoryRangeTrie<>(ByteComparable.Version.OSS50, BufferType.ON_HEAP, ExpectedLifetime.LONG, null); + EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty); + empty = new InMemoryRangeTrie<>(ByteComparable.Version.OSS50, BufferType.OFF_HEAP, ExpectedLifetime.LONG, null); + EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty); + } + + InMemoryRangeTrie(ByteComparable.Version byteComparableVersion, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder) + { + super(byteComparableVersion, false, bufferType, lifetime, opOrder); + } + + public static > InMemoryRangeTrie shortLived(ByteComparable.Version byteComparableVersion) + { + return new InMemoryRangeTrie<>(byteComparableVersion, BufferType.ON_HEAP, ExpectedLifetime.SHORT, null); + } + + public static > InMemoryRangeTrie shortLived(ByteComparable.Version byteComparableVersion, BufferType bufferType) + { + return new InMemoryRangeTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.SHORT, null); + } + + public static > InMemoryRangeTrie longLived(ByteComparable.Version byteComparableVersion, OpOrder opOrder) + { + return longLived(byteComparableVersion, BufferType.OFF_HEAP, opOrder); + } + + public static > InMemoryRangeTrie longLived(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder) + { + return new InMemoryRangeTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.LONG, opOrder); + } + + public InMemoryRangeCursor makeCursor(Direction direction) + { + return new InMemoryRangeCursor<>(this, direction, root); + } + + protected long emptySizeOnHeap() + { + return bufferManager.bufferType() == BufferType.ON_HEAP ? EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP; + } + + static class InMemoryRangeCursor> extends InMemoryCursor implements RangeCursor + { + boolean activeIsSet; + S activeRange; // only non-null if activeIsSet + S prevContent; // can only be non-null if activeIsSet + + InMemoryRangeCursor(InMemoryReadTrie trie, Direction direction, int root) + { + // Range tries must always preserve the order of boundaries with respect to descendants. + super(trie, direction, root, false); + activeIsSet = true; + activeRange = null; + prevContent = null; + updateActiveAndReturn(encodedPosition()); + } + + @Override + public long advance() + { + return updateActiveAndReturn(super.advance()); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return updateActiveAndReturn(super.advanceMultiple(receiver)); + } + + /// @inheritDoc + /// Range tries may have two content values. Handle this possibility here. + @Override + S processPrefix(int node, int depth, int transition) + { + S content1 = processPrefixEntry(node, depth, transition, PREFIX_CONTENT_OFFSET); + S content2 = processPrefixEntry(node, depth, transition, PREFIX_ALTERNATE_OFFSET); + assert (content1 == null) || (content2 == null) : "Prefix node with incompatible content pair " + content1 + " and " + content2; + // It's not okay to have two backtracks either, but this is not trivial to check. + return content1 == null ? content2 : content1; + } + + @Override + public long skipTo(long encodedSkipPosition) + { + activeIsSet = false; // since we are skipping, we have no idea where we will end up + activeRange = null; + prevContent = null; + return updateActiveAndReturn(super.skipTo(encodedSkipPosition)); + } + + @Override + public S state() + { + if (!activeIsSet) + setActiveState(); + return activeRange; + } + + long updateActiveAndReturn(long position) + { + if (!Cursor.isExhausted(position)) + { + // Always check if we are seeing new content; if we do, that's an easy state update. + S content = content(); + if (content != null) + { + activeRange = content; + prevContent = content; + activeIsSet = true; + } + else if (prevContent != null) + { + // If the previous state was exact, its right side is what we now have. + activeRange = prevContent.succedingState(direction); + prevContent = null; + assert activeIsSet; + } + // otherwise the active state is either not set or still valid. + } + else + { + // exhausted + activeIsSet = true; + activeRange = null; + prevContent = null; + } + return position; + } + + private void setActiveState() + { + assert content() == null; + S nearestContent = getNearestContent(direction); + // Note: the nearest content may change between the time we fetch it and when we reach that node, e.g. + // if someone deletes aa-cd where there existed an abc-acd deletion, and we fetched the latter while at "a". + // This, though, should only be possible if the preceding state of the nearest content is null. + activeRange = nearestContent != null ? nearestContent.precedingState(direction) : null; + prevContent = null; + activeIsSet = true; + } + + private S getNearestContent(Direction direction) + { + // Descend into the children of the node until content is found, making sure to ignore return-path content + // for non-leaf nodes (because there must be a leaf that is before them in iteration order). + + // This should yield the same result as walking a copy of this cursor in the given direction until the first + // content, i.e. + // new InMemoryRangeCursor<>(trie, direction, currentFullNode).advanceToContent(null); + int node = currentFullNode; + if (isNull(node)) + return null; + while (true) + { + assert !isNull(node); + if (isLeaf(node)) + { + // We have reached the bottom of the branch. Report regardless of direction as it's the closest + // content either way. + return trie.getContent(node); + } + + if (offset(node) == PREFIX_OFFSET) + { + int contentId = trie.getIntVolatile(node + direction.select(PREFIX_CONTENT_OFFSET, PREFIX_ALTERNATE_OFFSET)); + if (isLeaf(contentId)) + return trie.getContent(contentId); + + node = trie.followPrefixTransition(node); + assert !isNull(node); + } + + node = trie.getFirstChild(node, direction); + } + } + + @Override + public InMemoryRangeCursor tailCursor(Direction direction) + { + // Deletion ranges active at entry and exit must be presented by the tail at its root. To do this, get + // the closest content in both forward and reverse direction and adjust the content that the tail reports + // for them. + if (content == null) + setActiveState(); // prepare and store activeRange if it is needed + + S rootDescentContent = getTailRootContent(this.direction, content, activeIsSet, activeRange); + S rootAscentContent = getTailRootContent(this.direction.opposite(), getAscentPathContent(), false, null); + if (this.direction != direction) + { + S swap = rootDescentContent; + rootDescentContent = rootAscentContent; + rootAscentContent = swap; + } + + if (rootAscentContent == null && rootDescentContent == null) + return new InMemoryRangeCursor<>(trie, direction, currentNode); + else + return new InMemoryRangeBranchCursor<>(trie, direction, currentNode, rootDescentContent, rootAscentContent); + } + + S getAscentPathContent() + { + if (backtrackDepth <= 0) + return null; + if (depth(backtrackDepth - 1) != depth - 1) + return null; + int contentId = node(backtrackDepth - 1); + if (!isLeaf(contentId)) + return null; + assert shouldPresentOnTheReturnPath(contentId); + return trie.getContent(contentId); + } + + S getTailRootContent(Direction direction, S contentAtRoot, boolean activeRangeKnown, S activeRange) + { + if (contentAtRoot != null) + return contentAtRoot.restrict(!direction.isForward(), direction.isForward()); + if (!activeRangeKnown) + activeRange = getNearestContent(direction); + if (activeRange == null) + return null; + activeRange = activeRange.precedingState(direction); + if (activeRange == null) + return null; + return activeRange.asBoundary(direction); + } + } + + /// Modified range cursor returning the given content at the root's descend and ascent positions. + static class InMemoryRangeBranchCursor> extends InMemoryRangeCursor + { + final S rootAscentContent; + + InMemoryRangeBranchCursor(InMemoryReadTrie trie, Direction direction, int root, S rootDescentContent, S rootAscentContent) + { + super(trie, direction, root); + content = rootDescentContent; + this.rootAscentContent = rootAscentContent; + if (rootAscentContent != null) + addBacktrack(NONE, 0, -1); + updateActiveAndReturn(encodedPosition()); + } + + @Override + long advanceToNextChild(int node, int data) + { + if (isNull(node)) + return presentAscentPathContent(); + else + return super.advanceToNextChild(node, data); + } + + @Override + long advanceToNextChildWithTarget(int node, int data, int transition) + { + if (isNull(node)) + return direction.le(transition, data) ? presentAscentPathContent() + : NOT_FOUND; + else + return super.advanceToNextChildWithTarget(node, data, transition); + } + + long presentAscentPathContent() + { + return setNodeState(Cursor.encode(++depth, 0, direction) | ON_RETURN_PATH_BIT, + rootAscentContent, + NONE, + NONE); + } + + @Override + S getAscentPathContent() + { + if (backtrackDepth == 0) + return null; + if (depth(backtrackDepth - 1) != depth - 1) + return null; + int contentId = node(backtrackDepth - 1); + if (!isNullOrLeaf(contentId)) + return null; + if (isNull(contentId)) + return rootAscentContent; + assert shouldPresentOnTheReturnPath(contentId); + return trie.getContent(contentId); + } + } + + /// Reused storage for the state of application of mutations. This stores the backtracking path, including changes + /// already applied (e.g. new version of a node that is not yet linked to the current trie) and some that are yet + /// to be applied (e.g. updated content). + /// + /// Because in-memory tries are single-writer, we can reuse a single state array for all updates. The updates are + /// serialized and thus no other thread can corrupt this state (note that this is not the factor enforcing the + /// single writer policy, and since we are already bound to it there is cost involved in reusing this state array). + final private ApplyState applyState = new ApplyState<>(this); + + enum AdvanceResult + { + DESCENDED, + NEEDS_ASCENT, + AT_LIMIT + } + + static class ApplyState> extends InMemoryBaseTrie.ApplyState + { + ApplyState(InMemoryBaseTrie trie) + { + super(trie); + } + + ApplyState start() + { + return start(trie.root); + } + + ApplyState start(int root) + { + return (ApplyState) super.start(root); + } + + private S getFirstChildContent(int node) + { + while (true) + { + int contentId = getDescentPathContentId(node); + if (contentId != NONE) + return trie.getContent(contentId); + + int next = trie.getNextChild(node, 0); + + if (next == NONE) + { + int returnPathContent = getAscentPathContentId(node); + assert returnPathContent != NONE; + return trie.getContent(returnPathContent); + } + node = next; + } + } + + /// Get the nearest content following the current position. This is used to establish the range that applies to + /// the current position after we have followed a mutation path which is done by skipping over our boundaries. + private S getNearestContent(boolean onReturnPath) + { + // 1. If not on the return path, and the node we are positioned on exists, descend until we find content. + // If we can't descend any further, there must be return-side content there. We are done. + int fullNode = existingFullNode(); + if (fullNode != NONE && !onReturnPath) + return getFirstChildContent(fullNode); + + // 2. If the node we are positioned on did not exist, or we are looking for return-path data, ascend until + // we find a node that exists. + int stackPos = currentDepth - 1; + int node = NONE; + + while (stackPos >= 0) + { + node = existingFullNodeAtDepth(stackPos); + if (node != NONE) + break; + --stackPos; + } + + if (node == NONE) + return null; + + while (true) + { + // 3. If that node has a child with a transition index greater than the one we took to descend, descend + // into that child and perform 1. + int child = trie.getNextChild(node, transitionAtDepth(stackPos) + 1); + if (child != NONE) + return getFirstChildContent(child); + // 4. If not, check return path content -- return if present. + int returnPathId = getAscentPathContentId(node); + if (returnPathId != NONE) + return trie.getContent(returnPathId); + // 5. Otherwise, go up one level and back to 3. + if (--stackPos < 0) + return null; + node = existingFullNodeAtDepth(stackPos); + } + } + + boolean advanceToMutationPosition(int depth, int transition, boolean isOnReturnPath, int forcedCopyDepth) + throws TrieSpaceExhaustedException + { + while (currentDepth >= Math.max(depth, 1)) + { + if (isOnReturnPath && depth == currentDepth && transition == transitionAtDepth(currentDepth - 1)) + return true; + + // There are no more children. Ascend to the parent state to continue walk. + attachAndMoveToParentState(forcedCopyDepth); + } + + if (depth <= 0) // Either exhausted or the root's return path position. + return (isOnReturnPath && depth == 0); + + // We have a transition, get child to descend into + descend(transition); + return true; + } + + AdvanceResult tryDescendInExisting(int limitDepth, int limitTransition, boolean limitOnReturnPath) + { + int currentTransition = transition(); + + int nextTransition = trie.getNextTransition(updatedPostContentNode(), currentTransition + 1); + if (currentDepth + 1 == limitDepth && (nextTransition > limitTransition || (nextTransition == limitTransition && !limitOnReturnPath))) + { + descend(limitTransition); + return AdvanceResult.AT_LIMIT; + } + if (nextTransition <= 0xFF) + { + descend(nextTransition); + return AdvanceResult.DESCENDED; + } + + // With range tries we need to be able to ascend on the return path without going over the node. + if (limitOnReturnPath && currentDepth == limitDepth && (limitDepth == 0 || transitionAtDepth(currentDepth - 1) == limitTransition)) + return AdvanceResult.AT_LIMIT; + + return AdvanceResult.NEEDS_ASCENT; + } + + int getDescentPathContentId(int fullNode) + { + if (isNull(fullNode)) + return NONE; + if (isLeaf(fullNode)) + return !trie.shouldPresentAfterBranch(fullNode) ? fullNode : NONE; + if (offset(fullNode) == PREFIX_OFFSET) + return trie().getIntVolatile(fullNode + PREFIX_CONTENT_OFFSET); + + return NONE; + } + + int getAscentPathContentId(int fullNode) + { + if (isNull(fullNode)) + return NONE; + if (isLeaf(fullNode)) + return trie.shouldPresentAfterBranch(fullNode) ? fullNode : NONE; + if (offset(fullNode) == PREFIX_OFFSET) + return trie().getIntVolatile(fullNode + PREFIX_ALTERNATE_OFFSET); + + return NONE; + } + + int getAscentPathContentId() + { + return getAscentPathContentId(existingFullNode()); + } + + @Override + protected int applyContent(boolean forcedCopy) throws TrieSpaceExhaustedException + { + int ascentPathContentId = getAscentPathContentId(); + return applyAscentPathContent(ascentPathContentId, forcedCopy); + } + + /// After a node's children are processed, this is called to ascend from it. This means applying the collected + /// content to the compiled `updatedPostContentNode` and creating a mapping in the parent to it (or updating if + /// one already exists). + void attachAndMoveToParentStateWithAscentPathContent(int ascentPathContentId, int forcedCopyDepth) throws TrieSpaceExhaustedException + { + attachBranchAndMoveToParentState(applyAscentPathContent(ascentPathContentId, currentDepth >= forcedCopyDepth), + forcedCopyDepth); + } + + @Override + void attachBranchAndMoveToParentState(int updatedFullNode, int forcedCopyDepth) throws TrieSpaceExhaustedException + { + if (currentDepth > 0) + super.attachBranchAndMoveToParentState(updatedFullNode, forcedCopyDepth); + else + { + // we need to update the root -- leave that job to complete() and abuse existingFullNode to tell it + // what value to use + setExistingFullNode(updatedFullNode); + --currentDepth; + } + } + + protected int applyAscentPathContent(int ascentPathContentId, boolean forcedCopy) throws TrieSpaceExhaustedException + { + if (ascentPathContentId == NONE) + return super.applyContent(forcedCopy); + + int descentPathContentId = descentPathContentId(); + final int updatedPostContentNode = updatedPostContentNode(); + if (isNull(updatedPostContentNode)) + { + if (!isNull(ascentPathContentId) && !trie.shouldPreserveWithoutChildren(ascentPathContentId)) + { + trie.releaseContent(ascentPathContentId); + return super.applyContent(forcedCopy); + } + if (!isNull(descentPathContentId) && !trie.shouldPreserveWithoutChildren(descentPathContentId)) + { + trie.releaseContent(descentPathContentId); + descentPathContentId = NONE; + } + } + + final int existingPreContentNode = existingFullNode(); + final int existingPostContentNode = existingPostContentNode(); + + if (isNull(descentPathContentId) && isNull(updatedPostContentNode)) + { + // return path content only with no child -- we can use a leaf to store it + if (existingPreContentNode != existingPostContentNode + && !isNullOrLeaf(existingPreContentNode) + && !trie.isEmbeddedPrefixNode(existingPreContentNode)) + trie.recycleCell(existingPreContentNode); + return ascentPathContentId; + } + + // If we only had a descent-path entry before, upgrade to prefix node + if (isLeaf(existingPreContentNode)) + return trie.createPrefixNode(descentPathContentId, ascentPathContentId, updatedPostContentNode, true); + + return applyPrefixChange(updatedPostContentNode, + existingPreContentNode, + existingPostContentNode, + descentPathContentId, + ascentPathContentId, + forcedCopy); + } + + void attachPreparedRoot(int updatedFullNode) + { + if (updatedFullNode != trie.root) + { + // Only write to root if they are different (value doesn't change, but + // we don't want to invalidate the value in other cores' caches unnecessarily). + trie.root = updatedFullNode; + } + } + + } + + /// Range trie mutation functionality. Provides functionality used both by range trie and deletion-aware mutators. + static class MutatorStatic, U extends RangeState> + extends InMemoryBaseTrie.Mutator, ApplyState> + { + MutatorStatic(ApplyState applyState, + UpsertTransformer transformer, + Predicate> needsForcedCopy) + { + super(transformer, needsForcedCopy, applyState); + } + + @Override + MutatorStatic apply() throws TrieSpaceExhaustedException + { + applyRanges(); + assert state.currentDepth == 0 || state.currentDepth == -1 : "Unexpected change to applyState. Concurrent trie modification?"; + return this; + } + + @Override + void complete() throws TrieSpaceExhaustedException + { + if (state.currentDepth == 0) + super.complete(); + else if (state.currentDepth == -1) // root already prepared because of return-path update to the root node + state.attachPreparedRoot(state.existingFullNodeAtDepth(0)); + else + throw new AssertionError("Unexpected depth value " + state.currentDepth); + } + + int completeBranch() throws TrieSpaceExhaustedException + { + if (state.currentDepth == 0) + return state.applyContent(state.currentDepth >= forcedCopyDepth); + else if (state.currentDepth == -1) // root already prepared because of return-path update to the root node + return state.existingFullNodeAtDepth(0); + else + throw new AssertionError("Unexpected depth value " + state.currentDepth); + } + + void applyContent(S existingState, U mutationState) throws TrieSpaceExhaustedException + { + S combined = transformer.apply(existingState, mutationState); + S existing = existingState; + if (combined != null && !combined.isBoundary()) + combined = null; + if (existing != null && !existing.isBoundary()) + existing = null; + if (combined != existing) + state.setDescentPathContent(combined, // can be null + state.currentDepth >= forcedCopyDepth); // this is called at the start of processing + } + + void applyRanges() throws TrieSpaceExhaustedException + { + // While activeDeletion is not set, follow the mutation trie. + // When a deletion is found, get existing covering state, combine and apply/store. + // Get rightSideAsCovering and walk the full existing trie to apply, advancing mutation cursor in parallel + // until we see an end boundary in mutation trie. + // Repeat until mutation trie is exhausted. + int depth = state.currentDepth; + long position = mutationCursor.encodedPosition(); + assert !Cursor.isOnReturnPath(position) : "Cursor cannot start with position on return path."; + while (true) + { + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; + + U content = mutationCursor.content(); + if (content != null) + { + S existingCoveringState = getExistingCoveringState(Cursor.isOnReturnPath(position)); + applyDeletionRange(rightSideAsCovering(existingCoveringState), position); + } + + position = mutationCursor.advance(); + depth = Cursor.depth(position); + // Advance according to the mutation cursor. This will apply point content and complete anything already modified. + if (!state.advanceToMutationPosition(depth, Cursor.incomingTransition(position), Cursor.isOnReturnPath(position), forcedCopyDepth)) + break; + assert depth == state.currentDepth : "Unexpected change to applyState. Concurrent trie modification?"; + } + } + + private void ascendWithUpdatedReturnPathContent(int existingContentId, S existingState, U content, int depth) throws TrieSpaceExhaustedException + { + S combined = transformer.apply(existingState, content); + S existing = existingState; + if (combined != null && !combined.isBoundary()) + combined = null; + if (existing != null && !existing.isBoundary()) + existing = null; + int combinedId = combined != existing + ? state.combineContent(existingContentId, + combined, + true, + forcedCopyDepth >= depth) + : existingContentId; + state.attachAndMoveToParentStateWithAscentPathContent(combinedId, forcedCopyDepth); + } + + void applyDeletionRange(S existingCoveringState, long position) + throws TrieSpaceExhaustedException + { + AdvanceResult advance = AdvanceResult.AT_LIMIT; + int limitDepth = Cursor.depth(position); + int limitTransition = Cursor.incomingTransition(position); + boolean limitOnReturnPath = Cursor.isOnReturnPath(position); + U mutationCoveringState = null; + + // We are walking both tries in parallel. + while (true) + { + // We need to force-copy every node we touch while applying ranges to ensure consistent ranges. + forcedCopyDepth = Math.min(forcedCopyDepth, state.currentDepth); + + switch (advance) + { + case AT_LIMIT: + { + // We are following the mutation cursor. Check it for content to apply, and then advance it. + U mutationContent = mutationCursor.content(); + + int existingContentId = limitOnReturnPath ? state.getAscentPathContentId() : state.descentPathContentId(); + S existingContent = InMemoryReadTrie.isNull(existingContentId) ? null : state.trie.getContent(existingContentId); + + if (existingContent != null || mutationContent != null) + { + if (existingContent == null) + existingContent = existingCoveringState; + if (mutationContent == null) + mutationContent = mutationCoveringState; + + if (limitOnReturnPath) + ascendWithUpdatedReturnPathContent(existingContentId, existingContent, mutationContent, limitDepth); + else + applyContent(existingContent, mutationContent); + + mutationCoveringState = mutationContent.succedingState(Direction.FORWARD); + existingCoveringState = rightSideAsCovering(existingContent); + if (mutationCoveringState == null) + return; // mutation deletion range was closed, we can continue normal mutation cursor iteration + } + + position = mutationCursor.advance(); + limitDepth = Cursor.depth(position); + limitTransition = Cursor.incomingTransition(position); + limitOnReturnPath = Cursor.isOnReturnPath(position); + assert limitDepth >= 0 : "Unbounded range in mutation trie, state " + mutationCoveringState + " active when exhausted."; + break; + } + case DESCENDED: + { + // We have descended in the existing trie. Apply the mutation's deletion to an content and + // continue. + S existingContent = state.getDescentPathContent(); + if (existingContent != null) + { + applyContent(existingContent, mutationCoveringState); + existingCoveringState = existingContent.succedingState(Direction.FORWARD); + } + break; + } + case NEEDS_ASCENT: + { + // There are no more children in the existing trie, and we need to ascend. Do so, but first + // check if there is ascent path content that needs to be updated. + int existingContentId = state.getAscentPathContentId(); + if (existingContentId != NONE) + { + S existingContent = state.trie.getContent(existingContentId); + existingCoveringState = existingContent.succedingState(Direction.FORWARD); + ascendWithUpdatedReturnPathContent(existingContentId, + existingContent, + mutationCoveringState, + forcedCopyDepth); + } + else + state.attachAndMoveToParentState(forcedCopyDepth); + break; + } + default: + throw new AssertionError(); + } + + advance = state.tryDescendInExisting(limitDepth, limitTransition, limitOnReturnPath); + } + } + + S getExistingCoveringState(boolean onReturnPath) + { + S existingCoveringState = state.getNearestContent(onReturnPath); + if (existingCoveringState != null) + return existingCoveringState.precedingState(Direction.FORWARD); + + return null; + } + + static > S rightSideAsCovering(S rangeState) + { + if (rangeState == null) + return null; + return rangeState.succedingState(Direction.FORWARD); + } + } + + /// Range trie mutator, binding the trie with merge configuration (i.e. transformer and predicates). + /// Can be used to apply multiple modifications to the trie using [#apply(RangeTrie)]. + public class Mutator> extends MutatorStatic + { + /// See [InMemoryTrie#mutator(UpsertTransformer, Predicate)] for the meaning of the + /// parameters. + Mutator(UpsertTransformer transformer, Predicate> needsForcedCopy) + { + super(applyState, transformer, needsForcedCopy); + } + + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + public void apply(RangeTrie mutation) throws TrieSpaceExhaustedException + { + try + { + start(mutation.cursor(Direction.FORWARD)).apply().complete(); + completeMutation(); + } + catch (Throwable t) + { + abortMutation(); + throw t; + } + } + } + + + /// Creates a trie mutator that can be used to apply multiple modifications to the trie. + /// + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to + /// concurrent readers. See NodeFeatures for details. + public > Mutator mutator(final UpsertTransformer transformer, + Predicate> needsForcedCopy) + { + return new Mutator<>(transformer, needsForcedCopy); + } + + + /// Modify this trie to apply the mutation given in the form of a range trie. Any content in the mutation will be + /// resolved with the given function before being placed in this trie (even if there's no pre-existing content in + /// this trie). For any range that the new mutation introduces, the transformer function will be applied to all + /// existing content that falls in the range; this may result in the deletion of existing boundaries or their + /// modification. + /// @param mutation the mutation to be applied, given in the form of a range trie. Note that its content can be of + /// type different than the element type for this memtable trie. + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value, as well as to pre-existing content that falls under a range in the mutation. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to + /// concurrent readers. Note that this only applies to separate ranges in the mutation. Whenever a mutation range is + /// applied, the covered content is copied to ensure that consumer cannot see unclosed ranges due to intermediate + /// state. See [NodeFeatures] for more details. + public > void apply(RangeTrie mutation, + final UpsertTransformer transformer, + Predicate> needsForcedCopy) + throws TrieSpaceExhaustedException + { + mutator(transformer, needsForcedCopy).apply(mutation); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryReadTrie.java b/src/java/org/apache/cassandra/db/tries/InMemoryReadTrie.java index ecddfd9544ef..4a0959661e91 100644 --- a/src/java/org/apache/cassandra/db/tries/InMemoryReadTrie.java +++ b/src/java/org/apache/cassandra/db/tries/InMemoryReadTrie.java @@ -18,19 +18,16 @@ package org.apache.cassandra.db.tries; import java.util.Arrays; -import java.util.concurrent.atomic.AtomicReferenceArray; import java.util.function.Function; import org.agrona.concurrent.UnsafeBuffer; -import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; -/** - * In-memory trie built for fast modification and reads executing concurrently with writes from a single mutator thread. - * - * This class provides the read-only functionality, expanded in {@link InMemoryTrie} to writes. - */ -public class InMemoryReadTrie extends Trie +/// In-memory trie built for fast modification and reads executing concurrently with writes from a single mutator thread. +/// +/// This class provides the read-only functionality, expanded in [InMemoryTrie] to writes. +public abstract class InMemoryReadTrie { /* TRIE FORMAT AND NODE TYPES @@ -55,10 +52,9 @@ public class InMemoryReadTrie extends Trie not pointing at the beginning of cells, and we call 'pointer offset' the offset of the node pointer to the cell it points into. The value of a 'node pointer' is used to decide what kind of node is pointed: - - If the pointer is negative, we have a leaf node. Since a leaf has no children, we need no data outside of its - content to represent it, and that content is stored in a 'content list', not in the nodes buffer. The content - of a particular leaf node is located at the ~pointer position in the content list (~ instead of - so that -1 can - correspond to position 0). + - If the pointer is negative, we have a leaf node. Since a leaf has no children, we need no data other than its + content to represent it, and that content is mapped to this id by the content manager, which may store a list + of content values corresponding to these ids. - If the 'pointer offset' is smaller than 28, we have a chain node with one transition. The transition character is the byte at the position pointed in the 'node buffer', and the child is pointed by: @@ -101,15 +97,17 @@ allows iteration over the order word (which divides said word by 6 each step) to One split node may need up to 1 + 4 + 4*8 cells (1184 bytes) to store all its children. - If the pointer offset is 31, we have a prefix node. These are two types: - -- Embedded prefix nodes occupy the free bytes in a chain or split node. The byte at offset 4 has the offset + -- Embedded prefix nodes occupy the free bytes in a chain or split node. The byte at offset 8 has the offset within the 32-byte cell for the augmented node. - -- Full prefix nodes have 0xFF at offset 4 and a pointer at 28, pointing to the augmented node. - Both types contain an index for content at offset 0. The augmented node cannot be a leaf or NONE -- in the former - case the leaf itself contains the content index, in the latter we use a leaf instead. + -- Full prefix nodes have 0xFF at offset 8 and a pointer at 28, pointing to the augmented node. + Both types contain a leaf pointer for content at offset 0, specifying the content associated with the augmented + node, and a secondary pointer at offset 4. The secondary pointer's usage depends on the exact type of trie -- + it can be return path content (in range tries) or alternate branch pointer (in deletion-aware tries). + The augmented node cannot be a leaf, because in that case we can either drop the prefix (if there's no secondary + pointer) or pull the content pointer to it; the augmented node can be NONE only if the secondary pointer is + non-null (otherwise we can use a leaf instead of prefix). The term "node" when applied to these is a bit of a misnomer as they are not presented as separate nodes during - traversals. Instead, they augment a node, changing only its content. Internally we create a Node object for the - augmented node and wrap a PrefixNode around it, which changes the `content()` method and routes all other - calls to the augmented node's methods. + traversals. Instead, they augment a node, changing only its content/alternate branch. When building a trie we first allocate the content, then create a chain node leading to it. While we only have single transitions leading to a chain node, we can expand that node (attaching a character and using pointer - 1) @@ -118,7 +116,7 @@ single transitions leading to a chain node, we can expand that node (attaching a child, we switch to split. Cells can be reused once they are no longer used and cannot be in the state of a concurrently running reader. See - MemoryAllocationStrategy for details. + MemoryManager for details. For further descriptions and examples of the mechanics of the trie, see InMemoryTrie.md. */ @@ -168,94 +166,78 @@ Cell offsets used to identify node types (by comparing them to the node 'pointer static final int SPARSE_ORDER_OFFSET = SPARSE_CHILD_COUNT * 5 - SPARSE_OFFSET; // 0 // Offset of the flag byte in a prefix node. In shared cells, this contains the offset of the next node. - static final int PREFIX_FLAGS_OFFSET = 4 - PREFIX_OFFSET; + static final int PREFIX_FLAGS_OFFSET = 9 - PREFIX_OFFSET; // Offset of the content id static final int PREFIX_CONTENT_OFFSET = 0 - PREFIX_OFFSET; + // Offset of the alternate branch pointer + static final int PREFIX_ALTERNATE_OFFSET = 4 - PREFIX_OFFSET; // Offset of the next pointer in a non-shared prefix node static final int PREFIX_POINTER_OFFSET = LAST_POINTER_OFFSET - PREFIX_OFFSET; - /** - * Value used as null for node pointers. - * No node can use this address (we enforce this by not allowing chain nodes to grow to position 0). - * Do not change this as the code relies there being a NONE placed in all bytes of the cell that are not set. - */ + /// Value used as null for node pointers. + /// No node can use this address (we enforce this by not allowing chain nodes to grow to position 0). + /// Do not change this as the code relies on there being a `NONE` placed in all bytes of the cell that are not set. static final int NONE = 0; volatile int root; - /* - EXPANDABLE DATA STORAGE - - The tries will need more and more space in buffers and content lists as they grow. Instead of using ArrayList-like - reallocation with copying, which may be prohibitively expensive for large buffers, we use a sequence of - buffers/content arrays that double in size on every expansion. - - For a given address x the index of the buffer can be found with the following calculation: - index_of_most_significant_set_bit(x / min_size + 1) - (relying on sum (2^i) for i in [0, n-1] == 2^n - 1) which can be performed quickly on modern hardware. - - Finding the offset within the buffer is then - x + min - (min << buffer_index) - - The allocated space starts 256 bytes for the buffer and 16 entries for the content list. - - Note that a buffer is not allowed to split 32-byte cells (code assumes same buffer can be used for all bytes - inside the cell). - */ - - static final int BUF_START_SHIFT = 8; - static final int BUF_START_SIZE = 1 << BUF_START_SHIFT; - - static final int CONTENTS_START_SHIFT = 4; - static final int CONTENTS_START_SIZE = 1 << CONTENTS_START_SHIFT; - - static - { - assert BUF_START_SIZE % CELL_SIZE == 0 : "Initial buffer size must fit a full cell."; - } - - final UnsafeBuffer[] buffers; - final AtomicReferenceArray[] contentArrays; final ByteComparable.Version byteComparableVersion; - - InMemoryReadTrie(ByteComparable.Version byteComparableVersion, UnsafeBuffer[] buffers, AtomicReferenceArray[] contentArrays, int root) + final BufferManager bufferManager; + final ContentManager contentManager; + + /// If true, the content always is presented on the descent path of any walk (useful for metadata-carrying tries). + /// If false, its position before/after the path will be tracked on insertion and the content will be appropriately + /// returned on walks (useful for range and ordered tries). + final boolean presentContentOnDescentPath; + + InMemoryReadTrie(ByteComparable.Version byteComparableVersion, + boolean presentContentOnDescentPath, + BufferManager bufferManager, + ContentManager contentManager, + int root) { this.byteComparableVersion = byteComparableVersion; - this.buffers = buffers; - this.contentArrays = contentArrays; + this.presentContentOnDescentPath = presentContentOnDescentPath; + this.contentManager = contentManager; + this.bufferManager = bufferManager; this.root = root; } /* Buffer, content list and cell management */ - int getBufferIdx(int pos, int minBufferShift, int minBufferSize) + static int getBufferIdx(int pos, int minBufferShift, int minBufferSize) { return 31 - minBufferShift - Integer.numberOfLeadingZeros(pos + minBufferSize); } - int inBufferOffset(int pos, int bufferIndex, int minBufferSize) + static int inBufferOffset(int pos, int bufferIndex, int minBufferSize) { return pos + minBufferSize - (minBufferSize << bufferIndex); } UnsafeBuffer getBuffer(int pos) { - int leadBit = getBufferIdx(pos, BUF_START_SHIFT, BUF_START_SIZE); - return buffers[leadBit]; + return bufferManager.getBuffer(pos); } int inBufferOffset(int pos) { - int leadBit = getBufferIdx(pos, BUF_START_SHIFT, BUF_START_SIZE); - return inBufferOffset(pos, leadBit, BUF_START_SIZE); + return bufferManager.inBufferOffset(pos); } + T getContent(int id) + { + return contentManager.getContent(id); + } - /** - * Pointer offset for a node pointer. - */ - int offset(int pos) + boolean shouldPresentAfterBranch(int contentId) + { + return contentManager.shouldPresentAfterBranch(contentId); + } + + /// Pointer offset for a node pointer. + static int offset(int pos) { return pos & (CELL_SIZE - 1); } @@ -270,66 +252,46 @@ final int getUnsignedShortVolatile(int pos) return getBuffer(pos).getShortVolatile(inBufferOffset(pos)) & 0xFFFF; } - /** - * Following a pointer must be done using a volatile read to enforce happens-before between reading the node we - * advance to and the preparation of that node that finishes in a volatile write of the pointer that makes it - * visible. - */ + /// Following a pointer must be done using a volatile read to enforce happens-before between reading the node we + /// advance to and the preparation of that node that finishes in a volatile write of the pointer that makes it + /// visible. final int getIntVolatile(int pos) { return getBuffer(pos).getIntVolatile(inBufferOffset(pos)); } - /** - * Get the content for the given content pointer. - * - * @param id content pointer, encoded as ~index where index is the position in the content array. - * @return the current content value. - */ - T getContent(int id) - { - int leadBit = getBufferIdx(~id, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); - int ofs = inBufferOffset(~id, leadBit, CONTENTS_START_SIZE); - AtomicReferenceArray array = contentArrays[leadBit]; - return array.get(ofs); - } - /* Reading node content */ - boolean isNull(int node) + static boolean isNull(int node) { return node == NONE; } - boolean isLeaf(int node) + static boolean isLeaf(int node) { return node < NONE; } - boolean isNullOrLeaf(int node) + static boolean isNullOrLeaf(int node) { return node <= NONE; } - /** - * Returns the number of transitions in a chain cell entered with the given pointer. - */ - private int chainCellLength(int node) + /// Returns the number of transitions in a chain cell entered with the given pointer. + static int chainCellLength(int node) { return LAST_POINTER_OFFSET - offset(node); } - /** - * Get a node's child for the given transition character - */ + /// Get a node's child for the given transition character int getChild(int node, int trans) { if (isNullOrLeaf(node)) return NONE; - node = followContentTransition(node); + node = followPrefixTransition(node); switch (offset(node)) { @@ -348,36 +310,87 @@ int getChild(int node, int trans) } } - protected int followContentTransition(int node) + int getFirstChild(int node, Direction direction) + { + assert !isNullOrLeaf(node); + switch (offset(node)) + { + case PREFIX_OFFSET: + throw new AssertionError(); + case SPARSE_OFFSET: + return getSparseFirstChild(node, direction); + case SPLIT_OFFSET: + return getSplitFirstChild(node, direction); + default: + // directly jump over all bytes of the chain + return getChildOfChainNode(node); + } + } + + /// Returns first present transition byte in the node that is the same or greater as the given target transition. + int getNextTransition(int node, int trans) + { + if (isNullOrLeaf(node)) + return Integer.MAX_VALUE; + + node = followPrefixTransition(node); + + if (isNullOrLeaf(node)) + return Integer.MAX_VALUE; + + switch (offset(node)) + { + case SPARSE_OFFSET: + return getSparseNextTransition(node, trans); + case SPLIT_OFFSET: + return getSplitNextTransition(node, trans); + default: + return getChainNextTransition(node, trans); + } + } + + int getNextChild(int node, int targetTransition) + { + int nextTransition = getNextTransition(node, targetTransition); + if (nextTransition <= 0xFF) + return getChild(node, nextTransition); + else + return NONE; + } + + protected int followPrefixTransition(int node) { if (isNullOrLeaf(node)) return NONE; if (offset(node) == PREFIX_OFFSET) - { - int b = getUnsignedByte(node + PREFIX_FLAGS_OFFSET); - if (b < CELL_SIZE) - node = node - PREFIX_OFFSET + b; - else - node = getIntVolatile(node + PREFIX_POINTER_OFFSET); + node = getChildOfPrefixNode(node); - assert node >= 0 && offset(node) != PREFIX_OFFSET; - } return node; } - /** - * Advance as long as the cell pointed to by the given pointer will let you. - *

- * This is the same as getChild(node, first), except for chain nodes where it would walk the fill chain as long as - * the input source matches. - */ + private int getChildOfPrefixNode(int node) + { + int b = getUnsignedByte(node + PREFIX_FLAGS_OFFSET); + if (b < CELL_SIZE) + node = node - PREFIX_OFFSET + b; + else + node = getIntVolatile(node + PREFIX_POINTER_OFFSET); + + assert node >= 0 && offset(node) != PREFIX_OFFSET; + return node; + } + + /// Advance as long as the cell pointed to by the given pointer will let you. + /// + /// This is the same as `getChild(node, first)`, except for chain nodes where it would walk the fill chain as long + /// as the input source matches. int advance(int node, int first, ByteSource rest) { if (isNullOrLeaf(node)) return NONE; - node = followContentTransition(node); + node = followPrefixTransition(node); switch (offset(node)) { @@ -401,9 +414,7 @@ int advance(int node, int first, ByteSource rest) } } - /** - * Get the child for the given transition character, knowing that the node is sparse - */ + /// Get the child for the given transition character, knowing that the node is sparse int getSparseChild(int node, int trans) { for (int i = 0; i < SPARSE_CHILD_COUNT; ++i) @@ -423,39 +434,149 @@ int getSparseChild(int node, int trans) return NONE; } - /** - * Given a transition, returns the corresponding index (within the node cell) of the pointer to the mid cell of - * a split node. - */ - int splitNodeMidIndex(int trans) + int getSparseNextTransition(int node, int targetTransition) + { + UnsafeBuffer chunk = getBuffer(node); + int inChunkNode = inBufferOffset(node); + int data = chunk.getShortVolatile(inChunkNode + SPARSE_ORDER_OFFSET) & 0xFFFF; + int index; + int transition; + do + { + // Peel off the next index. + index = data % SPARSE_CHILD_COUNT; + data = data / SPARSE_CHILD_COUNT; + transition = chunk.getByte(inChunkNode + SPARSE_BYTES_OFFSET + index) & 0xFF; + } + while (transition < targetTransition && data != 0); + + if (transition < targetTransition) + return Integer.MAX_VALUE; + else + return transition; + } + + int getSparseFirstChild(int node, Direction direction) + { + UnsafeBuffer chunk = getBuffer(node); + int inChunkNode = inBufferOffset(node); + int data = chunk.getShortVolatile(inChunkNode + SPARSE_ORDER_OFFSET) & 0xFFFF; + + if (direction.isForward()) + data %= SPARSE_CHILD_COUNT; + else + while (data >= SPARSE_CHILD_COUNT) + data /= SPARSE_CHILD_COUNT; + + return chunk.getIntVolatile(inChunkNode + SPARSE_CHILDREN_OFFSET + data * Integer.BYTES); + } + + int getChainNextTransition(int node, int targetTransition) + { + int transition = getUnsignedByte(node); + if (transition < targetTransition) + return Integer.MAX_VALUE; + else + return transition; + } + + int getChildOfChainNode(int node) + { + return getIntVolatile((node & -CELL_SIZE) + (CHAIN_MAX_OFFSET + 1)); + } + + int getSplitNextTransition(int node, int targetTransition) + { + if (targetTransition < 0) + targetTransition = 0; + int midIndex = splitNodeMidIndex(targetTransition); + int tailIndex = splitNodeTailIndex(targetTransition); + int childIndex = splitNodeChildIndex(targetTransition); + while (midIndex < SPLIT_START_LEVEL_LIMIT) + { + int mid = getSplitCellPointer(node, midIndex, SPLIT_START_LEVEL_LIMIT); + if (!isNull(mid)) + { + while (tailIndex < SPLIT_OTHER_LEVEL_LIMIT) + { + int tail = getSplitCellPointer(mid, tailIndex, SPLIT_OTHER_LEVEL_LIMIT); + if (!isNull(tail)) + { + while (childIndex < SPLIT_OTHER_LEVEL_LIMIT) + { + int child = getSplitCellPointer(tail, childIndex, SPLIT_OTHER_LEVEL_LIMIT); + if (!isNull(child)) + return childIndex | (tailIndex << 3) | (midIndex << 6); + ++childIndex; + } + } + childIndex = 0; + ++tailIndex; + } + } + tailIndex = 0; + childIndex = 0; + ++midIndex; + } + return Integer.MAX_VALUE; + } + + int getSplitFirstChild(int node, Direction direction) + { + for (int midIndex = direction.select(0, SPLIT_START_LEVEL_LIMIT - 1); + direction.inLoop(midIndex, 0, SPLIT_START_LEVEL_LIMIT - 1); + midIndex += direction.increase) + { + int mid = getSplitCellPointer(node, midIndex, SPLIT_START_LEVEL_LIMIT); + if (!isNull(mid)) + { + for (int tailIndex = direction.select(0, SPLIT_OTHER_LEVEL_LIMIT - 1); + direction.inLoop(tailIndex, 0, SPLIT_OTHER_LEVEL_LIMIT - 1); + tailIndex += direction.increase) + { + int tail = getSplitCellPointer(mid, tailIndex, SPLIT_OTHER_LEVEL_LIMIT); + if (!isNull(tail)) + { + for (int childIndex = direction.select(0, SPLIT_OTHER_LEVEL_LIMIT - 1); + direction.inLoop(childIndex, 0, SPLIT_OTHER_LEVEL_LIMIT - 1); + childIndex += direction.increase) + { + int child = getSplitCellPointer(tail, childIndex, SPLIT_OTHER_LEVEL_LIMIT); + if (!isNull(child)) + return child; + } + } + } + } + } + throw new AssertionError("Empty split node"); + } + + /// Given a transition, returns the corresponding index (within the node cell) of the pointer to the mid cell of + /// a split node. + static int splitNodeMidIndex(int trans) { // first 2 bits of the 2-3-3 split return (trans >> 6) & 0x3; } - /** - * Given a transition, returns the corresponding index (within the mid cell) of the pointer to the tail cell of - * a split node. - */ - int splitNodeTailIndex(int trans) + /// Given a transition, returns the corresponding index (within the mid cell) of the pointer to the tail cell of + /// a split node. + static int splitNodeTailIndex(int trans) { // second 3 bits of the 2-3-3 split return (trans >> 3) & 0x7; } - /** - * Given a transition, returns the corresponding index (within the tail cell) of the pointer to the child of - * a split node. - */ - int splitNodeChildIndex(int trans) + /// Given a transition, returns the corresponding index (within the tail cell) of the pointer to the child of + /// a split node. + static int splitNodeChildIndex(int trans) { // third 3 bits of the 2-3-3 split return trans & 0x7; } - /** - * Get the child for the given transition character, knowing that the node is split - */ + /// Get the child for the given transition character, knowing that the node is split int getSplitChild(int node, int trans) { int mid = getSplitCellPointer(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); @@ -468,9 +589,7 @@ int getSplitChild(int node, int trans) return getSplitCellPointer(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); } - /** - * Get the content for a given node - */ + /// Get the content for a given node T getNodeContent(int node) { if (isLeaf(node)) @@ -485,6 +604,15 @@ T getNodeContent(int node) : null; } + int getAlternateBranch(int node) + { + if (isNullOrLeaf(node)) + return NONE; + if (offset(node) != PREFIX_OFFSET) + return NONE; + return getIntVolatile(node + PREFIX_ALTERNATE_OFFSET); + } + int splitCellPointerAddress(int node, int childIndex, int subLevelLimit) { return node - SPLIT_OFFSET + (8 - subLevelLimit + childIndex) * 4; @@ -495,18 +623,22 @@ int getSplitCellPointer(int node, int childIndex, int subLevelLimit) return getIntVolatile(splitCellPointerAddress(node, childIndex, subLevelLimit)); } - /** - * Backtracking state for a cursor. - * - * To avoid allocations and pointer-chasing, the backtracking data is stored in a simple int array with - * BACKTRACK_INTS_PER_ENTRY ints for each level. - */ + /// Backtracking state for a cursor. + /// + /// To avoid allocations and pointer-chasing, the backtracking data is stored in a simple int array with + /// `BACKTRACK_INTS_PER_ENTRY` ints for each level. private static class CursorBacktrackingState { static final int BACKTRACK_INTS_PER_ENTRY = 3; static final int BACKTRACK_INITIAL_SIZE = 16; - private int[] backtrack = new int[BACKTRACK_INITIAL_SIZE * BACKTRACK_INTS_PER_ENTRY]; - int backtrackDepth = 0; + private int[] backtrack; + int backtrackDepth; + + CursorBacktrackingState() + { + backtrack = new int[BACKTRACK_INITIAL_SIZE * BACKTRACK_INTS_PER_ENTRY]; + backtrackDepth = 0; + } void addBacktrack(int node, int data, int depth) { @@ -534,38 +666,57 @@ int depth(int backtrackDepth) } } - /* - * Cursor implementation. - * - * InMemoryTrie cursors maintain their backtracking state in CursorBacktrackingState where they store - * information about the node to backtrack to and the transitions still left to take or attempt. - * - * This information is different for the different types of node: - * - for leaf and chain no backtracking is saved (because we know there are no further transitions) - * - for sparse we store the remainder of the order word - * - for split we store one entry per sub-level of the 2-3-3 split - * - * When the cursor is asked to advance it first checks the current node for children, and if there aren't any - * (i.e. it is positioned on a leaf node), it goes one level up the backtracking chain, where we are guaranteed to - * have a remaining child to advance to. When there's nothing to backtrack to, the trie is exhausted. - */ - class InMemoryCursor extends CursorBacktrackingState implements Cursor + /// Cursor implementation. + /// + /// `InMemoryTrie` cursors maintain their backtracking state in [CursorBacktrackingState] where they store + /// information about the node to backtrack to and the transitions still left to take or attempt. + /// + /// This information is different for the different types of node: + /// - for leaf and chain no backtracking is saved (because we know there are no further transitions) + /// - for sparse we store the remainder of the order word + /// - for split we store one entry per sub-level of the 2-3-3 split + /// + /// When the cursor is asked to advance it first checks the current node for children, and if there aren't any + /// (i.e. it is positioned on a leaf node), it goes one level up the backtracking chain, where we are guaranteed to + /// have a remaining child to advance to. When there's nothing to backtrack to, the trie is exhausted. + static class InMemoryCursor extends CursorBacktrackingState implements Cursor { - private int currentNode; - private int currentFullNode; - private int incomingTransition; - private T content; - private final Direction direction; - int depth = -1; - - InMemoryCursor(Direction direction) + static final long NOT_FOUND = Cursor.EXHAUSTED_POSITION_FORWARD; + + final InMemoryReadTrie trie; + int currentNode; + int currentFullNode; + private long currentPosition; + protected int depth; + protected T content; + final Direction direction; + final boolean presentContentOnDescentPath; + + InMemoryCursor(InMemoryReadTrie trie, Direction direction, int root) { + this(trie, direction, root, trie.presentContentOnDescentPath); + } + + /// A version of the constructor which allows the `presentContentOnDescentPath` flag to be overridden. + /// This is needed for [DeletionAwareTrie] which uses the same structure for both metadata-carrying data trie + /// where content must always be before branch and range tries where the position of content must be preserved. + InMemoryCursor(InMemoryReadTrie trie, Direction direction, int root, boolean presentContentOnDescentPath) + { + this.trie = trie; this.direction = direction; - descendInto(root, -1); + this.presentContentOnDescentPath = presentContentOnDescentPath; + depth = 0; + currentPosition = Cursor.rootPosition(direction); + setCurrentNodeAndApplyPrefixes(root, 0, 0, true); } @Override - public int advance() + public long advance() + { + return doAdvance(); + } + + long doAdvance() { if (isNullOrLeaf(currentNode)) return backtrack(); @@ -574,15 +725,15 @@ public int advance() } @Override - public int advanceMultiple(TransitionsReceiver receiver) + public long advanceMultiple(TransitionsReceiver receiver) { int node = currentNode; if (!isChainNode(node)) - return advance(); + return doAdvance(); // Jump directly to the chain's child. - UnsafeBuffer buffer = getBuffer(node); - int inBufferNode = inBufferOffset(node); + UnsafeBuffer buffer = trie.getBuffer(node); + int inBufferNode = trie.inBufferOffset(node); int bytesJumped = chainCellLength(node) - 1; // leave the last byte for incomingTransition if (receiver != null && bytesJumped > 0) receiver.addPathBytes(buffer, inBufferNode, bytesJumped); @@ -598,18 +749,28 @@ public int advanceMultiple(TransitionsReceiver receiver) } @Override - public int skipTo(int skipDepth, int skipTransition) + public long skipTo(long encodedSkipPosition) { + int skipDepth = Cursor.depth(encodedSkipPosition); + int skipTransition = Cursor.incomingTransition(encodedSkipPosition); + boolean onReturnPath = Cursor.isOnReturnPath(encodedSkipPosition); if (skipDepth > depth) { // Descent requested. Jump to the given child transition or greater, and backtrack if there's no such. assert skipDepth == depth + 1; - int advancedDepth = advanceToChildWithTarget(currentNode, skipTransition); - if (advancedDepth < 0) + long advancedPosition = advanceToChildWithTarget(currentNode, skipTransition); + if (advancedPosition == NOT_FOUND) return backtrack(); - assert advancedDepth == skipDepth; - return advancedDepth; + if (onReturnPath && Cursor.compare(advancedPosition, encodedSkipPosition) < 0) + { + // Requested return path but we seeked to the forward. If there's an entry matching the request, + // it will be the immediate backtrack. If not, we still need to backtrack to find the next. + return backtrack(); + } + + assert Cursor.depth(advancedPosition) == skipDepth; + return advancedPosition; } // Backtrack until we reach the requested depth. Note that we may have more than one entry for a given @@ -623,18 +784,25 @@ public int skipTo(int skipDepth, int skipTransition) if (depth == skipDepth - 1) { - int advancedDepth = advanceToNextChildWithTarget(node(backtrackDepth), data(backtrackDepth), skipTransition); - if (advancedDepth >= 0) - return advancedDepth; + long advancedPosition = advanceToNextChildWithTarget(node(backtrackDepth), data(backtrackDepth), skipTransition); + if (advancedPosition != NOT_FOUND) + { + if (!onReturnPath || Cursor.compare(advancedPosition, encodedSkipPosition) >= 0) + return advancedPosition; + // We found an exact match that is not on the return path. A return path backtrack may have just + // been inserted. The next iteration of the loop should find it. + } + // Note that we can't directly do backtrack() because there may be further options to check on the + // same depth. } } return exhausted(); } @Override - public int depth() + public long encodedPosition() { - return depth; + return currentPosition; } @Override @@ -643,42 +811,30 @@ public T content() return content; } - @Override - public int incomingTransition() - { - return incomingTransition; - } - - @Override - public Direction direction() - { - return direction; - } - @Override public ByteComparable.Version byteComparableVersion() { - return byteComparableVersion; + return trie.byteComparableVersion; } @Override - public Trie tailTrie() + public Cursor tailCursor(Direction dir) { - assert depth >= 0 : "tailTrie called on exhausted cursor"; - return new InMemoryReadTrie<>(byteComparableVersion, buffers, contentArrays, currentFullNode); + assert !Cursor.isExhausted(currentPosition) : "tailCursor called on exhausted cursor"; + return new InMemoryCursor<>(trie, dir, currentFullNode, presentContentOnDescentPath); } - private int exhausted() + long exhausted() { - depth = -1; - incomingTransition = -1; + currentPosition = Cursor.exhaustedPosition(direction); currentFullNode = NONE; currentNode = NONE; content = null; - return -1; + depth = -1; + return currentPosition; } - private int backtrack() + private long backtrack() { if (--backtrackDepth < 0) return exhausted(); @@ -687,7 +843,7 @@ private int backtrack() return advanceToNextChild(node(backtrackDepth), data(backtrackDepth)); } - private int advanceToFirstChild(int node) + private long advanceToFirstChild(int node) { assert (!isNullOrLeaf(node)); @@ -702,10 +858,10 @@ private int advanceToFirstChild(int node) } } - private int advanceToChildWithTarget(int node, int skipTransition) + private long advanceToChildWithTarget(int node, int skipTransition) { if (isNullOrLeaf(node)) - return -1; + return NOT_FOUND; switch (offset(node)) { @@ -718,9 +874,11 @@ private int advanceToChildWithTarget(int node, int skipTransition) } } - private int advanceToNextChild(int node, int data) + long advanceToNextChild(int node, int data) { - assert (!isNullOrLeaf(node)); + assert (!isNull(node)); + if (isNullOrLeaf(node)) + return descendInto(node, data); switch (offset(node)) { @@ -733,9 +891,12 @@ private int advanceToNextChild(int node, int data) } } - private int advanceToNextChildWithTarget(int node, int data, int transition) + long advanceToNextChildWithTarget(int node, int data, int transition) { - assert (!isNullOrLeaf(node)); + assert (!isNull(node)); + if (isLeaf(node)) + return direction.le(transition, data) ? descendInto(node, data) + : NOT_FOUND; switch (offset(node)) { @@ -748,19 +909,17 @@ private int advanceToNextChildWithTarget(int node, int data, int transition) } } - /** - * Descend into the sub-levels of a split node. Advances to the first child and creates backtracking entries - * for the following ones. We use the bits of trans (lowest non-zero ones) to identify which sub-level an - * entry refers to. - * - * @param node The node or cell id, must have offset SPLIT_OFFSET. - * @param limit The transition limit for the current sub-level (4 for the start, 8 for the others). - * @param collected The transition bits collected from the parent chain (e.g. 0x40 after following 1 on the top - * sub-level). - * @param shift This level's bit shift (6 for start, 3 for mid and 0 for tail). - * @return the depth reached after descending. - */ - int descendInSplitSublevel(int node, int limit, int collected, int shift) + /// Descend into the sub-levels of a split node. Advances to the first child and creates backtracking entries + /// for the following ones. We use the bits of trans (lowest non-zero ones) to identify which sub-level an + /// entry refers to. + /// + /// @param node The node or cell id, must have offset `SPLIT_OFFSET`. + /// @param limit The transition limit for the current sub-level (4 for the start, 8 for the others). + /// @param collected The transition bits collected from the parent chain (e.g. 0x40 after following 1 on the top + /// sub-level). + /// @param shift This level's bit shift (6 for start, 3 for mid and 0 for tail). + /// @return the depth reached after descending. + long descendInSplitSublevel(int node, int limit, int collected, int shift) { while (true) { @@ -772,7 +931,7 @@ int descendInSplitSublevel(int node, int limit, int collected, int shift) direction.inLoop(childIndex, 0, limit - 1); childIndex += direction.increase) { - child = getSplitCellPointer(node, childIndex, limit); + child = trie.getSplitCellPointer(node, childIndex, limit); if (!isNull(child)) break; } @@ -795,15 +954,13 @@ int descendInSplitSublevel(int node, int limit, int collected, int shift) } } - /** - * As above, but also makes sure that the descend selects a value at least as big as the given - * {@code minTransition}. - */ - private int descendInSplitSublevelWithTarget(int node, int limit, int collected, int shift, int minTransition) + /// As above, but also makes sure that the descent selects a value at least as big as the given + /// `minTransition`. + private long descendInSplitSublevelWithTarget(int node, int limit, int collected, int shift, int minTransition) { minTransition -= collected; if (minTransition >= limit << shift || minTransition < 0) - return -1; + return NOT_FOUND; while (true) { @@ -816,13 +973,13 @@ private int descendInSplitSublevelWithTarget(int node, int limit, int collected, direction.inLoop(childIndex, 0, limit - 1); childIndex += direction.increase) { - child = getSplitCellPointer(node, childIndex, limit); + child = trie.getSplitCellPointer(node, childIndex, limit); if (!isNull(child)) break; isExact = false; } if (!isExact && (childIndex == limit || childIndex == -1)) - return -1; + return NOT_FOUND; // look for any more valid transitions and add backtracking if found maybeAddSplitBacktrack(node, childIndex, limit, collected, shift); @@ -846,10 +1003,8 @@ private int descendInSplitSublevelWithTarget(int node, int limit, int collected, } } - /** - * Backtrack to a split sub-level. The level is identified by the lowest non-0 bits in data. - */ - int nextValidSplitTransition(int node, int data) + /// Backtrack to a split sub-level. The level is identified by the lowest non-0 bits in data. + long nextValidSplitTransition(int node, int data) { // Note: This is equivalent to return advanceToSplitTransition(node, data, data) but quicker. assert data >= 0 && data <= 0xFF; @@ -861,7 +1016,7 @@ int nextValidSplitTransition(int node, int data) SPLIT_OTHER_LEVEL_LIMIT, data & -(1 << (SPLIT_LEVEL_SHIFT * 1)), SPLIT_LEVEL_SHIFT * 0); - int child = getSplitCellPointer(node, childIndex, SPLIT_OTHER_LEVEL_LIMIT); + int child = trie.getSplitCellPointer(node, childIndex, SPLIT_OTHER_LEVEL_LIMIT); return descendInto(child, data); } int tailIndex = splitNodeTailIndex(data); @@ -872,7 +1027,7 @@ int nextValidSplitTransition(int node, int data) SPLIT_OTHER_LEVEL_LIMIT, data & -(1 << (SPLIT_LEVEL_SHIFT * 2)), SPLIT_LEVEL_SHIFT * 1); - int tail = getSplitCellPointer(node, tailIndex, SPLIT_OTHER_LEVEL_LIMIT); + int tail = trie.getSplitCellPointer(node, tailIndex, SPLIT_OTHER_LEVEL_LIMIT); return descendInSplitSublevel(tail, SPLIT_OTHER_LEVEL_LIMIT, data & -(1 << SPLIT_LEVEL_SHIFT * 1), @@ -885,18 +1040,16 @@ int nextValidSplitTransition(int node, int data) SPLIT_START_LEVEL_LIMIT, 0, SPLIT_LEVEL_SHIFT * 2); - int mid = getSplitCellPointer(node, midIndex, SPLIT_START_LEVEL_LIMIT); + int mid = trie.getSplitCellPointer(node, midIndex, SPLIT_START_LEVEL_LIMIT); return descendInSplitSublevel(mid, SPLIT_OTHER_LEVEL_LIMIT, data & -(1 << SPLIT_LEVEL_SHIFT * 2), SPLIT_LEVEL_SHIFT * 1); } - /** - * Backtrack to a split sub-level and advance to given transition if it fits within the sublevel. - * The level is identified by the lowest non-0 bits in data as above. - */ - private int advanceToSplitTransition(int node, int data, int skipTransition) + /// Backtrack to a split sub-level and advance to given transition if it fits within the sublevel. + /// The level is identified by the lowest non-0 bits in data as above. + private long advanceToSplitTransition(int node, int data, int skipTransition) { assert data >= 0 && data <= 0xFF; if (direction.lt(skipTransition, data)) @@ -924,9 +1077,7 @@ private int advanceToSplitTransition(int node, int data, int skipTransition) return descendInSplitSublevelWithTarget(node, sublevelLimit, data & sublevelMask, sublevelShift, skipTransition); } - /** - * Look for any further non-null transitions on this sub-level and, if found, add a backtracking entry. - */ + /// Look for any further non-null transitions on this sub-level and, if found, add a backtracking entry. private void maybeAddSplitBacktrack(int node, int startAfter, int limit, int collected, int shift) { int nextChildIndex; @@ -934,7 +1085,7 @@ private void maybeAddSplitBacktrack(int node, int startAfter, int limit, int col direction.inLoop(nextChildIndex, 0, limit - 1); nextChildIndex += direction.increase) { - if (!isNull(getSplitCellPointer(node, nextChildIndex, limit))) + if (!isNull(trie.getSplitCellPointer(node, nextChildIndex, limit))) break; } if (direction.inLoop(nextChildIndex, 0, limit - 1)) @@ -950,14 +1101,14 @@ private void maybeAddSplitBacktrack(int node, int startAfter, int limit, int col } - private int nextValidSparseTransition(int node, int data) + private long nextValidSparseTransition(int node, int data) { // Peel off the next index. int index = data % SPARSE_CHILD_COUNT; data = data / SPARSE_CHILD_COUNT; - UnsafeBuffer buffer = getBuffer(node); - int inBufferNode = inBufferOffset(node); + UnsafeBuffer buffer = trie.getBuffer(node); + int inBufferNode = trie.inBufferOffset(node); // If there are remaining transitions, add backtracking entry. if (data != exhaustedOrderWord()) @@ -969,13 +1120,11 @@ private int nextValidSparseTransition(int node, int data) return descendInto(child, transition); } - /** - * Prepare the sparse node order word for iteration. For forward iteration, this means just reading it. - * For reverse, we also invert the data so that the peeling code above still works. - */ + /// Prepare the sparse node order word for iteration. For forward iteration, this means just reading it. + /// For reverse, we also invert the data so that the peeling code above still works. int prepareOrderWord(int node) { - int fwdState = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); + int fwdState = trie.getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); if (direction.isForward()) return fwdState; else @@ -1008,18 +1157,16 @@ int prepareOrderWord(int node) } } - /** - * Returns the state which marks the exhaustion of the order word. - */ + /// Returns the state which marks the exhaustion of the order word. int exhaustedOrderWord() { return direction.select(0, 1); } - private int advanceToSparseTransition(int node, int data, int skipTransition) + private long advanceToSparseTransition(int node, int data, int skipTransition) { - UnsafeBuffer buffer = getBuffer(node); - int inBufferNode = inBufferOffset(node); + UnsafeBuffer buffer = trie.getBuffer(node); + int inBufferNode = trie.inBufferOffset(node); int index; int transition; do @@ -1031,7 +1178,7 @@ private int advanceToSparseTransition(int node, int data, int skipTransition) } while (direction.lt(transition, skipTransition) && data != exhaustedOrderWord()); if (direction.lt(transition, skipTransition)) - return -1; + return NOT_FOUND; // If there are remaining transitions, add backtracking entry. if (data != exhaustedOrderWord()) @@ -1042,11 +1189,11 @@ private int advanceToSparseTransition(int node, int data, int skipTransition) return descendInto(child, transition); } - private int getChainTransition(int node) + private long getChainTransition(int node) { // No backtracking needed. - UnsafeBuffer buffer = getBuffer(node); - int inBufferNode = inBufferOffset(node); + UnsafeBuffer buffer = trie.getBuffer(node); + int inBufferNode = trie.inBufferOffset(node); int transition = buffer.getByte(inBufferNode) & 0xFF; int next = node + 1; if (offset(next) <= CHAIN_MAX_OFFSET) @@ -1055,14 +1202,14 @@ private int getChainTransition(int node) return descendInto(buffer.getIntVolatile(inBufferNode + 1), transition); } - private int advanceToChainTransition(int node, int skipTransition) + private long advanceToChainTransition(int node, int skipTransition) { // No backtracking needed. - UnsafeBuffer buffer = getBuffer(node); - int inBufferNode = inBufferOffset(node); + UnsafeBuffer buffer = trie.getBuffer(node); + int inBufferNode = trie.inBufferOffset(node); int transition = buffer.getByte(inBufferNode) & 0xFF; if (direction.gt(skipTransition, transition)) - return -1; + return NOT_FOUND; int next = node + 1; if (offset(next) <= CHAIN_MAX_OFFSET) @@ -1071,46 +1218,113 @@ private int advanceToChainTransition(int node, int skipTransition) return descendInto(buffer.getIntVolatile(inBufferNode + 1), transition); } - int descendInto(int child, int transition) + + void setCurrentNodeAndApplyPrefixes(int node, int depth, int transition, boolean isInitialState) { - ++depth; - incomingTransition = transition; - content = getNodeContent(child); - currentFullNode = child; - currentNode = followContentTransition(child); - return depth; + currentFullNode = node; + if (isLeaf(node)) + { + if (shouldPresentOnTheReturnPath(node)) + { + if (isInitialState) + { + // We are just starting, we need to present the forward-direction root first. + addBacktrack(node, transition, depth - 1); + content = null; + } + else + { + // There's no reason to delay going to the position of the content. + currentPosition |= ON_RETURN_PATH_BIT; + content = trie.getContent(node); + } + } + else + content = trie.getContent(node); + + currentNode = NONE; + } + else if (offset(node) == PREFIX_OFFSET) + { + content = processPrefix(node, depth, transition); + currentNode = trie.getChildOfPrefixNode(node); + } + else + { + content = null; + currentNode = node; + } + } + + /// Get the content from a prefix node and/or put a backtracking entry for return path data. + /// Overridden by range tries, where both descent and ascent path content can be present. + /// + /// @return the descent path content, if present + T processPrefix(int node, int depth, int transition) + { + return processPrefixEntry(node, depth, transition, PREFIX_CONTENT_OFFSET); + } + + T processPrefixEntry(int node, int depth, int transition, int contentPointerOffset) + { + int child = trie.getIntVolatile(node + contentPointerOffset); + if (isNull(child)) + return null; + + assert isLeaf(child); + if (!shouldPresentOnTheReturnPath(child)) + return trie.getContent(child); + + // this content needs to be presented on the return path + addBacktrack(child, transition, depth - 1); + return null; } - int descendIntoChain(int child, int transition) + protected boolean shouldPresentOnTheReturnPath(int node) + { + // Tries where content in prefix nodes is treated as metadata relevant to the branch should present it + // before children in both directions. + if (presentContentOnDescentPath) + return false; + // Otherwise obey the before/after position that was given when the data was added (range and ordered tries). + else + return trie.shouldPresentAfterBranch(node) == direction.isForward(); + } + + long descendInto(int child, int transition) { ++depth; - incomingTransition = transition; - content = null; - currentFullNode = child; - currentNode = child; - return depth; + currentPosition = Cursor.encode(depth, transition, direction); + setCurrentNodeAndApplyPrefixes(child, depth, transition, false); + return currentPosition; } - } - private boolean isChainNode(int node) - { - return !isNullOrLeaf(node) && offset(node) <= CHAIN_MAX_OFFSET; + long descendIntoChain(int child, int transition) + { + return setNodeState(Cursor.encode(++depth, transition, direction), null, child, child); + } + + long setNodeState(long nextPosition, T nodeContent, int fullNode, int node) + { + currentPosition = nextPosition; + content = nodeContent; + currentFullNode = fullNode; + currentNode = node; + return nextPosition; + } } - public InMemoryCursor cursor(Direction direction) + static boolean isChainNode(int node) { - return new InMemoryCursor(direction); + return !isNullOrLeaf(node) && offset(node) <= CHAIN_MAX_OFFSET; } /* Direct read methods */ - /** - * Get the content mapped by the specified key. - * Fast implementation using integer node addresses. - */ - @Override + /// Get the content mapped by the specified key. + /// Fast implementation using integer node addresses. public T get(ByteComparable path) { int n = root; @@ -1137,144 +1351,168 @@ public ByteComparable.Version byteComparableVersion() return byteComparableVersion; } - /** - * Override of dump to provide more detailed printout that includes the type of each node in the trie. - * We do this via a wrapping cursor that returns a content string for the type of node for every node we return. - */ - @Override - public String dump(Function contentToString) + abstract InMemoryCursor makeCursor(Direction direction); + + /// Dump cursor, augmented to show the type of node + class DumpCursor> implements Cursor { - InMemoryCursor source = cursor(Direction.FORWARD); - class TypedNodesCursor implements Cursor + final C source; + private final Function contentToString; + + DumpCursor(C source, Function contentToString) { - @Override - public int advance() - { - return source.advance(); - } + this.source = source; + this.contentToString = contentToString; + } + + @Override + public long advance() + { + return source.advance(); + } + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return source.advanceMultiple(receiver); + } - @Override - public int advanceMultiple(TransitionsReceiver receiver) - { - return source.advanceMultiple(receiver); - } + @Override + public long skipTo(long encodedSkipPosition) + { + return source.skipTo(encodedSkipPosition); + } - @Override - public int skipTo(int skipDepth, int skipTransition) - { - return source.skipTo(skipDepth, skipTransition); - } + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } - @Override - public int depth() - { - return source.depth(); - } + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } - @Override - public int incomingTransition() - { - return source.incomingTransition(); - } + @Override + public DumpCursor tailCursor(Direction direction) + { + throw new AssertionError(); + } - @Override - public Direction direction() + @Override + public String content() + { + String type = null; + int node = source.currentNode; + if (!isNullOrLeaf(node)) { - return source.direction(); + switch (offset(node)) + { + case SPARSE_OFFSET: + type = String.format("[SPARSE@%x]", node); + break; + case SPLIT_OFFSET: + type = String.format("[SPLIT@%x]", node); + break; + case PREFIX_OFFSET: + throw new AssertionError("Unexpected prefix as cursor currentNode."); + default: + type = String.format("[CHAIN@%x]", node); + break; + } } - @Override - public ByteComparable.Version byteComparableVersion() + T content = source.content(); + if (content != null) { - return source.byteComparableVersion(); + if (type != null) + return contentToString.apply(content) + " -> " + type; + else + return contentToString.apply(content); } + else + return type; + } + } - @Override - public Trie tailTrie() - { - throw new AssertionError(); - } + /// Override of dump to provide more detailed printout that includes the type of each node in the trie. + /// We do this via a wrapping cursor that returns a content string for the type of node for every node we return. + public String dump(Function contentToString) + { + return new DumpCursor<>(makeCursor(Direction.FORWARD), contentToString).process(new TrieDumper.Plain<>(Function.identity())); + } - @Override - public String content() + private void dumpSplitNode(int node, int level, StringBuilder builder) + { + int limit = level == 0 ? SPLIT_START_LEVEL_LIMIT : SPLIT_OTHER_LEVEL_LIMIT; + for (int i = 0; i < limit; ++i) + { + int child = getIntVolatile(node - (limit - 1 - i) * 4); + if (child != NONE) { - String type = null; - int node = source.currentNode; - if (!isNullOrLeaf(node)) - { - switch (offset(node)) - { - case SPARSE_OFFSET: - type = "[SPARSE]"; - break; - case SPLIT_OFFSET: - type = "[SPLIT]"; - break; - case PREFIX_OFFSET: - throw new AssertionError("Unexpected prefix as cursor currentNode."); - default: - type = "[CHAIN]"; - break; - } - } - T content = source.content(); - if (content != null) - { - if (type != null) - return contentToString.apply(content) + " -> " + type; - else - return contentToString.apply(content); - } - else - return type; + builder.append('\n'); + for (int ind = 0; ind < level; ++ind) + builder.append(" "); + builder.append(Integer.toBinaryString(i | 8).substring(1)) // or and substring implement %03b + .append(" -> "); + builder.append(dumpChild(child)); + if (level < 2) + dumpSplitNode(child, level + 1, builder); } } - return process(new TrieDumper<>(Function.identity()), new TypedNodesCursor()); } - /** - * For use in debugging, dump info about the given node. - */ - @SuppressWarnings("unused") - String dumpNode(int node) + String dumpChild(int node) + { + if (isNullOrLeaf(node)) + return dumpLeafOrNull(node); + else + return Integer.toString(node, 16); + } + + private String dumpLeafOrNull(int node) { if (isNull(node)) return "NONE"; - else if (isLeaf(node)) - return "~" + (~node); + String contentAsText = contentManager.dumpContentId(node); + int cell = contentManager.cellUsedIfAny(node); + return cell < 0 ? contentAsText : "[@" + Integer.toString(cell, 16) + "] " + contentAsText; + } + + /// For use in debugging, dump info about the given node. + @SuppressWarnings("unused") + String dumpNode(int node) + { + if (isNullOrLeaf(node)) + return dumpLeafOrNull(node); else { StringBuilder builder = new StringBuilder(); - builder.append(node + " "); + builder.append(Integer.toString(node, 16)).append(' '); switch (offset(node)) { case SPARSE_OFFSET: { - builder.append("Sparse: "); + builder.append("Sparse (Order ") + .append(Integer.toString(getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET), 6)) + .append("):\n"); for (int i = 0; i < SPARSE_CHILD_COUNT; ++i) { int child = getIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4); if (child != NONE) builder.append(String.format("%02x", getUnsignedByte(node + SPARSE_BYTES_OFFSET + i))) .append(" -> ") - .append(child) + .append(dumpChild(child)) .append('\n'); } break; } case SPLIT_OFFSET: { - builder.append("Split: "); - for (int i = 0; i < SPLIT_START_LEVEL_LIMIT; ++i) - { - int child = getIntVolatile(node - (SPLIT_START_LEVEL_LIMIT - 1 - i) * 4); - if (child != NONE) - builder.append(Integer.toBinaryString(i)) - .append(" -> ") - .append(child) - .append('\n'); - } + builder.append("Split:"); + dumpSplitNode(node, 0, builder); break; } case PREFIX_OFFSET: @@ -1282,19 +1520,22 @@ else if (isLeaf(node)) builder.append("Prefix: "); int flags = getUnsignedByte(node + PREFIX_FLAGS_OFFSET); final int content = getIntVolatile(node + PREFIX_CONTENT_OFFSET); - builder.append(content < 0 ? "~" + (~content) : "" + content); - int child = followContentTransition(node); + final int alternate = getIntVolatile(node + PREFIX_ALTERNATE_OFFSET); + builder.append(dumpChild(content)); + if (alternate != NONE) + builder.append(" alt: ").append(dumpChild(alternate)); + int child = followPrefixTransition(node); builder.append(" -> ") - .append(child); + .append(dumpChild(child)); break; } default: { - builder.append("Chain: "); + builder.append("Chain:\n"); for (int i = 0; i < chainCellLength(node); ++i) builder.append(String.format("%02x", getUnsignedByte(node + i))); builder.append(" -> ") - .append(getIntVolatile(node + chainCellLength(node))); + .append(dumpChild(getIntVolatile(node + chainCellLength(node)))); break; } } @@ -1302,3 +1543,4 @@ else if (isLeaf(node)) } } } + diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.java b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.java index 7b7ac064418e..7aef28daa077 100644 --- a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.java +++ b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.java @@ -17,1541 +17,455 @@ */ package org.apache.cassandra.db.tries; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.concurrent.atomic.AtomicReferenceArray; - -import javax.annotation.Nonnull; +import java.util.function.Predicate; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; import com.google.common.base.Predicates; -import java.util.function.Predicate; - -import org.agrona.concurrent.UnsafeBuffer; -import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.io.compress.BufferType; -import org.apache.cassandra.io.util.FileUtils; -import org.apache.cassandra.utils.bytecomparable.ByteSource; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.concurrent.OpOrder; -import static org.github.jamm.MemoryMeterStrategy.MEMORY_LAYOUT; - -/** - * In-memory trie built for fast modification and reads executing concurrently with writes from a single mutator thread. - *

- * The main method for performing writes is {@link #apply(Trie, UpsertTransformer, Predicate)} which takes a trie as - * an argument and merges it into the current trie using the methods supplied by the given {@link UpsertTransformer}, - * force copying anything below the points where the third argument returns true. - *

- * The predicate can be used to implement several forms of atomicity and consistency guarantees: - * - *

  • if the predicate is {@code nf -> false}, neither atomicity nor sequential consistency is guaranteed - readers - * can see any mixture of old and modified content - *
  • if the predicate is {@code nf -> true}, full sequential consistency will be provided, i.e. if a reader sees any - * part of a modification, it will see all of it, and all the results of all previous modifications - *
  • if the predicate is {@code nf -> nf.isBranching()} the write will be atomic, i.e. either none or all of the - * content of the merged trie will be visible by concurrent readers, but not sequentially consistent, i.e. there - * may be writes that are not visible to a reader even when they precede writes that are visible. - *
  • if the predicate is {@code nf -> (nf.content())} the write will be consistent below the identified - * point (used e.g. by Memtable to ensure partition-level consistency) - * - *

    - * Additionally, the class provides several simpler write methods for efficiency and convenience: - * - *

  • {@link #putRecursive(ByteComparable, Object, UpsertTransformer)} inserts a single value using a recursive walk. - * It cannot provide consistency (single-path writes are always atomic). This is more efficient as it stores the - * walk state in the stack rather than on the heap but can cause a {@code StackOverflowException}. - *
  • {@link #putSingleton(ByteComparable, Object, UpsertTransformer)} is a non-recursive version of the above, using - * the {@code apply} machinery. - *
  • {@link #putSingleton(ByteComparable, Object, UpsertTransformer, boolean)} uses the fourth argument to choose - * between the two methods above, where some external property can be used to decide if the keys are short enough - * to permit recursive execution. - * - *

    - * Because it uses 32-bit pointers in byte buffers, this trie has a fixed size limit of 2GB. - */ -public class InMemoryTrie extends InMemoryReadTrie +/// In-memory trie built for fast modification and reads executing concurrently with writes from a single mutator thread. +/// +/// The main method for performing writes is [#apply(Trie,UpsertTransformer,Predicate)] which takes a trie as +/// an argument and merges it into the current trie using the methods supplied by the given [UpsertTransformer], +/// force copying anything below the points where the third argument returns true. +/// +/// +/// The predicate can be used to implement several forms of atomicity and consistency guarantees: +/// - if the predicate is `nf -> false`, neither atomicity nor sequential consistency is guaranteed - readers +/// can see any mixture of old and modified content +/// - if the predicate is `nf -> true`, full sequential consistency will be provided, i.e. if a reader sees any +/// part of a modification, it will see all of it, and all the results of all previous modifications +/// - if the predicate is `nf -> nf.isBranching()` the write will be atomic, i.e. either none or all of the +/// content of the merged trie will be visible by concurrent readers, but not sequentially consistent, i.e. there +/// may be writes that are not visible to a reader even when they precede writes that are visible. +/// - if the predicate is `nf -> (nf.content())` the write will be consistent below the identified +/// point (used e.g. by Memtable to ensure partition-level consistency) +/// +/// +/// Additionally, the class provides several simpler write methods for efficiency and convenience: +/// - [#putRecursive(ByteComparable,Object,UpsertTransformer)] inserts a single value using a recursive walk. +/// It cannot provide consistency (single-path writes are always atomic). This is more efficient as it stores the +/// walk state in the stack rather than on the heap but can cause a `StackOverflowException`. +/// - [#putSingleton(ByteComparable,Object,UpsertTransformer)] is a non-recursive version of the above, using +/// the `apply` machinery. +/// - [#putSingleton(ByteComparable,Object,UpsertTransformer,boolean)] uses the fourth argument to choose +/// between the two methods above, where some external property can be used to decide if the keys are short enough +/// to permit recursive execution. +/// +/// Because it uses 32-bit pointers in byte buffers, this trie has a fixed size limit of 2GB. +public class InMemoryTrie extends InMemoryBaseTrie implements Trie { - // See the trie format description in InMemoryReadTrie. - - /** - * Trie size limit. This is not enforced, but users must check from time to time that it is not exceeded (using - * {@link #reachedAllocatedSizeThreshold()}) and start switching to a new trie if it is. - * This must be done to avoid tries growing beyond their hard 2GB size limit (due to the 32-bit pointers). - */ - @VisibleForTesting - static final int ALLOCATED_SIZE_THRESHOLD; - static - { - // Default threshold + 10% == 2 GB. This should give the owner enough time to react to the - // {@link #reachedAllocatedSizeThreshold()} signal and switch this trie out before it fills up. - int limitInMB = CassandraRelevantProperties.MEMTABLE_TRIE_SIZE_LIMIT.getInt(2048 * 10 / 11); - if (limitInMB < 1 || limitInMB > 2047) - throw new AssertionError(CassandraRelevantProperties.MEMTABLE_TRIE_SIZE_LIMIT.getKey() + - " must be within 1 and 2047"); - ALLOCATED_SIZE_THRESHOLD = 1024 * 1024 * limitInMB; - } - - private int allocatedPos = 0; - private int contentCount = 0; - - final BufferType bufferType; // on or off heap - final MemoryAllocationStrategy cellAllocator; - final MemoryAllocationStrategy objectAllocator; - - // constants for space calculations private static final long EMPTY_SIZE_ON_HEAP; private static final long EMPTY_SIZE_OFF_HEAP; - private static final long REFERENCE_ARRAY_ON_HEAP_SIZE = ObjectSizes.measureDeep(new AtomicReferenceArray<>(0)); - static { // Measuring the empty size of long-lived tries, because these are the ones for which we want to track size. - InMemoryTrie empty = new InMemoryTrie<>(ByteComparable.Version.OSS50, BufferType.ON_HEAP, ExpectedLifetime.LONG, null); + InMemoryBaseTrie empty = new InMemoryTrie<>(ByteComparable.Version.OSS50, BufferType.ON_HEAP, ExpectedLifetime.LONG, null, true); EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty); - empty = new InMemoryTrie<>(ByteComparable.Version.OSS50, BufferType.OFF_HEAP, ExpectedLifetime.LONG, null); + empty = new InMemoryTrie<>(ByteComparable.Version.OSS50, BufferType.OFF_HEAP, ExpectedLifetime.LONG, null, true); EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty); } - enum ExpectedLifetime + InMemoryTrie(ByteComparable.Version byteComparableVersion, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder, boolean presentContentOnDescentPath) { - SHORT, LONG + super(byteComparableVersion, presentContentOnDescentPath, bufferType, lifetime, opOrder); } - InMemoryTrie(ByteComparable.Version byteComparableVersion, BufferType bufferType, ExpectedLifetime lifetime, OpOrder opOrder) + InMemoryTrie(ByteComparable.Version byteComparableVersion, + BufferType bufferType, + ExpectedLifetime lifetime, + OpOrder opOrder, + boolean presentContentOnDescentPath, + Predicate shouldPreserveWithoutChildren) { - super(byteComparableVersion, - new UnsafeBuffer[31 - BUF_START_SHIFT], // last one is 1G for a total of ~2G bytes - new AtomicReferenceArray[29 - CONTENTS_START_SHIFT], // takes at least 4 bytes to write pointer to one content -> 4 times smaller than buffers - NONE); - this.bufferType = bufferType; + super(byteComparableVersion, presentContentOnDescentPath, shouldPreserveWithoutChildren, bufferType, lifetime, opOrder); + } - switch (lifetime) - { - case SHORT: - cellAllocator = new MemoryAllocationStrategy.NoReuseStrategy(new MemoryAllocationStrategy.Allocator() - { - @Override - public int allocate() throws TrieSpaceExhaustedException - { - return allocateNewCell(); - } - }); - objectAllocator = new MemoryAllocationStrategy.NoReuseStrategy(new MemoryAllocationStrategy.Allocator() - { - @Override - public int allocate() - { - return allocateNewObject(); - } - }); - break; - case LONG: - cellAllocator = new MemoryAllocationStrategy.OpOrderReuseStrategy(new MemoryAllocationStrategy.Allocator() - { - @Override - public int allocate() throws TrieSpaceExhaustedException - { - return allocateNewCell(); - } - }, opOrder); - objectAllocator = new MemoryAllocationStrategy.OpOrderReuseStrategy(new MemoryAllocationStrategy.Allocator() - { - @Override - public int allocate() - { - return allocateNewObject(); - } - }, opOrder); - break; - default: - throw new AssertionError(); - } + InMemoryTrie(ByteComparable.Version byteComparableVersion, boolean presentContentOnDescentPath, BufferManager bufferManager, ContentManager contentManager) + { + super(byteComparableVersion, presentContentOnDescentPath, bufferManager, contentManager); } + /// Short-lived tries do not try to recycle and reuse cells and content slots of the trie that are no longer in use + /// and are expected to be recycled as a whole after the user is done with them. public static InMemoryTrie shortLived(ByteComparable.Version byteComparableVersion) { - return new InMemoryTrie<>(byteComparableVersion, BufferType.ON_HEAP, ExpectedLifetime.SHORT, null); + return shortLived(byteComparableVersion, BufferType.ON_HEAP); } + /// Short-lived tries do not try to recycle and reuse cells and content slots of the trie that are no longer in use + /// and are expected to be recycled as a whole after the user is done with them. public static InMemoryTrie shortLived(ByteComparable.Version byteComparableVersion, BufferType bufferType) { - return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.SHORT, null); + return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.SHORT, null, true); } + /// Long-lived tries are expected to stay around for a long time and will try to minimize the space wasted to data + /// or structure that is no longer referenced. To do this they need a signal that lets them know if all readers + /// started before a given point in time have completed work, given by the `opOrder` parameter. public static InMemoryTrie longLived(ByteComparable.Version byteComparableVersion, OpOrder opOrder) { return longLived(byteComparableVersion, BufferType.OFF_HEAP, opOrder); } + /// Long-lived tries are expected to stay around for a long time and will try to minimize the space wasted to data + /// or structure that is no longer referenced. To do this they need a signal that lets them know if all readers + /// started before a given point in time have completed work, given by the `opOrder` parameter. public static InMemoryTrie longLived(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder) { - return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.LONG, opOrder); - } - - - // Buffer, content list and cell management - - private void putInt(int pos, int value) - { - getBuffer(pos).putInt(inBufferOffset(pos), value); - } - - private void putIntVolatile(int pos, int value) - { - getBuffer(pos).putIntVolatile(inBufferOffset(pos), value); - } - - private void putShort(int pos, short value) - { - getBuffer(pos).putShort(inBufferOffset(pos), value); - } - - private void putShortVolatile(int pos, short value) - { - getBuffer(pos).putShort(inBufferOffset(pos), value); - } - - private void putByte(int pos, byte value) - { - getBuffer(pos).putByte(inBufferOffset(pos), value); - } - - /** - * Allocate a new cell in the data buffers. This is called by the memory allocation strategy when it runs out of - * free cells to reuse. - */ - private int allocateNewCell() throws TrieSpaceExhaustedException - { - // Note: If this method is modified, please run InMemoryTrieTest.testOver1GSize to verify it acts correctly - // close to the 2G limit. - int v = allocatedPos; - if (inBufferOffset(v) == 0) - { - int leadBit = getBufferIdx(v, BUF_START_SHIFT, BUF_START_SIZE); - if (leadBit + BUF_START_SHIFT == 31) - throw new TrieSpaceExhaustedException(); - - ByteBuffer newBuffer = bufferType.allocate(BUF_START_SIZE << leadBit); - buffers[leadBit] = new UnsafeBuffer(newBuffer); - // Note: Since we are not moving existing data to a new buffer, we are okay with no happens-before enforcing - // writes. Any reader that sees a pointer in the new buffer may only do so after reading the volatile write - // that attached the new path. - } - - allocatedPos += CELL_SIZE; - return v; - } - - /** - * Allocate a cell to use for storing data. This uses the memory allocation strategy to reuse cells if any are - * available, or to allocate new cells using {@link #allocateNewCell}. Because some node types rely on cells being - * filled with 0 as initial state, any cell we get through the allocator must also be cleaned. - */ - private int allocateCell() throws TrieSpaceExhaustedException - { - int cell = cellAllocator.allocate(); - getBuffer(cell).setMemory(inBufferOffset(cell), CELL_SIZE, (byte) 0); - return cell; - } - - private void recycleCell(int cell) - { - cellAllocator.recycle(cell & -CELL_SIZE); + return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.LONG, opOrder, true); } - /** - * Creates a copy of a given cell and marks the original for recycling. Used when a mutation needs to force-copy - * paths to ensure earlier states are still available for concurrent readers. - */ - private int copyCell(int cell) throws TrieSpaceExhaustedException + /// Long-lived tries are expected to stay around for a long time and will try to minimize the space wasted to data + /// or structure that is no longer referenced. To do this they need a signal that lets them know if all readers + /// started before a given point in time have completed work, given by the `opOrder` parameter. + public static InMemoryTrie longLived(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder, ContentSerializer contentSerializer) { - int copy = cellAllocator.allocate(); - getBuffer(copy).putBytes(inBufferOffset(copy), getBuffer(cell), inBufferOffset(cell & -CELL_SIZE), CELL_SIZE); - recycleCell(cell); - return copy | (cell & (CELL_SIZE - 1)); + BufferManagerMultibuf bufferManager = new BufferManagerMultibuf(bufferType, ExpectedLifetime.LONG, opOrder); + ContentManager contentManager = new ContentManagerBytes<>(contentSerializer, bufferManager); + return new InMemoryTrie<>(byteComparableVersion, true, bufferManager, contentManager); } - /** - * Allocate a new position in the object array. Used by the memory allocation strategy to allocate a content spot - * when it runs out of recycled positions. - */ - private int allocateNewObject() + /// Creates a short-lived "ordered" in-memory trie, i.e. where reverse iteration presents content on the ascent + /// path so that it can be correctly lexicographically ordered with any keys for which it is a prefix. + /// + /// Short-lived tries do not try to recycle and reuse cells and content slots of the trie that are no longer in use + /// and are expected to be recycled as a whole after the user is done with them. + public static InMemoryTrie shortLivedOrdered(ByteComparable.Version byteComparableVersion) { - int index = contentCount++; - int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); - AtomicReferenceArray array = contentArrays[leadBit]; - if (array == null) - { - assert inBufferOffset(index, leadBit, CONTENTS_START_SIZE) == 0 : "Error in content arrays configuration."; - contentArrays[leadBit] = new AtomicReferenceArray<>(CONTENTS_START_SIZE << leadBit); - } - return index; + return shortLivedOrdered(byteComparableVersion, BufferType.ON_HEAP); } - - /** - * Add a new content value. - * - * @return A content id that can be used to reference the content, encoded as ~index where index is the - * position of the value in the content array. - */ - private int addContent(@Nonnull T value) throws TrieSpaceExhaustedException + /// Creates a short-lived "ordered" in-memory trie, i.e. where reverse iteration presents content on the ascent + /// path so that it can be correctly lexicographically ordered with any keys for which it is a prefix. + /// + /// Short-lived tries do not try to recycle and reuse cells and content slots of the trie that are no longer in use + /// and are expected to be recycled as a whole after the user is done with them. + public static InMemoryTrie shortLivedOrdered(ByteComparable.Version byteComparableVersion, BufferType bufferType) { - Preconditions.checkNotNull(value, "Content value cannot be null"); - int index = objectAllocator.allocate(); - int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); - int ofs = inBufferOffset(index, leadBit, CONTENTS_START_SIZE); - AtomicReferenceArray array = contentArrays[leadBit]; - // no need for a volatile set here; at this point the item is not referenced - // by any node in the trie, and a volatile set will be made to reference it. - array.setPlain(ofs, value); - return ~index; + return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.SHORT, null, false); } - /** - * Change the content associated with a given content id. - * - * @param id content id, encoded as ~index where index is the position in the content array - * @param value new content value to store - */ - private void setContent(int id, T value) + /// Creates a long-lived "ordered" in-memory trie, i.e. where reverse iteration presents content on the ascent + /// path so that it can be correctly lexicographically ordered with any keys for which it is a prefix. + /// + /// Long-lived tries are expected to stay around for a long time and will try to minimize the space wasted to data + /// or structure that is no longer referenced. To do this they need a signal that lets them know if all readers + /// started before a given point in time have completed work, given by the `opOrder` parameter. + public static InMemoryTrie longLivedOrdered(ByteComparable.Version byteComparableVersion, OpOrder opOrder) { - int leadBit = getBufferIdx(~id, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); - int ofs = inBufferOffset(~id, leadBit, CONTENTS_START_SIZE); - AtomicReferenceArray array = contentArrays[leadBit]; - array.set(ofs, value); + return longLivedOrdered(byteComparableVersion, BufferType.OFF_HEAP, opOrder); } - private void releaseContent(int id) + /// Creates a long-lived "ordered" in-memory trie, i.e. where reverse iteration presents content on the ascent + /// path so that it can be correctly lexicographically ordered with any keys for which it is a prefix. + /// + /// Long-lived tries are expected to stay around for a long time and will try to minimize the space wasted to data + /// or structure that is no longer referenced. To do this they need a signal that lets them know if all readers + /// started before a given point in time have completed work, given by the `opOrder` parameter. + public static InMemoryTrie longLivedOrdered(ByteComparable.Version byteComparableVersion, BufferType bufferType, OpOrder opOrder) { - objectAllocator.recycle(~id); + return new InMemoryTrie<>(byteComparableVersion, bufferType, ExpectedLifetime.LONG, opOrder, false); } - /** - * Called to clean up all buffers when the trie is known to no longer be needed. - */ - public void discardBuffers() + public InMemoryCursor makeCursor(Direction direction) { - if (bufferType == BufferType.ON_HEAP) - return; // no cleaning needed - - for (UnsafeBuffer b : buffers) - { - if (b != null) - FileUtils.clean(b.byteBuffer()); - } - } - - private int copyIfOriginal(int node, int originalNode) throws TrieSpaceExhaustedException - { - return (node == originalNode) - ? copyCell(originalNode) - : node; - } - - private int getOrAllocate(int pointerAddress, int offsetWhenAllocating) throws TrieSpaceExhaustedException - { - int child = getIntVolatile(pointerAddress); - if (child != NONE) - return child; - - child = allocateCell() | offsetWhenAllocating; - // volatile writes not needed because this branch is not attached yet - putInt(pointerAddress, child); - return child; - } - - private int getCopyOrAllocate(int pointerAddress, int originalChild, int offsetWhenAllocating) throws TrieSpaceExhaustedException - { - int child = getIntVolatile(pointerAddress); - if (child == originalChild) - { - if (originalChild == NONE) - child = allocateCell() | offsetWhenAllocating; - else - child = copyCell(originalChild); - - // volatile writes not needed because this branch is not attached yet - putInt(pointerAddress, child); - } - - return child; + return new InMemoryCursor<>(this, direction, root); } - // Write methods - - // Write visibility model: writes are not volatile, with the exception of the final write before a call returns - // the same value that was present before (e.g. content was updated in-place / existing node got a new child or had - // a child pointer updated); if the whole path including the root node changed, the root itself gets a volatile - // write. - // This final write is the point where any new cells created during the write become visible for readers for the - // first time, and such readers must pass through reading that pointer, which forces a happens-before relationship - // that extends to all values written by this thread before it. - - /** - * Attach a child to the given non-content node. This may be an update for an existing branch, or a new child for - * the node. An update _is_ required (i.e. this is only called when the newChild pointer is not the same as the - * existing value). - * This method is called when the original node content must be preserved for concurrent readers (i.e. any cell to - * be modified needs to be copied first.) - * - * @param node pointer to the node to update or copy - * @param originalNode pointer to the node as it was before any updates in the current modification (i.e. apply - * call) were started. In other words, the node that is currently reachable by readers if they - * follow the same key, and which will become unreachable for new readers after this update - * completes. Used to avoid copying again if already done -- if node is already != originalNode - * (which is the case when a second or further child of a node is changed by an update), - * then node is currently not reachable and can be safely modified or completely overwritten. - * @param trans transition to modify/add - * @param newChild new child pointer - * @return pointer to the updated node - */ - private int attachChildCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException + protected long emptySizeOnHeap() { - assert !isLeaf(node) : "attachChild cannot be used on content nodes."; - - switch (offset(node)) - { - case PREFIX_OFFSET: - assert false : "attachChild cannot be used on content nodes."; - case SPARSE_OFFSET: - // If the node is already copied (e.g. this is not the first child being modified), there's no need to copy - // it again. - return attachChildToSparseCopying(node, originalNode, trans, newChild); - case SPLIT_OFFSET: - // This call will copy the split node itself and any intermediate cells as necessary to make sure cells - // reachable from the original node are not modified. - return attachChildToSplitCopying(node, originalNode, trans, newChild); - default: - // chain nodes - return attachChildToChainCopying(node, originalNode, trans, newChild); // always copies - } + return bufferManager.bufferType() == BufferType.ON_HEAP ? EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP; } - /** - * Attach a child to the given node. This may be an update for an existing branch, or a new child for the node. - * An update _is_ required (i.e. this is only called when the newChild pointer is not the same as the existing value). - * - * @param node pointer to the node to update or copy - * @param trans transition to modify/add - * @param newChild new child pointer - * @return pointer to the updated node; same as node if update was in-place - */ - private int attachChild(int node, int trans, int newChild) throws TrieSpaceExhaustedException - { - assert !isLeaf(node) : "attachChild cannot be used on content nodes."; + /// Reused storage for the state of application of mutations. This stores the backtracking path, including changes + /// already applied (e.g. new version of a node that is not yet linked to the current trie) and some that are yet + /// to be applied (e.g. updated content). + /// + /// Because in-memory tries are single-writer, we can reuse a single state array for all updates. The updates are + /// serialized and thus no other thread can corrupt this state (note that this is not the factor enforcing the + /// single writer policy, and since we are already bound to it there is cost involved in reusing this state array). + final private ApplyState applyState = new ApplyState<>(this); - switch (offset(node)) - { - case PREFIX_OFFSET: - assert false : "attachChild cannot be used on content nodes."; - case SPARSE_OFFSET: - return attachChildToSparse(node, trans, newChild); - case SPLIT_OFFSET: - return attachChildToSplit(node, trans, newChild); - default: - return attachChildToChain(node, trans, newChild); - } - } - - /** - * Attach a child to the given split node. This may be an update for an existing branch, or a new child for the node. - */ - private int attachChildToSplit(int node, int trans, int newChild) throws TrieSpaceExhaustedException + /// Mutator for `InMemoryTrie`. Combines the target trie with the merge options (i.e. the transformers and + /// predicates) and can be used repeatedly to apply modifications to the trie using [#apply(Trie)]. + public class Mutator extends InMemoryBaseTrie.Mutator, ApplyState> { - int midPos = splitCellPointerAddress(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); - int mid = getIntVolatile(midPos); - if (isNull(mid)) + /// See [InMemoryTrie#mutator(UpsertTransformer, Predicate)] for the meaning of the + /// parameters. + Mutator(UpsertTransformer transformer, Predicate> needsForcedCopy) { - mid = createEmptySplitNode(); - int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - int tail = createEmptySplitNode(); - int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - putInt(childPos, newChild); - putInt(tailPos, tail); - putIntVolatile(midPos, mid); - return node; + super(transformer, needsForcedCopy, applyState); } - int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - int tail = getIntVolatile(tailPos); - if (isNull(tail)) - { - tail = createEmptySplitNode(); - int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - putInt(childPos, newChild); - putIntVolatile(tailPos, tail); - return node; - } - - int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - putIntVolatile(childPos, newChild); - return node; - } - - /** - * Non-volatile version of attachChildToSplit. Used when the split node is not reachable yet (during the conversion - * from sparse). - */ - private int attachChildToSplitNonVolatile(int node, int trans, int newChild) throws TrieSpaceExhaustedException - { - assert offset(node) == SPLIT_OFFSET : "Invalid split node in trie"; - int midPos = splitCellPointerAddress(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); - int mid = getOrAllocate(midPos, SPLIT_OFFSET); - assert offset(mid) == SPLIT_OFFSET : "Invalid split node in trie"; - int tailPos = splitCellPointerAddress(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - int tail = getOrAllocate(tailPos, SPLIT_OFFSET); - assert offset(tail) == SPLIT_OFFSET : "Invalid split node in trie"; - int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - putInt(childPos, newChild); - return node; - } - - /** - * Attach a child to the given split node, copying all modified content to enable atomic visibility - * of modification. - * This may be an update for an existing branch, or a new child for the node. - */ - private int attachChildToSplitCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException - { - if (offset(originalNode) != SPLIT_OFFSET) // includes originalNode == NONE - return attachChildToSplitNonVolatile(node, trans, newChild); - - node = copyIfOriginal(node, originalNode); - assert offset(node) == SPLIT_OFFSET : "Invalid split node in trie"; - - int midPos = splitCellPointerAddress(0, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT); - int midOriginal = originalNode != NONE ? getIntVolatile(midPos + originalNode) : NONE; - int mid = getCopyOrAllocate(node + midPos, midOriginal, SPLIT_OFFSET); - assert offset(mid) == SPLIT_OFFSET : "Invalid split node in trie"; - - int tailPos = splitCellPointerAddress(0, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - int tailOriginal = midOriginal != NONE ? getIntVolatile(tailPos + midOriginal) : NONE; - int tail = getCopyOrAllocate(mid + tailPos, tailOriginal, SPLIT_OFFSET); - assert offset(tail) == SPLIT_OFFSET : "Invalid split node in trie"; - - int childPos = splitCellPointerAddress(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT); - putInt(childPos, newChild); - return node; - } - - /** - * Attach a child to the given sparse node. This may be an update for an existing branch, or a new child for the node. - */ - private int attachChildToSparse(int node, int trans, int newChild) throws TrieSpaceExhaustedException - { - int index; - int smallerCount = 0; - // first check if this is an update and modify in-place if so - for (index = 0; index < SPARSE_CHILD_COUNT; ++index) + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + public void apply(Trie mutation) + throws TrieSpaceExhaustedException { - if (isNull(getIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * 4))) - break; - final int existing = getUnsignedByte(node + SPARSE_BYTES_OFFSET + index); - if (existing == trans) + try { - putIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * 4, newChild); - return node; + start(mutation.cursor(Direction.FORWARD)).apply().complete(); + completeMutation(); } - else if (existing < trans) - ++smallerCount; - } - int childCount = index; - - if (childCount == SPARSE_CHILD_COUNT) - { - // Node is full. Switch to split - return upgradeSparseToSplit(node, trans, newChild); - } - - // Add a new transition. They are not kept in order, so append it at the first free position. - putByte(node + SPARSE_BYTES_OFFSET + childCount, (byte) trans); - - // Update order word. - int order = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); - int newOrder = insertInOrderWord(order, childCount, smallerCount); - - // Sparse nodes have two access modes: via the order word, when listing transitions, or directly to characters - // and addresses. - // To support the former, we volatile write to the order word last, and everything is correctly set up. - // The latter does not touch the order word. To support that too, we volatile write the address, as the reader - // can't determine if the position is in use based on the character byte alone (00 is also a valid transition). - // Note that this means that reader must check the transition byte AFTER the address, to ensure they get the - // correct value (see getSparseChild). - - // setting child enables reads to start seeing the new branch - putIntVolatile(node + SPARSE_CHILDREN_OFFSET + childCount * 4, newChild); - - // some readers will decide whether to check the pointer based on the order word - // write that volatile to make sure they see the new change too - putShortVolatile(node + SPARSE_ORDER_OFFSET, (short) newOrder); - return node; - } - - /** - * Attach a child to the given sparse node. This may be an update for an existing branch, or a new child for the node. - * Resulting node is not reachable, no volatile set needed. - */ - private int attachChildToSparseCopying(int node, int originalNode, int trans, int newChild) throws TrieSpaceExhaustedException - { - int index; - int smallerCount = 0; - // first check if this is an update and modify in-place if so - for (index = 0; index < SPARSE_CHILD_COUNT; ++index) - { - if (isNull(getIntVolatile(node + SPARSE_CHILDREN_OFFSET + index * 4))) - break; - final int existing = getUnsignedByte(node + SPARSE_BYTES_OFFSET + index); - if (existing == trans) + catch (Throwable t) { - node = copyIfOriginal(node, originalNode); - putInt(node + SPARSE_CHILDREN_OFFSET + index * 4, newChild); - return node; + abortMutation(); + throw t; } - else if (existing < trans) - ++smallerCount; - } - int childCount = index; - - if (childCount == SPARSE_CHILD_COUNT) - { - // Node is full. Switch to split. - // Note that even if node != originalNode, we still have to recycle it as it was a temporary one that will - // no longer be attached. - return upgradeSparseToSplit(node, trans, newChild); - } - - node = copyIfOriginal(node, originalNode); - - // Add a new transition. They are not kept in order, so append it at the first free position. - putByte(node + SPARSE_BYTES_OFFSET + childCount, (byte) trans); - - putInt(node + SPARSE_CHILDREN_OFFSET + childCount * 4, newChild); - - // Update order word. - int order = getUnsignedShortVolatile(node + SPARSE_ORDER_OFFSET); - int newOrder = insertInOrderWord(order, childCount, smallerCount); - putShort(node + SPARSE_ORDER_OFFSET, (short) newOrder); - - return node; - } - - private int upgradeSparseToSplit(int node, int trans, int newChild) throws TrieSpaceExhaustedException - { - int split = createEmptySplitNode(); - for (int i = 0; i < SPARSE_CHILD_COUNT; ++i) - { - int t = getUnsignedByte(node + SPARSE_BYTES_OFFSET + i); - int p = getIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4); - attachChildToSplitNonVolatile(split, t, p); } - attachChildToSplitNonVolatile(split, trans, newChild); - recycleCell(node); - return split; - } - /** - * Insert the given newIndex in the base-6 encoded order word in the correct position with respect to the ordering. - *

    - * E.g. - * - insertOrderWord(120, 3, 0) must return 1203 (decimal 48*6 + 3) - * - insertOrderWord(120, 3, 1, ptr) must return 1230 (decimal 8*36 + 3*6 + 0) - * - insertOrderWord(120, 3, 2, ptr) must return 1320 (decimal 1*216 + 3*36 + 12) - * - insertOrderWord(120, 3, 3, ptr) must return 3120 (decimal 3*216 + 48) - */ - private static int insertInOrderWord(int order, int newIndex, int smallerCount) - { - int r = 1; - for (int i = 0; i < smallerCount; ++i) - r *= 6; - int head = order / r; - int tail = order % r; - // insert newIndex after the ones we have passed (order % r) and before the remaining (order / r) - return tail + (head * 6 + newIndex) * r; - } - - /** - * Attach a child to the given chain node. This may be an update for an existing branch with different target - * address, or a second child for the node. - * This method always copies the node -- with the exception of updates that change the child of the last node in a - * chain cell with matching transition byte (which this method is not used for, see attachChild), modifications to - * chain nodes cannot be done in place, either because we introduce a new transition byte and have to convert from - * the single-transition chain type to sparse, or because we have to remap the child from the implicit node + 1 to - * something else. - */ - private int attachChildToChain(int node, int transitionByte, int newChild) throws TrieSpaceExhaustedException - { - int existingByte = getUnsignedByte(node); - if (transitionByte == existingByte) + /// Map-like put method, using the apply machinery above which cannot run into stack overflow. When the correct + /// position in the trie has been reached, the value will be resolved with the given function before being placed in + /// the trie (even if there's no pre-existing content in this trie). + /// @param key the trie path/key for the given value. + /// @param value the value being put in the memtable trie. Note that it can be of type different than the element + /// type for this memtable trie. It's up to the `transformer` to return the final value that will stay in + /// the memtable trie. + public void putSingleton(ByteComparable key, U value) throws TrieSpaceExhaustedException { - // This is still a single path. Update child if possible (only if this is the last character in the chain). - if (offset(node) == LAST_POINTER_OFFSET - 1) - { - putIntVolatile(node + 1, newChild); - return node; - } - else - { - // This will only be called if new child is different from old, and the update is not on the final child - // where we can change it in place (see attachChild). We must always create something new. - // Note that since this is not the last character, we either still need this cell or we have already - // released it (a createSparseNode must have been called earlier). - // If the child is a chain, we can expand it (since it's a different value, its branch must be new and - // nothing can already reside in the rest of the cell). - return expandOrCreateChainNode(transitionByte, newChild); - } + apply(Trie.singleton(key, byteComparableVersion, value)); } - - // The new transition is different, so we no longer have only one transition. Change type. - return convertChainToSparse(node, existingByte, newChild, transitionByte); } - /** - * Attach a child to the given chain node, when we are force-copying. - */ - private int attachChildToChainCopying(int node, int originalNode, int transitionByte, int newChild) - throws TrieSpaceExhaustedException + /// Creates a trie mutator that can be used to apply multiple modifications to the trie. + /// + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to + /// concurrent readers. See NodeFeatures for details. + public Mutator mutator(UpsertTransformer transformer, + Predicate> needsForcedCopy) { - int existingByte = getUnsignedByte(node); - if (transitionByte == existingByte) - { - // This is still a single path. - // Make sure we release the cell if it will no longer be referenced (if we update last reference, the whole - // path has to move as the other nodes in this chain can't be remapped). - if (offset(node) == LAST_POINTER_OFFSET - 1) - { - assert node == originalNode; // if we have already created a node, the character can't match what - // it was created with - - recycleCell(node); - } - - return expandOrCreateChainNode(transitionByte, newChild); - } - else - { - // The new transition is different, so we no longer have only one transition. Change type. - return convertChainToSparse(node, existingByte, newChild, transitionByte); - } + return new Mutator<>(transformer, needsForcedCopy); } - private int convertChainToSparse(int node, int existingByte, int newChild, int transitionByte) + /// Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved + /// with the given function before being placed in this trie (even if there's no pre-existing content in this trie). + /// @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type + /// different than the element type for this memtable trie. + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to + /// concurrent readers. See NodeFeatures for details. + public void apply(Trie mutation, + final UpsertTransformer transformer, + final Predicate> needsForcedCopy) throws TrieSpaceExhaustedException { - int existingChild = node + 1; - if (offset(existingChild) == LAST_POINTER_OFFSET) - { - existingChild = getIntVolatile(existingChild); - // This was a chain with just one transition which will no longer be referenced. - // The cell may contain other characters/nodes leading to this, which are also guaranteed to be - // unreferenced. - // However, these leading nodes may still be in the parent path and will be needed until the - // mutation completes. - recycleCell(node); - } - // Otherwise the sparse node we will now create references this cell, so it can't be recycled. - return createSparseNode(existingByte, existingChild, transitionByte, newChild); - } - - private boolean isExpandableChain(int newChild) - { - int newOffset = offset(newChild); - return newChild > 0 && newChild - 1 > NONE && newOffset > CHAIN_MIN_OFFSET && newOffset <= CHAIN_MAX_OFFSET; - } - - /** - * Create a sparse node with two children. - */ - private int createSparseNode(int byte1, int child1, int byte2, int child2) throws TrieSpaceExhaustedException - { - assert byte1 != byte2 : "Attempted to create a sparse node with two of the same transition"; - if (byte1 > byte2) - { - // swap them so the smaller is byte1, i.e. there's always something bigger than child 0 so 0 never is - // at the end of the order - int t = byte1; byte1 = byte2; byte2 = t; - t = child1; child1 = child2; child2 = t; - } - - int node = allocateCell() + SPARSE_OFFSET; - putByte(node + SPARSE_BYTES_OFFSET + 0, (byte) byte1); - putByte(node + SPARSE_BYTES_OFFSET + 1, (byte) byte2); - putInt(node + SPARSE_CHILDREN_OFFSET + 0 * 4, child1); - putInt(node + SPARSE_CHILDREN_OFFSET + 1 * 4, child2); - putShort(node + SPARSE_ORDER_OFFSET, (short) (1 * 6 + 0)); - // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be - // put in an existing node or the root. That action ends in a happens-before enforcing write. - return node; - } - - /** - * Creates a chain node with the single provided transition (pointing to the provided child). - * Note that to avoid creating inefficient tries with under-utilized chain nodes, this should only be called from - * {@link #expandOrCreateChainNode} and other call-sites should call {@link #expandOrCreateChainNode}. - */ - private int createNewChainNode(int transitionByte, int newChild) throws TrieSpaceExhaustedException - { - int newNode = allocateCell() + LAST_POINTER_OFFSET - 1; - putByte(newNode, (byte) transitionByte); - putInt(newNode + 1, newChild); - // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be - // put in an existing node or the root. That action ends in a happens-before enforcing write. - return newNode; - } - - /** Like {@link #createNewChainNode}, but if the new child is already a chain node and has room, expand - * it instead of creating a brand new node. */ - private int expandOrCreateChainNode(int transitionByte, int newChild) throws TrieSpaceExhaustedException - { - if (isExpandableChain(newChild)) - { - // attach as a new character in child node - int newNode = newChild - 1; - putByte(newNode, (byte) transitionByte); - return newNode; - } - - return createNewChainNode(transitionByte, newChild); - } - - private int createEmptySplitNode() throws TrieSpaceExhaustedException - { - return allocateCell() + SPLIT_OFFSET; - } - - private int createPrefixNode(int contentId, int child, boolean isSafeChain) throws TrieSpaceExhaustedException - { - assert !isNullOrLeaf(child) : "Prefix node cannot reference a childless node."; - - int offset = offset(child); - int node; - if (offset == SPLIT_OFFSET || isSafeChain && offset > (PREFIX_FLAGS_OFFSET + PREFIX_OFFSET) && offset <= CHAIN_MAX_OFFSET) - { - // We can do an embedded prefix node - // Note: for chain nodes we have a risk that the node continues beyond the current point, in which case - // creating the embedded node may overwrite information that is still needed by concurrent readers or the - // mutation process itself. - node = (child & -CELL_SIZE) | PREFIX_OFFSET; - putByte(node + PREFIX_FLAGS_OFFSET, (byte) offset); - } - else - { - // Full prefix node - node = allocateCell() + PREFIX_OFFSET; - putByte(node + PREFIX_FLAGS_OFFSET, (byte) 0xFF); - putInt(node + PREFIX_POINTER_OFFSET, child); - } - - putInt(node + PREFIX_CONTENT_OFFSET, contentId); - return node; - } - - private int updatePrefixNodeChild(int node, int child, boolean forcedCopy) throws TrieSpaceExhaustedException - { - assert offset(node) == PREFIX_OFFSET : "updatePrefix called on non-prefix node"; - assert !isNullOrLeaf(child) : "Prefix node cannot reference a childless node."; - - // We can only update in-place if we have a full prefix node - if (!isEmbeddedPrefixNode(node)) - { - if (!forcedCopy) - { - // This attaches the child branch and makes it reachable -- the write must be volatile. - putIntVolatile(node + PREFIX_POINTER_OFFSET, child); - return node; - } - else - { - node = copyCell(node); - putInt(node + PREFIX_POINTER_OFFSET, child); - return node; - } - } - else - { - // No need to recycle this cell because that is already done by the modification of the child - int contentId = getIntVolatile(node + PREFIX_CONTENT_OFFSET); - return createPrefixNode(contentId, child, true); - } + mutator(transformer, needsForcedCopy).apply(mutation); } - private boolean isEmbeddedPrefixNode(int node) + /// Trie mutator that accepts a range trie (e.g. deletions) to apply over data in this trie using [#apply(RangeTrie)]. + public static class RangeMutator> + extends InMemoryBaseTrie.Mutator, ApplyState> { - return getUnsignedByte(node + PREFIX_FLAGS_OFFSET) < CELL_SIZE; - } + int initialDepth; - /** - * Copy the content from an existing node, if it has any, to a newly-prepared update for its child. - * - * @param existingPreContentNode pointer to the existing node before skipping over content nodes, i.e. this is - * either the same as existingPostContentNode or a pointer to a prefix or leaf node - * whose child is existingPostContentNode - * @param existingPostContentNode pointer to the existing node being updated, after any content nodes have been - * skipped and before any modification have been applied; always a non-content node - * @param updatedPostContentNode is the updated node, i.e. the node to which all relevant modifications have been - * applied; if the modifications were applied in-place, this will be the same as - * existingPostContentNode, otherwise a completely different pointer; always a non- - * content node - * @param forcedCopy whether or not we need to preserve all pre-existing data for concurrent readers - * @return a node which has the children of updatedPostContentNode combined with the content of - * existingPreContentNode - */ - private int preserveContent(int existingPreContentNode, - int existingPostContentNode, - int updatedPostContentNode, - boolean forcedCopy) - throws TrieSpaceExhaustedException - { - if (existingPreContentNode == existingPostContentNode) - return updatedPostContentNode; // no content to preserve - - if (existingPostContentNode == updatedPostContentNode) + /// See [InMemoryTrie#mutator(UpsertTransformer, Predicate)] for the meaning of + /// the parameters. + RangeMutator(ApplyState state, + UpsertTransformer transformer, + Predicate> needsForcedCopy) { - assert !forcedCopy; - return existingPreContentNode; // child didn't change, no update necessary + super(transformer, needsForcedCopy, state); } - // else we have existing prefix node, and we need to reference a new child - if (isLeaf(existingPreContentNode)) + RangeMutator start(int root, RangeCursor mutationCursor, int initialForcedCopyDepth) { - return createPrefixNode(existingPreContentNode, updatedPostContentNode, true); - } - - assert offset(existingPreContentNode) == PREFIX_OFFSET : "Unexpected content in non-prefix and non-leaf node."; - return updatePrefixNodeChild(existingPreContentNode, updatedPostContentNode, forcedCopy); - } - - private final ApplyState applyState = new ApplyState(); - - /** - * Represents the state for an {@link #apply} operation. Contains a stack of all nodes we descended through - * and used to update the nodes with any new data during ascent. - *

    - * To make this as efficient and GC-friendly as possible, we use an integer array (instead of is an object stack) - * and we reuse the same object. The latter is safe because memtable tries cannot be mutated in parallel by multiple - * writers. - */ - private class ApplyState implements KeyProducer - { - int[] data = new int[16 * 5]; - int currentDepth = -1; - - /** - * Pointer to the existing node before skipping over content nodes, i.e. this is either the same as - * existingPostContentNode or a pointer to a prefix or leaf node whose child is existingPostContentNode. - */ - int existingPreContentNode() - { - return data[currentDepth * 5 + 0]; - } - void setExistingPreContentNode(int value) - { - data[currentDepth * 5 + 0] = value; - } - - /** - * Pointer to the existing node being updated, after any content nodes have been skipped and before any - * modification have been applied. Always a non-content node. - */ - int existingPostContentNode() - { - return data[currentDepth * 5 + 1]; - } - void setExistingPostContentNode(int value) - { - data[currentDepth * 5 + 1] = value; - } - - /** - * The updated node, i.e. the node to which the relevant modifications are being applied. This will change as - * children are processed and attached to the node. After all children have been processed, this will contain - * the fully updated node (i.e. the union of existingPostContentNode and mutationNode) without any content, - * which will be processed separately and, if necessary, attached ahead of this. If the modifications were - * applied in-place, this will be the same as existingPostContentNode, otherwise a completely different - * pointer. Always a non-content node. - */ - int updatedPostContentNode() - { - return data[currentDepth * 5 + 2]; - } - void setUpdatedPostContentNode(int value) - { - data[currentDepth * 5 + 2] = value; - } - - /** - * The transition we took on the way down. - */ - int transition() - { - return data[currentDepth * 5 + 3]; - } - void setTransition(int transition) - { - data[currentDepth * 5 + 3] = transition; - } - int transitionAtDepth(int stackDepth) - { - return data[stackDepth * 5 + 3]; - } - - /** - * The compiled content id. Needed because we can only access a cursor's content on the way down but we can't - * attach it until we ascend from the node. - */ - int contentId() - { - return data[currentDepth * 5 + 4]; - } - void setContentId(int value) - { - data[currentDepth * 5 + 4] = value; - } - int contentIdAtDepth(int stackDepth) - { - return data[stackDepth * 5 + 4]; - } - - ApplyState start() - { - int existingFullNode = root; - currentDepth = 0; - - descendInto(existingFullNode); + initialDepth = 0; + super.start(root, mutationCursor, initialForcedCopyDepth); return this; } - /** - * Returns true if the depth signals mutation cursor is exhausted. - */ - boolean advanceTo(int depth, int transition, int forcedCopyDepth) throws TrieSpaceExhaustedException - { - while (currentDepth > Math.max(0, depth - 1)) - { - // There are no more children. Ascend to the parent state to continue walk. - attachAndMoveToParentState(forcedCopyDepth); - } - if (depth == -1) - return true; - - // We have a transition, get child to descend into - descend(transition); - return false; - } - - /** - * Descend to a child node. Prepares a new entry in the stack for the node. - */ - void descend(int transition) + /// A variation of `start` which starts the operation at some point in the trie rather than the root. Used for + /// processing deletion branches in deletion-aware tries. + RangeMutator continueFromCurrentState(RangeCursor mutationCursor, int initialForcedCopyDepth) { - setTransition(transition); - int existingPreContentNode = getChild(existingPreContentNode(), transition); - ++currentDepth; - descendInto(existingPreContentNode); + mutationCursor.assertFresh(); + this.mutationCursor = mutationCursor; + this.initialDepth = state.currentDepth; + this.forcedCopyDepth = initialForcedCopyDepth; + return this; } - private void descendInto(int existingPreContentNode) + @Override + RangeMutator apply() throws TrieSpaceExhaustedException { - if (currentDepth * 5 >= data.length) - data = Arrays.copyOf(data, currentDepth * 5 * 2); - setExistingPreContentNode(existingPreContentNode); - - int existingContentId = NONE; - int existingPostContentNode; - if (isLeaf(existingPreContentNode)) - { - existingContentId = existingPreContentNode; - existingPostContentNode = NONE; - } - else if (offset(existingPreContentNode) == PREFIX_OFFSET) + int depth = state.currentDepth; + long position = mutationCursor.encodedPosition(); + assert !Cursor.isOnReturnPath(position) : "Cursor cannot start with position on return path."; + while (true) { - existingContentId = getIntVolatile(existingPreContentNode + PREFIX_CONTENT_OFFSET); - existingPostContentNode = followContentTransition(existingPreContentNode); - } - else - existingPostContentNode = existingPreContentNode; - setExistingPostContentNode(existingPostContentNode); - setUpdatedPostContentNode(existingPostContentNode); - setContentId(existingContentId); - } - - T getContent() - { - int contentId = contentId(); - if (contentId == NONE) - return null; - return InMemoryTrie.this.getContent(contentId()); - } + if (depth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; - void setContent(T content, boolean forcedCopy) throws TrieSpaceExhaustedException - { - int contentId = contentId(); - if (contentId == NONE) - { + S content = mutationCursor.content(); if (content != null) - setContentId(InMemoryTrie.this.addContent(content)); - } - else if (content == null) - { - releaseContent(contentId); - setContentId(NONE); - // At this point we are not deleting branches on the way up, just making sure we don't hold on to - // references to content. - } - else if (content == InMemoryTrie.this.getContent(contentId)) - { - // no changes, nothing to do - } - else if (forcedCopy) - { - releaseContent(contentId); - setContentId(InMemoryTrie.this.addContent(content)); - } - else - { - InMemoryTrie.this.setContent(contentId, content); - } - } - - /** - * Attach a child to the current node. - */ - private void attachChild(int transition, int child, boolean forcedCopy) throws TrieSpaceExhaustedException - { - int updatedPostContentNode = updatedPostContentNode(); - if (isNull(updatedPostContentNode)) - setUpdatedPostContentNode(expandOrCreateChainNode(transition, child)); - else if (forcedCopy) - setUpdatedPostContentNode(attachChildCopying(updatedPostContentNode, - existingPostContentNode(), - transition, - child)); - else - setUpdatedPostContentNode(InMemoryTrie.this.attachChild(updatedPostContentNode, - transition, - child)); - } - - /** - * Apply the collected content to a node. Converts NONE to a leaf node, and adds or updates a prefix for all - * others. - */ - private int applyContent(boolean forcedCopy) throws TrieSpaceExhaustedException - { - // Note: the old content id itself is already released by setContent. Here we must release any standalone - // prefix nodes that may reference it. - int contentId = contentId(); - final int updatedPostContentNode = updatedPostContentNode(); - final int existingPreContentNode = existingPreContentNode(); - final int existingPostContentNode = existingPostContentNode(); + applyDeletionRange(position); - // applyPrefixChange does not understand leaf nodes, handle upgrade from and to one explicitly. - if (isNull(updatedPostContentNode)) - { - if (existingPreContentNode != existingPostContentNode - && !isNullOrLeaf(existingPreContentNode) - && !isEmbeddedPrefixNode(existingPreContentNode)) - recycleCell(existingPreContentNode); - return contentId; // also fine for contentId == NONE + position = mutationCursor.advance(); + depth = Cursor.depth(position) + initialDepth; + // Descend but do not modify anything yet. If the position is on the return path, we can still follow + // it, `applyDeletionRange` will take care to not apply it to content or descendants. + if (!state.advanceTo(depth, Cursor.incomingTransition(position), forcedCopyDepth, initialDepth)) + break; + assert depth == state.currentDepth : "Unexpected change to applyState. Concurrent trie modification?"; } - if (isLeaf(existingPreContentNode)) - return contentId != NONE - ? createPrefixNode(contentId, updatedPostContentNode, true) - : updatedPostContentNode; - - return applyPrefixChange(updatedPostContentNode, - existingPreContentNode, - existingPostContentNode, - contentId, - forcedCopy); + assert state.currentDepth == initialDepth; + return this; } - private int applyPrefixChange(int updatedPostPrefixNode, - int existingPrePrefixNode, - int existingPostPrefixNode, - int prefixData, - boolean forcedCopy) - throws TrieSpaceExhaustedException + /// Walk all existing content covered under a deletion. Returns true if the caller needs to continue processing + /// the mutation cursor, and false if the mutation has been exhausted (i.e. the range was open on the right + /// and we have consumed all existing content). + void applyDeletionRange(long position) throws TrieSpaceExhaustedException { - boolean prefixWasPresent = existingPrePrefixNode != existingPostPrefixNode; - boolean prefixWasEmbedded = prefixWasPresent && isEmbeddedPrefixNode(existingPrePrefixNode); - if (prefixData == NONE) - { - if (prefixWasPresent && !prefixWasEmbedded) - recycleCell(existingPrePrefixNode); - return updatedPostPrefixNode; - } - - boolean childChanged = updatedPostPrefixNode != existingPostPrefixNode; - boolean dataChanged = !prefixWasPresent || prefixData != getIntVolatile(existingPrePrefixNode + PREFIX_CONTENT_OFFSET); - if (!childChanged && !dataChanged) - return existingPrePrefixNode; - - if (forcedCopy) + S mutationCoveringState = null; + boolean atMutation = true; + int depth = Cursor.depth(position) + initialDepth; + int transition = Cursor.incomingTransition(position); + boolean onReturnPath = Cursor.isOnReturnPath(position); + // We are walking both tries in parallel. + while (true) { - if (!childChanged && prefixWasEmbedded) + if (atMutation) { - // If we directly create in this case, we will find embedding is possible and will overwrite the - // previous value. - // We could create a separate metadata node referencing the child, but in that case we'll - // use two nodes while one suffices. Instead, copy the child and embed the new metadata. - updatedPostPrefixNode = copyCell(existingPostPrefixNode); - } - else if (prefixWasPresent && !prefixWasEmbedded) - { - recycleCell(existingPrePrefixNode); - // otherwise cell is already recycled by the recycling of the child - } - return createPrefixNode(prefixData, updatedPostPrefixNode, isNull(existingPostPrefixNode)); - } + if (state.currentDepth < forcedCopyDepth) + forcedCopyDepth = needsForcedCopy.test(this) ? state.currentDepth : Integer.MAX_VALUE; - // We can't update in-place if there was no preexisting prefix, or if the - // prefix was embedded and the target node must change. - if (!prefixWasPresent || prefixWasEmbedded && childChanged) - return createPrefixNode(prefixData, updatedPostPrefixNode, isNull(existingPostPrefixNode)); + S mutationContent = mutationCursor.content(); - // Otherwise modify in place - if (childChanged) // to use volatile write but also ensure we don't corrupt embedded nodes - putIntVolatile(existingPrePrefixNode + PREFIX_POINTER_OFFSET, updatedPostPrefixNode); - if (dataChanged) - putIntVolatile(existingPrePrefixNode + PREFIX_CONTENT_OFFSET, prefixData); - return existingPrePrefixNode; - } + if (mutationContent != null) + { + if (!onReturnPath) + applyContent(mutationContent); + mutationCoveringState = mutationContent.succedingState(Direction.FORWARD); + } + else if (!onReturnPath) + applyContent(mutationCoveringState); - /** - * After a node's children are processed, this is called to ascend from it. This means applying the collected - * content to the compiled updatedPostContentNode and creating a mapping in the parent to it (or updating if - * one already exists). - */ - void attachAndMoveToParentState(int forcedCopyDepth) throws TrieSpaceExhaustedException - { - int updatedFullNode = applyContent(currentDepth >= forcedCopyDepth); - int existingFullNode = existingPreContentNode(); - --currentDepth; + if (mutationCoveringState == null) + return; - if (updatedFullNode != existingFullNode) - attachChild(transition(), updatedFullNode, currentDepth >= forcedCopyDepth); - } + position = mutationCursor.advance(); + depth = Cursor.depth(position) + initialDepth; + transition = Cursor.incomingTransition(position); + onReturnPath = Cursor.isOnReturnPath(position); + } + else + applyContent(mutationCoveringState); - /** - * Ascend and update the root at the end of processing. - */ - void attachRoot(int forcedCopyDepth) throws TrieSpaceExhaustedException - { - int updatedPreContentNode = applyContent(0 >= forcedCopyDepth); - int existingPreContentNode = existingPreContentNode(); - assert root == existingPreContentNode : "Unexpected change to root. Concurrent trie modification?"; - if (updatedPreContentNode != existingPreContentNode) - { - // Only write to root if they are different (value doesn't change, but - // we don't want to invalidate the value in other cores' caches unnecessarily). - root = updatedPreContentNode; + atMutation = !state.advanceToNextExistingOr(depth, transition, onReturnPath, forcedCopyDepth, initialDepth); } } - public byte[] getBytes() + void applyContent(S content) throws TrieSpaceExhaustedException { - int arrSize = currentDepth; - byte[] data = new byte[arrSize]; - int pos = 0; - for (int i = 0; i < currentDepth; ++i) + T existingContent = state.getDescentPathContent(); + if (existingContent != null) { - int trans = transitionAtDepth(i); - data[pos++] = (byte) trans; + T combinedContent = transformer.apply(existingContent, content); + if (combinedContent != existingContent) + state.setDescentPathContent(combinedContent, // can be null + state.currentDepth >= forcedCopyDepth); // this is called at the start of processing } - return data; } - public byte[] getBytes(Predicate shouldStop) + + /// Apply the given range trie to this in-memory trie. Any existing content that falls under the ranges of the given + /// trie will be modified by applying the transformer. This is usually used to delete covered content (by returning + /// null from the transformer). + /// @param rangeTrie the ranges to be applied, given in the form of a range trie. + public void apply(RangeTrie rangeTrie) throws TrieSpaceExhaustedException { - if (currentDepth == 0) - return new byte[0]; + apply(rangeTrie.cursor(Direction.FORWARD)); + } - int arrSize = 1; - int i; - for (i = currentDepth - 1; i > 0; --i) + void apply(RangeCursor cursor) throws TrieSpaceExhaustedException + { + try { - int content = contentIdAtDepth(i); - if (!isNull(content) && shouldStop.test(InMemoryTrie.this.getContent(content))) - break; - ++arrSize; + start(cursor).apply().complete(); + state.trie.completeMutation(); } - assert i > 0 || arrSize == currentDepth; // if the loop covers the whole stack, the array must cover the full depth - - byte[] data = new byte[arrSize]; - int pos = 0; - for (; i < currentDepth; ++i) + catch (Throwable t) { - int trans = transitionAtDepth(i); - data[pos++] = (byte) trans; + state.trie.abortMutation(); + throw t; } - return data; - } - - public ByteComparable.Version byteComparableVersion() - { - return byteComparableVersion; } } - public interface KeyProducer + /// A variation of range mutator to apply sets as deletions of data in the trie. + public static class SetMutator extends RangeMutator { - /** - * Get the bytes of the path leading to this node. - */ - byte[] getBytes(); - - /** - * Get the bytes of the path leading to this node from the closest ancestor whose content, after any new inserts - * have been applied, satisfies the given predicate. - * Note that the predicate is not called for the current position, because its content is not yet prepared when - * the method is being called. - */ - byte[] getBytes(Predicate shouldStop); - - ByteComparable.Version byteComparableVersion(); - } - - /** - * Somewhat similar to {@link Trie.MergeResolver}, this encapsulates logic to be applied whenever new content is - * being upserted into a {@link InMemoryTrie}. Unlike {@link Trie.MergeResolver}, {@link UpsertTransformer} will be - * applied no matter if there's pre-existing content for that trie key/path or not. - * - * @param The content type for this {@link InMemoryTrie}. - * @param The type of the new content being applied to this {@link InMemoryTrie}. - */ - public interface UpsertTransformerWithKeyProducer - { - /** - * Called when there's content in the updating trie. - * - * @param existing Existing content for this key, or null if there isn't any. - * @param update The update, always non-null. - * @param keyState An interface that can be used to retrieve the path of the value being updated. - * @return The combined value to use. Cannot be null. - */ - @Nonnull T apply(T existing, @Nonnull U update, @Nonnull KeyProducer keyState); - } - - /** - * Somewhat similar to {@link Trie.MergeResolver}, this encapsulates logic to be applied whenever new content is - * being upserted into a {@link InMemoryTrie}. Unlike {@link Trie.MergeResolver}, {@link UpsertTransformer} will be - * applied no matter if there's pre-existing content for that trie key/path or not. - *

    - * A version of the above that does not use a {@link KeyProducer}. - * - * @param The content type for this {@link InMemoryTrie}. - * @param The type of the new content being applied to this {@link InMemoryTrie}. - */ - public interface UpsertTransformer extends UpsertTransformerWithKeyProducer - { - /** - * Called when there's content in the updating trie. - * - * @param existing Existing content for this key, or null if there isn't any. - * @param update The update, always non-null. - * @return The combined value to use. Cannot be null. - */ - @Nonnull T apply(T existing, @Nonnull U update); - - /** - * Version of the above that also provides the path of a value being updated. - * - * @param existing Existing content for this key, or null if there isn't any. - * @param update The update, always non-null. - * @param keyState An interface that can be used to retrieve the path of the value being updated. - * @return The combined value to use. Cannot be null. - */ - default @Nonnull T apply(T existing, @Nonnull U update, @Nonnull KeyProducer keyState) + SetMutator(ApplyState state, Predicate> needsForcedCopy) { - return apply(existing, update); + super(state, SetMutator::deleteEntry, needsForcedCopy); } - } - /** - * Interface providing features of the mutating node during mutation done using {@link #apply}. - * Effectively a subset of the {@link Trie.Cursor} interface which only permits operations that are safe to - * perform before iterating the children of the mutation node to apply the branch mutation. - * - * This is mainly used as an argument to predicates that decide when to copy substructure when modifying tries, - * which enables different kinds of atomicity and consistency guarantees. - * - * See the InMemoryTrie javadoc or InMemoryTrieThreadedTest for demonstration of the typical usages and what they - * achieve. - */ - public interface NodeFeatures - { - /** - * Whether or not the node has more than one descendant. If a checker needs mutations to be atomic, they can - * return true when this becomes true. - */ - boolean isBranching(); - - /** - * The metadata associated with the node. If readers need to see a consistent view (i.e. where older updates - * cannot be missed if a new one is presented) below some specified point (e.g. within a partition), the checker - * should return true when it identifies that point. - */ - T content(); - } - - private static class Mutation implements NodeFeatures - { - final UpsertTransformerWithKeyProducer transformer; - final Predicate> needsForcedCopy; - final Cursor mutationCursor; - final InMemoryTrie.ApplyState state; - int forcedCopyDepth; - - Mutation(UpsertTransformerWithKeyProducer transformer, - Predicate> needsForcedCopy, - Cursor mutationCursor, - InMemoryTrie.ApplyState state) + void apply(TrieSet set) throws TrieSpaceExhaustedException { - assert mutationCursor.depth() == 0 : "Unexpected non-fresh cursor."; - assert state.currentDepth == 0 : "Unexpected change to applyState. Concurrent trie modification?"; - this.transformer = transformer; - this.needsForcedCopy = needsForcedCopy; - this.mutationCursor = mutationCursor; - this.state = state; + apply(set.cursor(Direction.FORWARD)); } - void apply() throws TrieSpaceExhaustedException + private static T deleteEntry(T entry, TrieSetCursor.RangeState state) { - int depth = state.currentDepth; - while (true) - { - if (depth <= forcedCopyDepth) - forcedCopyDepth = needsForcedCopy.test(this) ? depth : Integer.MAX_VALUE; - - applyContent(); - - depth = mutationCursor.advance(); - if (state.advanceTo(depth, mutationCursor.incomingTransition(), forcedCopyDepth)) - break; - assert state.currentDepth == depth : "Unexpected change to applyState. Concurrent trie modification?"; - } + return state.applicableAfter ? null : entry; } - void applyContent() throws TrieSpaceExhaustedException - { - U content = mutationCursor.content(); - if (content != null) - { - T existingContent = state.getContent(); - T combinedContent = transformer.apply(existingContent, content, state); - if (combinedContent == null) - throw new AssertionError("Transformer " + transformer + " returned null content for " - + existingContent + ", " + content); - state.setContent(combinedContent, - state.currentDepth >= forcedCopyDepth); // this is called at the start of processing - } - } - - - void complete() throws TrieSpaceExhaustedException - { - assert state.currentDepth == 0 : "Unexpected change to applyState. Concurrent trie modification?"; - state.attachRoot(forcedCopyDepth); - } - - @Override - public boolean isBranching() - { - // This is not very efficient, but we only currently use this option in tests. - // If it's needed for production use, isBranching should be implemented in the cursor interface. - Cursor dupe = mutationCursor.tailTrie().cursor(Direction.FORWARD); - int childDepth = dupe.advance(); - return childDepth > 0 && - dupe.skipTo(childDepth, dupe.incomingTransition() + 1) == childDepth; - } + } - @Override - public U content() - { - return mutationCursor.content(); - } + /// Creates a range mutator that can be used to apply multiple modifications/deletions to the trie. + /// + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value. Applied even if there's no pre-existing value in the memtable trie. + /// @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to + /// concurrent readers. See NodeFeatures for details. + public > RangeMutator rangeMutator(UpsertTransformer transformer, + Predicate> needsForcedCopy) + { + return new RangeMutator<>(applyState, transformer, needsForcedCopy); } - /** - * Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved - * with the given function before being placed in this trie (even if there's no pre-existing content in this trie). - * @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type - * different than the element type for this memtable trie. - * @param transformer a function applied to the potentially pre-existing value for the given key, and the new - * value. Applied even if there's no pre-existing value in the memtable trie. - * @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to - * concurrent readers. See NodeFeatures for details. - */ - public void apply(Trie mutation, - final UpsertTransformerWithKeyProducer transformer, - final Predicate> needsForcedCopy) - throws TrieSpaceExhaustedException + /// Creates a set mutator that can be used to apply multiple deletions to the trie. + public SetMutator deleter() { - try - { - Mutation m = new Mutation<>(transformer, - needsForcedCopy, - mutation.cursor(Direction.FORWARD), - applyState.start()); - m.apply(); - m.complete(); - completeMutation(); - } - catch (Throwable t) - { - abortMutation(); - throw t; - } + return new SetMutator<>(applyState, NodeFeatures::isBranching); } - /** - * Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved - * with the given function before being placed in this trie (even if there's no pre-existing content in this trie). - * @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type - * different than the element type for this memtable trie. - * @param transformer a function applied to the potentially pre-existing value for the given key, and the new - * value. Applied even if there's no pre-existing value in the memtable trie. - * @param needsForcedCopy a predicate which decides when to fully copy a branch to provide atomicity guarantees to - * concurrent readers. See NodeFeatures for details. - */ - public void apply(Trie mutation, - final UpsertTransformer transformer, - final Predicate> needsForcedCopy) - throws TrieSpaceExhaustedException + /// Delete all entries covered under the specified TrieSet + public void delete(TrieSet set) throws TrieSpaceExhaustedException { - apply(mutation, (UpsertTransformerWithKeyProducer) transformer, needsForcedCopy); + deleter().apply(set); } - /** - * Map-like put method, using the apply machinery above which cannot run into stack overflow. When the correct - * position in the trie has been reached, the value will be resolved with the given function before being placed in - * the trie (even if there's no pre-existing content in this trie). - * @param key the trie path/key for the given value. - * @param value the value being put in the memtable trie. Note that it can be of type different than the element - * type for this memtable trie. It's up to the {@code transformer} to return the final value that will stay in - * the memtable trie. - * @param transformer a function applied to the potentially pre-existing value for the given key, and the new - * value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied - * even if there's no pre-existing value in the memtable trie. - */ + /// Map-like put method, using the apply machinery above which cannot run into stack overflow. When the correct + /// position in the trie has been reached, the value will be resolved with the given function before being placed in + /// the trie (even if there's no pre-existing content in this trie). + /// @param key the trie path/key for the given value. + /// @param value the value being put in the memtable trie. Note that it can be of type different than the element + /// type for this memtable trie. It's up to the `transformer` to return the final value that will stay in + /// the memtable trie. + /// @param transformer a function applied to the potentially pre-existing value for the given key, and the new + /// value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied + /// even if there's no pre-existing value in the memtable trie. public void putSingleton(ByteComparable key, R value, UpsertTransformer transformer) throws TrieSpaceExhaustedException { - apply(Trie.singleton(key, byteComparableVersion, value), transformer, Predicates.alwaysFalse()); + mutator(transformer, Predicates.alwaysFalse()).apply(Trie.singleton(key, byteComparableVersion, value)); } - /** - * A version of putSingleton which uses recursive put if the last argument is true. - */ + /// A version of putSingleton which uses recursive put if the last argument is true. public void putSingleton(ByteComparable key, R value, UpsertTransformer transformer, @@ -1562,208 +476,4 @@ public void putSingleton(ByteComparable key, else putSingleton(key, value, transformer); } - - /** - * Map-like put method, using a fast recursive implementation through the key bytes. May run into stack overflow if - * the trie becomes too deep. When the correct position in the trie has been reached, the value will be resolved - * with the given function before being placed in the trie (even if there's no pre-existing content in this trie). - * @param key the trie path/key for the given value. - * @param value the value being put in the memtable trie. Note that it can be of type different than the element - * type for this memtable trie. It's up to the {@code transformer} to return the final value that will stay in - * the memtable trie. - * @param transformer a function applied to the potentially pre-existing value for the given key, and the new - * value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied - * even if there's no pre-existing value in the memtable trie. - */ - public void putRecursive(ByteComparable key, R value, final UpsertTransformer transformer) throws TrieSpaceExhaustedException - { - try - { - int newRoot = putRecursive(root, key.asComparableBytes(byteComparableVersion), value, transformer); - if (newRoot != root) - root = newRoot; - completeMutation(); - } - catch (Throwable t) - { - abortMutation(); - throw t; - } - } - - private int putRecursive(int node, ByteSource key, R value, final UpsertTransformer transformer) throws TrieSpaceExhaustedException - { - int transition = key.next(); - if (transition == ByteSource.END_OF_STREAM) - return applyContent(node, value, transformer); - - int child = getChild(node, transition); - - int newChild = putRecursive(child, key, value, transformer); - if (newChild == child) - return node; - - int skippedContent = followContentTransition(node); - int attachedChild = !isNull(skippedContent) - ? attachChild(skippedContent, transition, newChild) // Single path, no copying required - : expandOrCreateChainNode(transition, newChild); - - return preserveContent(node, skippedContent, attachedChild, false); - } - - private int applyContent(int node, R value, UpsertTransformer transformer) throws TrieSpaceExhaustedException - { - if (isNull(node)) - return addContent(transformer.apply(null, value)); - - if (isLeaf(node)) - { - int contentId = node; - setContent(contentId, transformer.apply(getContent(contentId), value)); - return node; - } - - if (offset(node) == PREFIX_OFFSET) - { - int contentId = getIntVolatile(node + PREFIX_CONTENT_OFFSET); - setContent(contentId, transformer.apply(getContent(contentId), value)); - return node; - } - else - return createPrefixNode(addContent(transformer.apply(null, value)), node, false); - } - - private void completeMutation() - { - cellAllocator.completeMutation(); - objectAllocator.completeMutation(); - } - - private void abortMutation() - { - cellAllocator.abortMutation(); - objectAllocator.abortMutation(); - } - - /** - * Returns true if the allocation threshold has been reached. To be called by the the writing thread (ideally, just - * after the write completes). When this returns true, the user should switch to a new trie as soon as feasible. - *

    - * The trie expects up to 10% growth above this threshold. Any growth beyond that may be done inefficiently, and - * the trie will fail altogether when the size grows beyond 2G - 256 bytes. - */ - public boolean reachedAllocatedSizeThreshold() - { - return allocatedPos >= ALLOCATED_SIZE_THRESHOLD; - } - - /** - * For tests only! Advance the allocation pointer (and allocate space) by this much to test behaviour close to - * full. - */ - @VisibleForTesting - int advanceAllocatedPos(int wantedPos) throws TrieSpaceExhaustedException - { - while (allocatedPos < wantedPos) - allocateCell(); - return allocatedPos; - } - - /** - * For tests only! Returns the current allocation position. - */ - @VisibleForTesting - int getAllocatedPos() - { - return allocatedPos; - } - - /** - * Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content, or - * any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). - * The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when - * to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out - * immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it - * possible to flush out before making these large allocations. - */ - public long usedSizeOffHeap() - { - return bufferType == BufferType.ON_HEAP ? 0 : usedBufferSpace(); - } - - /** - * Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content, or - * any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). - * The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when - * to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out - * immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it - * possible to flush out before making these large allocations. - */ - public long usedSizeOnHeap() - { - return usedObjectSpace() + - REFERENCE_ARRAY_ON_HEAP_SIZE * getBufferIdx(contentCount, CONTENTS_START_SHIFT, CONTENTS_START_SIZE) + - (bufferType == BufferType.ON_HEAP ? usedBufferSpace() + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP) + - REFERENCE_ARRAY_ON_HEAP_SIZE * getBufferIdx(allocatedPos, BUF_START_SHIFT, BUF_START_SIZE); - } - - private long usedBufferSpace() - { - return allocatedPos - cellAllocator.indexCountInPipeline() * CELL_SIZE; - } - - private long usedObjectSpace() - { - return (contentCount - objectAllocator.indexCountInPipeline()) * MEMORY_LAYOUT.getReferenceSize(); - } - - /** - * Returns the amount of memory that has been allocated for various buffers but isn't currently in use. - * The total on-heap space used by the trie is {@code usedSizeOnHeap() + unusedReservedOnHeapMemory()}. - */ - @VisibleForTesting - public long unusedReservedOnHeapMemory() - { - int bufferOverhead = 0; - if (bufferType == BufferType.ON_HEAP) - { - int pos = this.allocatedPos; - UnsafeBuffer buffer = getBuffer(pos); - if (buffer != null) - bufferOverhead = buffer.capacity() - inBufferOffset(pos); - bufferOverhead += cellAllocator.indexCountInPipeline() * CELL_SIZE; - } - - int index = contentCount; - int leadBit = getBufferIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE); - int ofs = inBufferOffset(index, leadBit, CONTENTS_START_SIZE); - AtomicReferenceArray contentArray = contentArrays[leadBit]; - int contentOverhead = ((contentArray != null ? contentArray.length() : 0) - ofs); - contentOverhead += objectAllocator.indexCountInPipeline(); - contentOverhead *= MEMORY_LAYOUT.getReferenceSize(); - - return bufferOverhead + contentOverhead; - } - - /** - * Release all recycled content references, including the ones waiting in still incomplete recycling lists. - * This is a test method and can cause null pointer exceptions if used on a live trie. - *

    - * If similar functionality is required for non-test purposes, a version of this should be developed that only - * releases references on barrier-complete lists. - */ - @VisibleForTesting - public void releaseReferencesUnsafe() - { - for (int idx : objectAllocator.indexesInPipeline()) - setContent(~idx, null); - } - - /** - * Returns the number of values in the trie - */ - public int valuesCount() - { - return contentCount; - } } diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md index 1952d864e056..ce3a2c5e70d9 100644 --- a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md +++ b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md @@ -22,9 +22,10 @@ The `InMemoryTrie` is one of the main components of the trie infrastructure, a m modification and reads executing concurrently with writes from a single mutator thread. The main features of its implementation are: -- full support of the `Trie` interface +- full support of the `Trie` hierarchy of interfaces - using nodes of several different types for efficiency - support for content on any node, including intermediate (prefix) +- support for alternate branch pointers - support for writes from a single mutator thread concurrent with multiple readers - various consistency and atomicity guarantees for readers - memory management, off-heap or on-heap @@ -34,14 +35,52 @@ The main features of its implementation are: ## Memory layout One of the main design drivers of the memtable trie is the desire to avoid on-heap storage and Java object management. -The trie thus implements its own memory management for the structure of the trie (content is, at this time, still given -as Java objects in a content array). The structure resides in one `UnsafeBuffer` (which can be on or off heap as -desired) and is broken up in 32-byte "cells", which are the unit of allocation, update and reuse. +The trie thus implements its own memory management for the structure of the trie. The structure resides in one +`UnsafeBuffer` (which can be on or off heap as desired) and is broken up in 32-byte "cells", which are the unit of +allocation, update and reuse. Like all tries, `InMemoryTrie` is built from nodes and has a root pointer. The nodes reside in cells, but there is no 1:1 correspondence between nodes and cells - some node types pack multiple in one cell, while other types require multiple cells. +### Content storage + +In-memory tries support two approaches for storing content/payload data: + +#### `ContentManagerPojo` - Object array storage + +This default approach stores content as Java objects in a separate content array. Leaf nodes reference values by +storing the array index as a negative pointer value (where masking away the sign and flags gives the index in the +array). This approach is simple but keeps content on-heap as Java objects, which can lead to garbage collection +pressure for large tries. + +#### `ContentManagerBytes` - Direct buffer storage + +The alternative stores content directly in the trie's buffer cells, using the same 32-byte cells that store the +trie structure. This eliminates the need for a separate content array and allows content to be stored off-heap +alongside the trie structure. + +`ContentManagerBytes` relies on a `ContentSerializer` interface to handle encoding and decoding of content. The +serializer defines: +- How to serialize content into a 32-byte cell, returning a `offsetBits`, a 5-bit offset which is combined with + the cell base to form the leaf pointer/id. +- How to deserialize content from a 32-byte cell and the pointer's `offsetBits` +- Which values should be treated as "special" (encoded without using cells, using `offsetBits == 0x1F`) + +The `offsetBits` are to be used to help determine the type of content, and they also may be used to store e.g. +length and flags that could otherwise take up a byte in the cell. + +**Special values**: Some content types appear frequently and carry no additional data (e.g. markers, empty values). +These can be encoded as special values and mapped directly to singleton objects without allocating trie cells. The +`ContentSerializer` determines which values qualify as special via the `idIfSpecial()` method. + +**Large values**: When content doesn't fit in 32 bytes, the serializer can use its own external storage mechanism +(e.g. slab buffers) and store only a handle/pointer in the trie cell. The serializer is responsible for managing +this external storage and releasing it when content is removed. + +This approach can be used to drastically reduce or completely eliminate the per-item on-heap presence of a trie, +improving overall garbage collection efficiency. + ### Pointers and node types A "pointer" is an integer that points to a node in the trie buffer. A pointer specifies the location of the node @@ -91,11 +130,11 @@ to the type of node, in this case the bits also define the length of the chain & The simplest chain node has one transition leading to one child and is laid out like this: -offset|content|example ----|---|--- -00 - 1A|unused| -1B |character|41 A -1C - 1F|child pointer|FFFFFFFF +| offset | content | example | +|---------|---------------|----------| +| 00 - 1A | unused | | +| 1B | character | 41 A | +| 1C - 1F | child pointer | FFFFFFFF | where the pointer points to the `1B` line in the cell. @@ -105,11 +144,11 @@ pointer `0x13B` point to a node with one child with transition `0x41` `A` to a l Another chain cell, which points to this one, can be added in the same cell by placing a character at offset `1A`. This new node is effectively laid out as -offset|content|example ----|---|--- -00 - 19|unused| -1A |character|48 H -1B - 1F|unused| +| offset | content | example | +|---------|-----------|---------| +| 00 - 19 | unused | | +| 1A | character | 48 H | +| 1B - 1F | unused | | where the pointer points to line `1A`. This node has one transition, and the child pointer is implicit as the node's pointer plus one. @@ -120,13 +159,13 @@ Example: The cell `xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx43484 pointer `0x139` point to a node with one child with transition `0x43` `C` to a node with one child with transition `0x48` `H` to a node with one child with transition `0x41` `A` to a leaf node with content `contentArray[0]`. -offset|content|example ----|---|--- -00 - 18|unused| -19 |character|43 C -1A |character|48 H -1B |character|41 A -1C - 1F|child pointer|FFFFFFFF +| offset | content | example | +|----------|---------------|----------| +| 00 - 18 | unused | | +| 19 | character | 43 C | +| 1A | character | 48 H | +| 1B | character | 41 A | +| 1C - 1F | child pointer | FFFFFFFF | In this example `0x13A` and `0x13B` are also valid pointers to the respective chains and could be referenced from other @@ -144,21 +183,21 @@ Note: offset `0x00` also specifies a chain node, but the pointer 0 is a special Sparse nodes are used when a node has at least two children, and all pointers and transition characters can fit in one cell, which limits the maximum number of children to 6. Their layout is: -offset|content| ----|---| -00 - 03|child pointer 0| -04 - 07|child pointer 1| -08 - 0B|child pointer 2| -0C - 0F|child pointer 3| -10 - 13|child pointer 4| -14 - 17|child pointer 5| -18 |character 0| -19 |character 1| -1A |character 2| -1B |character 3| -1C |character 4| -1D |character 5| -1E - 1F|order word| +| offset | content | +|---------|-----------------| +| 00 - 03 | child pointer 0 | +| 04 - 07 | child pointer 1 | +| 08 - 0B | child pointer 2 | +| 0C - 0F | child pointer 3 | +| 10 - 13 | child pointer 4 | +| 14 - 17 | child pointer 5 | +| 18 | character 0 | +| 19 | character 1 | +| 1A | character 2 | +| 1B | character 3 | +| 1C | character 4 | +| 1D | character 5 | +| 1E - 1F | order word | where the pointer points to the line `1E` (i.e. the type identifier for a sparse node is `0x1E`). @@ -179,15 +218,15 @@ To do this, the mutating thread will have to convert the chain node into a spars (e.g. `0x240`-`0x25F`) and filling in the sparse node `00000238 0000013A 00000000 00000000 00000000 00000000 41430000 00000006` with pointer `0x25E`: -offset|content|example ----|---|--- -00 - 03|child pointer 0| 00000238 -04 - 07|child pointer 1| 0000013A -08 - 17|unused| -18 |character 0| 41 A -19 |character 1| 43 C -1A - 1D|unused| -1E - 1F|order word, always 10| 0006 = 10 (base 6) +| offset | content | example | +|---------|-----------------------|--------------------| +| 00 - 03 | child pointer 0 | 00000238 | +| 04 - 07 | child pointer 1 | 0000013A | +| 08 - 17 | unused | | +| 18 | character 0 | 41 A | +| 19 | character 1 | 43 C | +| 1A - 1D | unused | | +| 1E - 1F | order word, always 10 | 0006 = 10 (base 6) | This is the smallest kind of sparse node, with just two children. Two-children sparse nodes always put their two children in order (we can do this as this does not happen in response to an addition of a new child to @@ -200,17 +239,17 @@ least significant digit of the order word, 0. The second child is specified by t Suppose we then need to add a new child, using character `0x35` `5` and child `0x33B`. The node will change to `00000238 0000013A 0000033B 00000000 00000000 00000000 41433500 00000026` and the pointer to it stays the same. -offset|content|example ----|---|--- -00 - 03|child pointer 0| 00000238 -04 - 07|child pointer 1| 0000013A -08 - 0B|child pointer 2| 0000033B -0C - 17|unused| -18 |character 0| 41 A -19 |character 1| 43 C -1A |character 2| 35 5 -1B - 1D|unused| -1E - 1F|order word| 0026 = 102 (base 6) +| offset | content | example | +|---------|-----------------|---------------------| +| 00 - 03 | child pointer 0 | 00000238 | +| 04 - 07 | child pointer 1 | 0000013A | +| 08 - 0B | child pointer 2 | 0000033B | +| 0C - 17 | unused | | +| 18 | character 0 | 41 A | +| 19 | character 1 | 43 C | +| 1A | character 2 | 35 5 | +| 1B - 1D | unused | | +| 1E - 1F | order word | 0026 = 102 (base 6) | This node has three (the number of digits in the order word) children. The first child is at the position specified by the least significant digit of the order word, 2. The second child is specified by the second least significant digit, @@ -223,21 +262,21 @@ cannot miscount the number of children. The addition of children can continue until we have 6, for example `00000238 0000013A 0000033B 0000035C 0000037A 0000041B 41433542 50338129` (pointer `0x25E`) for -offset|content|example ----|---|--- -00 - 03|child pointer 0| 00000238 -04 - 07|child pointer 1| 0000013A -08 - 0B|child pointer 2| 0000033B -0C - 0F|child pointer 3| 0000035C -10 - 13|child pointer 4| 0000037A -14 - 17|child pointer 5| 0000041B -18 |character 0| 41 A -19 |character 1| 43 C -1A |character 2| 35 5 -1B |character 3| 42 B -1C |character 4| 50 P -1D |character 5| 33 3 -1E - 1F|order word| 8129 = 413025 (base 6) +| offset | content | example | +|----------|-----------------|------------------------| +| 00 - 03 | child pointer 0 | 00000238 | +| 04 - 07 | child pointer 1 | 0000013A | +| 08 - 0B | child pointer 2 | 0000033B | +| 0C - 0F | child pointer 3 | 0000035C | +| 10 - 13 | child pointer 4 | 0000037A | +| 14 - 17 | child pointer 5 | 0000041B | +| 18 | character 0 | 41 A | +| 19 | character 1 | 43 C | +| 1A | character 2 | 35 5 | +| 1B | character 3 | 42 B | +| 1C | character 4 | 50 P | +| 1D | character 5 | 33 3 | +| 1E - 1F | order word | 8129 = 413025 (base 6) | Beyond 6 children, a node needs to be converted to split. @@ -249,39 +288,39 @@ method we chose is to construct a "mini-trie" with 2-3-3 bit transitions. A split node is identified by the `0x1C` offset. The starting cell of a split node has this layout: -offset|content| ----|---| -00 - 0F|unused| -10 - 13|mid-cell for leading 00| -14 - 17|mid-cell for leading 01| -18 - 1B|mid-cell for leading 10| -1C - 1F|mid-cell for leading 11| +| offset | content | +|---------|-------------------------| +| 00 - 0F | unused | +| 10 - 13 | mid-cell for leading 00 | +| 14 - 17 | mid-cell for leading 01 | +| 18 - 1B | mid-cell for leading 10 | +| 1C - 1F | mid-cell for leading 11 | (pointers to this node point to the `1C` line) and where each mid-cell contains: -offset|content| ----|---| -00 - 03|end-cell for middle 000| -04 - 07|end-cell for middle 001| -08 - 0B|end-cell for middle 010| -0C - 0F|end-cell for middle 011| -10 - 13|end-cell for middle 100| -14 - 17|end-cell for middle 101| -18 - 1B|end-cell for middle 110| -1C - 1F|end-cell for middle 111| +| offset | content | +|---------|-------------------------| +| 00 - 03 | end-cell for middle 000 | +| 04 - 07 | end-cell for middle 001 | +| 08 - 0B | end-cell for middle 010 | +| 0C - 0F | end-cell for middle 011 | +| 10 - 13 | end-cell for middle 100 | +| 14 - 17 | end-cell for middle 101 | +| 18 - 1B | end-cell for middle 110 | +| 1C - 1F | end-cell for middle 111 | and end-cell: -offset|content| ----|---| -00 - 03|pointer to child for ending 000| -04 - 07|pointer to child for ending 001| -08 - 0B|pointer to child for ending 010| -0C - 0F|pointer to child for ending 011| -10 - 13|pointer to child for ending 100| -14 - 17|pointer to child for ending 101| -18 - 1B|pointer to child for ending 110| -1C - 1F|pointer to child for ending 111| +| offset | content | +|---------|---------------------------------| +| 00 - 03 | pointer to child for ending 000 | +| 04 - 07 | pointer to child for ending 001 | +| 08 - 0B | pointer to child for ending 010 | +| 0C - 0F | pointer to child for ending 011 | +| 10 - 13 | pointer to child for ending 100 | +| 14 - 17 | pointer to child for ending 101 | +| 18 - 1B | pointer to child for ending 110 | +| 1C - 1F | pointer to child for ending 111 | In any of the cell or pointer positions we can have `NONE`, meaning that such a child (or cell of children) does not exist. At minimum, a split node occupies 3 cells (one leading, one mid and one end), and at maximum — @@ -297,78 +336,78 @@ section. This will generate the following structure: Leading cell (e.g. `0x500`-`0x51F` with pointer `0x51C`) -offset|content|example ----|---|--- -00 - 0F|unused| -10 - 13|mid-cell for leading 00|0000053C -14 - 17|mid-cell for leading 01|0000057C -18 - 1B|mid-cell for leading 10|00000000 NONE -1C - 1F|mid-cell for leading 11|00000000 NONE +| offset | content | example | +|---------|-------------------------|---------------| +| 00 - 0F | unused | | +| 10 - 13 | mid-cell for leading 00 | 0000053C | +| 14 - 17 | mid-cell for leading 01 | 0000057C | +| 18 - 1B | mid-cell for leading 10 | 00000000 NONE | +| 1C - 1F | mid-cell for leading 11 | 00000000 NONE | Mid cell `00` at `0x520`-`0x53F`: -offset|content|example ----|---|--- -00 - 03|end-cell for middle 000|00000000 NONE -04 - 07|end-cell for middle 001|00000000 NONE -08 - 0B|end-cell for middle 010|00000000 NONE -0C - 0F|end-cell for middle 011|00000000 NONE -10 - 13|end-cell for middle 100|00000000 NONE -14 - 17|end-cell for middle 101|00000000 NONE -18 - 1B|end-cell for middle 110|0000055C -1C - 1F|end-cell for middle 111|00000000 NONE +| offset | content | example | +|---------|-------------------------|---------------| +| 00 - 03 | end-cell for middle 000 | 00000000 NONE | +| 04 - 07 | end-cell for middle 001 | 00000000 NONE | +| 08 - 0B | end-cell for middle 010 | 00000000 NONE | +| 0C - 0F | end-cell for middle 011 | 00000000 NONE | +| 10 - 13 | end-cell for middle 100 | 00000000 NONE | +| 14 - 17 | end-cell for middle 101 | 00000000 NONE | +| 18 - 1B | end-cell for middle 110 | 0000055C | +| 1C - 1F | end-cell for middle 111 | 00000000 NONE | End cell `00 110` at `0x540`-`0x55F`: -offset|content|example ----|---|--- -00 - 03|pointer to child for ending 000|00000000 NONE -04 - 07|pointer to child for ending 001|00000000 NONE -08 - 0B|pointer to child for ending 010|00000000 NONE -0C - 0F|pointer to child for ending 011|0000041B -10 - 13|pointer to child for ending 100|00000000 NONE -14 - 17|pointer to child for ending 101|0000033B -18 - 1B|pointer to child for ending 110|00000000 NONE -1C - 1F|pointer to child for ending 111|00000000 NONE +| offset | content | example | +|---------|---------------------------------|---------------| +| 00 - 03 | pointer to child for ending 000 | 00000000 NONE | +| 04 - 07 | pointer to child for ending 001 | 00000000 NONE | +| 08 - 0B | pointer to child for ending 010 | 00000000 NONE | +| 0C - 0F | pointer to child for ending 011 | 0000041B | +| 10 - 13 | pointer to child for ending 100 | 00000000 NONE | +| 14 - 17 | pointer to child for ending 101 | 0000033B | +| 18 - 1B | pointer to child for ending 110 | 00000000 NONE | +| 1C - 1F | pointer to child for ending 111 | 00000000 NONE | Mid cell `01` at `0x560`-`0x57F`: -offset|content|example ----|---|--- -00 - 03|end-cell for middle 000|0000059C -04 - 07|end-cell for middle 001|00000000 NONE -08 - 0B|end-cell for middle 010|000005BC -0C - 0F|end-cell for middle 011|00000000 NONE -10 - 13|end-cell for middle 100|00000000 NONE -14 - 17|end-cell for middle 101|00000000 NONE -18 - 1B|end-cell for middle 110|00000000 NONE -1C - 1F|end-cell for middle 111|00000000 NONE +| offset | content | example | +|---------|-------------------------|---------------| +| 00 - 03 | end-cell for middle 000 | 0000059C | +| 04 - 07 | end-cell for middle 001 | 00000000 NONE | +| 08 - 0B | end-cell for middle 010 | 000005BC | +| 0C - 0F | end-cell for middle 011 | 00000000 NONE | +| 10 - 13 | end-cell for middle 100 | 00000000 NONE | +| 14 - 17 | end-cell for middle 101 | 00000000 NONE | +| 18 - 1B | end-cell for middle 110 | 00000000 NONE | +| 1C - 1F | end-cell for middle 111 | 00000000 NONE | End cell `01 000` at `0x580`-`0x59F`: -offset|content|example ----|---|--- -00 - 03|pointer to child for ending 000|00000000 NONE -04 - 07|pointer to child for ending 001|00000238 -08 - 0B|pointer to child for ending 010|0000035C -0C - 0F|pointer to child for ending 011|0000013A -10 - 13|pointer to child for ending 100|00000000 NONE -14 - 17|pointer to child for ending 101|00000000 NONE -18 - 1B|pointer to child for ending 110|00000000 NONE -1C - 1F|pointer to child for ending 111|00000000 NONE +| offset | content | example | +|---------|---------------------------------|---------------| +| 00 - 03 | pointer to child for ending 000 | 00000000 NONE | +| 04 - 07 | pointer to child for ending 001 | 00000238 | +| 08 - 0B | pointer to child for ending 010 | 0000035C | +| 0C - 0F | pointer to child for ending 011 | 0000013A | +| 10 - 13 | pointer to child for ending 100 | 00000000 NONE | +| 14 - 17 | pointer to child for ending 101 | 00000000 NONE | +| 18 - 1B | pointer to child for ending 110 | 00000000 NONE | +| 1C - 1F | pointer to child for ending 111 | 00000000 NONE | End cell `01 010` at `0x5A0`-`0x5BF`: -offset|content|example ----|---|--- -00 - 03|pointer to child for ending 000|0000037A -04 - 07|pointer to child for ending 001|00000455 -08 - 0B|pointer to child for ending 010|00000000 NONE -0C - 0F|pointer to child for ending 011|00000000 NONE -10 - 13|pointer to child for ending 100|00000000 NONE -14 - 17|pointer to child for ending 101|00000000 NONE -18 - 1B|pointer to child for ending 110|00000000 NONE -1C - 1F|pointer to child for ending 111|00000000 NONE +| offset | content | example | +|---------|---------------------------------|---------------| +| 00 - 03 | pointer to child for ending 000 | 0000037A | +| 04 - 07 | pointer to child for ending 001 | 00000455 | +| 08 - 0B | pointer to child for ending 010 | 00000000 NONE | +| 0C - 0F | pointer to child for ending 011 | 00000000 NONE | +| 10 - 13 | pointer to child for ending 100 | 00000000 NONE | +| 14 - 17 | pointer to child for ending 101 | 00000000 NONE | +| 18 - 1B | pointer to child for ending 110 | 00000000 NONE | +| 1C - 1F | pointer to child for ending 111 | 00000000 NONE | To find a child in this structure, we follow the transitions along the bits of the mini-trie. For example, for `0x42` `B` = `0b01000010` we start at `0x51C`, take the `01` pointer to `0x57C`, then the `000` pointer to `0x59C` and finally @@ -377,14 +416,15 @@ reachable with pointers, they only make sense as substructure of the split node. ![graph](InMemoryTrie.md.g3.svg) -#### Content `Prefix` +#### `Prefix` nodes Prefix nodes are not nodes in themselves, but they add information to the node they lead to. Specifically, they -encode an index in the content array, and a pointer to the node to which this content is attached. In anything other -than the content, they are equivalent to the linked node — i.e. a prefix node pointer has the same children as -the node it links to (another way to see this is as a content-carrying node is one that has an _ε_ transition to the -linked node and no other features except added content). We do not allow more than one prefix to a node (i.e. prefix -can't point to another prefix), and the child of a prefix node cannot be a leaf. +encode an index in the content array, a pointer to any alternate branch, and a pointer to the node to which this +additional information is attached. In anything other than the content/alternate, they are equivalent to the linked node +— i.e. a prefix node pointer has the same children as the node it links to (another way to see this is as a +content-carrying node that has an _ε_ transition to the linked node and no other features except added content and/or +alternate branch). We do not allow more than one prefix to a node (i.e. prefix can't point to another prefix), and the +child of a prefix node cannot be a leaf. There are two types of prefixes: - standalone, which has a full 32-bit pointer to the linked node @@ -393,42 +433,52 @@ of the linked node Standalone prefixes have this layout: -offset|content|example ----|---|--- -00 - 03|content index|00000001 -04|standalone flag, 0xFF|FF -05 - 1B|unused| -1C - 1F|linked node pointer|0000025E +| offset | content | example | +|---------|--------------------------|---------------| +| 00 - 03 | content pointer | FFFFFFFE ~1 | +| 04 - 07 | alternate branch pointer | 00000000 NONE | +| 08 | standalone flag, 0xFF | FF | +| 09 - 1B | unused | | +| 1C - 1F | linked node pointer | 0000025E | and pointer offset `0x1F`. The sample values above will be the ones used to link a prefix node to our `Sparse` example, where a prefix cannot be embedded as all the bytes of the cell are in use. If we want to attach the same prefix to the `Split` example, we will place this -offset|content|example ----|---|--- -00 - 03|content index|00000001 -04|embedded offset within cell|1C -05 - 1F|unused| +| offset | content | example | +|---------|-----------------------------|---------------| +| 00 - 03 | content pointer | FFFFFFFE ~1 | +| 04 - 07 | alternate branch pointer | 00000000 NONE | +| 08 | embedded offset within cell | 1C | +| 09 - 1F | unused | | _inside_ the leading split cell, with pointer `0x1F`. Since this is an embedded node, the augmented one resides within the same cell, and thus we need only 5 bits to encode the pointer (the other 27 are the same as the prefix's). -The combined content of the cell at `0x500-0x51F` will then be `00000001 1C000000 00000000 00000000 00000520 00000560 +The combined content of the cell at `0x500-0x51F` will then be `FFFFFFFE 00000000 1C000000 00000000 00000520 00000560 00000000 00000000`: -offset|content|example ----|---|--- -00 - 03|content index|00000001 -04|embedded offset within cell|1C -05 - 0F|unused| -10 - 13|mid-cell for leading 00|00000520 -14 - 17|mid-cell for leading 01|00000560 -18 - 1B|mid-cell for leading 10|00000000 NONE -1C - 1F|mid-cell for leading 11|00000000 NONE +| offset | content | example | +|---------|-----------------------------|---------------| +| 00 - 03 | content pointer | FFFFFFFE ~1 | +| 04 - 07 | alternate branch pointer | 00000000 NONE | +| 08 | embedded offset within cell | 1C | +| 09 - 0F | unused | | +| 10 - 13 | mid-cell for leading 00 | 00000520 | +| 14 - 17 | mid-cell for leading 01 | 00000560 | +| 18 - 1B | mid-cell for leading 10 | 00000000 NONE | +| 1C - 1F | mid-cell for leading 11 | 00000000 NONE | Both `0x51C` and `0x51F` are valid pointers in this cell. The former refers to the plain split node, the latter to its content-augmented version. The only difference between the two is the result of a call to `content()`. +Note that for code simplicity we store content indexes as pointers, i.e. with the same value as the leaf node for the +given content index. In the example above, `contentArray[1]` is encoded as `~1` i.e. `0xFFFFFFFE`. + +Alternate branch pointers can store a link to another branch of the trie. They are used by deletion-aware tries to store +deletion branches and are ignored by other types of tries. Another possible application of these would be to implement +non-deterministic tries. + ![graph](InMemoryTrie.md.g4.svg) @@ -443,7 +493,8 @@ interface implemented by `InMemoryTrie` (see `Trie.md` for a description of curs ![graph](InMemoryTrie.md.wc1.svg) -(Edges in black show the trie's structure, and the ones in light blue the path the cursor walk takes.) +(Edges in black show the trie's structure, and the ones in light blue the path the +cursor walk takes.) ### Cursors over `InMemoryTrie` @@ -561,11 +612,11 @@ and we can put in the new value by performing a volatile write. For example, updating `N -> 0x39C` is accomplished by making the volatile write: -offset|content|before|after ----|---|---|--- -00-1A|irrelevant|| -1B|character|N|N -1C-1F|pointer|0000031E|_**0000039C**_ +| offset |content|before|after| +|--------|---|---|---| + | 00-1A |irrelevant||| + | 1B |character|N|N| + | 1C-1F |pointer|0000031E|_**0000039C**_| (Here and below normal writes are in bold and volatile writes in bold italic.) @@ -584,17 +635,17 @@ where the pointer to the old child is written, and we can update it by doing a v For example, updating `C -> 0x51E` in a sparse node can be: -offset|content|before|after ----|---|---|--- -00 - 03|child pointer 0| 00000238|00000238 -04 - 07|child pointer 1| 0000013A|_**0000051E**_ -08 - 0B|child pointer 2| 0000033B|0000033B -0C - 17|unused| -18 |character 0| 41 A|41 A -19 |character 1| 43 C|43 C -1A |character 2| 35 5|35 5 -1B - 1D|unused| -1E - 1F|order word| 0026 = 102 (base 6) +| offset |content|before|after| +|---------|---|---|---| + | 00 - 03 |child pointer 0| 00000238|00000238| + | 04 - 07 |child pointer 1| 0000013A|_**0000051E**_| + | 08 - 0B |child pointer 2| 0000033B|0000033B| + | 0C - 17 |unused|| + | 18 |character 0| 41 A|41 A| + | 19 |character 1| 43 C|43 C| + | 1A |character 2| 35 5|35 5| + | 1B - 1D |unused|| + | 1E - 1F |order word| 0026 = 102 (base 6)| #### Adding a new child to `Split` @@ -614,29 +665,29 @@ In any of these cases, readers have to pass through the volatile update to reach For example, to add `x -> 0x71A` (`x` is `0x78` or `0b01111000`) to the split node example needs a new end cell for `01 111` (for example at `0x720-0x73F`) (these writes can be non-volatile): -offset|content|before|after ----|---|---|--- -00 - 03|pointer to child for ending 000|n/a|**0000071A** -04 - 07|pointer to child for ending 001|n/a|**00000000** NONE -08 - 0B|pointer to child for ending 010|n/a|**00000000** NONE -0C - 0F|pointer to child for ending 011|n/a|**00000000** NONE -10 - 13|pointer to child for ending 100|n/a|**00000000** NONE -14 - 17|pointer to child for ending 101|n/a|**00000000** NONE -18 - 1B|pointer to child for ending 110|n/a|**00000000** NONE -1C - 1F|pointer to child for ending 111|n/a|**00000000** NONE +| offset | content | before | after | +|---------|---------------------------------|--------|-------------------| +| 00 - 03 | pointer to child for ending 000 | n/a | **0000071A** | +| 04 - 07 | pointer to child for ending 001 | n/a | **00000000** NONE | +| 08 - 0B | pointer to child for ending 010 | n/a | **00000000** NONE | +| 0C - 0F | pointer to child for ending 011 | n/a | **00000000** NONE | +| 10 - 13 | pointer to child for ending 100 | n/a | **00000000** NONE | +| 14 - 17 | pointer to child for ending 101 | n/a | **00000000** NONE | +| 18 - 1B | pointer to child for ending 110 | n/a | **00000000** NONE | +| 1C - 1F | pointer to child for ending 111 | n/a | **00000000** NONE | and this volatile write to the mid cell `0x520`: -offset|content|before|after ----|---|---|--- -00 - 03|end-cell for middle 000|00000000 NONE|00000000 NONE -04 - 07|end-cell for middle 001|00000000 NONE|00000000 NONE -08 - 0B|end-cell for middle 010|00000000 NONE|00000000 NONE -0C - 0F|end-cell for middle 011|00000000 NONE|00000000 NONE -10 - 13|end-cell for middle 100|00000000 NONE|00000000 NONE -14 - 17|end-cell for middle 101|00000000 NONE|00000000 NONE -18 - 1B|end-cell for middle 110|0000055C|0000055C -1C - 1F|end-cell for middle 111|00000000 NONE|_**0000073C**_ +| offset | content | before | after | +|---------|-------------------------|---------------|----------------| +| 00 - 03 | end-cell for middle 000 | 00000000 NONE | 00000000 NONE | +| 04 - 07 | end-cell for middle 001 | 00000000 NONE | 00000000 NONE | +| 08 - 0B | end-cell for middle 010 | 00000000 NONE | 00000000 NONE | +| 0C - 0F | end-cell for middle 011 | 00000000 NONE | 00000000 NONE | +| 10 - 13 | end-cell for middle 100 | 00000000 NONE | 00000000 NONE | +| 14 - 17 | end-cell for middle 101 | 00000000 NONE | 00000000 NONE | +| 18 - 1B | end-cell for middle 110 | 0000055C | 0000055C | +| 1C - 1F | end-cell for middle 111 | 00000000 NONE | _**0000073C**_ | The start cell, and the other mid and end cells remain unchanged. @@ -664,19 +715,19 @@ and stop searching when they find a `NONE` pointer. For example, adding `x -> 0x71A` to the sparse example above is done by: -offset|content|before|after ----|---|---|--- -00 - 03|child pointer 0| 00000238|00000238 -04 - 07|child pointer 1| 0000051E|0000051E -08 - 0B|child pointer 2| 0000033B|0000033B -0C - 0F|child pointer 3|any|_**0000071A**_ -10 - 17|unused|NONE|NONE -18 |character 0| 41 A|41 A -19 |character 1| 43 C|43 C -1A |character 2| 35 5|35 5 -1B |character 3| any |**78** x -1C - 1D|unused|00 00|00 00 -1E - 1F|order word|0026 = 102 (base 6)|_**02AE**_ = 3102 (base 6) +| offset | content | before | after | +|---------|-----------------|---------------------|----------------------------| +| 00 - 03 | child pointer 0 | 00000238 | 00000238 | +| 04 - 07 | child pointer 1 | 0000051E | 0000051E | +| 08 - 0B | child pointer 2 | 0000033B | 0000033B | +| 0C - 0F | child pointer 3 | any | _**0000071A**_ | +| 10 - 17 | unused | NONE | NONE | +| 18 | character 0 | 41 A | 41 A | +| 19 | character 1 | 43 C | 43 C | +| 1A | character 2 | 35 5 | 35 5 | +| 1B | character 3 | any | **78** x | +| 1C - 1D | unused | 00 00 | 00 00 | +| 1E - 1F | order word | 0026 = 102 (base 6) | _**02AE**_ = 3102 (base 6) | where we first write the character, then volatile write the pointer, and finally the order word. @@ -804,6 +855,29 @@ Ascending back to add the child `~3`, we add a child to `NONE` and get `updatedP the existing content, we create the embedded prefix node `updatedPreContentNode = 0x0BF` with `contentIndex = 1` and pass that on to the recursion. +### Deletion + +Deletion of data in `InMemoryTrie`s is achieved by returning `null` for the value that needs to be put in a position +with existing content (both `apply` and `putSingleton` take an `UpsertTransformer` that is applied to the combination +of existing and update value; this transformer can choose to return `null`). + +This automatically results in `NONE` value for the content id. On the way up the recursive application chain, we +recognize `NONE` for the child pointer and apply this as removal of the child. Depending on the type of node, this may +be achieved by dropping the node (`Chain`), by putting the `NONE` value as the child pointer, or by duplicating a node +to switch its type or remove a child. This may in turn result in an empty node, which returns `NONE` as the child +pointer, continuing the removal upwards in the recursive chain. + +The example below shows the deletion of "tractor" from one of the modified tries above. + +![graph](InMemoryTrie.md.d1.svg) + +The associated value is set to `null`, resulting in `NONE` being returned from the recursive application, which removes +the only child of node `0x01B`, resulting in an empty node i.e. `NONE`. This propagates up until the sparse node +`0x0DE`, where one child remains after the removal. Since sparse nodes cannot have only one child, this node is freed, +and a chain node is created for the remaining `v` transition -- this child node is placed within the child chain cell, +which has room for further transitions. The pointer to this chain node is passed back up the recursive application +chain, and the parent node is updated to point to it. + ### Memory management and cell reuse As mentioned in the beginning, in order to avoid long garbage collection pauses due to large long-lasting content in @@ -893,3 +967,51 @@ and should be discarded; because the strategy works with blocks, it will actuall happen often, but any users of tries that expect them to live indefinitely (unlike memtables which are flushed regularly; an example would be the chunk cache map when/if we switch it to `InMemoryTrie`) must ensure that exceptions cannot happen during mutation, otherwise waste can slowly accumulate to bring the node down. + +### Range tries + +Range tries differ from plain ones in being able to present preceding state for any position in the trie, and the fact +that range tries need to be able to present boundaries both on the descent and ascent path for a node. In-memory +range tries do not store the preceding state information, but instead construct it during cursor iteration. + +When a range trie is stored in an in-memory trie, it stores only content values. The range cursors created keep track of +the currently active covering state (which is equal to the succeeding side of any visited boundary during advance) and +report it as `precedingState`. This information, however, is no longer valid when a `skipTo` operation is performed, as +it may skip over arbitrarily many boundaries and end up in a covered range. If `precedingState` is requested after such +a skip, the cursor needs to obtain the applicable state. This is done by descending into the current branch (in +iteration order) until the closest boundary is found, and using its preceding side. For this to work, all in-memory trie +branches must terminate in a boundary state with content, which is something that in-memory tries do maintain (see +below). + +For every boundary the range trie needs to recognize whether it falls to the left or right of the branch in order to +know whether to present it on the descent or ascend path of the iteration. This is done by reserving a bit in the +content encoding that specifies the modifier. If something needs to be presented on the return path, its pointer is put +in the backtracking state to be returned after the branch is exhausted instead of being reported immediately. Some nodes +will contain both left- and right-side boundaries, in which case we use a prefix node where both the content and +alternate fields are set. + +Because all trie application may affect covered state, even if they are a singleton, all insertions into a range trie +are done using the `apply` method. The application itself is more elaborate than the case of simple data tries: +when `apply` is called with a range trie argument, the in-memory trie has to walk all existing positions that fall under +ranges of the trie and apply the active state to them. Additionally, it must track any active existing range to combine +it with incoming content. + +Because the incoming content is often expected to be a (newer) deletion, the resolver is expected to often return null +for combined content. This triggers removal of nodes and paths up the relevant branch (which may also result in changing +the type of a node e.g. from sparse to chain), which in turn guarantees that we remove branches that do not terminate in +non-null content. + +### Deletion-aware tries + +Deletion-aware tries are tries that contain parallel deletion branch range tries at some of their nodes. In in-memory +deletion-aware tries, the parallel branches are stored using the alternate branch pointers of prefix nodes. + +When a deletion-aware trie needs to be applied to an in-memory trie, it proceeds as normal while a deletion branch is +not seen in either source or target. When a deletion branch is found, the procedure must: +- check if the other source has a deletion branch at the same point, and if it doesn't (and `deletionsAtFixedPoints` is + not set), hoist its descendant deletion branches to this position. +- merge the two deletion branches into the new deletion branch (using range trie merge logic). +- apply the source deletion branch to the data trie to apply any incoming deletions. +- merge the source data branch, with the target's deletion branch applied to it (to make sure any preexisting deletions + are also taken into account for incoming data if that happens to be older), into the data trie (using plain trie merge + logic, i.e. no longer checking for deletion branches). diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.d1.svg b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.d1.svg new file mode 100644 index 000000000000..e3c892bf4f98 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.d1.svg @@ -0,0 +1,480 @@ + + + + + + + + G + + + + root + + Chain + 0x9A + + + + start + + start/end + + + + + root->start + + + 0x09A + + + + t + + 0x09B + + + + root->t + + + t + + + + root->t + + + t + + + + start->root + + + + + + tra + + Sparse + 0x0DE + + + + trav + + Chain + 0x0B8 + + + + tra->trav + + + v + + + + trac + + 0x019 + + + + tra->trac + + + c + + + + tra->trac + + + c + + + + tra2 + + Chain + 0x0B7 + + + + + trave + + 0x0B9 + + + + trav->trave + + + e + + + + tree + + contentArray[1] + + + + + traver + + 0x0BA + + + + trave->traver + + + r + + + + travers + + 0x0BB + + + + traver->travers + + + s + + + + traverse + + contentArray[3] + + + + travers->traverse + + + e + + + + trie + + contentArray[2] + + + + + tre + + Chain + 0x03B + + + + tre->tree + + + e + + + + tri + + Chain + 0x05B + + + + + tri->trie + + + e + + + + tr + + Sparse + 0x07E + + + + tr->tra + + + a + + + + tr->tra + + + a + + + + tr->tre + + + e + + + + tr->tri + + + i + + + + tr->t + + + 0x07E + + + + tr->tra2 + + + a + + + + t->root + + + 0x09B + + + + t->tr + + + r + + + + t->tr + + + r + + + + tractor + + contentArray[0] + + + + tracto + + 0x01B + + + + tractor->tracto + + + NONE + + + + tracto->tractor + + + r + + + + tracto->tractor + + + r + + + + tract + + 0x01A + + + + tracto->tract + + + NONE + + + + tract->tracto + + + o + + + + tract->tracto + + + o + + + + tract->trac + + + NONE + + + + + trac->tract + + + t + + + + trac->tract + + + t + + + + trac->tra2 + + + NONE + + + + tra2->trav + + + v + + + + + tra2->tr + + + 0x0B7 + + + \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.p1.svg b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.p1.svg index c89c085820bf..fb31e622b819 100644 --- a/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.p1.svg +++ b/src/java/org/apache/cassandra/db/tries/InMemoryTrie.md.p1.svg @@ -55,15 +55,17 @@ digraph G { tr -> tri [label = " i"] tri -> trie [label = " e"] - { + subgraph newNodes { + edge [color = "blue"; fontcolor="blue"; arrowhead="vee"] + node [color = "blue"; fontcolor="blue"] ranksep = 0.1 tree2 [label = "Chain\n0x0BB"] tree2p [label = "Prefix\n0x0BF\ncontentArray[1]"] + tre -> tree2p [label = " e"] tree2p -> tree2 [label = " ε"] + tree2 -> trees [label = " s"] } - tre -> tree2p [label = " e"] - tree2 -> trees [label = " s"] {rank=same tra -> tre -> tri [style=invis]} {rank=same trac -> tree -> tree2p -> trie [style=invis]} @@ -98,308 +100,307 @@ digraph G { } --> - - + + G - + root - - Chain - 0x9A + + Chain + 0x9A start - - start/end + + start/end - root->start - - - 0x09A + root->start + + + 0x09A t - - 0x09B + + 0x09B - root->t - - - t + root->t + + + t - root->t - - - t + root->t + + + t - start->root - - + start->root + + tractor - - contentArray[0] + + contentArray[0] tracto - - 0x01B + + 0x01B - tracto->tractor - - - r + tracto->tractor + + + r tract - - 0x01A + + 0x01A - tract->tracto - - - o + tract->tracto + + + o trac - - 0x019 + + 0x019 - trac->tract - - - t + trac->tract + + + t tree - - contentArray[1] + + contentArray[1] tra - - Chain - 0x018 + + Chain + 0x018 - tra->trac - - - c + tra->trac + + + c tre - - Chain - 0x03B + + Chain + 0x03B tree2p - - Prefix - 0x0BF - contentArray[1] + + Prefix + 0x0BF + contentArray[1] trees - - contentArray[3] + + contentArray[3] - tree->trees - - - s + tree->trees + + + s - tre->tree - - - e + tre->tree + + + e - tre->tree - - - e + tre->tree + + + e tri - - Chain - 0x05B + + Chain + 0x05B tr - - Sparse - 0x07E + + Sparse + 0x07E - tre->tr - - - 0x03B + tre->tr + + + 0x03B - - tre->tree2p - - - e + + tre->tree2p + + + e trie - - contentArray[2] + + contentArray[2] - tri->trie - - - e + tri->trie + + + e - tr->tra - - - a + tr->tra + + + a - tr->tre - - - e + tr->tre + + + e - tr->tre - - - e + tr->tre + + + e - tr->tri - - - i + tr->tri + + + i - tr->t - - - 0x07E + tr->t + + + 0x07E - t->root - - - 0x09B + t->root + + + 0x09B - t->tr - - - r + t->tr + + + r - t->tr - - - r + t->tr + + + r tree2 - - Chain - 0x0BB + + Chain + 0x0BB - tree2->tree2p - - - 0x0BB + tree2->tree2p + + + 0x0BB - tree2->trees - - - s + tree2->trees + + + s - tree2p->tre - - - 0x0BF + tree2p->tre + + + 0x0BF - - tree2p->tree2 - - - ε + + tree2p->tree2 + + + ε - trees->tree2 - - - ~3 + trees->tree2 + + + ~3 diff --git a/src/java/org/apache/cassandra/db/tries/IntersectionCursor.java b/src/java/org/apache/cassandra/db/tries/IntersectionCursor.java new file mode 100644 index 000000000000..0b9b8b9c4efd --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/IntersectionCursor.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// The implementation of the intersection of a trie with a set. Intersections normally return all content that is +/// present on any trie position that the set lists, regardless if the specific position falls inside the set -- this +/// is done to make sure that metadata relevant to the selection is preserved. +/// +/// For ordered tries where we may want the intersection to return only content that falls strictly within the bounds +/// of the trie, use [Slice]. +abstract class IntersectionCursor> implements Cursor +{ + enum State + { + /// Source and set cursors are at the same position. + MATCHING, + /// The set cursor is ahead; the current position, as well as any before the set cursor's are inside the set. + SET_AHEAD + } + + final C source; + final TrieSetCursor set; + State state; + + IntersectionCursor(C source, TrieSetCursor set) + { + this.source = source; + this.set = set; + setInitialState(); + } + + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } + + @Override + public long advance() + { + if (state == State.SET_AHEAD) + return advanceInCoveredBranch(set.encodedPosition(), source.advance()); + + return advanceWhenMatching(); + } + + @Override + public long advanceMultiple(Cursor.TransitionsReceiver receiver) + { + // We can only apply advanceMultiple if we are fully inside a covered branch. + if (state == State.SET_AHEAD) + return advanceInCoveredBranch(set.encodedPosition(), source.advanceMultiple(receiver)); + + return advanceWhenMatching(); + } + + private long advanceWhenMatching() + { + // The set is assumed sparser, so we advance that first. + long setPosition = set.advance(); + if (set.precedingIncluded()) + return advanceInCoveredBranch(setPosition, source.advance()); + else + return advanceSourceToIntersection(setPosition); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + if (state == State.SET_AHEAD) + return advanceInCoveredBranch(set.encodedPosition(), source.skipTo(encodedSkipPosition)); + + long setPosition = set.skipTo(encodedSkipPosition); + if (set.precedingIncluded()) + return advanceInCoveredBranch(setPosition, source.skipTo(encodedSkipPosition)); + else + return advanceSourceToIntersection(setPosition); + } + + private long advanceInCoveredBranch(long setPosition, long sourcePosition) + { + // Check if the advanced source is still in the covered area. + long cmp = Cursor.compare(sourcePosition, setPosition); + if (cmp < 0) // source is strictly before set position + return coveredAreaWithSetAhead(sourcePosition); + if (Cursor.isExhausted(sourcePosition)) + return exhausted(sourcePosition); + + if (cmp == 0) + return matchingPosition(sourcePosition); + + // Source moved beyond the set position. Advance the set too. + setPosition = set.skipTo(sourcePosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(sourcePosition); + + // At this point set is ahead. Check content to see if we are in a covered branch. + // If not, we need to skip the source as well and repeat the process. + if (set.precedingIncluded()) + return coveredAreaWithSetAhead(sourcePosition); + else + return advanceSourceToIntersection(setPosition); + } + + private long advanceSourceToIntersection(long setPosition) + { + while (true) + { + // Set is ahead of source, but outside the covered area. Skip source to the set's position. + long sourcePosition = source.skipTo(setPosition); + if (Cursor.isExhausted(sourcePosition)) + return exhausted(sourcePosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(sourcePosition); + + // Source is now ahead of the set. + setPosition = set.skipTo(sourcePosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(sourcePosition); + + // At this point set is ahead. Check content to see if we are in a covered branch. + if (set.precedingIncluded()) + return coveredAreaWithSetAhead(sourcePosition); + } + } + + private long coveredAreaWithSetAhead(long encodedPosition) + { + state = State.SET_AHEAD; + return encodedPosition; + } + + long matchingPosition(long encodedPosition) + { + state = State.MATCHING; + return encodedPosition; + } + + void setInitialState() + { + matchingPosition(encodedPosition()); + } + + private long exhausted(long position) + { + state = State.MATCHING; + return position; + } + + @Override + public T content() + { + return source.content(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + /// A variation of the intersection cursor that only returns content when it falls strictly inside the boundaries + /// of the set. + abstract static class Slice> extends IntersectionCursor + { + Slice(C source, TrieSetCursor set) + { + super(source, set); + } + + @Override + public T content() + { + switch (state) + { + case SET_AHEAD: + return source.content(); + case MATCHING: + // Slice bounds fall on the same positions as ordered content. The right side of the state, + // regardless of the direction of iteration, determines coverage for the specific position. + return set.state().applicableAfter ? source.content() : null; + default: + throw new AssertionError(); + } + } + } + + /// Intersection cursor for [Trie]. + static class Plain extends IntersectionCursor> + { + public Plain(Cursor source, TrieSetCursor set) + { + super(source, set); + } + + @Override + public Cursor tailCursor(Direction direction) + { + switch (state) + { + case MATCHING: + return new Plain<>(source.tailCursor(direction), set.tailCursor(direction)); + case SET_AHEAD: + return source.tailCursor(direction); + default: + throw new AssertionError(); + } + } + } + + /// Slice cursor for [Trie]. + static class PlainSlice extends Slice> + { + public PlainSlice(Cursor source, TrieSetCursor set) + { + super(source, set); + } + + @Override + public Cursor tailCursor(Direction direction) + { + switch (state) + { + case MATCHING: + return new PlainSlice<>(source.tailCursor(direction), set.tailCursor(direction)); + case SET_AHEAD: + return source.tailCursor(direction); + default: + throw new AssertionError(); + } + } + } + + static class DeletionAware> + extends IntersectionCursor> + implements DeletionAwareCursor + { + RangeCursor applicableDeletionBranch; + + public DeletionAware(DeletionAwareCursor source, TrieSetCursor set) + { + super(source, set); + applicableDeletionBranch = null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + switch (state) + { + case MATCHING: + return new DeletionAware<>(source.tailCursor(direction), set.tailCursor(direction)); + case SET_AHEAD: + return source.tailCursor(direction); + default: + throw new AssertionError(); + } + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + RangeCursor deletions = source.deletionBranchCursor(direction); + if (deletions == null) + return null; + + switch (state) + { + case SET_AHEAD: + // Since the deletion branch cannot extend outside this branch, it is fully covered by the set. + return deletions; + case MATCHING: + return new RangeIntersectionCursor<>(deletions, + set.tailCursor(direction)); + default: + throw new AssertionError(); + } + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/MemoryAllocationStrategy.java b/src/java/org/apache/cassandra/db/tries/MemoryAllocationStrategy.java index c34942097437..a2aef97fcb65 100644 --- a/src/java/org/apache/cassandra/db/tries/MemoryAllocationStrategy.java +++ b/src/java/org/apache/cassandra/db/tries/MemoryAllocationStrategy.java @@ -23,56 +23,42 @@ import org.agrona.collections.IntArrayList; import org.apache.cassandra.utils.concurrent.OpOrder; -/** - * Allocation strategy for buffers and arrays for InMemoryTrie's. Controls how space is allocated and reused. - */ +/// Allocation strategy for buffers and arrays for `InMemoryTrie`s. Controls how space is allocated and reused. public interface MemoryAllocationStrategy { - /** - * Get a free index. This is either a new index, allocated via the passed index producer functions, or one that - * has been previously recycled. - */ + /// Get a free index. This is either a new index, allocated via the passed index producer functions, or one that + /// has been previously recycled. int allocate() throws TrieSpaceExhaustedException; - /** - * Marks the given index for recycling. - * - * When the index is actually reused depends on the recycling strategy. In any case it cannot be before the current - * mutation is complete (because it may still be walking cells that have been moved), and any concurrent readers - * that have started before this cell has become unreachable must also have completed. - */ + /// Marks the given index for recycling. + /// + /// When the index is actually reused depends on the recycling strategy. In any case it cannot be before the current + /// mutation is complete (because it may still be walking cells that have been moved), and any concurrent readers + /// that have started before this cell has become unreachable must also have completed. void recycle(int index); - /** - * To be called when a mutation completes. No new readers must be able to see recycled content at the time of this - * call (the paths for reaching them must have been overwritten via a volatile write; additionally, if the buffer - * has grown, the root variable (which is stored outside the buffer) must have accepted a volatile write). - * No recycled indexes can be made available for reuse before this is called, and before any readers started before - * this call have completed. - */ + /// To be called when a mutation completes. No new readers must be able to see recycled content at the time of this + /// call (the paths for reaching them must have been overwritten via a volatile write; additionally, if the buffer + /// has grown, the root variable (which is stored outside the buffer) must have accepted a volatile write). + /// No recycled indexes can be made available for reuse before this is called, and before any readers started before + /// this call have completed. void completeMutation(); - /** - * Called when a mutation is aborted because of an exception. This means that the indexes that were marked for - * recycling are still going to be in use (unless this is called a later separate completeMutation call may release - * and reuse them, causing corruption). - * - * Aborted mutations are not normal, and at this time we are not trying to ensure that a trie will behave at its - * best if an abort has taken place (i.e. it may take more space, be slower etc.), but it should still operate - * correctly. - */ + /// Called when a mutation is aborted because of an exception. This means that the indexes that were marked for + /// recycling are still going to be in use (unless this is called, a later separate `completeMutation` call may + /// release and reuse them, causing corruption). + /// + /// Aborted mutations are not normal, and at this time we are not trying to ensure that a trie will behave at its + /// best if an abort has taken place (i.e. it may take more space, be slower etc.), but it should still operate + /// correctly. void abortMutation(); - /** - * Returns the number of indexes that have been claimed by the allocation strategy but are not currently in use - * (either because they are in various stages of recycling, or have yet to see first use). - */ + /// Returns the number of indexes that have been claimed by the allocation strategy but are not currently in use + /// (either because they are in various stages of recycling, or have yet to see first use). long indexCountInPipeline(); - /** - * Constructs a list of all the indexes that are in the recycling pipeline. - * Used to test available and unreachable indexes are the same thing. - */ + /// Constructs a list of all the indexes that are in the recycling pipeline. + /// Used to test available and unreachable indexes are the same thing. @VisibleForTesting IntArrayList indexesInPipeline(); @@ -87,9 +73,7 @@ default void allocate(int[] indexList) throws TrieSpaceExhaustedException } } - /** - * Strategy for small short-lived tries, usually on-heap. This strategy does not reuse any indexes. - */ + /// Strategy for small short-lived tries, usually on-heap. This strategy does not reuse any indexes. class NoReuseStrategy implements MemoryAllocationStrategy { final Allocator allocator; @@ -133,69 +117,59 @@ public IntArrayList indexesInPipeline() } } - /** - * Reuse strategy for large, long-lived tries. Recycles indexes when it knows that the mutation recycling - * them has completed, and all reads started no later than this completion have also completed (signalled by an - * OpOrder which the strategy assumes all readers subscribe to). - * - * The OpOrder recycling strategy holds queues of indexes available for recycling. The queues ar organized in blocks - * of REUSE_BLOCK_SIZE entries. The blocks move through the following stages: - * - Being filled with newly released indexes. In this stage they are at the head of the "justReleased" list. When - * a block becomes full, a new block is created and attached to the head of the list. - * - Full, but the mutation that released one or more of the mutations in them has not yet completed. In this stage - * they are attached to the "justReleased" list as the second or further block. When a mutationComplete is - * received, all such blocks get issued a common OpOrder.Barrier and are attached to "awaitingBarrierTail" (which - * is the tail of the "free" list). - * - Awaiting a barrier. In this stage they are in the "free" list after its head, closer to its - * "awaitingBarrierTail", identified by the fact that their barrier has not yet expired. Note that the blocks are - * put in the order in which their barriers are issued, thus if a block has an active barrier, all blocks that - * follow it in the list also do. - * - Ready for use. In this stage they are still in the "free" list after its head, but their barrier has now - * expired. All the indexes in such blocks can now be reused, and will be when the head of the list is exhausted. - * - Active free block at the head of the "free" list. This block is the one new allocations are served from. When - * it is exhausted, we check if the next block's barrier has expired. If so, the "free" pointer moves to it. - * If not, there's nothing to reuse as any blocks in the list still have an active barrier, thus we grab some new - * memory and refill the block. - * - If a mutation is aborted by an error, we throw away all indexes in the "justReleased" list. This is done so - * that none of the indexes that were marked for release, but whose parent chain may have remained in place, - * making them reachable, are reused and corrupt the trie. This will leak some indexes (from earlier mutations in - * the block and/or ones whose parents have already been moved), but we prefer not to pay the cost of identifying - * the exact indexes that need to remain or be recycled. - * We assume that exceptions while mutating are not normal and should not happen, and thus a temporary leak (e.g. - * until the memtable is switched) is acceptable. Should this change (e.g. if a trie is used for the full lifetime - * of the process or longer and exceptions are expected as part of its function), we can implement a reachability - * walk to identify orphaned indexes and call it with some frequency after one or more exceptions have occured. - */ + /// Reuse strategy for large, long-lived tries. Recycles indexes when it knows that the mutation recycling + /// them has completed, and all reads started no later than this completion have also completed (signalled by an + /// `OpOrder` which the strategy assumes all readers subscribe to). + /// + /// The `OpOrder` recycling strategy holds queues of indexes available for recycling. The queues ar organized in + /// blocks of [#REUSE_BLOCK_SIZE] entries. The blocks move through the following stages: + /// - Being filled with newly released indexes. In this stage they are at the head of the [#justReleased] list. When + /// a block becomes full, a new block is created and attached to the head of the list. + /// - Full, but the mutation that released one or more of the mutations in them has not yet completed. In this stage + /// they are attached to the [#justReleased] list as the second or further block. When a `mutationComplete` is + /// received, all such blocks get issued a common `OpOrder.Barrier` and are attached to [#awaitingBarrierTail] + /// (which is the tail of the [#free] list). + /// - Awaiting a barrier. In this stage they are in the [#free] list after its head, closer to its + /// [#awaitingBarrierTail], identified by the fact that their barrier has not yet expired. Note that the blocks are + /// put in the order in which their barriers are issued, thus if a block has an active barrier, all blocks that + /// follow it in the list also do. + /// - Ready for use. In this stage they are still in the [#free] list after its head, but their barrier has now + /// expired. All the indexes in such blocks can now be reused, and will be when the head of the list is exhausted. + /// - Active free block at the head of the [#free] list. This block is the one new allocations are served from. When + /// it is exhausted, we check if the next block's barrier has expired. If so, the [#free] pointer moves to it. + /// If not, there's nothing to reuse as any blocks in the list still have an active barrier, thus we grab some new + /// memory and refill the block. + /// - If a mutation is aborted by an error, we throw away all indexes in the [#justReleased] list. This is done so + /// that none of the indexes that were marked for release, but whose parent chain may have remained in place, + /// making them reachable, are reused and corrupt the trie. This will leak some indexes (from earlier mutations in + /// the block and/or ones whose parents have already been moved), but we prefer not to pay the cost of identifying + /// the exact indexes that need to remain or be recycled. + /// We assume that exceptions while mutating are not normal and should not happen, and thus a temporary leak (e.g. + /// until the memtable is switched) is acceptable. Should this change (e.g. if a trie is used for the full lifetime + /// of the process or longer and exceptions are expected as part of its function), we can implement a reachability + /// walk to identify orphaned indexes and call it with some frequency after one or more exceptions have occured. static class OpOrderReuseStrategy implements MemoryAllocationStrategy { - /** - * Cells list holding indexes that are just recycled. When full, new one is allocated and linked. - * - * On mutationComplete, any full (in justReleased.nextList) lists get issued a barrier and are moved to - * awaitingBarrierTail. - */ + /// Cells list holding indexes that are just recycled. When full, new one is allocated and linked. + /// + /// On `mutationComplete`, any full (in `justReleased.nextList`) lists get issued a barrier and are moved to + /// [#awaitingBarrierTail]. IndexBlockList justReleased; - /** - * Tail of the "free and awaiting barrier" queue. This is reachable by following the links from free. - * - * Full lists are attached to this tail when their barrier is issued. - * Lists are consumed from the head when free becomes empty if the list at the head has an expired barrier. - */ + /// Tail of the "free and awaiting barrier" queue. This is reachable by following the links from [#free]. + /// + /// Full lists are attached to this tail when their barrier is issued. + /// Lists are consumed from the head when [#free] becomes empty if the list at the head has an expired barrier. IndexBlockList awaitingBarrierTail; - /** - * Current free list, head of the "free and awaiting barrier" queue. Allocations are served from here. - * - * Starts full, and when it is exhausted we check the barrier at the next linked block. - * If expired, update free to point to it (consuming one block from the queue). - * If not, re-fill the block by allocating a new set of REUSE_BLOCK_SIZE indexes. - */ + /// Current free list, head of the "free and awaiting barrier" queue. Allocations are served from here. + /// + /// Starts full, and when it is exhausted we check the barrier at the next linked block. + /// If expired, update [#free] to point to it (consuming one block from the queue). + /// If not, re-fill the block by allocating a new set of [#REUSE_BLOCK_SIZE] indexes. IndexBlockList free; - /** - * Called to allocate a new block of indexes to distribute. - */ + /// Called to allocate a new block of indexes to distribute. final Allocator allocator; final OpOrder opOrder; diff --git a/src/java/org/apache/cassandra/db/tries/MemoryManager.java b/src/java/org/apache/cassandra/db/tries/MemoryManager.java new file mode 100644 index 000000000000..0ed2cb53f096 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/MemoryManager.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import com.google.common.annotations.VisibleForTesting; + +/// Base class for the buffer and content managers of in-memory tries. +public interface MemoryManager +{ + /// To be called when a mutation completes. No new readers must be able to see recycled content at the time of this + /// call (the paths for reaching them must have been overwritten via a volatile write; additionally, if the buffer + /// has grown, the root variable (which is stored outside the buffer) must have accepted a volatile write). + /// No recycled indexes can be made available for reuse before this is called, and before any readers started before + /// this call have completed. + void completeMutation(); + + /// Called when a mutation is aborted because of an exception. This means that the indexes that were marked for + /// recycling are still going to be in use (unless this is called, a later separate `completeMutation` call may + /// release and reuse them, causing corruption). + /// + /// Aborted mutations are not normal, and at this time we are not trying to ensure that a trie will behave at its + /// best if an abort has taken place (i.e. it may take more space, be slower etc.), but it should still operate + /// correctly. + void abortMutation(); + + /// Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content, or + /// any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). + /// The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when + /// to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out + /// immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it + /// possible to flush out before making these large allocations. + long usedSizeOffHeap(); + + /// Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content, or + /// any space that has been allocated but is not currently in use (e.g. recycled cells or preallocated buffer). + /// The latter means we are undercounting the actual usage, but the purpose of this reporting is to decide when + /// to flush out e.g. a memtable and if we include the unused space we would almost always end up flushing out + /// immediately after allocating a large buffer and not having a chance to use it. Counting only used space makes it + /// possible to flush out before making these large allocations. + long usedSizeOnHeap(); + + /// Returns the amount of memory that has been allocated for various buffers but isn't currently in use. + /// The total on-heap space used by the trie is `usedSizeOnHeap() + unusedReservedOnHeapMemory()`. + @VisibleForTesting + long unusedReservedOnHeapMemory(); +} diff --git a/src/java/org/apache/cassandra/db/tries/MergeCursor.java b/src/java/org/apache/cassandra/db/tries/MergeCursor.java new file mode 100644 index 000000000000..e231b8ba0390 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/MergeCursor.java @@ -0,0 +1,619 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// A merged view of two trie cursors. +/// +/// This is accomplished by walking the two cursors in parallel; the merged cursor takes the position and features of the +/// smaller and advances with it; when the two cursors are equal, both are advanced. +/// +/// Crucial for the efficiency of this is the fact that when they are advanced like this, we can compare cursors' +/// positions by their `depth` descending and then `incomingTransition` ascending. +/// See [Trie.md](./Trie.md) for further details. +abstract class MergeCursor, U, D extends Cursor, R> implements Cursor +{ + final C c1; + final D c2; + + boolean atC1; + boolean atC2; + + MergeCursor(C c1, D c2) + { + c1.assertFresh(); + c2.assertFresh(); + this.c1 = c1; + this.c2 = c2; + atC1 = atC2 = true; + } + + @Override + public long advance() + { + return checkOrder(atC1 ? c1.advance() : c1.encodedPosition(), + atC2 ? c2.advance() : c2.encodedPosition()); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return checkOrder(atC1 ? c1.skipTo(encodedSkipPosition) : c1.skipToWhenAhead(encodedSkipPosition), + atC2 ? c2.skipTo(encodedSkipPosition) : c2.skipToWhenAhead(encodedSkipPosition)); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + // While we are on a shared position, we must descend one byte at a time to maintain the cursor ordering. + if (atC1 && atC2) + return checkOrder(c1.advance(), c2.advance()); + + // If we are in a branch that's only covered by one of the sources, we can use its advanceMultiple as it is + // only different from advance if it takes multiple steps down, which does not change the order of the + // cursors. + // Since it might ascend, we still have to check the order after the call. + if (atC1) + return checkOrder(c1.advanceMultiple(receiver), c2.encodedPosition()); + else // atC2 + return checkOrder(c1.encodedPosition(), c2.advanceMultiple(receiver)); + } + + long checkOrder(long c1pos, long c2pos) + { + long cmp = Cursor.compare(c1pos, c2pos); + atC1 = cmp <= 0; + atC2 = cmp >= 0; + return atC1 ? c1pos : c2pos; + } + + @Override + public long encodedPosition() + { + return atC1 ? c1.encodedPosition() : c2.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + assert c1.byteComparableVersion() == c2.byteComparableVersion() : + "Merging cursors with different byteComparableVersions: " + + c1.byteComparableVersion() + " vs " + c2.byteComparableVersion(); + return c1.byteComparableVersion(); + } + + /// Merge implementation for [Trie] + static class Plain extends MergeCursor, T, Cursor, T> + { + private final Trie.MergeResolver resolver; + + Plain(Trie.MergeResolver resolver, Cursor c1, Cursor c2) + { + super(c1, c2); + this.resolver = resolver; + } + + @Override + public T content() + { + T mc = atC2 ? c2.content() : null; + T nc = atC1 ? c1.content() : null; + if (mc == null) + return nc; + else if (nc == null) + return mc; + else + return resolver.resolve(nc, mc); + } + + @Override + public Cursor tailCursor(Direction direction) + { + if (atC1 && atC2) + return new Plain<>(resolver, c1.tailCursor(direction), c2.tailCursor(direction)); + else if (atC1) + return c1.tailCursor(direction); + else if (atC2) + return c2.tailCursor(direction); + else + throw new AssertionError(); + } + } + + /// Mapping version of the merge for [Trie], admitting different cursor types and applying a transformation over all + /// content. Unlike the non-mapping version, this has to wrap tail cursors with only one source to be able to apply + /// the transformation. + static class PlainMapping extends MergeCursor, U, Cursor, R> + { + private final BiFunction resolver; + + PlainMapping(BiFunction resolver, Cursor c1, Cursor c2) + { + super(c1, c2); + this.resolver = resolver; + } + + @Override + public R content() + { + U mc = atC2 ? c2.content() : null; + T nc = atC1 ? c1.content() : null; + if (mc == null && nc == null) + return null; + return resolver.apply(nc, mc); + } + + @Override + public Cursor tailCursor(Direction direction) + { + return new PlainMapping<>(resolver, + atC1 ? c1.tailCursor(direction) : new Cursor.Empty<>(direction, c1.byteComparableVersion()), + atC2 ? c2.tailCursor(direction) : new Cursor.Empty<>(direction, c2.byteComparableVersion())); + } + } + + + /// Base class for range merges (mapping and non-mapping). + static abstract class RangeBase, C extends RangeCursor, T extends RangeState, D extends RangeCursor, U extends RangeState> + extends MergeCursor implements RangeCursor + { + private U state; + boolean stateCollected; + + RangeBase(C c1, D c2) + { + super(c1, c2); + } + + abstract U collectState(); + + @Override + public U state() + { + if (!stateCollected) + { + state = collectState(); + stateCollected = true; + } + return state; + } + + @Override + public long advance() + { + stateCollected = false; + return super.advance(); + } + + @Override + public long skipTo(long encodedSkipTransition) + { + stateCollected = false; + return super.skipTo(encodedSkipTransition); + } + + @Override + public long advanceMultiple(Cursor.TransitionsReceiver receiver) + { + stateCollected = false; + return super.advanceMultiple(receiver); + } + } + + + /// Merge implementation for [RangeTrie] + static class Range> extends RangeBase, S, RangeCursor, S> + { + private final Trie.MergeResolver resolver; + + Range(Trie.MergeResolver resolver, RangeCursor c1, RangeCursor c2) + { + super(c1, c2); + this.resolver = resolver; + } + + @Override + public S collectState() + { + S state1 = atC1 ? c1.state() : c1.precedingState(); + S state2 = atC2 ? c2.state() : c2.precedingState(); + if (state1 == null) + return state2; + else if (state2 == null) + return state1; + else + return resolver.resolve(state1, state2); + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + if (atC1 && atC2) + return new Range<>(resolver, c1.tailCursor(direction), c2.tailCursor(direction)); + else if (atC1) + return makeMerge(resolver, c1.tailCursor(direction), c2.precedingStateCursor(direction)); + else if (atC2) + return makeMerge(resolver, c1.precedingStateCursor(direction), c2.tailCursor(direction)); + else + throw new AssertionError(); + } + + private static > RangeCursor makeMerge(Trie.MergeResolver resolver, RangeCursor c1, RangeCursor c2) + { + if (c1 == null) + return c2; + if (c2 == null) + return c1; + return new Range<>(resolver, c1, c2); + } + } + + /// Mapping version of the merge for [RangeTrie], admitting different cursor types and applying a transformation + /// over all states. Unlike the non-mapping version, this has to wrap tail cursors with only one source to be able + /// to apply the transformation. + static class RangeMapping, T extends RangeState, R extends RangeState> + extends RangeBase, T, RangeCursor, R> + { + private final BiFunction resolver; + + RangeMapping(BiFunction resolver, RangeCursor c1, RangeCursor c2) + { + super(c1, c2); + this.resolver = resolver; + } + + @Override + public R collectState() + { + S state1 = atC1 ? c1.state() : c1.precedingState(); + T state2 = atC2 ? c2.state() : c2.precedingState(); + return (state1 == null && state2 == null) ? null : resolver.apply(state1, state2); + } + + @Override + public RangeMapping tailCursor(Direction direction) + { + return makeMerge(resolver, + atC1 ? c1.tailCursor(direction) : c1.precedingStateCursor(direction), + atC2 ? c2.tailCursor(direction) : c2.precedingStateCursor(direction)); + } + + private static , T extends RangeState, R extends RangeState> + RangeMapping makeMerge(BiFunction resolver, RangeCursor c1, RangeCursor c2) + { + if (c1 == null) + c1 = RangeCursor.empty(c2.direction(), c2.byteComparableVersion()); + if (c2 == null) + c2 = RangeCursor.empty(c1.direction(), c1.byteComparableVersion()); + return new RangeMapping<>(resolver, c1, c2); + } + } + + /// Deletion-aware merge cursor that efficiently merges two deletion-aware tries. + /// This cursor handles the complex task of merging both live data and deletion metadata + /// from two deletion-aware sources. It supports an important optimization via the + /// `deletionsAtFixedPoints` flag. + static abstract class DeletionAwareBase, S, E extends RangeState, R, Q extends RangeState> + extends MergeCursor, S, DeletionAwareMergeSource, R> implements DeletionAwareCursor + { + /// Tracks the depth at which deletion branches were introduced to avoid redundant processing. + /// Set to -1 when no deletion branches are active. + int deletionBranchDepth = -1; + + /// @see DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints + final boolean deletionsAtFixedPoints; + + /// Creates a deletion-aware merge cursor with configurable deletion optimization. + /// + /// @param c1 first deletion-aware cursor + /// @param c2 second deletion-aware cursor + /// @param deletionsAtFixedPoints See [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints] + DeletionAwareBase(DeletionAwareMergeSource c1, + DeletionAwareMergeSource c2, + boolean deletionsAtFixedPoints) + { + super(c1, c2); + this.deletionsAtFixedPoints = deletionsAtFixedPoints; + // descendants must call maybeAddDeletionsBranch(c1.encodedPosition) + } + + @Override + public long advance() + { + return maybeAddDeletionsBranch(super.advance()); + } + + @Override + public long skipTo(long encodedSkipTransition) + { + return maybeAddDeletionsBranch(super.skipTo(encodedSkipTransition)); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return maybeAddDeletionsBranch(super.advanceMultiple(receiver)); + } + + long maybeAddDeletionsBranch(long encodedPosition) + { + if (Cursor.depth(encodedPosition) <= deletionBranchDepth) // ascending above common deletions root + { + deletionBranchDepth = -1; + assert !c1.hasDeletions(); + assert !c2.hasDeletions(); + } + + if (atC1 && atC2 && // otherwise even if there is deletion, the other cursor is ahead of it and can't be affected + (!deletionsAtFixedPoints || deletionBranchDepth == -1)) // if we already found one, don't check the other source for branches below it + { + maybeAddDeletionsBranch(c1, c2); + maybeAddDeletionsBranch(c2, c1); + } + return encodedPosition; + } + + /// Attempts to add deletion branches from one source to another. + /// This method implements the core deletion merging logic. When `deletionsAtFixedPoints` + /// is true, it can skip expensive operations because we know deletion branches are + /// mutually exclusive between sources. + /// + /// @param tgt target merge source that may receive deletions + /// @param src source merge source that may provide deletions + static , S, E extends RangeState> + void maybeAddDeletionsBranch(DeletionAwareMergeSource tgt, + DeletionAwareMergeSource src) + { + // If tgt already has deletions applied, no need to add more (we cannot have a deletion branch covering + // another deletion branch). + if (tgt.hasDeletions()) + return; + + RangeCursor deletionsBranch = src.deletionBranchCursor(src.direction()); + if (deletionsBranch != null) + tgt.addDeletions(deletionsBranch); // apply all src deletions to tgt + } + + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + int depth = Cursor.depth(encodedPosition()); + if (deletionBranchDepth != -1 && depth > deletionBranchDepth) + return null; // already covered by a deletion branch, if there is any here it will be reflected in that + + // if one of the two cursors is ahead, it can't affect this deletion branch + if (!atC1) + return maybeSetDeletionsDepth(makeRangeCursor(null, c2.deletionBranchCursor(direction)), depth); + if (!atC2) + return maybeSetDeletionsDepth(makeRangeCursor(c1.deletionBranchCursor(direction), null), depth); + + // We are positioned at a common branch. If one has a deletion branch, we must combine it with the + // deletion-tree branch of the other to make sure that we merge any higher-depth deletion branch with it. + RangeCursor b1 = c1.deletionBranchCursor(direction); + RangeCursor b2 = c2.deletionBranchCursor(direction); + if (b1 == null && b2 == null) + return null; + + deletionBranchDepth = depth; + + // OPTIMIZATION: When deletionsAtFixedPoints=true, we know that both sources would + // have deletions at the same depth, i.e. if one source has a deletion + // branch at this position, the other cannot have any deletion branches below this + // point. We can thus avoid reproducing the data trie in the deletion branch. + if (deletionsAtFixedPoints) + { + // With the optimization, we can directly return the existing deletion branch + // without needing to create expensive DeletionsTrieCursor instances + return makeRangeCursor(b1, b2); + } + else + { + // Safe path: create DeletionsTrieCursor for missing deletion branches + // This ensures we capture any deletion branches that might exist deeper + // in the trie structure, but is expensive for large tries because we have + // to list the whole data trie (minus content). + if (b1 == null) + b1 = new DeletionAwareCursor.DeletionsTrieCursor<>(c1.data.tailCursor(direction)); + if (b2 == null) + b2 = new DeletionAwareCursor.DeletionsTrieCursor<>(c2.data.tailCursor(direction)); + + return makeRangeCursor(b1, b2); + } + } + + abstract RangeCursor makeRangeCursor(RangeCursor c1, RangeCursor c2); + + private > RangeCursor maybeSetDeletionsDepth(RangeCursor deletionBranchCursor, int depth) + { + if (deletionBranchCursor != null) + deletionBranchDepth = depth; + return deletionBranchCursor; + } + } + + /// Merge cursor for [DeletionAwareTrie]. + /// + /// See the base class [DeletionAwareBase] for the intricacies of the implementation. + static class DeletionAware> + extends DeletionAwareBase + { + final Trie.MergeResolver mergeResolver; + final Trie.MergeResolver deletionResolver; + + /// Creates a deletion-aware merge cursor with configurable deletion optimization. + /// + /// @param mergeResolver resolver for merging live data content + /// @param deletionResolver resolver for merging deletion metadata + /// @param deleter function to apply deletions to live data + /// @param c1 first deletion-aware cursor + /// @param c2 second deletion-aware cursor + /// @param deletionsAtFixedPoints See [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints] + DeletionAware(Trie.MergeResolver mergeResolver, + Trie.MergeResolver deletionResolver, + BiFunction deleter, + DeletionAwareCursor c1, + DeletionAwareCursor c2, + boolean deletionsAtFixedPoints) + { + this(mergeResolver, deletionResolver, + new DeletionAwareMergeSource<>(deleter, c1), + new DeletionAwareMergeSource<>(deleter, c2), + deletionsAtFixedPoints); + // We will add deletion sources to the above as we find them. + maybeAddDeletionsBranch(this.c1.encodedPosition()); + } + + DeletionAware(Trie.MergeResolver mergeResolver, + Trie.MergeResolver deletionResolver, + DeletionAwareMergeSource c1, + DeletionAwareMergeSource c2, + boolean deletionsAtFixedPoints) + { + super(c1, c2, deletionsAtFixedPoints); + this.mergeResolver = mergeResolver; + this.deletionResolver = deletionResolver; + } + + @Override + public T content() + { + T mc = atC2 ? c2.content() : null; + T nc = atC1 ? c1.content() : null; + if (mc == null) + return nc; + else if (nc == null) + return mc; + else + return mergeResolver.resolve(nc, mc); + } + + @Override + RangeCursor makeRangeCursor(RangeCursor c1, RangeCursor c2) + { + return (c1 != null) ? (c2 != null) ? new Range<>(deletionResolver, c1, c2) + : c1 + : (c2 != null) ? c2 + : null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + if (atC1 && atC2) + return new DeletionAware<>(mergeResolver, + deletionResolver, + c1.tailCursor(direction), + c2.tailCursor(direction), + deletionsAtFixedPoints); + else if (atC1) + return c1.tailCursor(direction); + else if (atC2) + return c2.tailCursor(direction); + else + throw new AssertionError(); + } + } + + + /// Mapping version of the merge for [DeletionAwareTrie], admitting different cursor types and applying a + /// transformation over all states. Unlike the non-mapping version, this has to wrap tail cursors with only one + /// source to be able to apply the transformation. + /// + /// See the base class [DeletionAwareBase] for the intricacies of the implementation. + static class DeletionAwareMapping, S, E extends RangeState, R, Q extends RangeState> + extends DeletionAwareBase + { + final BiFunction mergeResolver; + final BiFunction deletionResolver; + + /// Creates a deletion-aware merge cursor with configurable deletion optimization. + /// + /// @param mergeResolver resolver for merging live data content + /// @param deletionResolver resolver for merging deletion metadata + /// @param deleter1 function to apply deletions to live data in c1 + /// @param deleter2 function to apply deletions to live data in c2 + /// @param c1 first deletion-aware cursor + /// @param c2 second deletion-aware cursor + /// @param deletionsAtFixedPoints See [DeletionAwareTrie.MergeResolver#deletionsAtFixedPoints] + DeletionAwareMapping(BiFunction mergeResolver, + BiFunction deletionResolver, + BiFunction deleter1, + BiFunction deleter2, + DeletionAwareCursor c1, + DeletionAwareCursor c2, + boolean deletionsAtFixedPoints) + { + this(mergeResolver, + deletionResolver, + new DeletionAwareMergeSource<>(deleter1, c1), + new DeletionAwareMergeSource<>(deleter2, c2), + deletionsAtFixedPoints); + // We will add deletion sources to the above as we find them. + maybeAddDeletionsBranch(this.c1.encodedPosition()); + } + + DeletionAwareMapping(BiFunction mergeResolver, + BiFunction deletionResolver, + DeletionAwareMergeSource c1, + DeletionAwareMergeSource c2, + boolean deletionsAtFixedPoints) + { + super(c1, c2, deletionsAtFixedPoints); + this.mergeResolver = mergeResolver; + this.deletionResolver = deletionResolver; + } + + @Override + public R content() + { + S mc = atC2 ? c2.content() : null; + T nc = atC1 ? c1.content() : null; + if (mc == null && nc == null) + return null; + else + return mergeResolver.apply(nc, mc); + } + + @Override + RangeCursor makeRangeCursor(RangeCursor c1, RangeCursor c2) + { + if (c1 == null && c2 == null) + return null; + if (c1 == null) + c1 = RangeCursor.empty(c2.direction(), byteComparableVersion()); + else if (c2 == null) + c2 = RangeCursor.empty(c1.direction(), byteComparableVersion()); + return new RangeMapping<>(deletionResolver, c1, c2); + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + return new DeletionAwareMapping<>(mergeResolver, + deletionResolver, + atC1 ? c1.tailCursor(direction) : DeletionAwareMergeSource.empty(direction, byteComparableVersion()), + atC2 ? c2.tailCursor(direction) : DeletionAwareMergeSource.empty(direction, byteComparableVersion()), + deletionsAtFixedPoints); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java deleted file mode 100644 index ffdfee4267e8..000000000000 --- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.db.tries; - -import com.google.common.collect.Iterables; - -import org.apache.cassandra.utils.bytecomparable.ByteComparable; - -/** - * A merged view of two tries. - * - * This is accomplished by walking the two cursors in parallel; the merged cursor takes the position and features of the - * smaller and advances with it; when the two cursors are equal, both are advanced. - * - * Crucial for the efficiency of this is the fact that when they are advanced like this, we can compare cursors' - * positions by their depth descending and then incomingTransition ascending. - * - * See Trie.md for further details. - */ -class MergeTrie extends Trie -{ - private final MergeResolver resolver; - protected final Trie t1; - protected final Trie t2; - - MergeTrie(MergeResolver resolver, Trie t1, Trie t2) - { - this.resolver = resolver; - this.t1 = t1; - this.t2 = t2; - } - - @Override - protected Cursor cursor(Direction direction) - { - return new MergeCursor<>(resolver, direction, t1, t2); - } - - static class MergeCursor implements Cursor - { - private final MergeResolver resolver; - private final Direction direction; - private final Cursor c1; - private final Cursor c2; - - boolean atC1; - boolean atC2; - - MergeCursor(MergeResolver resolver, Direction direction, Trie t1, Trie t2) - { - this.resolver = resolver; - this.direction = direction; - this.c1 = t1.cursor(direction); - this.c2 = t2.cursor(direction); - assert c1.depth() == 0; - assert c2.depth() == 0; - atC1 = atC2 = true; - } - - @Override - public int advance() - { - return checkOrder(atC1 ? c1.advance() : c1.depth(), - atC2 ? c2.advance() : c2.depth()); - } - - @Override - public int skipTo(int skipDepth, int skipTransition) - { - int c1depth = c1.depth(); - int c2depth = c2.depth(); - assert skipDepth <= c1depth + 1 || skipDepth <= c2depth + 1; - if (atC1 || skipDepth < c1depth || skipDepth == c1depth && direction.gt(skipTransition, c1.incomingTransition())) - c1depth = c1.skipTo(skipDepth, skipTransition); - if (atC2 || skipDepth < c2depth || skipDepth == c2depth && direction.gt(skipTransition, c2.incomingTransition())) - c2depth = c2.skipTo(skipDepth, skipTransition); - - return checkOrder(c1depth, c2depth); - } - - @Override - public int advanceMultiple(TransitionsReceiver receiver) - { - // While we are on a shared position, we must descend one byte at a time to maintain the cursor ordering. - if (atC1 && atC2) - return checkOrder(c1.advance(), c2.advance()); - - // If we are in a branch that's only covered by one of the sources, we can use its advanceMultiple as it is - // only different from advance if it takes multiple steps down, which does not change the order of the - // cursors. - // Since it might ascend, we still have to check the order after the call. - if (atC1) - return checkOrder(c1.advanceMultiple(receiver), c2.depth()); - else // atC2 - return checkOrder(c1.depth(), c2.advanceMultiple(receiver)); - } - - private int checkOrder(int c1depth, int c2depth) - { - if (c1depth > c2depth) - { - atC1 = true; - atC2 = false; - return c1depth; - } - if (c1depth < c2depth) - { - atC1 = false; - atC2 = true; - return c2depth; - } - // c1depth == c2depth - int c1trans = c1.incomingTransition(); - int c2trans = c2.incomingTransition(); - atC1 = direction.le(c1trans, c2trans); - atC2 = direction.le(c2trans, c1trans); - assert atC1 | atC2; - return c1depth; - } - - @Override - public int depth() - { - return atC1 ? c1.depth() : c2.depth(); - } - - @Override - public int incomingTransition() - { - return atC1 ? c1.incomingTransition() : c2.incomingTransition(); - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - assert c1.byteComparableVersion() == c2.byteComparableVersion() : - "Merging cursors with different byteComparableVersions: " + - c1.byteComparableVersion() + " vs " + c2.byteComparableVersion(); - return c1.byteComparableVersion(); - } - - public T content() - { - T mc = atC2 ? c2.content() : null; - T nc = atC1 ? c1.content() : null; - if (mc == null) - return nc; - else if (nc == null) - return mc; - else - return resolver.resolve(nc, mc); - } - - @Override - public Trie tailTrie() - { - if (atC1 && atC2) - return new MergeTrie<>(resolver, c1.tailTrie(), c2.tailTrie()); - else if (atC1) - return c1.tailTrie(); - else if (atC2) - return c2.tailTrie(); - else - throw new AssertionError(); - } - } - - /** - * Special instance for sources that are guaranteed (by the caller) distinct. The main difference is that we can - * form unordered value list by concatenating sources. - */ - static class Distinct extends MergeTrie - { - Distinct(Trie input1, Trie input2) - { - super(throwingResolver(), input1, input2); - } - - @Override - public Iterable valuesUnordered() - { - return Iterables.concat(t1.valuesUnordered(), t2.valuesUnordered()); - } - } -} diff --git a/src/java/org/apache/cassandra/db/tries/PrefixedCursor.java b/src/java/org/apache/cassandra/db/tries/PrefixedCursor.java new file mode 100644 index 000000000000..bf54001bc34c --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/PrefixedCursor.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Prefixed cursor. Prepends the given prefix to all keys of the supplied cursor. +abstract class PrefixedCursor> extends DepthAdjustedCursor +{ + ByteSource prefixBytes; + int nextPrefixByte; + long currentPosition; + + PrefixedCursor(ByteComparable prefix, C tail) + { + this(prefix.asComparableBytes(tail.byteComparableVersion()), tail); + } + + PrefixedCursor(ByteSource prefix, C tail) + { + this(prefix.next(), prefix, tail); + } + + PrefixedCursor(int firstPrefixByte, ByteSource prefix, C tail) + { + super(tail, 0); + prefixBytes = prefix; + nextPrefixByte = firstPrefixByte; + tail.assertFresh(); + setPositionAndCheckPrefixDone(tail.encodedPosition()); // initial position with the correct direction + } + + long completeAdvanceInTail(long position) + { + return currentPosition = position; + } + + boolean prefixDone() + { + return nextPrefixByte == ByteSource.END_OF_STREAM; + } + + @Override + public long encodedPosition() + { + return currentPosition; + } + + @Override + public long advance() + { + if (prefixDone()) + return completeAdvanceInTail(super.advance()); + + long nextPosition = Cursor.positionForDescentWithByte(currentPosition, nextPrefixByte); + nextPrefixByte = prefixBytes.next(); + return setPositionAndCheckPrefixDone(nextPosition); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + if (prefixDone()) + return completeAdvanceInTail(super.advanceMultiple(receiver)); + + long pos = currentPosition; + int incomingTransition = nextPrefixByte; + nextPrefixByte = prefixBytes.next(); + + while (nextPrefixByte != ByteSource.END_OF_STREAM) + { + receiver.addPathByte(incomingTransition); + pos += DEPTH_ADJUSTMENT_ONE; + incomingTransition = nextPrefixByte; + nextPrefixByte = prefixBytes.next(); + } + // Note: It's tempting to do an advance in the tail too, but its root main contain content or other features + // that we can't skip over. + + return setPositionAndCheckPrefixDone(Cursor.positionForDescentWithByte(pos, incomingTransition)); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + if (prefixDone()) + return completeAdvanceInTail(super.skipTo(encodedSkipPosition)); + + long nextPosition = Cursor.positionForDescentWithByte(currentPosition, nextPrefixByte); + if (Cursor.compare(encodedSkipPosition, nextPosition) > 0) + return exhausted(); + assert Cursor.depth(encodedSkipPosition) == Cursor.depth(nextPosition) + : "Invalid advance request to " + Cursor.toString(encodedSkipPosition) + + " to cursor at " + Cursor.toString(currentPosition); + nextPrefixByte = prefixBytes.next(); + return setPositionAndCheckPrefixDone(nextPosition); + } + + private long setPositionAndCheckPrefixDone(long position) + { + if (nextPrefixByte == ByteSource.END_OF_STREAM) + setAttachmentPoint(position); + + currentPosition = position; + return position; + } + + private long exhausted() + { + currentPosition = Cursor.exhaustedPosition(currentPosition); + nextPrefixByte = 0; // make sure prefixDone is not engaged (we could return content or tail if it is) + return currentPosition; + } + + @Override + public T content() + { + return prefixDone() ? source.content() : null; + } + + ByteSource.Duplicatable duplicateSource() + { + if (!(prefixBytes instanceof ByteSource.Duplicatable)) + prefixBytes = ByteSource.duplicatable(prefixBytes); + ByteSource.Duplicatable duplicatableSource = (ByteSource.Duplicatable) prefixBytes; + return duplicatableSource.duplicate(); + } + + static class Plain extends PrefixedCursor> implements Cursor + { + Plain(ByteComparable prefix, Cursor tail) + { + super(prefix, tail); + } + + Plain(int firstPrefixByte, ByteSource prefix, Cursor source) + { + super(firstPrefixByte, prefix, source); + } + + @Override + public Cursor tailCursor(Direction direction) + { + assert !Cursor.isExhausted(currentPosition) : "tailTrie called on exhausted cursor"; + + if (prefixDone()) + return source.tailCursor(direction); + else + return new PrefixedCursor.Plain<>(nextPrefixByte, duplicateSource(), source.tailCursor(direction)); + } + } + + static class Range> extends PrefixedCursor> implements RangeCursor + { + Range(ByteComparable prefix, RangeCursor tail) + { + super(prefix, tail); + } + + Range(int firstPrefixByte, ByteSource prefix, RangeCursor source) + { + super(firstPrefixByte, prefix, source); + } + + @Override + public S state() + { + if (prefixDone()) + return source.state(); + return null; + } + + @Override + public S precedingState() + { + if (prefixDone()) + return source.precedingState(); + return null; + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + assert !Cursor.isExhausted(currentPosition) : "tailTrie called on exhausted cursor"; + + if (prefixDone()) + return source.tailCursor(direction); + else + return new PrefixedCursor.Range<>(nextPrefixByte, duplicateSource(), source.tailCursor(direction)); + } + } + + static class DeletionAware> + extends PrefixedCursor> implements DeletionAwareCursor + { + DeletionAware(ByteComparable prefix, DeletionAwareCursor tail) + { + super(prefix, tail); + } + + DeletionAware(int firstPrefixByte, ByteSource prefix, DeletionAwareCursor tail) + { + super(firstPrefixByte, prefix, tail); + } + + DeletionAware(DeletionAware copyFrom, Direction direction) + { + this(copyFrom.nextPrefixByte, copyFrom.duplicateSource(), copyFrom.source.tailCursor(direction)); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return prefixDone() ? source.deletionBranchCursor(direction) : null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + assert !Cursor.isExhausted(currentPosition) : "tailTrie called on exhausted cursor"; + + if (prefixDone()) + return source.tailCursor(direction); + else + { + return new DeletionAware<>(this, direction); + } + } + } + + + static class DeletionAwareSeparately> + extends DeletionAware + { + final RangeCursor deletionBranch; + + DeletionAwareSeparately(ByteComparable prefix, DeletionAwareCursor contentBranch, RangeCursor deletionBranch) + { + super(prefix, contentBranch); + this.deletionBranch = deletionBranch != null ? new PrefixedCursor.Range<>(prefix, deletionBranch) : null; + } + + DeletionAwareSeparately(DeletionAwareSeparately copyFrom, Direction direction) + { + super(copyFrom, direction); + this.deletionBranch = copyFrom.deletionBranch; // no need to take tailCursor as we do that when we return it + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return Cursor.isRootPosition(encodedPosition()) && deletionBranch != null + ? deletionBranch.tailCursor(direction) + : null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + if (Cursor.isRootPosition(encodedPosition())) + return new DeletionAwareSeparately<>(this, direction); + else + return super.tailCursor(direction); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/PrefixedTrie.java b/src/java/org/apache/cassandra/db/tries/PrefixedTrie.java deleted file mode 100644 index cf5f9dd63513..000000000000 --- a/src/java/org/apache/cassandra/db/tries/PrefixedTrie.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.db.tries; - -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; - -/** - * Prefixed trie. Represents the content of the given trie with the prefix prepended to all keys. - */ -public class PrefixedTrie extends Trie -{ - final ByteComparable prefix; - final Trie trie; - - public PrefixedTrie(ByteComparable prefix, Trie trie) - { - this.prefix = prefix; - this.trie = trie; - } - - @Override - protected Trie.Cursor cursor(Direction direction) - { - Trie.Cursor sourceCursor = trie.cursor(direction); - return new Cursor<>(prefix.asComparableBytes(sourceCursor.byteComparableVersion()), sourceCursor); - } - - private static class Cursor implements Trie.Cursor - { - final Trie.Cursor tail; - ByteSource prefixBytes; - int nextPrefixByte; - int incomingTransition; - int depthOfPrefix; - - Cursor(ByteSource prefix, Trie.Cursor tail) - { - this.tail = tail; - prefixBytes = prefix; - incomingTransition = -1; - nextPrefixByte = prefixBytes.next(); - depthOfPrefix = 0; - } - - int completeAdvanceInTail(int depthInTail) - { - if (depthInTail < 0) - return exhausted(); - - incomingTransition = tail.incomingTransition(); - return depthInTail + depthOfPrefix; - } - - boolean prefixDone() - { - return nextPrefixByte == ByteSource.END_OF_STREAM; - } - - @Override - public int depth() - { - if (prefixDone()) - return tail.depth() + depthOfPrefix; - else - return depthOfPrefix; - } - - @Override - public int incomingTransition() - { - return incomingTransition; - } - - @Override - public int advance() - { - if (prefixDone()) - return completeAdvanceInTail(tail.advance()); - - ++depthOfPrefix; - incomingTransition = nextPrefixByte; - nextPrefixByte = prefixBytes.next(); - return depthOfPrefix; - } - - @Override - public int advanceMultiple(Trie.TransitionsReceiver receiver) - { - if (prefixDone()) - return completeAdvanceInTail(tail.advanceMultiple(receiver)); - - while (!prefixDone()) - { - receiver.addPathByte(incomingTransition); - ++depthOfPrefix; - incomingTransition = nextPrefixByte; - nextPrefixByte = prefixBytes.next(); - } - return depthOfPrefix; - } - - @Override - public int skipTo(int skipDepth, int skipTransition) - { - // regardless if we exhausted prefix, if caller asks for depth <= prefix depth, we're done. - if (skipDepth <= depthOfPrefix) - return exhausted(); - if (prefixDone()) - return completeAdvanceInTail(tail.skipTo(skipDepth - depthOfPrefix, skipTransition)); - assert skipDepth == depthOfPrefix + 1 : "Invalid advance request to depth " + skipDepth + " to cursor at depth " + depthOfPrefix; - if (tail.direction().gt(skipTransition, nextPrefixByte)) - return exhausted(); - return advance(); - } - - private int exhausted() - { - incomingTransition = -1; - depthOfPrefix = -1; - nextPrefixByte = 0; // to make prefixDone() false so incomingTransition/depth/content are -1/-1/null - return depthOfPrefix; - } - - public Direction direction() - { - return tail.direction(); - } - - public ByteComparable.Version byteComparableVersion() - { - return tail.byteComparableVersion(); - } - - @Override - public T content() - { - return prefixDone() ? tail.content() : null; - } - - @Override - public Trie tailTrie() - { - if (prefixDone()) - return tail.tailTrie(); - else - { - assert depthOfPrefix >= 0 : "tailTrie called on exhausted cursor"; - if (!(prefixBytes instanceof ByteSource.Duplicatable)) - prefixBytes = ByteSource.duplicatable(prefixBytes); - ByteSource.Duplicatable duplicatableSource = (ByteSource.Duplicatable) prefixBytes; - - return new PrefixedTrie<>(v -> duplicatableSource.duplicate(), tail.tailTrie()); - } - } - } -} diff --git a/src/java/org/apache/cassandra/db/tries/RangeApplyCursor.java b/src/java/org/apache/cassandra/db/tries/RangeApplyCursor.java new file mode 100644 index 000000000000..ea7612b7d7dd --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangeApplyCursor.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.function.BiFunction; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// A cursor applying a range to a data cursor. The cursor will present the content of the data trie modified by any +/// applicable/covering range of the range trie. +/// +/// This is very similar to a normal merge cursor but, because it only presents content from the data trie, it does not +/// need to walk the range trie unless it matches positions from the data cursor and thus skips the range cursor +/// whenever the data one ends up ahead. +class RangeApplyCursor> implements Cursor +{ + final BiFunction resolver; + final RangeCursor range; + final Cursor data; + + boolean atRange; + + RangeApplyCursor(BiFunction resolver, RangeCursor range, Cursor data) + { + this.resolver = resolver; + this.range = range; + this.data = data; + assert Cursor.compare(data.encodedPosition(), range.encodedPosition()) == 0; + atRange = true; + } + + @Override + public long encodedPosition() + { + return data.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + assert range.byteComparableVersion() == data.byteComparableVersion() : + "Merging cursors with different byteComparableVersions: " + + range.byteComparableVersion() + " vs " + data.byteComparableVersion(); + return range.byteComparableVersion(); + } + + @Override + public long advance() + { + long dataPosition = data.advance(); + if (atRange) + return skipRangeToDataPosition(dataPosition); + else + return maybeSkipRange(dataPosition); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + long dataPosition = data.skipTo(encodedSkipPosition); + if (atRange) // if both cursors were at the same position, always advance the range cursor to catch up. + return skipRangeToDataPosition(dataPosition); + else // otherwise skip range to the new data position only if it advances past the range's current position. + return maybeSkipRange(dataPosition); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + // While we are on a shared position, we must descend one byte at a time to maintain the cursor ordering. + if (atRange) + return skipRangeToDataPosition(data.advance()); + else // atData only + return maybeSkipRange(data.advanceMultiple(receiver)); + } + + long maybeSkipRange(long dataPosition) + { + long rangePosition = range.encodedPosition(); + long cmp = Cursor.compare(dataPosition, rangePosition); + // If data position is at or before the range position, we are good. + if (cmp <= 0) + return setAtRangeAndReturnPosition(cmp == 0, dataPosition); + + // Range cursor is before data cursor. Skip it ahead so that we are positioned on data. + return skipRangeToDataPosition(dataPosition); + } + + private long skipRangeToDataPosition(long dataPosition) + { + long rangePosition = range.skipTo(dataPosition); + return setAtRangeAndReturnPosition(rangePosition == dataPosition, + dataPosition); + } + + private long setAtRangeAndReturnPosition(boolean atRange, long dataPosition) + { + this.atRange = atRange; + return dataPosition; + } + + @Override + public T content() + { + T content = data.content(); + if (content == null) + return null; + + S applicableRange = atRange ? range.content() : null; + + if (applicableRange == null) + { + if (Cursor.isExhausted(range.encodedPosition())) + return content; + + applicableRange = range.precedingState(); + if (applicableRange == null) + return content; + } + + return resolver.apply(applicableRange, content); + } + + @Override + public Cursor tailCursor(Direction direction) + { + if (atRange) + return new RangeApplyCursor<>(resolver, range.tailCursor(direction), data.tailCursor(direction)); + else + { + RangeCursor r = range.precedingStateCursor(direction); + return r == null ? data.tailCursor(direction) : new RangeApplyCursor<>(resolver, r, data.tailCursor(direction)); + } + } + + static class DeletionAwareDataBranch> extends RangeApplyCursor implements DeletionAwareCursor + { + DeletionAwareDataBranch(BiFunction resolver, RangeCursor range, Cursor data) + { + super(resolver, range, data); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return null; + } + + @Override + public DeletionAwareCursor tailCursor(Direction direction) + { + if (atRange) + { + return new DeletionAwareDataBranch<>(resolver, range.tailCursor(direction), data.tailCursor(direction)); + } + else + { + RangeCursor r = range.precedingStateCursor(direction); + if (r == null) + r = RangeCursor.empty(direction, byteComparableVersion()); + return new DeletionAwareDataBranch<>(resolver, r, data.tailCursor(direction)); + + } + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/RangeCursor.java b/src/java/org/apache/cassandra/db/tries/RangeCursor.java new file mode 100644 index 000000000000..72590eb86255 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangeCursor.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// The cursor implementation of [RangeTrie]. +/// +/// The main difference between normal and range cursors is the addition of a [#precedingState] method, which returns a +/// range for prefixes of start or end positions and is used to determine whether a position that has been skipped to +/// falls inside one of the trie's ranges. +/// +/// As an example, consider the following range trie: +/// +/// ``` +/// a -> +/// b -> start of del1 (left: null, right: del1) +/// c -> +/// d -> switch from del1 to del2 (left: del1, right: del2) +/// g -> +/// h -> end,del2 (left: del2, right: null) +/// ``` +/// +/// If we advance through this trie, it is easy to keep track of the covering deletion, as we walk through the +/// boundaries before entering the range. However, if we skip into the trie, we will not see the boundaries. +/// Imagine a cursor starts at the root and attempts to skip to "b". We need to be able to notice that "b" is covered by +/// the "ab-cd" range with deletion del1. This is achieved by using [#precedingState]. In this case skipping to "b" +/// (in forward direction) will position the cursor on "c"; because the positions to the left of "c" are covered by +/// del1, the [#precedingState] the cursor reports must be a covering state corresponding to `del1`. +/// +/// If, on the other hand, we perform a skip in the reverse direction that reaches the same state, the cursor should +/// report `null` as its state (e.g. performing a `skipTo` from the root with character "d" will land in "c" whose state +/// correctly determines that there is no covering deletion for "d"). +/// +/// For further details, see the range trie section in [Trie.md](./Trie.md). +interface RangeCursor> extends Cursor +{ + /// Returns a range that covers positions before this in iteration order, including this position if `content()` is + /// null. This is the range that is active at (i.e. covers) a position that was skipped to, when the range trie + /// jumps past the requested position or does not have content. + /// + /// The returned value must be a non-boundary state (i.e. `precedingState().isBoundary()` must always be `false`) + /// and must return itself for its `precedingState` in both directions. + default S precedingState() + { + final S state = state(); + if (state == null) + return null; + return state.precedingState(direction()); + } + + /// The range state at the current position. This is either a reportable marker (if the cursor is positioned at a + /// range boundary), or the covering state that applies to this position and the ones preceding it (up to the + /// closest range marker preceding the current position). This carries information for both [#content] and + /// [#precedingState] and is used to reduce the amount of work done to obtain both values. + /// + /// More precisely, `state` can be defined as + /// `state() :== content() != null ? content() : precedingState()`, + /// but it is often easier to implement `state` and let the other two be derived from it by the default + /// implementations. + /// + /// This can be null when no range is active before the current position. + S state(); + + /// Content is only returned for boundary positions. + /// Note that if `content()` is non-null, `precedingState()` does not apply to this exact position. + @Override + default S content() + { + final S state = state(); + if (state == null) + return null; + return state.isBoundary() ? state : null; + } + + @Override + RangeCursor tailCursor(Direction direction); + + /// Corresponding method to tailCursor above applicable when this cursor is ahead. + /// Returns a full-range cursor returning [#precedingState()]. + default RangeCursor precedingStateCursor(Direction direction) + { + S precedingState = precedingState(); + if (precedingState == null) + return null; + + return new FromSet<>(RangesCursor.full(direction, byteComparableVersion()), precedingState); + } + + class Empty> extends Cursor.Empty implements RangeCursor + { + final S coveringState; + + public Empty(S coveringState, ByteComparable.Version version, Direction direction) + { + super(direction, version); + this.coveringState = coveringState; + } + + @Override + public S state() + { + return coveringState; + } + + @Override + public S content() + { + return null; + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + return new RangeCursor.Empty<>(coveringState, byteComparableVersion(), direction); + } + } + + static > RangeCursor empty(Direction direction, ByteComparable.Version version) + { + return new Empty(null, version, direction); + } + + class FromSet> implements RangeCursor + { + final TrieSetCursor source; + final S marker; + + FromSet(TrieSetCursor source, S marker) + { + this.source = source; + this.marker = marker; + } + + @Override + public S state() + { + return source.state().applyToCoveringState(marker); + } + + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public long advance() + { + return source.advance(); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return source.skipTo(encodedSkipPosition); + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + return new FromSet<>(source.tailCursor(direction), marker); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/RangeIntersectionCursor.java b/src/java/org/apache/cassandra/db/tries/RangeIntersectionCursor.java new file mode 100644 index 000000000000..00ff2733a20f --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangeIntersectionCursor.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +class RangeIntersectionCursor> implements RangeCursor +{ + enum State + { + MATCHING, + SET_AHEAD, + SOURCE_AHEAD + } + + final RangeCursor src; + final TrieSetCursor set; + long currentPosition; + S currentState; + State state; + + public RangeIntersectionCursor(RangeCursor src, TrieSetCursor set) + { + this.set = set; + this.src = src; + assert Cursor.compare(src.encodedPosition(), set.encodedPosition()) == 0; + matchingPosition(set.encodedPosition()); + } + + @Override + public long encodedPosition() + { + return currentPosition; + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return set.byteComparableVersion(); + } + + @Override + public S state() + { + return currentState; + } + + @Override + public long advance() + { + switch(state) + { + case MATCHING: + { + long lposition = set.advance(); + if (set.precedingIncluded()) + return advanceWithSetAhead(src.advance()); + else + return advanceSourceToIntersection(lposition); + } + case SET_AHEAD: + return advanceWithSetAhead(src.advance()); + case SOURCE_AHEAD: + return advanceWithSourceAhead(set.advance()); + default: + throw new AssertionError(); + } + } + + @Override + public long skipTo(long skipPosition) + { + switch(state) + { + case MATCHING: + return skipBoth(skipPosition); + case SET_AHEAD: + { + // if the cursor ahead is at the skip point or beyond, we can advance the other cursor to the skip point + long setPosition = set.encodedPosition(); + if (Cursor.compare(skipPosition, setPosition) <= 0) + return advanceWithSetAhead(src.skipTo(skipPosition)); + // otherwise we must perform a full advance + return skipBoth(skipPosition); + } + case SOURCE_AHEAD: + { + // if the cursor ahead is at the skip point or beyond, we can advance the other cursor to the skip point + long sourcePosition = src.encodedPosition(); + if (Cursor.compare(skipPosition, sourcePosition) <= 0) + return advanceWithSourceAhead(set.skipTo(skipPosition)); + // otherwise we must perform a full advance + return skipBoth(skipPosition); + } + default: + throw new AssertionError(); + } + } + + private long skipBoth(long skipPosition) + { + long lposition = set.skipTo(skipPosition); + if (set.precedingIncluded()) + return advanceWithSetAhead(src.skipTo(skipPosition)); + else + return advanceSourceToIntersection(lposition); + } + + @Override + public long advanceMultiple(Cursor.TransitionsReceiver receiver) + { + switch(state) + { + case MATCHING: + { + // Cannot do multi-advance when cursors are at the same position. Applying advance(). + long lposition = set.advance(); + if (set.precedingIncluded()) + return advanceWithSetAhead(src.advance()); + else + return advanceSourceToIntersection(lposition); + } + case SET_AHEAD: + return advanceWithSetAhead(src.advanceMultiple(receiver)); + case SOURCE_AHEAD: + return advanceWithSourceAhead(set.advanceMultiple(receiver)); + default: + throw new AssertionError(); + } + } + + private long advanceWithSetAhead(long sourcePosition) + { + long setPosition = set.encodedPosition(); + long cmp = Cursor.compare(sourcePosition, setPosition); + if (cmp < 0) + return coveredAreaWithSetAhead(sourcePosition); + if (cmp == 0) + return matchingPosition(sourcePosition); + + // Advancing cursor moved beyond the ahead cursor. Check if roles have reversed. + if (src.precedingState() != null) + return coveredAreaWithSourceAhead(setPosition); + else + return advanceSetToIntersection(sourcePosition); + } + + private long advanceWithSourceAhead(long setPosition) + { + long sourcePosition = src.encodedPosition(); + long cmp = Cursor.compare(setPosition, sourcePosition); + if (cmp < 0) + return coveredAreaWithSourceAhead(setPosition); + if (cmp == 0) + return matchingPosition(setPosition); + + // Advancing cursor moved beyond the ahead cursor. Check if roles have reversed. + if (set.precedingIncluded()) + return coveredAreaWithSetAhead(sourcePosition); + else + return advanceSourceToIntersection(setPosition); + } + + private long advanceSourceToIntersection(long setPosition) + { + while (true) + { + // Set is ahead of source, but outside the covered area. Skip source to set's position. + long sourcePosition = src.skipTo(setPosition); + if (Cursor.compare(sourcePosition, setPosition) == 0) + return matchingPosition(setPosition); + if (src.precedingState() != null) + return coveredAreaWithSourceAhead(setPosition); + + // Source is ahead of set, but outside the covered area. Skip set to source's position. + setPosition = set.skipTo(sourcePosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(sourcePosition); + if (set.precedingIncluded()) + return coveredAreaWithSetAhead(sourcePosition); + } + } + + private long advanceSetToIntersection(long sourcePosition) + { + while (true) + { + // Source is ahead of set, but outside the covered area. Skip set to source's position. + long setPosition = set.skipTo(sourcePosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(sourcePosition); + if (set.precedingIncluded()) + return coveredAreaWithSetAhead(sourcePosition); + + // Set is ahead of source, but outside the covered area. Skip source to set's position. + sourcePosition = src.skipTo(setPosition); + if (Cursor.compare(setPosition, sourcePosition) == 0) + return matchingPosition(setPosition); + if (src.precedingState() != null) + return coveredAreaWithSourceAhead(setPosition); + } + } + + private long coveredAreaWithSetAhead(long position) + { + return setState(State.SET_AHEAD, position, src.state()); + } + + private long coveredAreaWithSourceAhead(long position) + { + return setState(State.SOURCE_AHEAD, position, restrict(src.precedingState(), set.state())); + } + + private long matchingPosition(long position) + { + return setState(State.MATCHING, position, restrict(src.state(), set.state())); + } + + private S restrict(S srcState, TrieSetCursor.RangeState setState) + { + if (srcState == null) + return null; + if (srcState.isBoundary()) + return srcState.restrict(setState.applicableBefore, setState.applicableAfter); + + return setState.applyToCoveringState(srcState); + } + + private long setState(State state, long position, S cursorState) + { + this.state = state; + this.currentPosition = position; + this.currentState = cursorState; + return position; + } + + @Override + public RangeCursor tailCursor(Direction direction) + { + switch (state) + { + case MATCHING: + return new RangeIntersectionCursor<>(src.tailCursor(direction), set.tailCursor(direction)); + case SET_AHEAD: + return src.tailCursor(direction); + case SOURCE_AHEAD: + return new RangeIntersectionCursor<>(src.precedingStateCursor(direction), set.tailCursor(direction)); + default: + throw new AssertionError(); + } + } + + static class TrieSet extends RangeIntersectionCursor implements TrieSetCursor + { + public TrieSet(TrieSetCursor src, TrieSetCursor set) + { + super(src, set); + } + + @Override + public RangeState state() + { + RangeState s = super.state(); + return s != null ? s : RangeState.NOT_CONTAINED; + } + + @Override + public TrieSetCursor tailCursor(Direction direction) + { + TrieSetCursor source = (TrieSetCursor) src; + switch (state) + { + case MATCHING: + return new TrieSet(source.tailCursor(direction), set.tailCursor(direction)); + case SET_AHEAD: + return source.tailCursor(direction); + case SOURCE_AHEAD: + return set.tailCursor(direction); + default: + throw new AssertionError(); + } + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/RangeState.java b/src/java/org/apache/cassandra/db/tries/RangeState.java new file mode 100644 index 000000000000..1ba5b7384a08 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangeState.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +/// A range state interface used for range tries. +/// +/// This interface combines two logical concepts: +/// - A range marker / boundary point, which is a point in the trie that either starts or ends a range, or switches +/// between ranges. Such markers are the content of a range trie and by themselves are sufficient to define and +/// recreate the trie. +/// +/// Markers return `true` for [#isBoundary()] and usually return different values for [#precedingState] in the two +/// directions (which can be `null` if no range applies). It is also possible for a marker to specify a point of +/// coverage, in which the preceding state is the same in both directions. +/// +/// - A covering range state, which describes the range that applies to an iteration position which is inside a covered +/// range. These are necessary to be able to efficiently jump inside range tries, for example when constructing the +/// intersection between a range trie and a set trie to answer a query. When a cursor skips to a position after a +/// point requested in a [Cursor#skipTo] call, the preceding state that the cursor returns is a covering state, +/// which describes the range that applies to (i.e. covers) the position the user requested. +/// +/// Covering states return `false` for [#isBoundary()] and must return themselves for [#precedingState] in both +/// directions. +/// +/// Using this combination instead of separate concepts simplifies and improves the performance of the implementation. +public interface RangeState> +{ + /// True if this is a boundary point. Boundary points are reported by `content()` and usually apply a different + /// state before and after the point (i.e. `precedingState(FORWARD) != precedingState(REVERSE)`). + boolean isBoundary(); + + /// Returns the state that applies to the positions preceding this marker in the given iteration order, if any. + /// The iteration order given must much the iteration order of the cursor, as some states are generated on the fly. + /// + /// This must always be a covering state (i.e. [#isBoundary()] must be `false` and the forward and reverse + /// preceding states are equal to itself). + S precedingState(Direction direction); + + /// Returns the state that applies to the positions succeding this marker in the given iteration order, if any. + /// The iteration order given must much the iteration order of the cursor, as some states are generated on the fly. + /// + /// This must always be a covering state (i.e. [#isBoundary()] must be `false` and the forward and reverse + /// preceding states are equal to itself). + S succedingState(Direction direction); + + /// Assuming this is a boundary, returns an intersected version of this state, which may drop parts of a marker that + /// are not covered by the intersecting range. + S restrict(boolean applicableBefore, boolean applicableAfter); + + /// Assuming this is a covering state, promote it to a boundary active in the specified direction. + S asBoundary(Direction direction); +} diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrie.java b/src/java/org/apache/cassandra/db/tries/RangeTrie.java new file mode 100644 index 000000000000..f42b1c8403a4 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangeTrie.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.tries; + +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Predicate; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Range trie. This type of trie represents information associated with ranges of keys. Its primary application is to +/// support deletions/tombstones. +/// +/// Range tries use [RangeState]s for their content, but they also report a [RangeCursor#state] for any prefix point, +/// so that skips inside the trie can figure out if the new position is covered by some of the trie's ranges. +/// +/// Note that some methods of this class accept a value mapper or resolver. It is the user's responsibility to ensure +/// that the application of these mappers preserves the properties of the range trie. +/// +/// See [RangeCursor] and [Trie.md](./Trie.md) for further details on the implementation of range tries. +public interface RangeTrie> extends BaseTrie, RangeTrie> +{ + /// Returns a range trie covering a branch. + /// + /// This performs the same process as intersecting a covered range by a set, converting the passed marker to the + /// proper state depending on the set's coverage and boundaries. To this end, the passed marker must be a covering + /// state (i.e. it must not be a boundary, and must have the same forward and reverse `precedingState`). + static > RangeTrie branch(ByteComparable key, ByteComparable.Version byteComparableVersion, S v) + { + return range(key, true, key, true, byteComparableVersion, v); + } + + /// Returns a range trie covering a single range with the given inclusivity flags for the end points and their + /// descendants. + /// + /// This performs the same process as intersecting a covered range by a set, converting the passed marker to the + /// proper state depending on the set's coverage and boundaries. To this end, the passed marker must be a covering + /// state (i.e. it must not be a boundary, and must have the same forward and reverse `precedingState`). + static > RangeTrie range(ByteComparable left, boolean leftInclusive, + ByteComparable right, boolean rightInclusive, + ByteComparable.Version byteComparableVersion, + S v) + { + return fromSet(TrieSet.range(byteComparableVersion, left, leftInclusive, right, rightInclusive), v); + } + + /// Returns a range trie covering the given set. This performs the same process as intersecting a covered + /// range by a set, converting the passed marker to the proper state depending on the set's coverage and boundaries. + /// To this end, the passed marker must be a covering state (i.e. it must not be boundary, and must return itself + /// as the forward and reverse `precedingState`). + static > RangeTrie fromSet(TrieSet set, S v) + { + Preconditions.checkArgument(!v.isBoundary()); + Preconditions.checkArgument(v.precedingState(Direction.FORWARD) == v); + Preconditions.checkArgument(v.succedingState(Direction.FORWARD) == v); + return dir -> new RangeCursor.FromSet<>(set.cursor(dir), v); + } + + /// Returns a singleton trie mapping the given byte path to a marker. + /// + /// Note: Ranges are meant to use boundaries that are distinct from data and thus a singleton range would list + /// only a boundary and always be empty in terms of covered content. This method is useful in cases where we want + /// to place other data in range tries (e.g. in tests), or if we want to help a force copy predicate decide when to + /// engage (with `beforeBranch = true`). + /// + /// @param beforeBranch Whether the marker should be listed before the descendant branch or after it. + static > RangeTrie point(ByteComparable key, ByteComparable.Version byteComparableVersion, boolean beforeBranch, S v) + { + Preconditions.checkArgument(v.isBoundary()); // make sure marker is returned for content() + Preconditions.checkArgument(v.precedingState(Direction.FORWARD) == null); + Preconditions.checkArgument(v.succedingState(Direction.FORWARD) == null); + return dir -> new SingletonOrderedCursor.Range<>(dir, + key.asPeekableBytes(byteComparableVersion), + byteComparableVersion, + dir.isForward() != beforeBranch, + v); + } + + /// Returns the state that applies to the given key. This is either the precise state at the given position, or + /// the range that covers it (i.e. the `precedingState` of the next marker). + default S applicableRange(ByteComparable key) + { + RangeCursor cursor = cursor(Direction.FORWARD); + final ByteSource bytes = key.asComparableBytes(cursor.byteComparableVersion()); + if (cursor.descendAlong(bytes)) + return cursor.state(); + else + return cursor.precedingState(); + } + + @Override + default RangeTrie intersect(TrieSet set) + { + return dir -> new RangeIntersectionCursor<>(cursor(dir), set.cursor(dir)); + } + + /// Constructs a view of the merge of this trie with the given one. The view is live, i.e. any write to any of the + /// sources will be reflected in the merged view. + /// + /// The resolver will be used to resolve the state of the resulting cursor positions whenever a non-null state + /// applies in both sources; this includes the case where one source is ahead but has a non-null preceding state. + /// (The resolver will not be called if when only one source has non-null applicable state.) + default RangeTrie mergeWith(RangeTrie other, Trie.MergeResolver resolver) + { + return dir -> new MergeCursor.Range<>(resolver, cursor(dir), other.cursor(dir)); + } + + /// Constructs a view of the merge of this trie with the given one, applying a transformation over all values. + /// The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// The resolver will be called for any non-null applicable state in any of the two source to transform it to the + /// output type, and one of its arguments will be null if the other source has no applicable state. + default , Q extends RangeState> + RangeTrie mappingMergeWith(RangeTrie other, + BiFunction resolver) + { + return dir -> new MergeCursor.RangeMapping<>(resolver, + cursor(dir), + other.cursor(dir)); + } + + /// Constructs a view of the merge of multiple tries. The view is live, i.e. any write to any of the + /// sources will be reflected in the merged view. + /// + /// If there is content for a given key in more than one sources, the resolver will be called to obtain the + /// combination. It will also be called if one source has a boundary, while a covering range applies in at least one + /// other source. The resolver will not be called if there's content from only one source and no covering range + /// applies in any of the others. + static > RangeTrie merge(Collection> sources, Trie.CollectionMergeResolver resolver) + { + switch (sources.size()) + { + case 0: + throw new AssertionError(); + case 1: + return sources.iterator().next(); + case 2: + { + Iterator> it = sources.iterator(); + RangeTrie t1 = it.next(); + RangeTrie t2 = it.next(); + return t1.mergeWith(t2, resolver); + } + default: + return dir -> new CollectionMergeCursor.Range<>(resolver, dir, sources, RangeTrie::cursor); + } + } + + /// Applies these ranges to a given data trie. The meaning of the application is defined by the given mapper: + /// whenever the trie's content falls under a range, the mapper is called to return the content that should be + /// presented. + /// + /// This operation will only list positions that are present in the source trie. This means, on one hand, + /// that it is not possible to add new content from the range trie, only to augment (usually delete) existing. + /// On the other, that the size of the range trie does not affect the size of the output or the complexity of + /// processing it. + default Trie applyTo(Trie source, BiFunction mapper) + { + return dir -> new RangeApplyCursor<>(mapper, cursor(dir), source.cursor(dir)); + } + + @Override + default RangeTrie prefixedBy(ByteComparable prefix) + { + return dir -> new PrefixedCursor.Range<>(prefix, cursor(dir)); + } + + @Override + default RangeTrie tailTrie(ByteComparable prefix) + { + RangeCursor c = cursor(Direction.FORWARD); + if (c.descendAlong(prefix.asComparableBytes(c.byteComparableVersion()))) + return c::tailCursor; + else if (c.precedingState() != null) + return c::precedingStateCursor; + else + return null; + } + + @Override + default Iterable>> tailTries(Direction direction, Predicate predicate) + { + return () -> new TrieTailsIterator.AsEntriesRange<>(cursor(direction), predicate); + } + + /// Returns a view of this trie where all content is processed through the given mapping function. + default > RangeTrie mapValues(Function mapper) + { + return dir -> new ContentMappingCursor.Range<>(mapper, cursor(dir)); + } + + // The methods below form the non-public implementation, whose visibility is restricted to package-level. + // The warning suppression below is necessary because we cannot limit the visibility of an interface method. + // We need an interface to be able to implement trie methods by lambdas, which is heavily used above. + + /// Implement this method to provide the concrete trie implementation as the cursor that presents it, most easily + /// done via a lambda as in the methods above. + //noinspection ClassEscapesDefinedScope + RangeCursor makeCursor(Direction direction); + + /// @inheritDoc This method's implementation uses [#makeCursor] to get the cursor and may apply additional cursor + /// checks for tests that run with verification enabled. + //noinspection ClassEscapesDefinedScope + @Override + default RangeCursor cursor(Direction direction) + { + return Trie.DEBUG ? new VerificationCursor.Range<>(makeCursor(direction)) + : makeCursor(direction); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/RangesCursor.java b/src/java/org/apache/cassandra/db/tries/RangesCursor.java new file mode 100644 index 000000000000..005b3ccdfb43 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/RangesCursor.java @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// A cursor for a [TrieSet] that represents a set of ranges. +/// +/// A range is the span between two keys, which can be taken to be inclusive or exclusive of the endpoints and all +/// their descendants. For example, the range `[abc; afg]` includes "abc", "af", "afg", as well as any string staring +/// with "abc", "ad", "afg". The range `[abc; abc]` is the branch of "abc", i.e. the set containing all string +/// starting with "abc". The range `(abc; afg)` includes "abd", "ac", "afe" and all strings starting with these prefixes, +/// but not anything starting with "abc" or "afg". +/// +/// If one of the two bounds is a prefix, the included part of the descendants of the prefix are restricted to only +/// those that fall between the bounds, e.g. `[a; abc]` includes "a", "ab" as well as all strings starting with "aa", +/// "abb" or "abc", but not those starting with "abd". The range `(abc; a]` contains all strings starting with "abd", +/// "ac" among others, but not "abc" or any string starting with "abc". +/// +/// The inclusivity is implemented by using positions before or after the branch in iteration order, which we +/// denote e.g. "abc<" as the point before the point "abc" and its branch, and "abc>" for the point after the branch of +/// "abc". In other words, these positions are such that `abc< < X < abc>` for any string X that starts with "abc". +/// When walking a trie using a cursor in the forward direction, the "<" positions are reported on the descent path, +/// while the ">" positions are reported on the ascent path, and vice versa in reverse. +/// +/// As a corollary of this order, ranges like `(a; abc]` or `(a; a)` are invalid, because the right side (resp. "abc>" +/// and "a<") orders smaller than the left ("a>"). +/// +/// The ranges are specified by passing a sequence of boundaries, where each even boundary is the opening boundary, and +/// the odd ones are the closing boundary for a range. The class also takes an argument for the inclusivity of open +/// and close boundaries which applies to all boundaries of the type. +/// +/// The boundaries given must be in order and cannot overlap, but it is possible to repeat a boundary when they are the +/// same thing or cover a branch because of inclusivity, e.g. +/// - `[a, a] == branch a`: an even number of repeats starting at an even position with inclusivity on both sides +/// specifies a branch (i.e. the set of all descendants of that key) +/// - `[a, b), [b, c) == [a, c)`: an even number of repeats is ignored when the inclusivity is complementary +/// (e.g. inclusive-start-exclusive-end) +/// - `[a, a), [a, b) == [a, b)`: an odd number of repeats is the same as a single copy for complementary inclusivity +/// - `(a, b), (b, c) == (a, c) - branch b`: an even number of repeats which starts at an odd position specifies an +/// excluded branch when both sides are exclusive. +/// +/// If the first boundary is null, the range is open on the left, and if the last non-null boundary is on an even +/// position (i.e. if the array has an even length and its last boundary is null, or if that last boundary is cut off), +/// the range is open on the right: +/// - `[null, a]` covers all keys smaller than `a` +/// - `[a, null]` or `[a]` covers all keys greater than `a` +class RangesCursor implements TrieSetCursor +{ + private final ByteComparable.Version byteComparableVersion; + + /// The current index in the boundaries array. This is the input that will be advanced on the next [#advance] call. + /// When the current boundary is exhausted, it will advance. When it reaches [#endIdx], the cursor is + /// exhausted. + private int currentIdx; + /// The index at which the ranges end. The end or start of the array, adjusted to remove null boundaries. + private final int endIdx; + /// The next position for all boundaries. + long[] nextPositions; + + /// Byte sources producing the rest of the bytes of the boundaries. + ByteSource.Peekable[] sources; + /// The current position (reported to the user). This is usually obtained from `nextPositions[currentIdx]` before + /// the current keys are advanced. + long currentPosition; + /// Current range state, returned by [#state]. + RangeState currentState; + + static final int STARTS_AFTER = 1; + static final int ENDS_AFTER = 2; + + /// Bit mask specifying the positioning of left and right bounds in relation to the covered branch as a combination + /// of [#STARTS_AFTER] and [#ENDS_AFTER] bits. This determines inclusivity: the left sides are inclusive if the + /// `STARTS_AFTER` is not set (i.e. the boundary is to the left/before the key), and the right sides are inclusive + /// if `ENDS_AFTER` is set (i.e. the boundary is to the right/after the key). + final int endsAfterMask; + + public static RangesCursor full(Direction direction, ByteComparable.Version byteComparableVersion) + { + return create(direction, byteComparableVersion, ENDS_AFTER, null, null); + } + + public static RangesCursor create(Direction direction, ByteComparable.Version byteComparableVersion, boolean startsInclusive, boolean endsInclusive, ByteComparable... boundaries) + { + return create(direction, byteComparableVersion, (startsInclusive ? 0 : STARTS_AFTER) | (endsInclusive ? ENDS_AFTER : 0), boundaries); + } + + public static RangesCursor create(Direction direction, ByteComparable.Version byteComparableVersion, int endsAfterMask, ByteComparable... boundaries) + { + long rootPosition = Cursor.rootPosition(direction); + + int length = boundaries.length; + + int arrayLength = (length + 1) & ~1; + long[] nextPositions = new long[arrayLength]; + + ByteSource.Peekable[] sources = new ByteSource.Peekable[arrayLength]; + for (int i = 0; i < arrayLength; ++i) + { + ByteComparable boundary = i < length ? boundaries[i] : null; + int destIndex = direction.select(i, arrayLength - i - 1); + if (boundary != null) + { + sources[destIndex] = boundary.asPeekableBytes(byteComparableVersion); + nextPositions[destIndex] = maybeOnReturnPath(sources, endsAfterMask, direction, rootPosition, destIndex); + } + else + { + // Unspecified bounds are the same as empty string, inclusive. + assert destIndex == 0 || destIndex == arrayLength - 1; + sources[destIndex] = ByteSource.Peekable.EMPTY; + nextPositions[destIndex] = maybeOnReturnPath(sources, ENDS_AFTER, direction, rootPosition, destIndex); + } + } + + // If we have a set that is empty because the first start position is after the root branch, shortcut this to + // plain empty set to avoid reporting NOT_CONTAINED on the return path. + if (arrayLength > 0 && nextPositions[0] == (rootPosition | ON_RETURN_PATH_BIT)) + { + return new RangesCursor(byteComparableVersion, + 0, + null, null, + 0, 0, + rootPosition, + RangeState.NOT_CONTAINED); + } + + RangesCursor cursor = new RangesCursor(byteComparableVersion, + endsAfterMask, + nextPositions, sources, + 0, arrayLength, + rootPosition, + RangeState.NOT_CONTAINED); + cursor.advanceBoundariesAndSelectState(rootPosition); + return cursor; + } + + private RangesCursor(ByteComparable.Version byteComparableVersion, + int endsAfterMask, + long[] nextPositions, + ByteSource.Peekable[] sources, + int startIdx, + int endIdxExclusive, + long currentPosition, + RangeState currentState) + { + this.byteComparableVersion = byteComparableVersion; + this.nextPositions = nextPositions; + this.sources = sources; + this.currentIdx = startIdx; + this.endIdx = endIdxExclusive; + this.currentPosition = currentPosition; + this.currentState = currentState; + this.endsAfterMask = endsAfterMask; + } + + @Override + public long encodedPosition() + { + return currentPosition; + } + + @Override + public RangeState state() + { + return currentState; + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return byteComparableVersion; + } + + @Override + public long advance() + { + if (currentIdx >= endIdx) + return exhausted(); + + return advanceBoundariesAndSelectState(nextPositions[currentIdx]); + } + + private long advanceBoundariesAndSelectState(long nextPosition) + { + // Note: currentIdx may be outside of range when this is called (e.g. for an empty set/tail), which is why we + // take the next position as an argument rather than get it from nextPositions[currentIdx]. + + // In reverse direction before and after are swapped. + Direction direction = Cursor.direction(nextPosition); + int containedSelection = 0; + + // Even left key index means not contained before. + if ((currentIdx & 1) != 0) + containedSelection |= direction.select(RangeState.APPLICABLE_BEFORE, RangeState.APPLICABLE_AFTER); + + // We need to advance all the keys that match the selected next position to prepare them for the next iteration. + for (int advancingIdx = currentIdx; advancingIdx < this.endIdx; ++advancingIdx) + { + long cmp = Cursor.compare(nextPosition, nextPositions[advancingIdx]); + assert cmp <= 0 : "Invalid order of range boundaries"; + if (cmp < 0) // This key is beyond our position, we are done. + break; + + int nextByte = sources[advancingIdx].next(); + if (nextByte == ByteSource.END_OF_STREAM) + { + // If a key (or more than one) ends here, advance currentIdx. Its new value will determine whether the + // succeeding side is included in the set. + assert currentIdx == advancingIdx : "Invalid order of range boundaries"; + ++currentIdx; + } + else + nextPositions[advancingIdx] = maybeOnReturnPath(Cursor.positionForDescentWithByte(nextPosition, nextByte), + advancingIdx, + direction); + } + + // Even left key index after consuming the cursors that end here means not contained after. + if ((currentIdx & 1) != 0) + containedSelection |= direction.select(RangeState.APPLICABLE_AFTER, RangeState.APPLICABLE_BEFORE); + + currentState = RangeState.values()[containedSelection]; + currentPosition = nextPosition; + return nextPosition; + } + + private long maybeOnReturnPath(long nextPosition, int index, Direction direction) + { + return maybeOnReturnPath(sources, endsAfterMask, direction, nextPosition, index); + } + + /// Adjusts the position for keys that end after the current byte, in order to put the boundaries at the right + /// place with respect to descendant branches. + /// + /// This means, for example, placing inclusive right boundaries on the return path for forward iteration. + private static long maybeOnReturnPath(ByteSource.Peekable[] sources, + int endsAfterMask, + Direction direction, + long nextPosition, + int index) + { + // Fast path when the inclusivity options ask for no return path positions. + if (endsAfterMask == direction.select(0, STARTS_AFTER | ENDS_AFTER)) + return nextPosition; + + if (sources[index].peek() != ByteSource.END_OF_STREAM) + return nextPosition; + + int bitInMask = index & 1; + if (!direction.isForward()) // Ends and starts are swapped when going in reverse + bitInMask ^= 1; + boolean placeAfter = (endsAfterMask & (1 << bitInMask)) != 0; + if (placeAfter == direction.isForward()) // Return path is to the left when going in reverse + return nextPosition | ON_RETURN_PATH_BIT; + else + return nextPosition; + } + + // Note: Sets don't need `advanceMultiple` because they are meant to apply as a restriction on other tries, + // and the combined walks necessary to implement such restrictions can only proceed one step at a time. + // Once the restriction identifies that a branch in covered by the set, it can use the trie's `advanceMultiple` + // method. + + @Override + public long skipTo(long encodedSkipPosition) + { + // Since individual keys are singletons, if the skip positition is beyond the prepared next position for a key, + // we are done with that key. So drop all such keys and advance with the first that is at or beyond the + // requested position. + while (currentIdx < endIdx && Cursor.compare(nextPositions[currentIdx], encodedSkipPosition) < 0) + currentIdx++; + return advance(); + } + + private long exhausted() + { + currentIdx = endIdx; + currentState = RangeState.NOT_CONTAINED; + currentPosition = Cursor.exhaustedPosition(currentPosition); + return currentPosition; + } + + @Override + public TrieSetCursor tailCursor(Direction direction) + { + return tailCopyOf(this, direction); + } + + ByteSource.Duplicatable duplicateSource(int index) + { + ByteSource.Peekable src = sources[index]; + if (src == null) + return null; + + if (!(src instanceof ByteSource.Duplicatable)) + sources[index] = src = ByteSource.duplicatable(src); + + ByteSource.Duplicatable duplicatableSource = (ByteSource.Duplicatable) src; + return duplicatableSource.duplicate(); + } + + private static RangesCursor tailCopyOf(RangesCursor copyFrom, Direction newDirection) + { + assert !Cursor.isOnReturnPath(copyFrom.currentPosition) + : "Cannot take tail of a position " + Cursor.toString(copyFrom.currentPosition) + " on the return path."; + boolean directionMatches = newDirection == copyFrom.direction(); + + // Calculate the span of boundaries that are still active for the tail, not including any matching return path + // (the latter has the same effect as the set being open-ended at this tail). + int startInclusive = copyFrom.currentIdx; + int endExclusive = startInclusive; + while (endExclusive < copyFrom.endIdx && + Cursor.compare(copyFrom.nextPositions[endExclusive], + copyFrom.currentPosition | ON_RETURN_PATH_BIT) < 0) + ++endExclusive; + + // We can only drop an even number of boundaries on either size. Expand the indexes to make them even. + int arrayStart = startInclusive & ~1; + int arrayEnd = ((endExclusive + 1) & ~1); + // Note: if endExclusive == startInclusive, arrayEnd - arrayStart is 0 if branch is not included, 2 if included + // (i.e. startInclusive is odd). + + final long depthDiff = Cursor.depthCorrectionValue(copyFrom.currentPosition); + ByteSource.Peekable[] sources = new ByteSource.Peekable[arrayEnd - arrayStart]; + final long[] nextPositions = new long[arrayEnd - arrayStart]; + + int newStartIdx; + + // Duplicate all selected boundaries, adjust depths and reverse the order if the direction doesn't match. + if (directionMatches) + { + for (int i = startInclusive; i < endExclusive; ++i) + { + sources[i - arrayStart] = copyFrom.duplicateSource(i); + nextPositions[i - arrayStart] = copyFrom.nextPositions[i] - depthDiff; + } + newStartIdx = startInclusive - arrayStart; + } + else + { + for (int i = startInclusive; i < endExclusive; ++i) + { + int destIndex = arrayEnd - 1 - i; + sources[destIndex] = copyFrom.duplicateSource(i); + nextPositions[destIndex] = (copyFrom.nextPositions[i] - depthDiff) ^ TRANSITION_MASK; + if (sources[destIndex].peek() == ByteSource.END_OF_STREAM) + nextPositions[destIndex] ^= ON_RETURN_PATH_BIT; + } + newStartIdx = arrayEnd - endExclusive; + } + + // Determine the state the root needs to present. + boolean startIsContained = (newStartIdx & 1) != 0; + RangeState rootState = startIsContained ? newDirection.select(RangeState.START, RangeState.END) + : RangeState.NOT_CONTAINED; + long rootPosition = Cursor.rootPosition(newDirection); + + // Add an onReturnPath root position for open-ended sets. + int last = nextPositions.length - 1; + if (last > 0 && sources[last] == null) + { + sources[last] = ByteSource.EMPTY; + nextPositions[last] = rootPosition | ON_RETURN_PATH_BIT; + } + + return new RangesCursor(copyFrom.byteComparableVersion, + copyFrom.endsAfterMask, + nextPositions, + sources, + newStartIdx, + nextPositions.length, + rootPosition, + rootState); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/SingletonCursor.java b/src/java/org/apache/cassandra/db/tries/SingletonCursor.java new file mode 100644 index 000000000000..f7641b0f6858 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/SingletonCursor.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Trie cursor for a singleton trie, mapping a given key to a value. +class SingletonCursor implements Cursor +{ + ByteSource src; + final ByteComparable.Version byteComparableVersion; + final T value; + protected long currentPosition; + protected long nextPosition; + + + public SingletonCursor(Direction direction, + ByteSource src, + ByteComparable.Version byteComparableVersion, + T value) + { + this.src = src; + this.byteComparableVersion = byteComparableVersion; + this.value = value; + this.currentPosition = Cursor.rootPosition(direction); + prepareNextPosition(currentPosition); + } + + /// Constructor for tail tries. + /// + /// Note: the positions given may have different direction from the direction of the constructed tail. + SingletonCursor(Direction direction, + long currentPosition, + long nextPosition, + ByteSource src, + ByteComparable.Version byteComparableVersion, + T value) + { + assert !Cursor.isOnReturnPath(currentPosition) : "tailCursor cannot be called on return path positions"; + this.src = src; + this.byteComparableVersion = byteComparableVersion; + this.value = value; + this.currentPosition = Cursor.rootPosition(direction); + if (!Cursor.isExhausted(nextPosition)) + this.nextPosition = nextPosition - Cursor.depthCorrectionValue(currentPosition); + else + this.nextPosition = nextPosition; + + if (direction != Cursor.direction(currentPosition)) + this.nextPosition ^= Cursor.TRANSITION_MASK; + } + + void prepareNextPosition(long currentPosition) + { + int nextTransition = src.next(); + + if (nextTransition != ByteSource.END_OF_STREAM) + nextPosition = Cursor.positionForDescentWithByte(currentPosition, nextTransition); + else + nextPosition = Cursor.exhaustedPosition(currentPosition); + } + + @Override + public long advance() + { + currentPosition = nextPosition; + if (!Cursor.isExhausted(nextPosition)) + prepareNextPosition(currentPosition); + return currentPosition; + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + if (Cursor.isExhausted(nextPosition)) + return currentPosition = nextPosition; + + int nextTransition = Cursor.incomingTransition(nextPosition); + int current = nextTransition; + long pos = currentPosition; + int next = src.next(); + while (next != ByteSource.END_OF_STREAM) + { + if (receiver != null) + receiver.addPathByte(current); + current = next; + next = src.next(); + pos += DEPTH_ADJUSTMENT_ONE; + } + currentPosition = Cursor.positionForDescentWithByte(pos, current); + nextPosition = Cursor.exhaustedPosition(currentPosition); + return currentPosition; + } + + @Override + public long skipTo(long encodedSkipPosition) + { + if (Cursor.compare(encodedSkipPosition, nextPosition) <= 0) + return advance(); + else + return currentPosition = Cursor.exhaustedPosition(currentPosition); + } + + protected boolean atEnd() + { + return Cursor.isExhausted(nextPosition) && !Cursor.isExhausted(currentPosition); + } + + @Override + public T content() + { + return atEnd() ? value : null; + } + + @Override + public long encodedPosition() + { + return currentPosition; + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return byteComparableVersion; + } + + @Override + public SingletonCursor tailCursor(Direction dir) + { + return new SingletonCursor<>(dir, + currentPosition, nextPosition, + duplicateSource(), + byteComparableVersion, + value); + } + + ByteSource.Duplicatable duplicateSource() + { + if (!(src instanceof ByteSource.Duplicatable)) + src = ByteSource.duplicatable(src); + ByteSource.Duplicatable duplicatableSource = (ByteSource.Duplicatable) src; + return duplicatableSource.duplicate(); + } + + static class DeletionAware> + extends SingletonCursor implements DeletionAwareCursor + { + DeletionAware(Direction direction, + ByteSource src, + ByteComparable.Version byteComparableVersion, + T value) + { + super(direction, src, byteComparableVersion, value); + } + + DeletionAware(Direction direction, + long currentPosition, + long nextPosition, + ByteSource src, + ByteComparable.Version byteComparableVersion, + T value) + { + super(direction, currentPosition, nextPosition, src, byteComparableVersion, value); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return null; + } + + @Override + public DeletionAware tailCursor(Direction dir) + { + return new DeletionAware<>(dir, currentPosition, nextPosition, duplicateSource(), byteComparableVersion, value); + } + } + + static class DeletionBranch> + extends SingletonCursor implements DeletionAwareCursor + { + RangeTrie deletionBranch; + + DeletionBranch(Direction direction, ByteSource src, ByteComparable.Version byteComparableVersion, RangeTrie deletionBranch) + { + super(direction, src, byteComparableVersion, null); + this.deletionBranch = deletionBranch; + } + + DeletionBranch(Direction direction, + long currentPosition, + long nextPosition, + ByteSource src, + ByteComparable.Version byteComparableVersion, + RangeTrie deletionBranch) + { + super(direction, currentPosition, nextPosition, src, byteComparableVersion, null); + this.deletionBranch = deletionBranch; + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + return atEnd() ? deletionBranch.cursor(direction) : null; + } + + @Override + public DeletionBranch tailCursor(Direction dir) + { + return new DeletionBranch<>(dir, currentPosition, nextPosition, duplicateSource(), byteComparableVersion, deletionBranch); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/SingletonOrderedCursor.java b/src/java/org/apache/cassandra/db/tries/SingletonOrderedCursor.java new file mode 100644 index 000000000000..28b36ec341c1 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/SingletonOrderedCursor.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// Trie cursor for a singleton trie, mapping a given key to a value. This version places the content before or after +/// the branch, which is useful for range boundaries and ordered tries (see [Trie#singletonOrdered]). +class SingletonOrderedCursor extends SingletonCursor +{ + final boolean presentOnReturnPath; + + + public SingletonOrderedCursor(Direction direction, + ByteSource.Peekable src, + ByteComparable.Version byteComparableVersion, + boolean presentOnReturnPath, + T value) + { + super(direction, src, byteComparableVersion, value); + this.presentOnReturnPath = presentOnReturnPath; + adjustNextPosition(src); + } + + /// Constructor for tail tries. + /// + /// Note: the positions given may have different direction from the direction of the constructed tail. + SingletonOrderedCursor(Direction direction, + long currentPosition, + long nextPosition, + ByteSource.Peekable src, + ByteComparable.Version byteComparableVersion, + boolean presentOnReturnPath, + T value) + { + super(direction, currentPosition, nextPosition, src, byteComparableVersion, value); + // We need to do some additional steps if the directions differ. + if (direction != Cursor.direction(currentPosition)) + { + if (!Cursor.isExhausted(nextPosition) && src.peek() == ByteSource.END_OF_STREAM) + this.nextPosition ^= ON_RETURN_PATH_BIT; + this.presentOnReturnPath = !presentOnReturnPath; + } + else + this.presentOnReturnPath = presentOnReturnPath; + } + + @Override + void prepareNextPosition(long currentPosition) + { + super.prepareNextPosition(currentPosition); + adjustNextPosition(((ByteSource.Peekable) src)); + } + + private void adjustNextPosition(ByteSource.Peekable src) + { + if (presentOnReturnPath) + { + if (Cursor.isExhausted(nextPosition)) + { + if (Cursor.isRootPosition(currentPosition)) + nextPosition = currentPosition | ON_RETURN_PATH_BIT; + } + else if (src.peek() == ByteSource.END_OF_STREAM) + nextPosition |= ON_RETURN_PATH_BIT; + } + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + if (Cursor.depth(nextPosition) <= 0) // exhausted or root return path + return advance(); + + super.advanceMultiple(receiver); + if (presentOnReturnPath) + currentPosition |= ON_RETURN_PATH_BIT; + return currentPosition; + } + + @Override + public SingletonOrderedCursor tailCursor(Direction dir) + { + return new SingletonOrderedCursor<>(dir, + currentPosition, nextPosition, + duplicateSource(), + byteComparableVersion, + presentOnReturnPath, + value); + } + + static class Range> extends SingletonOrderedCursor implements RangeCursor + { + public Range(Direction direction, + ByteSource.Peekable src, + ByteComparable.Version byteComparableVersion, + boolean presentOnReturnPath, + S value) + { + super(direction, src, byteComparableVersion, presentOnReturnPath, value); + } + + public Range(Direction direction, + long currentPosition, + long nextPosition, + ByteSource.Peekable src, + ByteComparable.Version byteComparableVersion, + boolean presentOnReturnPath, + S value) + { + super(direction, currentPosition, nextPosition, src, byteComparableVersion, presentOnReturnPath, value); + } + + @Override + public S precedingState() + { + return null; + } + + @Override + public S state() + { + return content(); + } + + @Override + public Range tailCursor(Direction dir) + { + return new Range<>(dir, currentPosition, nextPosition, duplicateSource(), byteComparableVersion, presentOnReturnPath, value); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java deleted file mode 100644 index e3eb62783ea1..000000000000 --- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.db.tries; - -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; - -/** - * Singleton trie, mapping the given key to value. - */ -class SingletonTrie extends Trie -{ - private final ByteComparable key; - private final ByteComparable.Version byteComparableVersion; - private final T value; - - SingletonTrie(ByteComparable key, ByteComparable.Version byteComparableVersion, T value) - { - this.byteComparableVersion = byteComparableVersion; - this.key = key; - this.value = value; - } - - public Cursor cursor(Direction direction) - { - return new Cursor(direction); - } - - class Cursor implements Trie.Cursor - { - private final Direction direction; - private ByteSource src = key.asComparableBytes(byteComparableVersion); - private int currentDepth = 0; - private int currentTransition = -1; - private int nextTransition = src.next(); - - public Cursor(Direction direction) - { - this.direction = direction; - } - - @Override - public int advance() - { - currentTransition = nextTransition; - if (currentTransition != ByteSource.END_OF_STREAM) - { - nextTransition = src.next(); - return ++currentDepth; - } - else - { - return currentDepth = -1; - } - } - - @Override - public int advanceMultiple(TransitionsReceiver receiver) - { - if (nextTransition == ByteSource.END_OF_STREAM) - return currentDepth = -1; - int current = nextTransition; - int depth = currentDepth; - int next = src.next(); - while (next != ByteSource.END_OF_STREAM) - { - if (receiver != null) - receiver.addPathByte(current); - current = next; - next = src.next(); - ++depth; - } - currentTransition = current; - nextTransition = next; - return currentDepth = ++depth; - } - - @Override - public int skipTo(int skipDepth, int skipTransition) - { - if (skipDepth <= currentDepth) - { - assert skipDepth < currentDepth || direction.gt(skipTransition, currentTransition); - return currentDepth = -1; // no alternatives - } - if (direction.gt(skipTransition, nextTransition)) - return currentDepth = -1; // request is skipping over our path - - return advance(); - } - - @Override - public int depth() - { - return currentDepth; - } - - @Override - public T content() - { - return nextTransition == ByteSource.END_OF_STREAM ? value : null; - } - - @Override - public int incomingTransition() - { - return currentTransition; - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - return byteComparableVersion; - } - - @Override - public Trie tailTrie() - { - if (!(src instanceof ByteSource.Duplicatable)) - src = ByteSource.duplicatable(src); - ByteSource.Duplicatable duplicatableSource = (ByteSource.Duplicatable) src; - - return new SingletonTrie(v -> duplicatableSource.duplicate(), byteComparableVersion, value); - } - } -} diff --git a/src/java/org/apache/cassandra/db/tries/SlicedTrie.java b/src/java/org/apache/cassandra/db/tries/SlicedTrie.java deleted file mode 100644 index c14f0adde620..000000000000 --- a/src/java/org/apache/cassandra/db/tries/SlicedTrie.java +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.db.tries; - -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; - -/** - * Represents a sliced view of a trie, i.e. the content within the given pair of bounds. - * - * Applied by advancing three tries in parallel: the left bound, the source and the right bound. While the source - * bound is smallest, we don't issue any content and skip over any children. As soon as the left bound becomes strictly - * smaller, we stop processing it (as it's a singleton trie it will remain smaller until it's exhausted) and start - * issuing the nodes and content from the source. As soon as the right bound becomes strictly smaller, we finish the - * walk. - * - * We don't explicitly construct tries for the two bounds; tracking the current depth (= prefix length) and transition - * as characters are requested from the key is sufficient as it is a trie with just a single descent path. Because we - * need the next character to tell if it's been exhausted, we keep these one position ahead. The source is always - * advanced, thus this gives us the thing to compare it against after the advance. - * - * We also track the current state to make some decisions a little simpler. - * - * See Trie.md for further details. - */ -public class SlicedTrie extends Trie -{ - private final Trie source; - - /** Left-side boundary. The characters of this are requested as we descend along the left-side boundary. */ - private final ByteComparable left; - - /** Right-side boundary. The characters of this are requested as we descend along the right-side boundary. */ - private final ByteComparable right; - - private final boolean includeLeft; - private final boolean includeRight; - - public SlicedTrie(Trie source, ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight) - { - this.source = source; - this.left = left; - this.right = right; - this.includeLeft = includeLeft; - this.includeRight = includeRight; - } - - static ByteSource openAndMaybeAdd0(ByteComparable key, ByteComparable.Version byteComparableVersion, boolean shouldAdd0) - { - if (key == null) - return null; - ByteSource src = key.asComparableBytes(byteComparableVersion); - if (shouldAdd0) - return ByteSource.append(src, 0); - else - return src; - } - - @Override - protected Cursor cursor(Direction direction) - { - Cursor sourceCursor = source.cursor(direction); - // The cursor is left-inclusive and right-exclusive by default. If we need to change the inclusiveness, adjust - // the bound to the next possible value by adding a 00 byte at the end. - ByteSource leftSource = openAndMaybeAdd0(left, sourceCursor.byteComparableVersion(), !includeLeft); - ByteSource rightSource = openAndMaybeAdd0(right, sourceCursor.byteComparableVersion(), includeRight); - - // Empty left bound is the same as having no left bound, adjust for that. - int leftNext = -1; - if (leftSource != null) - { - leftNext = leftSource.next(); - if (leftNext == ByteSource.END_OF_STREAM) - leftSource = null; - } - - // Empty right bound means the result can only be empty. Make things easier for the cursor by handling this. - int rightNext = -1; - if (rightSource != null) - { - rightNext = rightSource.next(); - if (rightNext == ByteSource.END_OF_STREAM) - { - assert leftSource == null : "Invalid range " + sliceString(); - return new Trie.EmptyCursor<>(direction, sourceCursor.byteComparableVersion()); - } - } - - return new SlicedCursor<>(sourceCursor, - leftSource, - leftNext, - rightSource, - rightNext, - direction); - } - - String sliceString() - { - ByteComparable.Version version = source.cursor(Direction.FORWARD).byteComparableVersion(); - return String.format("%s%s;%s%s", - includeLeft ? "[" : "(", - left.byteComparableAsString(version), - right.byteComparableAsString(version), - includeRight ? "]" : ")"); - } - - private enum State - { - /** - * The cursor is at the initial phase while it is walking prefixes of both bounds. - * Content is not to be reported. - */ - COMMON_PREFIX, - /** - * The cursor is positioned on some prefix of the start bound, strictly before any prefix of the end bound in - * iteration order. - * Content should only be reported in the reverse direction (as these prefixes are prefixes of the right bound - * and included in the slice). - */ - START_PREFIX, - /** - * The cursor is positioned inside the range, i.e. strictly between any prefixes of the start and end bounds. - * All content should be reported. - */ - INSIDE, - /** - * The cursor is positioned on some prefix of the end bound, strictly after any prefix of the start bound. - * Content should only be reported in the forward direction. - */ - END_PREFIX, - /** The cursor is positioned beyond the end bound. Exhaustion (depth -1) has been reported. */ - EXHAUSTED; - } - - private static class SlicedCursor implements Cursor - { - private ByteSource start; - private ByteSource end; - private final Cursor source; - private final Direction direction; - - State state; - int startNext; - int startNextDepth; - int endNext; - int endNextDepth; - - public SlicedCursor(Cursor source, - ByteSource leftSource, - int leftNext, - ByteSource rightSource, - int rightNext, - Direction direction) - { - this.source = source; - this.direction = direction; - start = direction.select(leftSource, rightSource); - end = direction.select(rightSource, leftSource); - startNext = direction.select(leftNext, rightNext); - endNext = direction.select(rightNext, leftNext); - startNextDepth = start != null ? 1 : 0; - endNextDepth = end != null ? 1 : 0; - state = start != null - ? end != null - ? State.COMMON_PREFIX - : State.START_PREFIX - : end != null - ? State.END_PREFIX - : State.INSIDE; - } - - @Override - public int advance() - { - int newDepth; - int transition; - - switch (state) - { - case COMMON_PREFIX: - case START_PREFIX: - // Skip any transitions before the start bound - newDepth = source.skipTo(startNextDepth, startNext); - transition = source.incomingTransition(); - return checkBothBounds(newDepth, transition); - case INSIDE: - case END_PREFIX: - newDepth = source.advance(); - transition = source.incomingTransition(); - return checkEndBound(newDepth, transition); - default: - throw new AssertionError(); - } - } - - private int markDone() - { - state = State.EXHAUSTED; - return -1; - } - - int checkBothBounds(int newDepth, int transition) - { - // Check if we are still following the start bound - if (newDepth == startNextDepth && transition == startNext) - { - assert startNext != ByteSource.END_OF_STREAM; - startNext = start.next(); - ++startNextDepth; - State currState = state; - // In the forward direction the exact match for the left bound and all descendant states are - // included in the set. - // In the reverse direction we will instead use the -1 as target transition and thus ascend on - // the next advance (skipping the exact right bound and all its descendants). - if (startNext == ByteSource.END_OF_STREAM && direction.isForward()) - state = State.INSIDE; // checkEndBound may adjust this to END_PREFIX - if (currState == State.START_PREFIX) - return newDepth; // there is no need to check the end bound as we descended along a - // strictly earlier path - } - else // otherwise we are beyond the start bound - state = State.INSIDE; // checkEndBound may adjust this to END_PREFIX - - return checkEndBound(newDepth, transition); - } - - private int checkEndBound(int newDepth, int transition) - { - // Cursor positions compare by depth descending and transition ascending. - if (newDepth > endNextDepth) - return newDepth; // happy and quick path in the interior of the slice - // (state == State.INSIDE can be asserted here (we skip it for efficiency)) - if (newDepth < endNextDepth) - return markDone(); - // newDepth == endDepth - if (direction.lt(transition, endNext)) - { - adjustStateStrictlyBeforeEnd(); - return newDepth; - } - if (direction.lt(endNext, transition)) - return markDone(); - - // Following end bound - endNext = end.next(); - ++endNextDepth; - if (endNext == ByteSource.END_OF_STREAM) - { - // At the exact end bound. - if (direction.isForward()) - { - // In forward direction the right bound is not included in the slice. - return markDone(); - } - else - { - // In reverse, the left bound and all its descendants are included, thus we use the -1 as limiting - // transition. We can also see the bound as strictly ahead of our current position as the current - // branch should be fully included. - adjustStateStrictlyBeforeEnd(); - } - } - else - adjustStateAtEndPrefix(); - return newDepth; - } - - private void adjustStateAtEndPrefix() - { - switch (state) - { - case INSIDE: - state = State.END_PREFIX; - break; - } - } - - private void adjustStateStrictlyBeforeEnd() - { - switch (state) - { - case COMMON_PREFIX: - state = State.START_PREFIX; - break; - case END_PREFIX: - state = State.INSIDE; - break; - } - } - - @Override - public int advanceMultiple(TransitionsReceiver receiver) - { - switch (state) - { - case COMMON_PREFIX: - case START_PREFIX: - case END_PREFIX: - return advance(); // descend only one level to be able to compare cursors correctly - case INSIDE: - int depth = source.depth(); - int newDepth = source.advanceMultiple(receiver); - if (newDepth > depth) - return newDepth; // successfully descended - // we ascended, check if we are still within boundaries - return checkEndBound(newDepth, source.incomingTransition()); - default: - throw new AssertionError(); - } - } - - @Override - public int skipTo(int skipDepth, int skipTransition) - { - // if skipping beyond end, we are done - if (skipDepth < endNextDepth || skipDepth == endNextDepth && direction.gt(skipTransition, endNext)) - return markDone(); - // if skipping before start, adjust request to skip to start - if (skipDepth == startNextDepth && direction.lt(skipTransition, startNext)) - skipTransition = startNext; - - switch (state) - { - case START_PREFIX: - case COMMON_PREFIX: - return checkBothBounds(source.skipTo(skipDepth, skipTransition), source.incomingTransition()); - case INSIDE: - case END_PREFIX: - return checkEndBound(source.skipTo(skipDepth, skipTransition), source.incomingTransition()); - default: - throw new AssertionError("Cursor already exhaused."); - } - } - - @Override - public int depth() - { - return state == State.EXHAUSTED ? -1 : source.depth(); - } - - @Override - public int incomingTransition() - { - return source.incomingTransition(); - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - return source.byteComparableVersion(); - } - - @Override - public T content() - { - switch (state) - { - case INSIDE: - return source.content(); - // Additionally, prefixes of the right bound (which are not prefixes of the left) need to be reported: - case START_PREFIX: - // start prefixes in reverse direction (but making sure we don't report the exact match); - return !direction.isForward() && startNext != ByteSource.END_OF_STREAM ? source.content() : null; - case END_PREFIX: - // end prefixes in forward direction. - return direction.isForward() ? source.content() : null; - default: - return null; - } - } - - @Override - public Trie tailTrie() - { - final Trie sourceTail = source.tailTrie(); - switch (state) - { - case INSIDE: - return sourceTail; - case COMMON_PREFIX: - return makeTrie(sourceTail, duplicatableStart(), startNext, duplicatableEnd(), endNext, direction); - case START_PREFIX: - return makeTrie(sourceTail, duplicatableStart(), startNext, null, -1, direction); - case END_PREFIX: - return makeTrie(sourceTail, null, -1, duplicatableEnd(), endNext, direction); - default: - throw new UnsupportedOperationException("tailTrie on a slice boundary"); - } - } - - private ByteSource.Duplicatable duplicatableStart() - { - if (start == null || start instanceof ByteSource.Duplicatable) - return (ByteSource.Duplicatable) start; - ByteSource.Duplicatable duplicatable = ByteSource.duplicatable(start); - start = duplicatable; - return duplicatable; - } - - private ByteSource.Duplicatable duplicatableEnd() - { - if (end == null || end instanceof ByteSource.Duplicatable) - return (ByteSource.Duplicatable) end; - ByteSource.Duplicatable duplicatable = ByteSource.duplicatable(end); - end = duplicatable; - return duplicatable; - } - - - private static Trie makeTrie(Trie source, - ByteSource.Duplicatable startSource, - int startNext, - ByteSource.Duplicatable endSource, - int endNext, - Direction direction) - { - ByteSource.Duplicatable leftSource = direction.select(startSource, endSource); - ByteSource.Duplicatable rightSource = direction.select(endSource, startSource); - int leftNext = direction.select(startNext, endNext); - int rightNext = direction.select(endNext, startNext); - return new Trie() - { - @Override - protected Cursor cursor(Direction direction) - { - return new SlicedCursor<>(source.cursor(direction), - leftSource != null ? leftSource.duplicate() : null, - leftNext, - rightSource != null ? rightSource.duplicate() : null, - rightNext, - direction); - } - }; - } - } -} diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java index 90006e52525b..4c5ccdcb1336 100644 --- a/src/java/org/apache/cassandra/db/tries/Trie.java +++ b/src/java/org/apache/cassandra/db/tries/Trie.java @@ -17,690 +17,173 @@ */ package org.apache.cassandra.db.tries; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; + import java.util.Collection; import java.util.Iterator; import java.util.Map; -import java.util.function.BiConsumer; -import java.util.function.Consumer; +import java.util.function.BiFunction; import java.util.function.Function; +import java.util.function.Predicate; -import com.google.common.collect.ImmutableList; - -import org.agrona.DirectBuffer; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; -/** - * Base class for tries. - *

    - * Normal users of tries will only use the public methods, which provide various transformations of the trie, conversion - * of its content to other formats (e.g. iterable of values), and several forms of processing. - *

    - * For any unimplemented data extraction operations one can build on the {@link TrieEntriesWalker} (for-each processing) - * and {@link TrieEntriesIterator} (to iterator) base classes, which provide the necessary mechanisms to handle walking - * the trie. - *

    - * The internal representation of tries using this interface is defined in the {@link Cursor} interface. - *

    - * Cursors are a method of presenting the internal structure of a trie without representing nodes as objects, which is - * still useful for performing the basic operations on tries (iteration, slicing/intersection and merging). A cursor - * will list the nodes of a trie in order, together with information about the path that was taken to reach them. - *

    - * To begin traversal over a trie, one must retrieve a cursor by calling {@link #cursor}. Because cursors are - * stateful, the traversal must always proceed from one thread. Should concurrent reads be required, separate calls to - * {@link #cursor} must be made. Any modification that has completed before the construction of a cursor must be - * visible, but any later concurrent modifications may be presented fully, partially or not at all; this also means that - * if multiple are made, the cursor may see any part of any subset of them. - *

    - * Note: This model only supports depth-first traversals. We do not currently have a need for breadth-first walks. - *

    - * See Trie.md for further description of the trie representation model. - * - * @param The content type of the trie. - */ -public abstract class Trie +/// Basic deterministic trie interface. +/// +/// Normal users of tries will only use the public methods of [BaseTrie] and this class, which provide various +/// transformations of the trie, conversion of its content to other formats (e.g. iterable of values), and several +/// forms of processing. +/// +/// For any unimplemented data extraction operations one can build on the [TrieEntriesWalker] (for-each processing) +/// and [TrieEntriesIterator] (to iterator) base classes, which provide the necessary mechanisms to handle walking +/// the trie. +/// +/// The internal representation of tries using this interface is defined in the [Cursor] interface, accessed via the +/// [CursorWalkable] interface's [#cursor] method. [#cursor]/[#makeCursor] is the only method of the interface without +/// an implementation; to define a trie, one needs to implement it. We usually do this by returning a lambda, and Java's +/// single abstract method functionality takes care of adding the trie plumbing around it. +/// +/// Cursors are a method of presenting the internal structure of a trie without representing nodes as objects, which is +/// still useful for performing the basic operations on tries (iteration, slicing/intersection and merging). A cursor +/// will list the nodes of a trie in order, together with information about the path that was taken to reach them. +/// +/// To begin traversal over a trie, one must retrieve a cursor by calling [#cursor]. Because cursors are +/// stateful, the traversal must always proceed from one thread. Should concurrent reads be required, separate calls to +/// [#cursor] must be made. Any modification that has completed before the construction of a cursor must be +/// visible, but any later concurrent modifications may be presented fully, partially or not at all; this also means that +/// if multiple are made, the cursor may see any part of any subset of them. +/// +/// Note: This model only supports depth-first traversals. We do not currently have a need for breadth-first walks. +/// +/// See [Trie.md](./Trie.md) for further description of the trie representation model. +/// +/// @param The content type of the trie. +public interface Trie extends BaseTrie, Trie> { - /** - * A trie cursor. - *

    - * This is the internal representation of the trie, which enables efficient walks and basic operations (merge, - * slice) on tries. - *

    - * The cursor represents the state of a walk over the nodes of trie. It provides three main features:

      - *
    • the current {@code depth} or descend-depth in the trie;
    • - *
    • the {@code incomingTransition}, i.e. the byte that was used to reach the current point;
    • - *
    • the {@code content} associated with the current node,
    • - *
    - * and provides methods for advancing to the next position. This is enough information to extract all paths, and - * also to easily compare cursors over different tries that are advanced together. Advancing is always done in - * order; if one imagines the set of nodes in the trie with their associated paths, a cursor may only advance from a - * node with a lexicographically smaller path to one with bigger. The {@code advance} operation moves to the immediate - * next, it is also possible to skip over some items e.g. all children of the current node ({@code skipChildren}). - *

    - * Moving to the immediate next position in the lexicographic order is accomplished by:

      - *
    • if the current node has children, moving to its first child;
    • - *
    • otherwise, ascend the parent chain and return the next child of the closest parent that still has any.
    • - *
    - * As long as the trie is not exhausted, advancing always takes one step down, from the current node, or from a node - * on the parent chain. By comparing the new depth (which {@code advance} also returns) with the one before the advance, - * one can tell if the former was the case (if {@code newDepth == oldDepth + 1}) and how many steps up we had to take - * ({@code oldDepth + 1 - newDepth}). When following a path down, the cursor will stop on all prefixes. - *

    - * When it is created the cursor is placed on the root node with {@code depth() = 0}, {@code incomingTransition() = -1}. - * Since tries can have mappings for empty, content() can possibly be non-null. The cursor is exhausted when it - * returns a depth of -1 (the operations that advance a cursor return the depth, and {@code depth()} will also - * return -1 if queried afterwards). It is not allowed for a cursor to start in exhausted state; once a cursor is - * exhausted, calling any of the advance methods or {@code tailTrie} is an error. - *

    - * For example, the following trie:
    - *

    -     *  t
    -     *   r
    -     *    e
    -     *     e *
    -     *    i
    -     *     e *
    -     *     p *
    -     *  w
    -     *   i
    -     *    n  *
    -     * 
    - * has nodes reachable with the paths
    - *   "", t, tr, tre, tree*, tri, trie*, trip*, w, wi, win*
    - * and the cursor will list them with the following {@code (depth, incomingTransition)} pairs:
    - *   (0, -1), (1, t), (2, r), (3, e), (4, e)*, (3, i), (4, e)*, (4, p)*, (1, w), (2, i), (3, n)* - *

    - * Because we exhaust transitions on bigger depths before we go the next transition on the smaller ones, when - * cursors are advanced together their positions can be easily compared using only the {@code depth} and - * {@code incomingTransition}:

      - *
    • one that is higher in depth is before one that is lower;
    • - *
    • for equal depths, the one with smaller incomingTransition is first.
    • - *
    - * If we consider walking the trie above in parallel with this:
    - *
    -     *  t
    -     *   r
    -     *    i
    -     *     c
    -     *      k *
    -     *  u
    -     *   p *
    -     * 
    - * the combined iteration will proceed as follows:
    -     *  (0, -1)+  (0, -1)+          cursors equal, advance both
    -     *  (1, t)+   (1, t)+   t       cursors equal, advance both
    -     *  (2, r)+   (2, r)+   tr      cursors equal, advance both
    -     *  (3, e)+ < (3, i)    tre     cursors not equal, advance smaller (3 = 3, e < i)
    -     *  (4, e)+ < (3, i)    tree*   cursors not equal, advance smaller (4 > 3)
    -     *  (3, i)+   (3, i)+   tri     cursors equal, advance both
    -     *  (4, e)  > (4, c)+   tric    cursors not equal, advance smaller (4 = 4, e > c)
    -     *  (4, e)  > (5, k)+   trick*  cursors not equal, advance smaller (4 < 5)
    -     *  (4, e)+ < (1, u)    trie*   cursors not equal, advance smaller (4 > 1)
    -     *  (4, p)+ < (1, u)    trip*   cursors not equal, advance smaller (4 > 1)
    -     *  (1, w)  > (1, u)+   u       cursors not equal, advance smaller (1 = 1, w > u)
    -     *  (1, w)  > (2, p)+   up*     cursors not equal, advance smaller (1 < 2)
    -     *  (1, w)+ < (-1, -1)  w       cursors not equal, advance smaller (1 > -1)
    -     *  (2, i)+ < (-1, -1)  wi      cursors not equal, advance smaller (2 > -1)
    -     *  (3, n)+ < (-1, -1)  win*    cursors not equal, advance smaller (3 > -1)
    -     *  (-1, -1)  (-1, -1)          both exhasted
    -     *  
    - *

    - * Cursors are created with a direction (forward or reverse), which specifies the order in which a node's children - * are iterated (smaller first or larger first). Note that entries returned in reverse direction are in - * lexicographic order for the inverted alphabet, which is not the same as being presented in reverse. For example, - * a cursor for a trie containing "ab", "abc" and "cba", will visit the nodes in order "cba", "ab", "abc", i.e. - * prefixes will still be reported before their descendants. - */ - protected interface Cursor - { - - /** - * @return the current descend-depth; 0, if the cursor has just been created and is positioned on the root, - * and -1, if the trie has been exhausted. - */ - int depth(); - - /** - * @return the last transition taken; if positioned on the root, return -1 - */ - int incomingTransition(); - - /** - * @return the content associated with the current node. This may be non-null for any presented node, including - * the root. - */ - T content(); - - /** - * Returns the direction in which this cursor is progressing. - */ - Direction direction(); - - /** - * Returns the byte-comparable version that this trie uses. - */ - ByteComparable.Version byteComparableVersion(); - - /** - * Advance one position to the node whose associated path is next lexicographically. - * This can be either:

      - *
    • descending one level to the first child of the current node, - *
    • ascending to the closest parent that has remaining children, and then descending one level to its next - * child. - *
    - * It is an error to call this after the trie has already been exhausted (i.e. when depth() == -1); - * for performance reasons we won't always check this. - * - * @return depth (can be prev+1 or <=prev), -1 means that the trie is exhausted - */ - int advance(); - - /** - * Advance, descending multiple levels if the cursor can do this for the current position without extra work - * (e.g. when positioned on a chain node in a memtable trie). If the current node does not have children this - * is exactly the same as advance(), otherwise it may take multiple steps down (but will not necessarily, even - * if they exist). - *

    - * Note that if any positions are skipped, their content must be null. - *

    - * This is an optional optimization; the default implementation falls back to calling advance. - *

    - * It is an error to call this after the trie has already been exhausted (i.e. when depth() == -1); - * for performance reasons we won't always check this. - * - * @param receiver object that will receive all transitions taken except the last; - * on ascend, or if only one step down was taken, it will not receive any - * @return the new depth, -1 if the trie is exhausted - */ - default int advanceMultiple(TransitionsReceiver receiver) - { - return advance(); - } - - /** - * Advance all the way to the next node with non-null content. - *

    - * It is an error to call this after the trie has already been exhausted (i.e. when depth() == -1); - * for performance reasons we won't always check this. - * - * @param receiver object that will receive all taken transitions - * @return the content, null if the trie is exhausted - */ - default T advanceToContent(ResettingTransitionsReceiver receiver) - { - int prevDepth = depth(); - while (true) - { - int currDepth = advanceMultiple(receiver); - if (currDepth <= 0) - return null; - if (receiver != null) - { - if (currDepth <= prevDepth) - receiver.resetPathLength(currDepth - 1); - receiver.addPathByte(incomingTransition()); - } - T content = content(); - if (content != null) - return content; - prevDepth = currDepth; - } - } - - /** - * Advance to the specified depth and incoming transition or the first valid position that is after the specified - * position. The inputs must be something that could be returned by a single call to {@link #advance} (i.e. - * {@code depth} must be <= current depth + 1, and {@code incomingTransition} must be higher than what the - * current state saw at the requested depth. - * - * @return the new depth, always <= previous depth; -1 if the trie is exhausted - */ - int skipTo(int skipDepth, int skipTransition); - - /** - * Descend into the cursor with the given path. - * - * @return True if the descent is positioned at the end of the given path, false if the trie did not have a path - * for it. In the latter case the cursor is positioned at the first node that follows the given key in iteration - * order. - */ - default boolean descendAlong(ByteSource bytes) - { - int next = bytes.next(); - int depth = depth(); - while (next != ByteSource.END_OF_STREAM) - { - if (skipTo(++depth, next) != depth || incomingTransition() != next) - return false; - next = bytes.next(); - } - return true; - } - - /** - * Returns a tail trie, i.e. a trie whose root is the current position. Walking a tail trie will list all - * descendants of the current position with depth adjusted by the current depth. - *

    - * It is an error to call tailTrie on an exhausted cursor. - */ - Trie tailTrie(); - } - - protected abstract Cursor cursor(Direction direction); - - /** - * Used by {@link Cursor#advanceMultiple} to feed the transitions taken. - */ - protected interface TransitionsReceiver - { - /** Add a single byte to the path. */ - void addPathByte(int nextByte); - /** Add the count bytes from position pos in the given buffer. */ - void addPathBytes(DirectBuffer buffer, int pos, int count); - } - - /** - * Used by {@link Cursor#advanceToContent} to track the transitions and backtracking taken. - */ - protected interface ResettingTransitionsReceiver extends TransitionsReceiver - { - /** Delete all bytes beyond the given length. */ - void resetPathLength(int newLength); - } - - /** - * A push interface for walking over the trie. Builds upon TransitionsReceiver to be given the bytes of the - * path, and adds methods called on encountering content and completion. - * See {@link TrieDumper} for an example of how this can be used, and {@link TrieEntriesWalker} as a base class - * for other common usages. - */ - protected interface Walker extends ResettingTransitionsReceiver - { - /** Called when content is found. */ - void content(T content); - - /** Called at the completion of the walk. */ - R complete(); - } - - /** - * Adapter interface providing the methods a {@link Walker} to a {@link Consumer}, so that the latter can be used - * with {@link #process}. - * - * This enables calls like - * trie.forEachEntry(x -> System.out.println(x)); - * to be mapped directly to a single call to {@link #process} without extra allocations. - */ - public interface ValueConsumer extends Consumer, Walker - { - @Override - default void content(T content) - { - accept(content); - } - - @Override - default Void complete() - { - return null; - } - - @Override - default void resetPathLength(int newDepth) - { - // not tracking path - } - - @Override - default void addPathByte(int nextByte) - { - // not tracking path - } - - @Override - default void addPathBytes(DirectBuffer buffer, int pos, int count) - { - // not tracking path - } - } - - /** - * Call the given consumer on all content values in the trie in order. - */ - public void forEachValue(ValueConsumer consumer) - { - process(consumer, Direction.FORWARD); - } - - /** - * Call the given consumer on all (path, content) pairs with non-null content in the trie in order. - */ - public void forEachEntry(BiConsumer consumer) - { - forEachEntry(Direction.FORWARD, consumer); - } - - /** - * Call the given consumer on all (path, content) pairs with non-null content in the trie in order. - */ - public void forEachEntry(Direction direction, BiConsumer consumer) - { - Cursor cursor = cursor(direction); - process(new TrieEntriesWalker.WithConsumer(consumer, cursor.byteComparableVersion()), cursor); - // Note: we can't do the ValueConsumer trick here, because the implementation requires state and cannot be - // implemented with default methods alone. - } - - /** - * Process the trie using the given Walker. - */ - public R process(Walker walker, Direction direction) - { - return process(walker, cursor(direction)); - } - - static R process(Walker walker, Cursor cursor) - { - assert cursor.depth() == 0 : "The provided cursor has already been advanced."; - T content = cursor.content(); // handle content on the root node - if (content == null) - content = cursor.advanceToContent(walker); - - while (content != null) - { - walker.content(content); - content = cursor.advanceToContent(walker); - } - return walker.complete(); - } - - - /** - * Process the trie using the given ValueConsumer, skipping all branches below the top content-bearing node. - */ - public Void forEachValueSkippingBranches(Direction direction, ValueConsumer consumer) - { - return processSkippingBranches(consumer, cursor(direction)); - } - - /** - * Call the given consumer on all (path, content) pairs with non-null content in the trie in order, skipping all - * branches below the top content-bearing node. - */ - public void forEachEntrySkippingBranches(Direction direction, BiConsumer consumer) - { - Cursor cursor = cursor(direction); - processSkippingBranches(new TrieEntriesWalker.WithConsumer(consumer, cursor.byteComparableVersion()), cursor); - // Note: we can't do the ValueConsumer trick here, because the implementation requires state and cannot be - // implemented with default methods alone. - } - - /** - * Process the trie using the given Walker, skipping all branches below the top content-bearing node. - */ - public R processSkippingBranches(Walker walker, Direction direction) - { - return processSkippingBranches(walker, cursor(direction)); - } - - static R processSkippingBranches(Walker walker, Cursor cursor) - { - assert cursor.depth() == 0 : "The provided cursor has already been advanced."; - T content = cursor.content(); // handle content on the root node - if (content != null) - { - walker.content(content); - return walker.complete(); - } - content = cursor.advanceToContent(walker); - - while (content != null) - { - walker.content(content); - if (cursor.skipTo(cursor.depth(), cursor.incomingTransition() + cursor.direction().increase) < 0) - break; - walker.resetPathLength(cursor.depth() - 1); - walker.addPathByte(cursor.incomingTransition()); - content = cursor.content(); - if (content == null) - content = cursor.advanceToContent(walker); - } - return walker.complete(); - } - - /** - * Map-like get by key. - */ - public T get(ByteComparable key) - { - Cursor cursor = cursor(Direction.FORWARD); - if (cursor.descendAlong(key.asComparableBytes(cursor.byteComparableVersion()))) - return cursor.content(); - else - return null; - } - - /** - * Constuct a textual representation of the trie. - */ - public String dump() - { - return dump(Object::toString); - } - - /** - * Constuct a textual representation of the trie using the given content-to-string mapper. - */ - public String dump(Function contentToString) - { - return process(new TrieDumper<>(contentToString), Direction.FORWARD); - } - - /** - * Returns a singleton trie mapping the given byte path to content. - */ - public static Trie singleton(ByteComparable b, ByteComparable.Version byteComparableVersion, T v) - { - return new SingletonTrie<>(b, byteComparableVersion, v); - } - - /** - * Returns a view of the subtrie containing everything in this trie whose keys fall between the given boundaries. - * The view is live, i.e. any write to the source will be reflected in the subtrie. - * - * This method will not check its arguments for correctness. The resulting trie may be empty or throw an exception - * if the right bound is smaller than the left. - * - * @param left the left bound for the returned subtrie. If {@code null}, the resulting subtrie is not left-bounded. - * @param includeLeft whether {@code left} is an inclusive bound of not. - * @param right the right bound for the returned subtrie. If {@code null}, the resulting subtrie is not right-bounded. - * @param includeRight whether {@code right} is an inclusive bound of not. - * @return a view of the subtrie containing all the keys of this trie falling between {@code left} (inclusively if - * {@code includeLeft}) and {@code right} (inclusively if {@code includeRight}). - */ - public Trie subtrie(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight) - { - if (left == null && right == null) - return this; - return new SlicedTrie<>(this, left, includeLeft, right, includeRight); - } - - /** - * Returns a view of the subtrie containing everything in this trie whose keys fall between the given boundaries. - * The view is live, i.e. any write to the source will be reflected in the subtrie. - * - * @param left the left bound for the returned subtrie, inclusive. If {@code null}, the resulting subtrie is not - * left-bounded. - * @param right the right bound for the returned subtrie, exclusive. If {@code null}, the resulting subtrie is not - * right-bounded. - * @return a view of the subtrie containing all the keys of this trie falling between {@code left} inclusively and - * {@code right} exclusively. - */ - public Trie subtrie(ByteComparable left, ByteComparable right) - { - return subtrie(left, true, right, false); - } - - /** - * Returns the ordered entry set of this trie's content as an iterable. - */ - public Iterable> entrySet() - { - return this::entryIterator; - } - - /** - * Returns the ordered entry set of this trie's content as an iterable. - */ - public Iterable> entrySet(Direction direction) - { - return () -> entryIterator(direction); - } - - /** - * Returns the ordered entry set of this trie's content in an iterator. - */ - public Iterator> entryIterator() - { - return entryIterator(Direction.FORWARD); - } - - /** - * Returns the ordered entry set of this trie's content in an iterator. - */ - public Iterator> entryIterator(Direction direction) - { - return new TrieEntriesIterator.AsEntries<>(cursor(direction)); - } - - /** - * Returns the ordered entry set of this trie's content in an iterable, filtered by the given type. - */ - public Iterable> filteredEntrySet(Class clazz) - { - return filteredEntrySet(Direction.FORWARD, clazz); - } - - /** - * Returns the ordered entry set of this trie's content in an iterable, filtered by the given type. - */ - public Iterable> filteredEntrySet(Direction direction, Class clazz) - { - return () -> filteredEntryIterator(direction, clazz); - } - - /** - * Returns the ordered entry set of this trie's content in an iterator, filtered by the given type. - */ - public Iterator> filteredEntryIterator(Direction direction, Class clazz) - { - return new TrieEntriesIterator.AsEntriesFilteredByType<>(cursor(direction), clazz); - } - - /** - * Returns the ordered set of values of this trie as an iterable. - */ - public Iterable values() - { - return this::valueIterator; - } - - /** - * Returns the ordered set of values of this trie as an iterable. - */ - public Iterable values(Direction direction) - { - return direction.isForward() ? this::valueIterator : this::reverseValueIterator; - } - - /** - * Returns the ordered set of values of this trie in an iterator. - */ - public Iterator valueIterator() - { - return valueIterator(Direction.FORWARD); - } - - /** - * Returns the inversely ordered set of values of this trie in an iterator. - */ - public Iterator reverseValueIterator() - { - return valueIterator(Direction.REVERSE); - } - - /** - * Returns the ordered set of values of this trie in an iterator. - */ - public Iterator valueIterator(Direction direction) - { - return new TrieValuesIterator<>(cursor(direction)); - } - - /** - * Returns the ordered set of values of this trie in an iterable, filtered by the given type. - */ - public Iterable filteredValues(Class clazz) - { - return filteredValues(Direction.FORWARD, clazz); - } - - /** - * Returns the ordered set of values of this trie in an iterable, filtered by the given type. - */ - public Iterable filteredValues(Direction direction, Class clazz) - { - return () -> filteredValuesIterator(direction, clazz); + boolean DEBUG = CassandraRelevantProperties.TRIE_DEBUG.getBoolean(); + + /// Returns a singleton trie mapping the given byte path to content. + static Trie singleton(ByteComparable b, ByteComparable.Version byteComparableVersion, T v) + { + return dir -> new SingletonCursor<>(dir, + b.asComparableBytes(byteComparableVersion), + byteComparableVersion, + v); + } + + /// Returns a singleton trie mapping the given byte path to content. + /// This singleton is ordered, which means that the content will be presented in lexicographic order in both + /// directions, i.e. before any content from descendants in the forward direction, and after any content from + /// descendents in the reverse. + static Trie singletonOrdered(ByteComparable b, ByteComparable.Version byteComparableVersion, T v) + { + return dir -> new SingletonOrderedCursor<>(dir, + b.asPeekableBytes(byteComparableVersion), + byteComparableVersion, + !dir.isForward(), + v); + } + + @Override + default Trie intersect(TrieSet set) + { + return dir -> new IntersectionCursor.Plain<>(cursor(dir), set.cursor(dir)); + } + + + /// Returns a view of this trie that is an intersection of its content with the given set. Unlike the normal + /// `intersect`, this version will only present content that is within the boundaries of the set, hiding content + /// that is present at prefixes. + /// + /// This method is most useful for ordered tries (i.e. tries where prefix content is presented on the descent path + /// in forward iteration, but on the ascent path in reverse so that it follows all descendants), where it can be + /// used to list the content that is within the set (see also [#slice] below). + /// + /// The view is live, i.e. any write to the source will be reflected in the intersection. + default Trie intersectSlicing(TrieSet set) + { + return dir -> new IntersectionCursor.PlainSlice<>(cursor(dir), set.cursor(dir)); + } + + /// A version of subtrie that lists content falling between two bounds in lexicographic order. Unlike `subtrie`, + /// prefixes and descendants of the boundaries are reported only if they fall in the span (i.e. are prefixes of the + /// right bound or descendants of the left bound). + /// + /// Note that for this to work correctly in reverse, the trie must be "ordered", i.e. present content on the ascent + /// path for the reverse iteration direction (e.g. [InMemoryTrie#shortLivedOrdered]). + /// + /// For example, `slice(2020, false, 4040, true)` excludes `20`, `2020` and `404040` but includes `202020`, `40` and + /// `4040` among others. + default Trie slice(ByteComparable left, boolean inclusiveLeft, ByteComparable right, boolean inclusiveRight) + { + return dir -> { + Cursor cursor = cursor(dir); + /// For lexicographic order the "after" position (used for exclusive start / inclusive end) needs to be + /// after the specific node but before its children. We do this by always using inclusive-start-exclusive-end + /// trie sets, but we form "after" positions by adding a 00 byte to the key. + return new IntersectionCursor.PlainSlice<>(cursor, + RangesCursor.create(dir, + cursor.byteComparableVersion(), + true, + false, + maybeAdd0(left, !inclusiveLeft), + maybeAdd0(right, inclusiveRight))); + }; } - /** - * Returns the ordered set of values of this trie in an iterator, filtered by the given type. - */ - public Iterator filteredValuesIterator(Direction direction, Class clazz) + private static ByteComparable maybeAdd0(ByteComparable byteComparable, boolean shouldAdd) { - return new TrieValuesIterator.FilteredByType<>(cursor(direction), clazz); + return byteComparable != null && shouldAdd ? v -> ByteSource.append(byteComparable.asComparableBytes(v), 0) : byteComparable; } - /** - * Returns the values in any order. For some tries this is much faster than the ordered iterable. - */ - public Iterable valuesUnordered() + /// Returns the values in any order. For some tries this is much faster than the ordered iterable. + default Iterable valuesUnordered() { return values(); } - /** - * Resolver of content of merged nodes, used for two-source merges (i.e. mergeWith). - */ - public interface MergeResolver + /// Resolver of content of merged nodes, used for two-source merges (i.e. mergeWith). + interface MergeResolver { // Note: No guarantees about argument order. // E.g. during t1.mergeWith(t2, resolver), resolver may be called with t1 or t2's items as first argument. T resolve(T b1, T b2); } - /** - * Constructs a view of the merge of this trie with the given one. The view is live, i.e. any write to any of the - * sources will be reflected in the merged view. - * - * If there is content for a given key in both sources, the resolver will be called to obtain the combination. - * (The resolver will not be called if there's content from only one source.) - */ - public Trie mergeWith(Trie other, MergeResolver resolver) + /// Constructs a view of the merge of this trie with the given one. The view is live, i.e. any write to any of the + /// sources will be reflected in the merged view. + /// + /// If there is content for a given key in both sources, the resolver will be called to obtain the combination. + /// (The resolver will not be called if there's content from only one source.) + default Trie mergeWith(Trie other, MergeResolver resolver) + { + return dir -> new MergeCursor.Plain<>(resolver, this.cursor(dir), other.cursor(dir)); + } + + /// Constructs a view of the merge of this trie with the given one, applying a transformation over all values. + /// The view is live, i.e. any write to any of the sources will be reflected in the merged view. + /// + /// The resolver will be called for all content in any of the two source to transform it to the output type, + /// and one of its arguments will be null if the other source has no matching content. + default + Trie mappingMergeWith(Trie other, BiFunction resolver) { - return new MergeTrie<>(resolver, this, other); + return dir -> new MergeCursor.PlainMapping<>(resolver, + cursor(dir), + other.cursor(dir)); } - /** - * Resolver of content of merged nodes. - * - * The resolver's methods are only called if more than one of the merged nodes contain content, and the - * order in which the arguments are given is not defined. Only present non-null values will be included in the - * collection passed to the resolving methods. - * - * Can also be used as a two-source resolver. - */ - public interface CollectionMergeResolver extends MergeResolver + /// Resolver of content of merged nodes. + /// + /// The resolver's methods are only called if more than one of the merged nodes contain content, and the + /// order in which the arguments are given is not defined. Only present non-null values will be included in the + /// collection passed to the resolving methods. + /// + /// Can also be used as a two-source resolver. + interface CollectionMergeResolver extends MergeResolver { T resolve(Collection contents); @@ -711,7 +194,33 @@ default T resolve(T c1, T c2) } } - private static final CollectionMergeResolver THROWING_RESOLVER = new CollectionMergeResolver() + /// Constructs a view of the merge of multiple tries. The view is live, i.e. any write to any of the + /// sources will be reflected in the merged view. + /// + /// If there is content for a given key in more than one sources, the resolver will be called to obtain the + /// combination. (The resolver will not be called if there's content from only one source.) + static Trie merge(Collection> sources, CollectionMergeResolver resolver) + { + switch (sources.size()) + { + case 0: + throw new AssertionError(); + case 1: + return sources.iterator().next(); + case 2: + { + Iterator> it = sources.iterator(); + Trie t1 = it.next(); + Trie t2 = it.next(); + return t1.mergeWith(t2, resolver); + } + default: + return dir -> new CollectionMergeCursor.Plain<>(resolver, dir, sources, Trie::cursor); + } + } + + /// Not to be used directly, call [#throwingResolver()] instead. + CollectionMergeResolver THROWING_RESOLVER = new CollectionMergeResolver<>() { @Override public Object resolve(Collection contents) @@ -725,50 +234,41 @@ private AssertionError error() } }; - /** - * Returns a resolver that throws whenever more than one of the merged nodes contains content. - * Can be used to merge tries that are known to have distinct content paths. - */ + /// Returns a resolver that throws whenever more than one of the merged nodes contains content. + /// Can be used to merge tries that are known to have distinct content paths. @SuppressWarnings("unchecked") - public static CollectionMergeResolver throwingResolver() + static CollectionMergeResolver throwingResolver() { return (CollectionMergeResolver) THROWING_RESOLVER; } - /** - * Constructs a view of the merge of multiple tries. The view is live, i.e. any write to any of the - * sources will be reflected in the merged view. - * - * If there is content for a given key in more than one sources, the resolver will be called to obtain the - * combination. (The resolver will not be called if there's content from only one source.) - */ - public static Trie merge(Collection> sources, CollectionMergeResolver resolver) + /// Constructs a view of the merge of two tries, where each source must have distinct keys. The view is live, i.e. + /// any write to any of the sources will be reflected in the merged view. + /// + /// If there is content for a given key in more than one sources, the merge will throw an assertion error. + static Trie mergeDistinct(Trie t1, Trie t2) { - switch (sources.size()) + return new Trie<>() { - case 0: - throw new AssertionError(); - case 1: - return sources.iterator().next(); - case 2: - { - Iterator> it = sources.iterator(); - Trie t1 = it.next(); - Trie t2 = it.next(); - return t1.mergeWith(t2, resolver); - } - default: - return new CollectionMergeTrie<>(sources, resolver); - } + @Override + public Cursor makeCursor(Direction direction) + { + return new MergeCursor.Plain<>(throwingResolver(), t1.cursor(direction), t2.cursor(direction)); + } + + @Override + public Iterable valuesUnordered() + { + return Iterables.concat(t1.valuesUnordered(), t2.valuesUnordered()); + } + }; } - /** - * Constructs a view of the merge of multiple tries, where each source must have distinct keys. The view is live, - * i.e. any write to any of the sources will be reflected in the merged view. - * - * If there is content for a given key in more than one sources, the merge will throw an assertion error. - */ - public static Trie mergeDistinct(Collection> sources) + /// Constructs a view of the merge of multiple tries, where each source must have distinct keys. The view is live, + /// i.e. any write to any of the sources will be reflected in the merged view. + /// + /// If there is content for a given key in more than one sources, the merge will throw an assertion error. + static Trie mergeDistinct(Collection> sources) { switch (sources.size()) { @@ -781,114 +281,75 @@ public static Trie mergeDistinct(Collection> sources) Iterator> it = sources.iterator(); Trie t1 = it.next(); Trie t2 = it.next(); - return new MergeTrie.Distinct<>(t1, t2); + return mergeDistinct(t1, t2); } default: - return new CollectionMergeTrie.Distinct<>(sources); - } - } + return new Trie<>() + { + @Override + public Cursor makeCursor(Direction direction) + { + return new CollectionMergeCursor.Plain<>(Trie.throwingResolver(), direction, sources, Trie::cursor); + } - /** - * Returns a Trie that is a view of this one, where the given prefix is prepended before the root. - */ - public Trie prefixedBy(ByteComparable prefix) - { - return new PrefixedTrie(prefix, this); + @Override + public Iterable valuesUnordered() + { + return Iterables.concat(Iterables.transform(sources, Trie::valuesUnordered)); + } + }; + } } - /** - * Returns an entry set containing all tail tree constructed at the points that contain content of - * the given type. - */ - public Iterable>> tailTries(Direction direction, Class clazz) + @Override + default Trie prefixedBy(ByteComparable prefix) { - return () -> new TrieTailsIterator.AsEntries<>(cursor(direction), clazz); + return dir -> new PrefixedCursor.Plain<>(prefix, cursor(dir)); } - /** - * Returns a trie that corresponds to the branch of this trie rooted at the given prefix. - *

    - * The result will include the same values as {@code subtrie(prefix, nextBranch(prefix))}, but the keys in the - * resulting trie will not include the prefix. In other words, - * {@code tailTrie(prefix).prefixedBy(prefix) = subtrie(prefix, nextBranch(prefix))} - * where nextBranch stands for the key adjusted by adding one at the last position. - */ - public Trie tailTrie(ByteComparable prefix) + @Override + default Trie tailTrie(ByteComparable prefix) { Cursor c = cursor(Direction.FORWARD); if (c.descendAlong(prefix.asComparableBytes(c.byteComparableVersion()))) - return c.tailTrie(); + return c::tailCursor; else return null; } - public static Trie empty(ByteComparable.Version byteComparableVersion) + @Override + default Iterable>> tailTries(Direction direction, Predicate predicate) { - return new Trie() - { - public Cursor cursor(Direction dir) - { - return new EmptyCursor<>(dir, byteComparableVersion); - } - }; + return () -> new TrieTailsIterator.AsEntries<>(cursor(direction), predicate); } - static class EmptyCursor implements Cursor + /// Returns a view of this trie where all content is processed through the given mapping function. + default Trie mapValues(Function mapper) { - private final Direction direction; - private final ByteComparable.Version byteComparableVersion; - int depth; - - public EmptyCursor(Direction direction, ByteComparable.Version byteComparableVersion) - { - this.direction = direction; - this.byteComparableVersion = byteComparableVersion; - depth = 0; - } - - public int advance() - { - return depth = -1; - } - - public int skipTo(int skipDepth, int skipTransition) - { - return depth = -1; - } - - public ByteComparable.Version byteComparableVersion() - { - if (byteComparableVersion != null) - return byteComparableVersion; - throw new AssertionError(); - } - - @Override - public Trie tailTrie() - { - assert depth == 0 : "tailTrie called on exhausted cursor"; - return empty(byteComparableVersion); - } + return dir -> new ContentMappingCursor.Plain<>(mapper, cursor(dir)); + } - public int depth() - { - return depth; - } + static Trie empty(ByteComparable.Version byteComparableVersion) + { + return dir -> new Cursor.Empty<>(dir, byteComparableVersion); + } - public T content() - { - return null; - } + // The methods below form the non-public implementation, whose visibility is restricted to package-level. + // The warning suppression below is necessary because we cannot limit the visibility of an interface method. + // We need an interface to be able to implement trie methods by lambdas, which is heavily used above. - @Override - public Direction direction() - { - return direction; - } + /// Implement this method to provide the concrete trie implementation as the cursor that presents it, most easily + /// done via a lambda as in the methods above. + //noinspection ClassEscapesDefinedScope + Cursor makeCursor(Direction direction); - public int incomingTransition() - { - return -1; - } + /// @inheritDoc This method's implementation uses [#makeCursor] to get the cursor and may apply additional cursor + /// checks for tests that run with verification enabled. + //noinspection ClassEscapesDefinedScope + @Override + default Cursor cursor(Direction direction) + { + return DEBUG ? new VerificationCursor.Plain<>(makeCursor(direction)) + : makeCursor(direction); } } diff --git a/src/java/org/apache/cassandra/db/tries/Trie.md b/src/java/org/apache/cassandra/db/tries/Trie.md index a482d7dc4a80..f1fce3c191ae 100644 --- a/src/java/org/apache/cassandra/db/tries/Trie.md +++ b/src/java/org/apache/cassandra/db/tries/Trie.md @@ -139,16 +139,16 @@ and provides methods for advancing to the next position. This is enough informa also to easily compare cursors over different tries that are advanced together. Advancing is always done in order; if one imagines the set of nodes in the trie with their associated paths, a cursor may only advance from a node with a lexicographically smaller path to one with bigger. The `advance` operation moves to the immediate -next, it is also possible to skip over some items e.g. all children of the current node (`skipChildren`). +next, it is also possible to skip over some items to a specific position ahead (`skipTo`). Moving to the immediate next position in the lexicographic order is accomplished by: - if the current node has children, moving to its first child; - otherwise, ascend the parent chain and return the next child of the closest parent that still has any. As long as the trie is not exhausted, advancing always takes one step down, from the current node, or from a node -on the parent chain. By comparing the new depth (which `advance` also returns) with the one before the advance, -one can tell if the former was the case (if `newDepth == oldDepth + 1`) and how many steps up we had to take -(`oldDepth + 1 - newDepth`). When following a path down, the cursor will stop on all prefixes. +on the parent chain. By comparing the new depth with the one before the advance, one can tell if the former was +the case (if `newDepth == oldDepth + 1`) or how many steps up we had to take (`oldDepth + 1 - newDepth`) if it +wasn't. When following a path down, the cursor will stop on all prefixes. In addition to the single-step `advance` method, the cursor also provides an `advanceMultiple` method for descending multiple steps down when this is known to be efficient. If it is not feasible to descend (e.g. because there are no @@ -203,9 +203,88 @@ have changed, and 3 is true because it has a false premise. The same argument holds when `b` is the smaller cursor to be advanced. +### `encodedPosition` for efficient advance and position comparison + +The cursor definitions above were given with separate `depth` and `incomingTransition` methods, but in practice we +combine the features of the current position into a single long integer that encodes the combination of the two. This +saves some method calls which often tend to be megamorphic, and also makes some checks more efficient. + +The encoded state is prepared in such a way that it is trivially easy to compare the position of two cursors in parallel +walks: if the encoded state of one cursor is smaller than the encoded state of another, then it is positioned before it. +Recall that for a cursor to be positioned before another means either that its depth is higher than the other's, or that +depth match, and the incoming character is smaller. We can ensure that we can trivially compare by composing a long that +has the inverse of the depth as its most significant bits, and the incoming character in some of its less-significant +ones. + +More precisely, in bits 32 to 63 of the `encodedPosition` long we store `-depth`, and in bits 20 to 27 – +`incomingTransition`. Some of the other bits have meanings that are descibed in the paragraphs below, and others +are reserved for future use (e.g. we could set a bit to signify that the node at the +current position may have content to drastically reduce the number of `content` calls a consumer needs to do). +The `Cursor` class implements methods to compose and decompose encoded states, as well as to perform common checks +(e.g. whether an advance descended into a child position) and to prepare certain positions for skipping (e.g. over +the current branch). + +### Reverse iteration + +Tries and trie cursors support reverse iteration. Because cursors have to stop on prefixes before visiting longer +sequences, the cursor walks described above performed in reverse have to differ from the reverse lexicographic order +because they will visit prefixes first (e.g. a trie representing the list "a", "ab", "c" will be walked as "c", "a", +"ab" in the reverse direction which is not the same as the reversed list). More precisely, reversed cursor walks as +described above will present data in lexicographic order of the inverted alphabet. + +This difference is immaterial if the data in the trie is guaranteed to be prefix-free (e.g. when it is given by +Cassandra's byte-comparable type translation) and makes it possible to store metadata that describes features of the +descendant branch at any point in the trie and have that metadata presented correctly during reverse walks. + +To make it easier to manipulate `encodedPosition` for reverse iteration, we use the 31st bit in the encoded state to +distinguish between forward and reverse iteration, and flip the bits of `incomingTransition` to make it possible to use +the same encoded state comparison for both iteration directions (because higher `incomingTransition` values should be +ordered before lower ones in reverse walks). + +### Stopping on the ascent path + +In some scenarios we may need reversed iteration that correctly presents reverse lexicographic order (e.g. SAI stores +terms in tries directly and is disadvantaged if the terms are not ordered correctly in reverse). For this reason, as +well as to support some needs of the set and range functionality that will be described later, the cursor interfaces +support stopping on a node on the ascent (i.e. return) path to present content. We call tries that present prefix +content on the return path in reverse "ordered tries". + +For example, the "a", "ab" and "c" example above will be iterated in forward direction as +``` +a -> value for "a" + b -> value for "ab" +c -> value for "c" +``` +with normal reverse iteration +``` +c -> value for "c" +a -> value for "a" + b -> value for "ab" +``` + +In an ordered trie, the reverse iteration changes to +``` +c -> value for "c" +a -> + b -> value for "ab" +a↑ -> value for "a" +``` + +The ascent path position (marked by the upwards arrow ↑ above) is one that is at the same depth of the matching descent +position, has the same incoming character and compares greater to it. This has the effect of being immediately after all +children of the node in cursor iteration order. In practical terms, we implement it by using bit 19 in the +`encodedPosition`. It is set to 0 for the descent path and 1 for the ascent path; unlike transition bits this is not +flipped for reverse iteration to ensure that it is visited after the descent path entry. + +Return path positions are valid positions for jumps, and will be stopped on if a `skipTo` call finds one to be closest +to the requested position. They cannot have children. + +Their usages will be further detailed in the [sections on sets](#trie-sets); there are also +[alternative approaches we considered during development](#return-stop-alternatives). + ## Merging two tries -Two tries can be merged using `Trie.mergeWith`, which is implemented using the class `MergeTrie`. The implementation +Two tries can be merged using `Trie.mergeWith`, which is implemented using the class `MergeCursor`. The implementation is a straightforward application of the parallel walking scheme above, where the merged cursor presents the depth and incoming transition of the currently smaller cursor, and advances by advancing the smaller cursor, or both if they are equal. @@ -218,7 +297,7 @@ the other through "bb" — condition 2. is violated, the latter will have hi ## Merging an arbitrary number of tries -Merging is extended to an arbitrary number of sources in `CollectionMergeTrie`, used through the static `Trie.merge`. +Merging is extended to an arbitrary number of sources in `CollectionMergeCursor`, used through the static `Trie.merge`. The process is a generalization of the above, implemented using the min-heap solution from `MergeIterator` applied to cursors. @@ -229,34 +308,720 @@ descendants) at the expense of possibly adding one additional comparison in the As above, when we know that the head element is not equal to the heap top (i.e. it's necessarily smaller) we can use its `advanceMultiple` safely. -## Slicing tries +# Trie sets + +The simplest way to implement a set in the trie paradigm is to define +an infinite trie that returns `true` for all positions that are covered by the set. Such a set is very easy to define +and apply, but unfortunately is not at all efficient because an intersection must necessarily walk the set cursor for +every covered position, which introduces a lot of overhead and makes it impossible to apply efficiency improvements such +as `advanceMultiple`. + +Instead, our trie sets (defined in `TrieSet/TrieSetCursor`) implement sets of ranges of keys by listing the boundaries of +each range and their prefixes. This makes it possible to identify fully contained regions of the set and proceed inside +such regions without touching the set cursor. + +Trie set cursors specify a `state` at any position they list. This state includes information about the inclusion of +trie branches before and after the listed position in iteration order. When we are applying a set to a trie (i.e. +intersecting the trie with it), we would walk the two cursors in parallel. If the set moves ahead, we use the state to +determine whether the position of the trie cursor is covered by the set. Similarly, when a `skipTo` is performed on the +set, the same state flags can tell us if the set covers the position we attempted to skip to, when the set cursor does +not have an exact match and skips over the requested position. + +To support all forms of inclusivity and prefixes in the definition of the sets, set states can be presented both on the +descent and the ascent path of the walk. The preceding side of a descent path state applies to positions before the +node and its branch, while the succeeding side applies to the current node and positions greater than it in iteration +order, starting with its first children; similarly the preceding side of an ascent path state applies to preceding +positions including the last children of the current node as well as any ascent path content for the current node, and +the succeeding side applies to the positions that follow the node and branch. + + +## Trie set content + +Trie sets list the boundary points for the represented ranges. For example, the range `[abc, ade)` will be represented +by the trie +``` +a -> + b -> + c -> START + d -> + e -> END +``` +where `START` is a state marking a left boundary, and `END` marks a right boundary. To be able to easily say that e.g. +`aa` is not covered by the set, but `ac` is, especially if we jump to these positions using a `skipTo` call, nodes on +any prefix path inside a covered range must also present a state that marks the node as contained on both sides. For +code simplicity sets also produce a state for prefixes that are fully outside the set. + +The full state trie for the above example when walked in the forward direction is +``` +a -> NOT_CONTAINED + b -> NOT_CONTAINED + c -> START + d -> CONTAINED + e -> END +``` +The "contained" states are not reported by `content()`, but they are used to determine the inclusion of preceding +positions in the set. For example, when a cursor follows a `skipTo` instruction to jump to "aa" and lands on "ab", the +preceding side of `NOT_CONTAINED` tells it that the position the caller tried to skip to is not inside the set; on the +other hand, jumping to "ac" would land in "ad", whose `CONTAINED` state's preceding side tells us that the queried +position is covered by the set. + +A set state only needs to specify a boolean for its two sides that specify whether the relevant positions are included +in the set. There are, thus, four possible set states: +- `NOT_CONTAINED` whose left and right side are both false, +- `CONTAINED` with true on both sides, +- `START` has false on the left and true on the right side, +- `END` is true on the left and false on the right side. + +The left and right side of a state are presented, respectively, as preceding and succeeding when the cursor iterates in +the forward direction. In reverse, the right side is preceding and the left is the succeeding side. Note that when we +iterate a set in the reverse direction, the representation will differ in the states that it will return for some +prefixes of the boundaries, and in the presentation of boundaries. The example above is iterated as: +``` +a -> NOT_CONTAINED + d -> NOT_CONTAINED + e↑ -> END + b -> CONTAINED + c↑ -> START +``` +Note that for the content-bearing nodes (i.e. the boundaries "abc" and "ade"), the `state` is the same, but it is now +presented on the return path to state that the boundary applies after the specific position in iteration order. For the +prefixes "ad" and "ab" we must return different states in the two directions to ensure, e.g., that skipping to "ac" +(which ends up in "ad" in the forward direction and "ab" in the reverse) correctly interprets the position to be covered +by the set. + +The example above is bounded by positions to the left of a given node, which are returned on the descent path in forward +direction, and on the ascent path in the reverse. In addition to this, our trie sets support positions to the right of a +node, to be able to specify sets that cover branches of the trie. + +For example, the branch set `[abc, abc]` is represented in forward direction by the trie +``` +a -> NOT_CONTAINED + b -> NOT_CONTAINED + c -> START + c↑ -> END +``` +and +``` +a -> NOT_CONTAINED + b -> NOT_CONTAINED + c -> END + c↑ -> START +``` +in the reverse. To give this in a shorter form, ignoring the prefix states which can be inferred, this set is +represented by the trie +``` +a -> + b -> + c< -> START + c> -> END +``` +where the `<` modifier in `c<` stands for the state that is returned on the descent path in forward direction, and on +the ascent path in the reverse, and vice versa for `>` in `c>`. + +A few more complex examples: +- `[abd, adc] + (ade, afg)`: + ``` + a -> + b -> + d< -> START + d -> + c> -> END + e> -> START + f -> + g< -> END + ``` +- `[a, abc] + [ade, a]` (in other words `[a, a] - (abc, ade)`): + ``` + a< -> START + b -> + c> -> END + d -> + e< -> START + a> -> END + ``` + +## Converting ranges to trie sets + +The main usage of a trie set is to return subtries bounded by one or more key ranges. We achieve this as the +intersection of a trie with a trie set that represents the ranges. The ranges are constructed by taking an array of +ordered boundaries, walking them in parallel the encoded position of the leftmost active key and presenting states as +follows: +- If the leftmost key has an odd index in the array, we are positioned to the right of a start boundary and thus must + present a state that is true on the left side. Otherwise, the state must be false on the left. +- We set the right side to be the same as the left initially, and flip it for every key that ends at the current + position. In other words, if one or an odd number of keys end here, we note that this is a boundary and changes the + set containment for the positions to the right (note: two copies of the same boundary cancel out). + +For the `[abc, adc) + [ade, afg)` example above, the ranges construction will accept the array `[abc, adc, ade, afg]` +and proceed as follows: +- We start with left index 0 at the root position (depth 0, character `\0`). As the left index is even, the state we + must return is false on the left side. We check if any of the keys ends here, and since none does, the right side + must be the same as the left, thus the state we return is `NOT_CONTAINED`. + We prepare for the next advance by getting the next byte from all keys. +- On the first `advance` call, we advance our encoded position to that of the key at the left index 0: (depth 1, + character `a`). Since the left index is 0 (left excluded) and the left key does not end here, both sides must be false + and thus the returned state is `NOT_CONTAINED`. + We prepare for the next advance by getting the next byte from all keys that match the current position, which matches + all four. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 0: (depth 2, + character `b`). The left index is still 0 and no keys end here, thus we return the state `NOT_CONTAINED` again. + We prepare for the next advance by getting the next byte from all keys that match the current position, which is only + key 0. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 0: (depth 3, + character `c`). The left index is still 0, but when we prepare for the next advance we see that this key has no more + bytes and thus we must advance the left index to 1 and flip the right side of the state, resulting in `START`. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 1: (depth 2, + character `d`). The left index is 1, thus the left side of the returned state must be true. No key ends here, thus + the right side is left the same, resulting in `CONTAINED`. + We prepare for the next advance by getting the next byte from all keys that match the current position, keys 1 and 2. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 1: (depth 3, + character `c`). The left index is still odd, thus the left side is contained. + When we get the next byte of the only key at the matching position, we note that it is exhausted, increase the left + index to 2 and flip the right side of the state, resulting in `END`. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 2: (depth 3, + character `e`). The left index is even, thus the left side is excluded. + This key doesn't have any further bytes either, thus we advance the left index to 3 and flip the right side, resulting + in `START`. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 3: (depth 2, + character `f`). The left index is odd, thus our state's left side must be true. + We advance the matching keys (only key 3 remains), and since it does not end the state's right side is the same as + the left, resuling in `CONTAINED`. +- On the next `advance` call, we advance our encoded position to that of the key at the left index 3: (depth 3, + character `g`). The state's left side must be true for the odd left index. + The key has no further bytes, thus the left index is advanced to 4 and the right side is flipped to return `END`. +- On the next `advance` call the left index is beyond the end of the key array, thus we advance to the exhausted + position (depth -1, character `\0`). Its state does not need to be defined as `state` and `content` cannot be called + on exhausted cursors. + +If we need to perform a `skipTo` operation, we do so by advancing the left index over the keys whose prepared position +is before the target. We then apply the same decisions using the new left index to prepare the resulting position and +state. + +To handle inclusive end and exclusive start positions, we need to change the encoded position when a key returns its +last byte, just before it is exhausted, to be a position on the return path of the iteration. This is sufficient to +properly define a position on the right of the key. Note that this means that sets such as `[ab, cd, cd, ef]` are +invalid if all positions are understood to be inclusive, because the first right boundary `cd` inclusive falls after +the second left boundary `cd` inclusive in iteration order (the latter is on the descent path while the former is on the +ascent path of the same position). It also means that we can define contained sets such as `[a, ab, ax, a]` where the +`a` branch is covered except the `(ab, ax)` subset. + +Reverse iteration is performed the same way on the reversed key array, expanded to even length by adding an inclusive +empty key if necessary. + +## Intersecting a trie with a trie set + +Set intersection is performed by walking the source and set with a parallel walk. If the set advances beyond the +position of the trie, we check the state of the set to see if the position is covered by the set (done by +`TrieSetCursor.precedingIncluded`). If it is, we can present all content in the trie until it catches up with the set +position, and we can also apply `advanceMultiple` as a direct call on the trie. If the position is not covered by the +set, we perform a `skipTo` call to the current position of the set. This may move beyond the current position of the +set, so we must skip the set to the new position, and then repeat the above steps. + +If at any point both trie and set are at the same position, we can report that position and advance both trie and set +on the next `advance` or `skipTo` call. In this case `advanceMultiple` cannot be used and must act as `advance`. + +### Presenting content at prefixes, subtrie and slice intersections + +The intersection of a trie with a set results in the restriction of the trie data to the coverage of the set. One +important question for this intersection is what content should be returned by the intersected trie. + +As mentioned previously, we very often want to be able to see metadata like level markers on the descent path of the +iteration regardless of the direction, to be able to recognize features that apply to the branch. For this to work +correctly, we also want metadata to be presented on prefixes that are not strictly contained in the set. For example, +if "a" contains metadata related to the full contained branch, we would need that metadata to be presented when we +intersect the trie containing "a" with e.g. the set `[abc, ade)`. To facilitate this, the normal trie intersection will +return the content of all nodes visited by the walk, i.e. for all prefixes of content in the queried set. Additionally, +inclusive ends and exclusive starts apply to the whole branch rooted at the given boundary position (implemented by the +`>` modifiers for the boundaries as shown above). + +On the other hand, to support ordered tries and queries, which should only return content that falls between the given +lexicographical boundaries, we also provide a "slice" intersection. This type of intersection only returns content that +is strictly inside the set. For the `[abc, ade)` example, this includes e.g. "abc", "ac", "acc", "ad", "adc" but +excludes "a" and "ab". + +Slice intersections only make sense if the trie they are applied to is ordered (i.e. if the content is returned on the +ascent path in the reverse direction) and need to also use a different form of preparation of inclusive ends and +exclusive starts: slices should not contain any children of an inclusive end (because they follow the boundary in +lexicographic order) and should not skip over the children of an exclusive start. This is achieved by presenting +exclusive start / inclusive end positions as the path with a zero byte appended at the end, always using +`<` modifiers. + +For example, the trie set representing the slice `[abc, adc] + (ade, afg)` is: +``` +a -> + b -> + c< -> START + d -> + c -> + \0< -> END + e -> + \0< -> START + f -> + g< -> END +``` +Note that when such a set is walked in the reverse direction, all boundaries are presented on the return path of the +iteration. + + +## Set algebra + +A variation of the above can also be applied to sets, giving us set intersections. + +We can also perform negation of a set by inverting the returned states, as well as adding or removing boundaries at the +root on both the descent and ascent path. For example, the inverse of the range `[abc, ade]` is the set +`[null, abc) + (ade, null]`, which is represented as +``` +< -> START +a -> + b -> + c< -> END + d -> + e> -> START +> -> END +``` + +Using De Morgan's law, the negation also lets us perform set union. + +# Range tries + +A range trie is a generalization of the trie set, where the covered ranges can come with further information. This is +achieved by turning each side of a state from a boolean signifying containment to an object that carries additional +information. + +Nodes fully contained inside a trie range must return states that have the same left and right side. To simplify coding, +we name these kinds of states "covering" and use them as the information-bearing object that a state must return. + +In other words, states' `precedingState` and `succeedingState` methods return a state. On "boundary" states, where the +carried data changes (or a new range starts/stops), the two sides are a different covering state or null. Covering +states, on the other hand, specify fully contained regions and return themselves for both the preceding and succeeding +side. + +In their simplest, a range trie is one that returns `content` for the boundary positions of the ranges, and also +implements a `precedingState` method that returns the range state that applies to positions before the cursor's. For +a little better efficiency most of the time we combine these two into the `state` method that returns the content, if +the position is a boundary, or the preceding state otherwise. This suffices to implement the required operations, +including: +- Intersecting a range trie with a trie set, which generates boundaries that match the closer of the range trie's or + the set's. +- Combining two range tries in a union, where the applicable covering state is applied to every content position + given to the merge resolver. +- Inserting ranges into an in-memory range trie, applying new ranges to existing content as well as existing ranges to + new content to have the same result as the union above. +- The above also form the basis of the application of range tries to data, e.g. applying deletions as range tries to + content tries. + +For the examples below, consider range states that specify deletion times. For example, a range trie could be used to +describe a deletion with timestamp 555 that applies to the range `[abc, adc]` as +``` +a -> + b -> + c< -> start(555) + d -> + c> -> end(555) +``` + +This dump only lists the content of the range trie. This information is sufficient to track the deletion state if we +are advancing through the cursor in either direction without skipping, but isn't sufficient to know what deletion state +applies if the user needs to `skipTo` positions inside the trie. For example, if a cursor is positioned on "a" and the +user performs `skipTo(2, c)` to advance to "ac", this range cursor does not list this position but must still be able to +present the fact that the requested position is covered by the deletion 555. This is achieved by the `precedingState` +reported by the cursor. + +If we also include the preceding state by reporting all `state` values, the trie will look like this in the forward +direction: +``` +a -> + b -> + c -> start(555) + d -> covering(555) + c↑ -> end(555) +``` +and like this in the reverse: +``` +a -> + d -> + c -> end(555) + b -> covering(555) + c↑ -> start(555) +``` +This ensures that when we skip to any position between the two bounds, the position where the cursor ends up (either +"ad" or "adc" in forward direction, "ab" or "abc" in the reverse) has 555 as its preceding deletion state. Note that any +content must be the same in both directions, but preceding state applies to preceding positions in iteration order and +thus will be different in the two directions. + +The range state used in this representation will be such that `start(dt)` has a `null` state on the left (i.e. returned +by `precedingState(FORWARD)`) and has `covering(dt)` on the right (`precedingState(REVERSE)`), `end(dt)` has +`covering(dt)` on the left and `null` on the right. `covering(dt)` is a covering state that returns itself for the +preceding state in both directions. To support touching ranges, we also need a `switch(ldt, rdt)` state that has +`covering(ldt)` on the left and `covering(rdt)` on the right. + +## Slice / set intersection of range tries + +Intersection of range tries is performed by the same process as normal trie set intersection, augmented by information +about the covering states of every position. If positions are completely covered by the set, we report the range +cursor's `state/precedingState/content` unmodified. If the position falls on a prefix or a boundary of the set, we throw +away (using the `restrict` method) parts that do not fall inside the set. The latter may also happen if the position +is not one present in the range trie, but covered by a range (i.e. where `skipTo` went beyond the set cursor's position +and the range cursor's `precedingState` returned covering state): in this case we may apply the covering state's +`asBoundary` method to promote it to a boundary where the set forms substructure in the covered range. + +Imagine that we want to intersect the range trie above with the range `[aaa, acc]`, which looks like this as in the +forward direction: +``` +a -> NOT_CONTAINED + a -> NOT_CONTAINED + a -> START + c -> CONTAINED + c↑ -> END +``` + +The intersection cursor will first visit the root and the position "a", where in both cases it will find `null` range +cursor state, resulting in an `null` state for the intersection. The next position "aa" is present in the set, but not +in the range, thus the `skipTo` operation on the range advances to "ab", whose `precedingState` is null. This means that +there is nothing to intersect in the "aa" branch and anything before the range cursor's position, thus we continue by +skipping the set cursor to "ab". This positions it at "ac", whose state is `CONTAINED` and thus its preceding side +is `true`. This means that we must report all branches of the range cursor that we see until we advance to or beyond the +set's position. The intersection cursor is positioned at the range cursor's "ab" position. It does not have any `state` +for it, so the intersection cursor reports `null` state as well. + +On the next advance we descend to "abc" (which by virtue of descending is known to fall before the set cursor's +position) and report the range cursor's `start(555)` state unchanged, resulting also in the same `content` and `null` +as `precedingState` (because `start(dt)` has `null` on its left (preceding in forward direction) side). + +The next advance takes the range cursor to "ad", which is beyond the current set cursor position. We check the range +cursor's `precedingState` and find that it is `covering(555)`. Since at this point we have a preceding state, we need to +walk the set branch and use it to augment and report the active covering state. The intersection cursor remains at the +set cursor's "ac" position, and must report the active `covering(555)` augmented by the set cursor's `CONTAINED` state. +This means that we can report the range state unchanged at this position, and thus `covering(555)` is reported as the +state and `null` as the `content` (because `covering(dt)` is not a boundary state). + +On the next advance, the intersection cursor follows the earlier of the two cursors, which is the set cursor. This +advances it to "acc↑", which is a boundary of the set with state `END`. The active covering state is still +`covering(555)`; augmenting it with `END` turns it into the boundary `end(555)`, which is reported in `state` as +well as `content` (because `start(dt)` is a boundary state). `precedingState` reports the left side of this boundary, +which is still `covering(555)`. + +The next advance takes the set to the exhausted position, which completes the intersection. + +The resulting forward walk of the trie looks as expected: +``` +a -> + b -> + c -> start(555) + c -> covering(555) + c↑ -> end(555) +``` + +## Union of range tries + +The merge process is similar (with a second range trie instead of a set), but we walk all branches of both tries and +combine their states. There are two differences from the normal trie merge process: +- We apply the merge resolver to states instead of content. This includes both content and preceding state, which is + necessary to be able to report the correct state for the merged trie. +- When one of the range cursors is ahead, we pass its `precedingState` as an argument to the merge resolver to modify + all reported states. + +As an example, consider once again the `[abc, adc]` range with deletion 555, merged with the following trie for the +`[aaa, acc]` range with deletion 666: +``` +a -> + a -> + a -> start(666) + c -> covering(666) + c↑ -> end(666) +``` + +The merge cursor will first proceed along "aaa" where the first source (advancing to "ab") does not have any +`precedingState`, and thus the merge reports "null" for "", "a" and "aa", and the `start(666)` state for "aaa" +unchanged. On the next advance this source moves beyond the other cursor's "ab" position. The merge thus follows the +second source, but the first has a `precedingState` of `covering(666)`, which must be reflected in the reported states. +The second cursor has no `state` for "ab", thus the merge reports `covering(666)` as the state for "ab". + +The next advance takes the second source to "abc", with `start(555)` state. The merge resolver is called with +`start(555)` and `covering(666)` as arguments. Typically, the resolvers we use drop smaller deletion timestamps, so +this returns `covering(666)` unchanged. + +The next advance takes the second source to "ad", which is beyond the current position of the first source. The merge +cursor switches to following the first source, positioned at "ac", with `covering(666)` as the `state`, but +it must also reflect the second sources `covering(555)` preceding state. The resolver is called with these two +arguments and once again returns the bigger deletion timestamp, `covering(666)`. + +The next advance takes the first source and the iteration cursor to "acc↑", where this source has the `end(666)` +boundary as state. The merge resolver is called with `end(666)` and `covering(555)`. This time the covering state does +not override the boundary, thus the resolver must create a state that reflects the end of the current range, as +well as the fact that we continue with the other covering state. It must thus return the boundary state +`switch(666, 555)` which the intersection cursor reports. + +The next advance takes the first source to the exhausted position. The merge thus reports all paths and state from the +other cursor unchanged until it is exhausted as well, i.e. `covering(555)` for "ad" and `end(555)` for "adc". + +The final resulting trie looks like this: +``` +a -> + a -> + a -> start(666) + b -> covering(666) + c -> covering(666) + c -> covering(666) + c -> switch(666, 555) + d -> covering(555) + c -> end(555) +``` +Note that the "abc" path adds no information. We don't, however, know this before descending into that branch, thus we +can't easily remove it. This could be done using a special `cleanup` operation over a trie which must buffer descents +until effective content is found, which is best done as a separate transformation rather than as part of the merge. + +## Relation to trie sets + +`TrieSetCursor` is a subclass of `RangeCursor`, and the trie set is a special case of a range trie that simply adds +boolean versions of the `precedingState` methods. + +# Deletion-Aware Tries + +Deletion-aware tries are designed to store live data together with ranges of deletions (aka tombstones) in a single +structure, and be able to apply operations over them that properly restrict deletion ranges on intersections and apply +the deletions of one source to the live content of others in merges. + +Our deletion-aware tries implement this by allowing nodes in the trie to offer a "deletions branch" which specifies +and encloses the deletion ranges applicable to the branch rooted at that node. This can be provided at any level of the +trie, but only once for any given path (i.e. there cannot be a deletion branch under another deletion branch). In many +practical usecases the depth at which this deletion path is introduced will also be predetermined for any given path; +merges implement an option that exploits this property to avoid some inefficiencies. + +It is also forbidden for live branches to contain data that is deleted by the trie's own deletion branches (aka +shadowed data). + +Perhaps the easiest way to describe the approach is to discuss its alternatives and the reasons we preferred the +structure and features of the option we went with. + +### Why not mix deletion ranges with live data? + +In this approach we store deletions as ranges, and live data as point ranges in the single structure. They are ordered +together and, to facilitate an efficient `precedingState`, points need to specify the applicable deletions before and +after the point. This approach is an evolution of Cassandra's `UnfilteredRowIterator` that mixes rows and tombstone +markers. + +The example below represents a trie that contains a deletion from `aaa` to `acc` with timestamp 666, and a live point at +`abb` with timestamp 700 in this fashion: +``` +a -> + a -> + a -> start(666) + b -> covering(666) + b -> data(value, 700) + switch(666, 666) + c -> covering(666) + c -> end(666) +``` + +Having the point also declare the state before and after makes it easy to obtain the covering deletion e.g. for `aba`, +`abc` or `ab`. This is a very acceptable amount of overhead that isn't a problem for the approach. + +The greatest strength of this approach is that it makes it very easy to perform merges because all the necessary +information is present at any position that the merging cursor visits. + +The reason to avoid this approach is that we often want to find only the live data between a given range of keys, or the +closest live entry after a given key. In an approach like this we can have many thousands of deletion markers that +precede the live entries, and to find it we have to filter these deletions out. + +In fact, we have found this situation to occur often in many practical applications of Cassandra. Solving this problem +is one of the main reasons to implement the `Trie` machinery. + +This problem could be worked around by storing metadata at parent nodes whose branches don't contain live data; we went +with a more flexible approach. + +### Why not store live data and deletions separately? + +In the other extreme, we can have two separate tries. For the example above, it could look like this: +``` +LIVE +b -> + b -> data(value, 700) +DELETIONS +a -> + a -> + a -> start(666) + c -> covering(666) + c -> end(666) +``` + +To perform a merge, we have to apply the DELETIONS trie of each source to the other's LIVE trie. In other words, a merge +can be implemented as +``` +merge(a, b).LIVE = merge(apply(b.DELETIONS, a.LIVE), apply(a.DELETIONS, b.LIVE)) +merge(a, b).DELETIONS = merge(a.DELETIONS, b.DELETIONS) +``` +or (which makes better sense when multiple sources are merged): +``` +d = merge(a.DELETIONS, b.DELETIONS) +merge(a, b).LIVE = apply(d, merge(a.LIVE, b.LIVE)) +merge(a, b).DELETIONS = d +``` + +This can create extra complexity when multiple merge operations are applied on top of one another, but if we select all +sources in advance and merge them with a single collection merge the method's performance is good. + +This solves the issue above: because we query live and deletions separately, we can efficiently get the first live item +after a point. We can also get the preceding state of a deletion without storing extra information at live points. + +The approach we ultimately took is an evolution of this to avoid a couple of performance weaknesses. On one hand, it is +a little inefficient to walk the same path in two separate tries, and it would be helpful if we can do this only once +for at least some part of the key. On the other, there is a concurrency chokepoint at the root of this structure, because +whenever a deletion actually finds live data to remove in an in-memory trie, to achieve atomicity of the operation we +need to prepare and swap snapshots for the two full tries, which can also waste work and limits caching efficiency. + +In Cassandra we use partitions as the unit of consistency, and also the limit that range deletions are not allowed to +cross. It is natural, then, to split the live and deletion branches at the partition level rather than at the trie root. + +### Why not allow shadowed data, i.e. data deleted by the same trie's deletion branches? + +One way to avoid the concurrency issue above is to leave the live data in place and apply the deletion trie on every +query. This does ease the atomicity problem, and in addition makes the merge process simpler as we can independently +merge data and deletion branches. + +However, we pay a cost on live data read that is not insignificant, and the amount of space and effort we must spend +to deal with the retained data items can quickly compound unless we apply garbage collection at some points. We prefer +to do that garbage collection as early as possible, by not introducing the garbage in the first place. + +There is a potential application of relaxing this for intermediate states of transformation, e.g. by letting a merge +delay the application of the deletions until the end of a chain of transformations. This is an internal implementation +detail that would not change the requirements for the user. + +### Why not allow nested deletion branches? + +If it makes sense to permit deletion branches, then we could have them at multiple levels, reducing the amount of path +duplication in the trie. + +For example, using a deletion branch we can represent the example above as +``` +a -> + *** start deletion branch + a -> + a -> start(666) + c -> covering(666) + c -> end(666) + *** end deletion branch + b -> + b -> data(value, 700) +``` + +and if we then delete `aba-abc` with timestamp 777, represented as +``` +a -> + b -> + *** start deletion branch + a -> start(777) + c -> end(777) + *** end deletion branch +``` +we could merge it into the in-memory trie as +``` +a -> + *** start deletion branch + a -> + a -> start(666) + c -> covering(666) + c -> end(666) + *** end deletion branch + b -> + *** start deletion branch + a -> start(777) + c -> end(777) + *** end deletion branch +``` + +This can work well for point queries and has some simplicity advantages for merging, but introduces a +complication tracking the state when we want to walk over a range and apply deletion branches to data. The problem is +that we don't easily know what deletion state applies e.g. when we advance from `abc` in the trie above; we either have +to keep a stack of applicable ranges, or store the deletion to return to in the nested deletion branch, which would +cancel out the simplicity advantages. + +### Why predetermined deletion levels (`deletionsAtFixedPoints`) are important + +The case above (nested deletion branches) is something that can naturally occur in merges, including as shown in the +example. If we don't do anything special, this merge would create a nesting of branches. + +We fix this problem by applying "hoisting" during merges, i.e. by bringing other sources' covered deletion branches to +the highest level that one source defines it. For the example above, this means that when the merged cursor encounters +the in-memory deletion branch at `a`, it has to hoist the mutation's deletion to be rooted at `a` rather than `ab`. + +In other words, the mutation is changed to effectively become +``` +a -> + *** start deletion branch + b -> + a -> start(777) + c -> end(777) + *** end deletion branch +``` + +which can then be correctly combined into +``` +a -> + *** start deletion branch + a -> + a -> start(666) + b -> covering(666) + a -> switch(666, 777) + c -> switch(777, 666) + c -> covering(666) + c -> end(666) + *** end deletion branch +``` + +The hoisting process can be very inefficient. The reason for this is that we do not know where in the source trie a +deletion branch is defined, and to bring them all to a certain level we must walk the whole live branch. If e.g. this +in-memory trie never had a deletion before, this could mean walking all the live data in the trie, potentially millions +of nodes. + +Provided that the result of this hoisting becomes a new deletion branch, which would be the case for in-memory tries, +one can say that the amortized cost is still O(1) because once we hoist a branch we never have to walk that branch +again. The delay of doing that pass could still cause problems; more importantly, in merge views we may have to do that +multiple times, especially on nested merges. + +To avoid this issue, deletion-aware merges accept a flag called `deletionsAtFixedPoints`. When this flag is true, the +merge expects that all sources can only define deletion branches at matching points. If this is guaranteed, we do not +need to do any hoisting, because a covered deletion branch cannot exist. We expect most practical uses of this class to +perform all merges with this flag set to true. + +This means preparing deletions so that they always share the same point of introduction of the deletion branch. For the +example above it means preparing the deletion in the hoisted form. In Cassandra, for example, this can be guaranteed +by wiring the deletion branches to always be on the partition level. + +# Appendices -Slicing, implemented in `SlicedTrie` and used via `Trie.subtrie`, can also be seen as a variation of the parallel -walk. In this case we walk the source as well as singletons of the two bounds. +## Return stop alternatives -More precisely, while the source cursor is smaller than the left bound, we don't produce any output. That is, we -keep advancing in a loop, but to avoid walking subtries unnecessarily, we use `skipChildren` instead of `advance`. -As we saw above, a smaller cursor that descends remains smaller, thus there is no point to do so when we are -ahead of the left bound. When the source matches a node from the left bound, we descend both and pass the -state to the consumer. As soon as the source becomes known greater than the left bound, we can stop processing -the latter and pass any state we see to the consumer. +We considered two alternatives to the ascent/return path stops that we use in our cursors. The two are conceptually +simpler, but come with higher implementation costs. -Throughout this we also process the right bound cursor and we stop the iteration (by returning `depth = -1`) -as soon as the source becomes larger than the right bound. +The first alternative is to use trailing "type" modifier bits for the positions the cursor stops at: +- "<" (00) for positions to the left of content and branch (left-inclusive or right-exclusive boundaries) +- "=" (01) for ordered content of a node +- "↓" (10) for any children of the branch, and also metadata that should be reported for the branch +- ">" (11) for positions to the right of content and branch (left-exclusive or right-inclusive boundaries) -`SlicedTrie` does not use singleton tries and cursors over them but opts to implement them directly, using an -implicit representation using a pair of `depth` and `incomingTransition` for each bound. +This is conceptually simpler because these modifiers do not change when we iterate the trie cursor in reverse, but are +simply walked in the reverse order (which is achieved by flipping the modifier bits together with the transition bits). +The difference with the "return path" approach that we chose is that the latter needs these to be bundled in a different +way depending on whether we are iterating in the forward or reverse iteration: +- on forward iteration, <, = and ↓ must be bundled on the descent path, and > on the ascent; +- on reverse iteration, > and ↓ must be bundled on the descent path, and = and < on the ascent. -In slices we can also use `advanceMultiple` when we are certain to be strictly inside the slice, i.e. beyond the -left bound and before the right bound. As above, descending to any depth in this case is safe as the -result will remain smaller than the right bound. +The main reason we could not use the modifier option is the fact that we need tail tries, and that we usually recognize +the need to form a tail trie by some content given for the ↓ metadata modifier. As cursors (especially after +transformations like unions) will have advanced beyond the < boundary and = content modifier, but we need to present +the boundaries and/or content in the returned tail trie, this would have added enormous complications to the tail trie +functionality. -## Reverse iteration +The ascent path approach always bundles metadata with the descent path content, meaning that we are correctly positioned +to take a tail trie when we recognize the need for one. -Tries and trie cursors support reverse iteration. Reverse trie iteration presents data in lexicographic order -using the inverted alphabet. This is not always the same as the reverse order of the data returned in the forward -direction; the latter is only guaranteed if the entries in the trie can contain no prefixes (i.e. the representation -is prefix-free like the byte-ordered type translations). +The other alternative we considered involves presenting virtual content-bearing child nodes for the boundaries and +ordered content using out-of-range transition values. For example, we could use the transitions `-2` for +left-inclusive/right-exclusive boundaries, `-1` for ordered content and `256` for left-exlusive/right-inclusive +boundaries, to lead us to implicit positions that contain the relevant attached value. By expanding the encoded position +part for the transition by two leading bits we can easily accommodate this without changing any of the transformation +implementations. -This difference is imposed by the cursor interfaces which necessarily have to present parent nodes before their -children and do not preserve or present any state on ascent. +This approach does not have a problem with tail tries, but would be a little less performant because of the additional +descent that would be required for all boundaries of range/trie sets and content in ordered tries. \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java index 9dfb2c190d6d..d1e5332baff2 100644 --- a/src/java/org/apache/cassandra/db/tries/TrieDumper.java +++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java @@ -21,23 +21,20 @@ import org.agrona.DirectBuffer; -/** - * Simple utility class for dumping the structure of a trie to string. - */ -class TrieDumper implements Trie.Walker +/// Simple utility class for dumping the structure of a trie to string. +abstract class TrieDumper implements Cursor.Walker { - private final StringBuilder b; - private final Function contentToString; + protected final StringBuilder b; int needsIndent = -1; int currentLength = 0; + int depthAdjustment = 0; - public TrieDumper(Function contentToString) + TrieDumper() { - this.contentToString = contentToString; this.b = new StringBuilder(); } - private void endLineAndSetIndent(int newIndent) + protected void endLineAndSetIndent(int newIndent) { needsIndent = newIndent; } @@ -45,11 +42,11 @@ private void endLineAndSetIndent(int newIndent) @Override public void resetPathLength(int newLength) { - currentLength = newLength; - endLineAndSetIndent(newLength); + currentLength = newLength + depthAdjustment; + endLineAndSetIndent(currentLength); } - private void maybeIndent() + protected void maybeIndent() { if (needsIndent >= 0) { @@ -78,11 +75,10 @@ public void addPathBytes(DirectBuffer buffer, int pos, int count) } @Override - public void content(T content) + public void onReturnPath() { - b.append(" -> "); - b.append(contentToString.apply(content)); - endLineAndSetIndent(currentLength); + maybeIndent(); + b.append('↑'); } @Override @@ -90,4 +86,64 @@ public String complete() { return b.toString(); } + + static class Plain extends TrieDumper + { + protected final Function contentToString; + + public Plain(Function contentToString) + { + super(); + this.contentToString = contentToString; + } + + @Override + public void content(T content) + { + b.append(" -> "); + b.append(contentToString.apply(content)); + endLineAndSetIndent(currentLength); + } + } + + static class DeletionAware> extends Plain + implements DeletionAwareCursor.DeletionAwareWalker + { + final Function rangeToString; + + public DeletionAware(Function contentToString, + Function rangeToString) + { + super(contentToString); + this.rangeToString = rangeToString; + } + + @Override + public void deletionMarker(D content) + { + b.append(" -> "); + b.append(rangeToString.apply(content)); + endLineAndSetIndent(currentLength); + } + + @Override + public boolean enterDeletionsBranch() + { + maybeIndent(); + b.append("*** Start deletion branch"); + endLineAndSetIndent(currentLength); + depthAdjustment = currentLength; + return true; + } + + @Override + public void exitDeletionsBranch() + { + endLineAndSetIndent(depthAdjustment); + maybeIndent(); + b.append("*** End deletion branch"); + resetPathLength(0); + depthAdjustment = 0; + } + } } diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumperWithPath.java b/src/java/org/apache/cassandra/db/tries/TrieDumperWithPath.java new file mode 100644 index 000000000000..20ed629b4598 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/TrieDumperWithPath.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.tries; + +import org.agrona.DirectBuffer; + +/// Simple utility class for dumping the structure of a trie to string. +/// +/// This version is in the form of abstract classes so that the implementation of the conversion to string can access +/// the key bytes and store additional information. +public abstract class TrieDumperWithPath extends TriePathReconstructor implements Cursor.Walker +{ + protected final StringBuilder b; + int needsIndent = -1; + int currentLength = 0; + int depthAdjustment = 0; + + TrieDumperWithPath() + { + this.b = new StringBuilder(); + } + + protected void endLineAndSetIndent(int newIndent) + { + needsIndent = newIndent; + } + + @Override + public void resetPathLength(int newLength) + { + currentLength = newLength + depthAdjustment; + super.resetPathLength(currentLength); + endLineAndSetIndent(currentLength); + } + + protected void maybeIndent() + { + if (needsIndent >= 0) + { + b.append('\n'); + for (int i = 0; i < needsIndent; ++i) + b.append(" "); + needsIndent = -1; + } + } + + @Override + public void addPathByte(int nextByte) + { + super.addPathByte(nextByte); + maybeIndent(); + ++currentLength; + b.append(String.format("%02x", nextByte)); + } + + @Override + public void addPathBytes(DirectBuffer buffer, int pos, int count) + { + super.addPathBytes(buffer, pos, count); + maybeIndent(); + for (int i = 0; i < count; ++i) + b.append(String.format("%02x", buffer.getByte(pos + i) & 0xFF)); + currentLength += count; + } + + @Override + public void onReturnPath() + { + super.onReturnPath(); + maybeIndent(); + b.append('↑'); + } + + @Override + public String complete() + { + return b.toString(); + } + + public static abstract class Plain extends TrieDumperWithPath + { + /// Convert the given content to string. This method can make use of [#keyBytes] and [#keyPos]. + public abstract String contentToString(T content); + + @Override + public void content(T content) + { + b.append(" -> "); + b.append(contentToString(content)); + endLineAndSetIndent(currentLength); + } + } + + public static abstract class DeletionAware> extends Plain + implements DeletionAwareCursor.DeletionAwareWalker + { + /// Convert the given deletion marker to string. This method can make use of [#keyBytes] and [#keyPos]. + public abstract String deletionToString(D deletionMarker); + + @Override + public void deletionMarker(D content) + { + b.append(" -> "); + b.append(deletionToString(content)); + endLineAndSetIndent(currentLength); + } + + @Override + public boolean enterDeletionsBranch() + { + maybeIndent(); + b.append("*** Start deletion branch"); + endLineAndSetIndent(currentLength); + depthAdjustment = currentLength; + return true; + } + + @Override + public void exitDeletionsBranch() + { + endLineAndSetIndent(depthAdjustment); + maybeIndent(); + b.append("*** End deletion branch"); + resetPathLength(0); + depthAdjustment = 0; + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java index 99e3f764244d..bca2dda30cc2 100644 --- a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java +++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java @@ -26,13 +26,14 @@ import org.apache.cassandra.utils.bytecomparable.ByteComparable; -/** - * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by - * descendants). - */ +/// Convertor of trie entries to iterator where each entry is passed through [#mapContent] (to be implemented by +/// descendants). +/// +/// If [#mapContent] returns null, this version of the class will pass on that null upstream. If this is not the desired +/// behaviour, see [WithNullFiltering]. public abstract class TrieEntriesIterator extends TriePathReconstructor implements Iterator { - private final Trie.Cursor cursor; + protected final Cursor cursor; private final Predicate predicate; T next; boolean gotNext; @@ -42,11 +43,11 @@ protected TrieEntriesIterator(Trie trie, Direction direction, Predicate pr this(trie.cursor(direction), predicate); } - TrieEntriesIterator(Trie.Cursor cursor, Predicate predicate) + TrieEntriesIterator(Cursor cursor, Predicate predicate) { this.cursor = cursor; this.predicate = predicate; - assert cursor.depth() == 0; + cursor.assertFresh(); next = cursor.content(); gotNext = next != null && predicate.test(next); } @@ -81,6 +82,14 @@ ByteComparable.Version byteComparableVersion() return cursor.byteComparableVersion(); } + Direction direction() + { + return cursor.direction(); + } + + /// To be implemented by descendants to map the content value and path to the required entry. If callers need to + /// save the path, they must copy the `bytes` array, which will be overwritten when the iteration continues. + /// If this method returns null, the null will be passed on as an entry in the iteration. protected abstract V mapContent(T content, byte[] bytes, int byteLength); /** @@ -88,7 +97,7 @@ ByteComparable.Version byteComparableVersion() */ static class AsEntries extends TrieEntriesIterator> { - public AsEntries(Trie.Cursor cursor) + public AsEntries(Cursor cursor) { super(cursor, Predicates.alwaysTrue()); } @@ -105,7 +114,7 @@ protected Map.Entry mapContent(T content, byte[] b */ static class AsEntriesFilteredByType extends TrieEntriesIterator> { - public AsEntriesFilteredByType(Trie.Cursor cursor, Class clazz) + public AsEntriesFilteredByType(Cursor cursor, Class clazz) { super(cursor, clazz::isInstance); } @@ -122,4 +131,76 @@ static java.util.Map.Entry toEntry(ByteCompara { return new AbstractMap.SimpleImmutableEntry<>(toByteComparable(version, bytes, byteLength), content); } + + /// Convertor of trie entries to iterator where each entry is passed through [#mapContent] (to be implemented by + /// descendants). This is the same as [TrieEntriesIterator], but instead of accepting a predicate to filter out entries, + /// it skips over ones where [#mapContent] returns null. + public static abstract class WithNullFiltering extends TriePathReconstructor implements Iterator + { + protected final Cursor cursor; + V next; + boolean gotNext; + + protected WithNullFiltering(BaseTrie trie, Direction direction) + { + this(trie.cursor(direction)); + } + + WithNullFiltering(Cursor cursor) + { + this.cursor = cursor; + cursor.assertFresh(); + T nextContent = cursor.content(); + if (nextContent != null) + { + next = mapContent(nextContent, keyBytes, keyPos); + gotNext = next != null; + } + else + gotNext = false; + } + + public boolean hasNext() + { + while (!gotNext) + { + T nextContent = cursor.advanceToContent(this); + if (nextContent != null) + { + next = mapContent(nextContent, keyBytes, keyPos); + gotNext = next != null; + } + else + gotNext = true; + } + + return next != null; + } + + public V next() + { + if (!hasNext()) + throw new IllegalStateException("next without hasNext"); + + return consumeNext(); + } + + protected V consumeNext() + { + gotNext = false; + V v = next; + next = null; + return v; + } + + protected V peekNextIfAvailable() + { + return next; // null if not prepared + } + + /// To be implemented by descendants to map the content value and path to the required entry. If callers need to + /// save the path, they must copy the `bytes` array, which will be overwritten when the iteration continues. + /// If this method returns null, the iteration will skip over the current position. + protected abstract V mapContent(T content, byte[] bytes, int byteLength); + } } diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesWalker.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesWalker.java index 362fe8f112b7..feda00adee8b 100644 --- a/src/java/org/apache/cassandra/db/tries/TrieEntriesWalker.java +++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesWalker.java @@ -21,11 +21,9 @@ import org.apache.cassandra.utils.bytecomparable.ByteComparable; -/** - * Walker of trie entries, used with Trie.process to walk all content in order and provide the path through which values - * are reached. - */ -public abstract class TrieEntriesWalker extends TriePathReconstructor implements Trie.Walker +/// Walker of trie entries, used with [Trie#process] to walk all content in order and provide the path through which values +/// are reached. +public abstract class TrieEntriesWalker extends TriePathReconstructor implements Cursor.Walker { @Override public void content(T content) @@ -35,15 +33,13 @@ public void content(T content) protected abstract void content(T content, byte[] bytes, int byteLength); - /** - * Iterator representing the content of the trie a sequence of (path, content) pairs. - */ + /// Iterator representing the content of the trie a sequence of (path, content) pairs. static class WithConsumer extends TrieEntriesWalker { - private final BiConsumer consumer; + private final BiConsumer consumer; private final ByteComparable.Version byteComparableVersion; - public WithConsumer(BiConsumer consumer, ByteComparable.Version byteComparableVersion) + public WithConsumer(BiConsumer consumer, ByteComparable.Version byteComparableVersion) { this.consumer = consumer; this.byteComparableVersion = byteComparableVersion; @@ -61,4 +57,40 @@ public Void complete() return null; } } + + /// Deletion-aware entries walker, listing both deletion (calling [#deletionMarker(RangeState, byte\[\], int)]) + /// and live (calling [#content(Object, byte\[\], int)]) content in the trie. + public static abstract class DeletionAware, V> + extends TrieEntriesWalker + implements DeletionAwareCursor.DeletionAwareWalker + { + int depthAdjustment = 0; + + @Override + public void deletionMarker(D marker) + { + deletionMarker(marker, keyBytes, keyPos); + } + + protected abstract void deletionMarker(D marker, byte[] bytes, int byteLength); + + @Override + public boolean enterDeletionsBranch() + { + depthAdjustment = keyPos; + return true; + } + + @Override + public void exitDeletionsBranch() + { + depthAdjustment = 0; + } + + @Override + public void resetPathLength(int newLength) + { + super.resetPathLength(newLength + depthAdjustment); + } + } } diff --git a/src/java/org/apache/cassandra/db/tries/TriePathReconstructor.java b/src/java/org/apache/cassandra/db/tries/TriePathReconstructor.java index c59d126fe272..be1dc975d327 100644 --- a/src/java/org/apache/cassandra/db/tries/TriePathReconstructor.java +++ b/src/java/org/apache/cassandra/db/tries/TriePathReconstructor.java @@ -23,7 +23,7 @@ import org.agrona.DirectBuffer; import org.apache.cassandra.utils.bytecomparable.ByteComparable; -public class TriePathReconstructor implements Trie.ResettingTransitionsReceiver +public class TriePathReconstructor implements Cursor.ResettingTransitionsReceiver { protected byte[] keyBytes = new byte[32]; protected int keyPos = 0; diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java new file mode 100644 index 000000000000..d3c408837edc --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +/// A trie that defines an infinite set of `ByteComparable`s. Trie sets represent sets of ranges of coverage by listing +/// the boundaries between them, and providing a way to identify the "covering state" for positions that are being +/// skipped to. +/// +/// Trie sets can be constructed from ranges using [#branch], [#range] and [#ranges], and can be manipulated via +/// the set algebra methods [#union], [#intersection] and [#negation]. Sets are usually used to select a subset of a +/// trie using its `intersect` method (see [Trie#intersect], [RangeTrie#intersect], [DeletionAwareTrie#intersect]), +/// which preserves metadata in prefixes that are not part of the set (e.g. content for "a", "ab", "abc", "abe" when +/// listing the branch `[abcde;abefg]`). Additionally, plain tries also provide the [Trie#intersectSlicing] method which +/// only preserves content strictly inside the set (i.e. only "abe" out of the prefix examples above) and can be used to +/// obtain a lexicographically bounded slice of an ordered trie. +public interface TrieSet extends CursorWalkable +{ + /// The set covering the branch rooted at the given key. + static TrieSet branch(ByteComparable.Version version, ByteComparable b) + { + return rangeInclusiveEnd(version, b, b); + } + + /// The set between two keys, inclusive of the left key and the branch rooted at the right bound. + static TrieSet rangeInclusiveEnd(ByteComparable.Version version, ByteComparable left, ByteComparable right) + { + return range(version, left, true, right, true); + } + + /// The set between two keys, inclusive of the left key and excluding the right bound or any of its descendants. + static TrieSet rangeExclusiveEnd(ByteComparable.Version version, ByteComparable left, ByteComparable right) + { + return range(version, left, true, right, false); + } + + /// The set between two keys with the given inclusivity. Note that inclusivity here applies to the branch rooted at + /// the respective key. For example, `(abc; cde]` does not contain "cdf", "abc" or "abcd" and includes "abd", "cde", + /// and "cdez". + static TrieSet range(ByteComparable.Version version, ByteComparable left, boolean leftInclusive, ByteComparable right, boolean rightInclusive) + { + return ranges(version, leftInclusive, rightInclusive, left, right); + } + + /// The set between the given pairs of boundaries. This is the same as the union of the range sets produced by + /// each pair in the `boundaries` array, done in a single step. The inclusivity parameters apply to the bound at the + /// respective side in every pair, `leftInclusive` to all left bounds (every even position in the array), and + /// `rightInclusive` to all right ones (every odd position in the array). + /// + /// The keys in the array must be given in order, taking into account the inclusivity parameter (where e.g. + /// right-inclusive bound "a" is greater than "ab"). + /// + /// Also see [RangesCursor] for further information. + static TrieSet ranges(ByteComparable.Version version, boolean leftInclusive, boolean rightInclusive, ByteComparable... boundaries) + { + return dir -> RangesCursor.create(dir, version, leftInclusive, rightInclusive, boundaries); + } + + /// The set between the given pairs of boundaries, start-inclusive and end-exclusive. This is the same as the union + /// of the range sets produced by each pair in the `boundaries` array, done in a single step. + /// + /// The keys in the array must be given in order. If the same key appears twice it has no effect on the result. + /// + /// Also see [RangesCursor] for further information. + static TrieSet ranges(ByteComparable.Version version, ByteComparable... boundaries) + { + return ranges(version, true, false, boundaries); + } + + static TrieSet empty(ByteComparable.Version byteComparableVersion) + { + return ranges(byteComparableVersion, true, false); + } + + static TrieSet full(ByteComparable.Version byteComparableVersion) + { + return ranges(byteComparableVersion, true, false, null, null); + } + + /// Returns true if the given key is strictly contained in this set, i.e. it falls inside a covered range or branch. + /// This excludes prefixes of set boundaries. + default boolean strictlyContains(ByteComparable key) + { + return contains(key) == ContainsResult.CONTAINED; + } + + enum ContainsResult + { + CONTAINED, + PREFIX, + NOT_CONTAINED + } + + /// Returns whether the given key is contained in this set. Returns CONTAINED if it falls inside a covered range or + /// branch, PREFIX if it is a prefix of a set boundary, and NOT_CONTAINED if it is not contained in the set at all. + default ContainsResult contains(ByteComparable key) + { + TrieSetCursor cursor = cursor(Direction.FORWARD); + final ByteSource bytes = key.asComparableBytes(cursor.byteComparableVersion()); + int next = bytes.next(); + while (next != ByteSource.END_OF_STREAM) + { + long skipPosition = Cursor.positionForDescentWithByte(cursor.encodedPosition(), next); + if (Cursor.compare(cursor.skipTo(skipPosition), skipPosition) != 0) + return cursor.state().precedingIncluded(Direction.FORWARD) ? ContainsResult.CONTAINED + : ContainsResult.NOT_CONTAINED; + + next = bytes.next(); + } + return cursor.state().succeedingIncluded(Direction.FORWARD) ? ContainsResult.CONTAINED + : ContainsResult.PREFIX; + } + + default TrieSet union(TrieSet other) + { + // This method is currently only used for tests. Implemented by deMorgan's rule (`A u B = ~(~A x ~B)`). + // It could be done more efficiently if we have an intersection variation that flips the state values + // internally. + return dir -> new RangeIntersectionCursor.TrieSet(cursor(dir).negated(), + other.cursor(dir).negated()) + .negated(); + } + + default TrieSet intersection(TrieSet other) + { + // This method is currently only used for tests. Should we need it for (performance-sensitive) production uses, + // we should switch to a more direct set-specific intersection implementation. + return dir -> new RangeIntersectionCursor.TrieSet(cursor(dir), other.cursor(dir)); + } + + /// Represents the set inverse of the given set. + /// E.g. the inverse of the set `[a, b]` is the set `union([null, a), (b, null])`. + default TrieSet negation() + { + return dir -> cursor(dir).negated(); + } + + /// Constuct a textual representation of the trie. + default String dump() + { + return cursor(Direction.FORWARD).process(new TrieDumper.Plain<>(Object::toString)); + } + + // The methods below form the non-public implementation, whose visibility is restricted to package-level. + // The warning suppression below is necessary because we cannot limit the visibility of an interface method. + // We need an interface to be able to implement trie methods by lambdas, which is heavily used above. + + /// Implement this method to provide the concrete trie implementation as the cursor that presents it, most easily + /// done via a lambda as in the methods above. + //noinspection ClassEscapesDefinedScope + TrieSetCursor makeCursor(Direction direction); + + /// @inheritDoc This method's implementation uses [#makeCursor] to get the cursor and may apply additional cursor + /// checks for tests that run with verification enabled. + //noinspection ClassEscapesDefinedScope + @Override + default TrieSetCursor cursor(Direction direction) + { + return Trie.DEBUG ? new VerificationCursor.TrieSet(makeCursor(direction)) + : makeCursor(direction); + } +} diff --git a/src/java/org/apache/cassandra/db/tries/TrieSetCursor.java b/src/java/org/apache/cassandra/db/tries/TrieSetCursor.java new file mode 100644 index 000000000000..fc97295e8057 --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/TrieSetCursor.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +/// The implementation of a [TrieSet]. +/// +/// In addition to the functionality of normal trie cursors, set cursors also produce a [#state] that describes the +/// coverage of trie sections to the left and right of the cursor position. This is necessary to be able to identify +/// coverage after a [#skipTo] operation, where the set cursor jumps to a position beyond the requested one. +interface TrieSetCursor extends RangeCursor +{ + /// This type describes the state at a given cursor position. It describes the coverage of the positions before and + /// after the current in forward iteration order and whether the node is a set boundary that starts or ends an + /// included range. + enum RangeState implements org.apache.cassandra.db.tries.RangeState + { + // Note: the states must be ordered so that + // `values()[applicableBefore * APPLICABLE_BEFORE + applicableAfter * APPLICABLE_AFTER]` + // produces a state with the requested flags + + /// The cursor is at a prefix of some start boundary, and the branches before it as well as the current point + /// are not included in the set. + NOT_CONTAINED(false, false), + /// The cursor is positioned at an end boundary. Branches to the left of this are covered by the set. + /// The current position and any position to the right in lexicographic order (including descendants of the + /// current position) until the next boundary are excluded. + END(true, false), + /// The cursor is positioned at a start boundary. The current position as well as any position to the right + /// in lexicographic order (including descendants of the current position) up to the next boundary are covered + /// by the set. Branches to the left of this position are excluded. + START(false, true), + /// The cursor is positioned inside a covered range. + CONTAINED(true, true); + + public static final int APPLICABLE_BEFORE = 1 << 0; + public static final int APPLICABLE_AFTER = 1 << 1; + + /// Whether the set contains the positions before the cursor's in forward iteration order. + final boolean applicableBefore; + /// Whether the set contains the positions after the cursor's in iteration order, starting with the specific + /// node and the children of the current node. + final boolean applicableAfter; + + RangeState(boolean applicableBefore, boolean applicableAfter) + { + this.applicableBefore = applicableBefore; + this.applicableAfter = applicableAfter; + } + + /// Whether the positions preceding the current in iteration order are included in the set. + public boolean precedingIncluded(Direction direction) + { + return direction.select(applicableBefore, applicableAfter); + } + + /// Whether the positions following the current in iteration order are included in the set. + public boolean succeedingIncluded(Direction direction) + { + return direction.select(applicableAfter, applicableBefore); + } + + /// Whether the current position is a range boundary. + public boolean isBoundary() + { + return applicableBefore != applicableAfter; + } + + public RangeState toContent() + { + return isBoundary() ? this : null; + } + + /// Return an "intersection" state for the combination of two states, i.e. the ranges covered by both states. + public RangeState intersect(RangeState other) + { + return values()[ordinal() & other.ordinal()]; + } + + /// Return a "union" state for the combination of two states, i.e. the ranges covered by at least one of the states. + public RangeState union(RangeState other) + { + return values()[ordinal() | other.ordinal()]; + } + + /// Return the negated state, i.e. the state that corresponds to flipped areas of coverage to the left + /// and right, thus also exchanging start and end boundaries. See [Negated] for more details. + public RangeState negation() + { + return values()[ordinal() ^ (APPLICABLE_BEFORE | APPLICABLE_AFTER)]; + } + + public static RangeState fromProperties(boolean applicableBefore, boolean applicableAfter) + { + return values()[(applicableBefore ? APPLICABLE_BEFORE : 0) | + (applicableAfter ? APPLICABLE_AFTER : 0)]; + } + + public static RangeState fromPropertiesAsRangeState(boolean applicableBefore, boolean applicableAfter) + { + int index = (applicableBefore ? APPLICABLE_BEFORE : 0) | + (applicableAfter ? APPLICABLE_AFTER : 0); + if (index == 0) + return null; + return values()[index]; + } + + // Implementations of methods from the general RangeState interface (used to treat sets as range tries) + + @Override + public RangeState precedingState(Direction direction) + { + return precedingIncluded(direction) ? CONTAINED : null; + } + + @Override + public RangeState succedingState(Direction direction) + { + return succeedingIncluded(direction) ? CONTAINED : null; + } + + // Note: this method is not used by the set code, only by the RangeCursor interpretation of it. It will return + // null instead of NOT_CONTAINED as expected for Range trie states. + @Override + public RangeState restrict(boolean applicableBefore, boolean applicableAfter) + { + return fromPropertiesAsRangeState(this.applicableBefore && applicableBefore, + this.applicableAfter && applicableAfter); + } + + // Note: this method is not used by the set code, only by the RangeCursor interpretation of it. It will return + // null instead of NOT_CONTAINED as expected for Range trie states. + @Override + public RangeState asBoundary(Direction direction) + { + final boolean isForward = direction.isForward(); + return fromPropertiesAsRangeState(this.applicableBefore && !isForward, + this.applicableAfter && isForward); + } + + + public > + S applyToCoveringState(S srcState) + { + switch (this) + { + case START: + return srcState.asBoundary(Direction.FORWARD); + case END: + return srcState.asBoundary(Direction.REVERSE); + case CONTAINED: + return srcState; + case NOT_CONTAINED: + return null; + default: + throw new AssertionError(); + } + } + } + + /// The range state of the trie cursor at this point. + RangeState state(); + + /// Returns whether the set includes the positions before the current in iteration order, but after any earlier + /// position of this cursor, including any position requested by a [#skipTo] call, where this cursor advanced beyond + /// that position. + default boolean precedingIncluded() + { + return state().precedingIncluded(direction()); + } + + @Override + default RangeState content() + { + return state().toContent(); + } + + @Override + TrieSetCursor tailCursor(Direction direction); + + @Override + default TrieSetCursor precedingStateCursor(Direction direction) + { + if (precedingIncluded()) // preceding in the direction of this cursor + return RangesCursor.full(direction, byteComparableVersion()); + else + return null; + } + + /// Returns a negated version of this cursor, covering the complement of the key space. + default TrieSetCursor negated() + { + return new Negated(this); + } + + /// Negation of trie set cursors. + /// + /// Achieved by simply inverting the [#state()] values, but it must also correct the root state, including by + /// adding or dropping a return path to the root state. + class Negated implements TrieSetCursor + { + final TrieSetCursor source; + + enum Overriding + { + NONE, ROOT, ROOT_RETURN, EXHAUSTED + } + Overriding overriding; + + Negated(TrieSetCursor source) + { + this.source = source; + overriding = Overriding.ROOT; + } + + @Override + public long encodedPosition() + { + long encodedPosition = source.encodedPosition(); + switch (overriding) + { + case ROOT_RETURN: + return Cursor.rootReturnPosition(encodedPosition); + case EXHAUSTED: + return Cursor.exhaustedPosition(encodedPosition); + case ROOT: + case NONE: + default: + return encodedPosition; + } + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public RangeState state() + { + switch (overriding) + { + case ROOT: + if (!source.state().succeedingIncluded(direction())) + return direction().select(RangeState.START, RangeState.END); + else + return RangeState.NOT_CONTAINED; + case ROOT_RETURN: + return direction().select(RangeState.END, RangeState.START); + case EXHAUSTED: + return RangeState.NOT_CONTAINED; + case NONE: + default: + return source.state().negation(); + } + } + + long checkOverride(long encodedPosition) + { + int depth = Cursor.depth(encodedPosition); + if (depth > 0) + { + overriding = Overriding.NONE; + return encodedPosition; + } + else if (depth == 0) + { + // If we are ascending to the root on the return path, it is done to close an active deletion which + // we no longer have. Go directly to exhausted. + assert Cursor.isOnReturnPath(encodedPosition); + overriding = Overriding.EXHAUSTED; + return encodedPosition(); + } + else // depth < 0 + { + // If we went directly to exhausted, we have an active deletion. Insert a root position on the return + // path to close it. + assert Cursor.isExhausted(encodedPosition); + overriding = Overriding.ROOT_RETURN; + return encodedPosition(); + } + } + + @Override + public long advance() + { + switch (overriding) + { + case ROOT_RETURN: + overriding = Overriding.EXHAUSTED; + return encodedPosition(); + default: + return checkOverride(source.advance()); + } + } + + @Override + public long skipTo(long encodedSkipPosition) + { + if (Cursor.isExhausted(encodedSkipPosition) || overriding == Overriding.ROOT_RETURN) + { + overriding = Overriding.EXHAUSTED; + return encodedPosition(); + } + else + return checkOverride(source.skipTo(encodedSkipPosition)); + } + + // Sets don't implement advanceMultiple as they are only meant to limit data tries. + + @Override + public TrieSetCursor tailCursor(Direction direction) + { + assert !Cursor.isOnReturnPath(encodedPosition()) : "tailCursor called on the return path"; + assert !Cursor.isExhausted(encodedPosition()) : "tailCursor on exhausted cursor"; + return new Negated(source.tailCursor(direction)); + } + } +} diff --git a/src/java/org/apache/cassandra/db/tries/TrieTailsIterator.java b/src/java/org/apache/cassandra/db/tries/TrieTailsIterator.java index e15ce6548206..0e10791f84ee 100644 --- a/src/java/org/apache/cassandra/db/tries/TrieTailsIterator.java +++ b/src/java/org/apache/cassandra/db/tries/TrieTailsIterator.java @@ -20,92 +20,246 @@ import java.util.AbstractMap; import java.util.Iterator; import java.util.Map; +import java.util.function.BiFunction; import java.util.function.Predicate; +import com.google.common.base.Predicates; + import org.apache.cassandra.utils.bytecomparable.ByteComparable; -/** - * Iterator of trie entries that constructs tail tries for the content-bearing branches that satisfy the given predicate - * and skips over the returned branches. - */ -public abstract class TrieTailsIterator extends TriePathReconstructor implements Iterator +/// Iterator of trie entries that constructs tail tries for the content-bearing branches that satisfy the given predicate +/// and skips over the returned branches. When a branch that satisfies the predicate is found, the iterator calls +/// [#getContent] which in turn refers to a trie-specific flavour of `mapContent`; if the latter returns null, the null +/// will not be returned by the iterator, but the branch will still be skipped. +public abstract class TrieTailsIterator> extends TriePathReconstructor implements Iterator { - final Trie.Cursor cursor; - private final Predicate predicate; - private T next; + final C cursor; + private final Predicate predicate; + private V next; private boolean gotNext; + private boolean started; - protected TrieTailsIterator(Trie trie, Direction direction, Predicate predicate) - { - this.cursor = trie.cursor(direction); - this.predicate = predicate; - assert cursor.depth() == 0; - } - - TrieTailsIterator(Trie.Cursor cursor, Predicate predicate) + TrieTailsIterator(C cursor, Predicate predicate) { this.cursor = cursor; this.predicate = predicate; - assert cursor.depth() == 0; + this.started = false; + cursor.assertFresh(); } public boolean hasNext() { - if (!gotNext) + while (!gotNext) { - int depth = cursor.depth(); - if (depth > 0) + if (started) { // if we are not just starting, we have returned a branch and must skip over it - depth = cursor.skipTo(depth, cursor.incomingTransition() + cursor.direction().increase); - if (depth < 0) - return false; - resetPathLength(depth - 1); - addPathByte(cursor.incomingTransition()); + long pos = cursor.skipTo(Cursor.positionForSkippingBranch(cursor.encodedPosition())); + if (Cursor.isExhausted(pos)) + return done(); + int depth = Cursor.depth(pos); + if (depth > 0) + { + resetPathLength(depth - 1); + addPathByte(Cursor.incomingTransition(pos)); + } + else + resetPathLength(0); } + else + started = true; - next = cursor.content(); - if (next != null) - gotNext = predicate.test(next); + boolean gotNextContent = false; + T nextContent = cursor.content(); + if (nextContent != null) + gotNextContent = predicate.test(nextContent); - while (!gotNext) + while (!gotNextContent) { - next = cursor.advanceToContent(this); - if (next != null) - gotNext = predicate.test(next); + nextContent = cursor.advanceToContent(this); + if (nextContent != null) + gotNextContent = predicate.test(nextContent); else - gotNext = true; + return done(); } + + next = getContent(nextContent); + gotNext = next != null; } return next != null; } + private boolean done() + { + gotNext = true; + next = null; + return false; + } + public V next() { gotNext = false; - T v = next; + V v = next; next = null; - return mapContent(v, cursor.tailTrie(), keyBytes, keyPos); + return v; + } + + void skipPreparedNextIf(Predicate shouldSkipPreparedNext) + { + if (gotNext && next != null && shouldSkipPreparedNext.test(next)) + { + gotNext = false; + next = null; + } } + protected abstract V getContent(T v); + ByteComparable.Version byteComparableVersion() { return cursor.byteComparableVersion(); } - protected abstract V mapContent(T value, Trie tailTrie, byte[] bytes, int byteLength); + public static abstract class Plain extends TrieTailsIterator> + { + Plain(Cursor cursor, Predicate predicate) + { + super(cursor, predicate); + } + + /// Public constructor accepting a Trie and creating a cursor from it + public Plain(Trie trie, Predicate predicate) + { + this(trie.cursor(Direction.FORWARD), predicate); + } - /** - * Iterator representing the selected content of the trie a sequence of {@code (path, tail)} pairs, where - * {@code tail} is the branch of the trie rooted at the selected content node (reachable by following - * {@code path}). The tail trie will have the selected content at its root. - */ - static class AsEntries extends TrieTailsIterator>> + /// Public constructor accepting a Trie, Direction, and creating a cursor from it + public Plain(Trie trie, Direction direction, Predicate predicate) + { + this(trie.cursor(direction), predicate); + } + + @Override + protected V getContent(T v) + { + // Fix the location of the tail trie source. + Cursor tailCursor = cursor.tailCursor(cursor.direction()); + return mapContent(v, tailCursor::tailCursor, keyBytes, keyPos); + } + + protected abstract V mapContent(T value, Trie tailTrie, byte[] bytes, int byteLength); + } + + public static abstract class Range, V> extends TrieTailsIterator> { - public AsEntries(Trie.Cursor cursor, Class clazz) + Range(RangeCursor cursor, Predicate predicate) + { + super(cursor, predicate); + } + + /// Public constructor accepting a RangeTrie and creating a cursor from it + public Range(RangeTrie trie, Predicate predicate) + { + this(trie.cursor(Direction.FORWARD), predicate); + } + + /// Public constructor accepting a RangeTrie, Direction, and creating a cursor from it + public Range(RangeTrie trie, Direction direction, Predicate predicate) { - super(cursor, clazz::isInstance); + this(trie.cursor(direction), predicate); + } + + @Override + protected V getContent(S v) + { + // Fix the location of the tail trie source. + RangeCursor tailCursor = cursor.tailCursor(cursor.direction()); + return mapContent(v, tailCursor::tailCursor, keyBytes, keyPos); + } + + protected abstract V mapContent(S value, RangeTrie tailTrie, byte[] bytes, int byteLength); + } + + /// Deletion-aware tails iterator that only walks the live data trie and ignores covering deletion branches. + /// To be used in cases where it is known that deletion branches can only start at or below the selected tail + /// positions. + public static abstract class DeletionAwareWithoutCoveringDeletions, V> + extends TrieTailsIterator> + { + DeletionAwareWithoutCoveringDeletions(DeletionAwareCursor cursor, Predicate predicate) + { + super(cursor, predicate); + } + + /// Public constructor accepting a DeletionAwareTrie, Direction, and creating a cursor from it + public DeletionAwareWithoutCoveringDeletions(DeletionAwareTrie trie, Direction direction, Predicate predicate) + { + this(trie.cursor(direction), predicate); + } + + @Override + protected V getContent(T v) + { + // Fix the location of the tail trie source. + DeletionAwareCursor tailCursor = cursor.tailCursor(cursor.direction()); + return mapContent(v, tailCursor::tailCursor, keyBytes, keyPos); + } + + protected abstract V mapContent(T value, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength); + } + + /// General deletion-aware tail trie iterator. Deletion branches are followed, covering deletions are applied to the + /// reported branches, and deletion branch data may be used to select a tail trie. + /// + /// Instead of a predicate, this version takes a mapping function that is given both live and deletion data, and + /// will select a branch when it returns a non-null value. Said value is then passed to [#mapContent]. + /// + /// When `includeCoveringDeletions` is true, this iterator will include deletions that are introduced above the + /// requested points as deletion branches at the roots of the returned tail tries. If it is false, covering + /// deletions will be filtered out from the returned tail tries. + /// + /// Also offers [#stopIssuingDeletions], which allows it to cease reporting data coming from deletion branches. + public static abstract class DeletionAware, V, Q> + extends TrieTailsIterator> + { + final boolean includeCoveringDeletions; + + DeletionAware(DeletionAwareCursor cursor, BiFunction merger, boolean includeCoveringDeletions) + { + super(new DeletionAwareCursor.SwitchableLiveAndDeletionsMergeCursor<>(merger, cursor), Predicates.alwaysTrue()); + this.includeCoveringDeletions = includeCoveringDeletions; + } + + /// Public constructor accepting a DeletionAwareTrie, Direction, and creating a cursor from it + public DeletionAware(DeletionAwareTrie trie, Direction direction, BiFunction merger, boolean includeCoveringDeletions) + { + this(trie.cursor(direction), merger, includeCoveringDeletions); + } + + @Override + protected Q getContent(V v) + { + return mapContent(v, cursor.deletionAwareTail(includeCoveringDeletions), keyBytes, keyPos); + } + + public void stopIssuingDeletions(Predicate shouldSkipPreparedNext) + { + skipPreparedNextIf(shouldSkipPreparedNext); + cursor.stopIssuingDeletions(this); + } + + protected abstract Q mapContent(V value, DeletionAwareTrie tailTrie, byte[] bytes, int byteLength); + } + + /// Iterator representing the selected content of the trie a sequence of `(path, tail)` pairs, where + /// `tail` is the branch of the trie rooted at the selected content node (reachable by following + /// `path`). The tail trie will have the selected content at its root. + static class AsEntries extends Plain>> + { + public AsEntries(Cursor cursor, Predicate predicate) + { + super(cursor, predicate); } @Override @@ -115,4 +269,51 @@ protected Map.Entry> mapContent(T value, Trie return new AbstractMap.SimpleImmutableEntry<>(key, tailTrie); } } + + /// Iterator representing the selected content of the trie a sequence of `(path, tail)` pairs, where + /// `tail` is the branch of the trie rooted at the selected content node (reachable by following + /// `path`). The tail trie will have the selected content at its root. + static class AsEntriesRange> + extends Range>> + { + public AsEntriesRange(RangeCursor cursor, Predicate predicate) + { + super(cursor, predicate); + } + + @Override + protected Map.Entry> mapContent(S value, RangeTrie tailTrie, byte[] bytes, int byteLength) + { + ByteComparable.Preencoded key = toByteComparable(byteComparableVersion(), bytes, byteLength); + return new AbstractMap.SimpleImmutableEntry<>(key, tailTrie); + } + } + + /// Iterator representing the selected content of the trie a sequence of `(path, tail)` pairs, where + /// `tail` is the branch of the trie rooted at the selected content node (reachable by following + /// `path`). The tail trie will have the selected content at its root. + /// + /// When `includeCoveringDeletions` is true, this iterator will include deletions that are introduced above the + /// requested points as deletion branches at the roots of the returned tail tries. If it is false, covering + /// deletions will be filtered out from the returned tail tries. + static class AsEntriesDeletionAware> + extends DeletionAware>> + { + public AsEntriesDeletionAware(DeletionAwareCursor cursor, + Predicate predicate, + boolean includeCoveringDeletions) + { + super(cursor, (t, d) -> predicate.test(t) ? t : null, includeCoveringDeletions); + } + + @Override + protected Map.Entry> mapContent(T value, + DeletionAwareTrie tailTrie, + byte[] bytes, + int byteLength) + { + ByteComparable.Preencoded key = toByteComparable(byteComparableVersion(), bytes, byteLength); + return new AbstractMap.SimpleImmutableEntry<>(key, tailTrie); + } + } } diff --git a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java index 0a99c3ff0b99..b56fd324357e 100644 --- a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java +++ b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java @@ -19,19 +19,17 @@ import java.util.Iterator; -/** - * Ordered iterator of trie content. - */ +/// Ordered iterator of trie content. class TrieValuesIterator implements Iterator { - private final Trie.Cursor cursor; + private final Cursor cursor; T next; boolean gotNext; - protected TrieValuesIterator(Trie.Cursor cursor) + protected TrieValuesIterator(Cursor cursor) { this.cursor = cursor; - assert cursor.depth() == 0; + cursor.assertFresh(); next = cursor.content(); gotNext = next != null; } @@ -60,16 +58,16 @@ public T next() static class FilteredByType implements Iterator { - private final Trie.Cursor cursor; + private final Cursor cursor; T next; boolean gotNext; Class clazz; - FilteredByType(Trie.Cursor cursor, Class clazz) + FilteredByType(Cursor cursor, Class clazz) { this.cursor = cursor; this.clazz = clazz; - assert cursor.depth() == 0; + cursor.assertFresh(); next = cursor.content(); gotNext = next != null && clazz.isInstance(next); } diff --git a/src/java/org/apache/cassandra/db/tries/VerificationCursor.java b/src/java/org/apache/cassandra/db/tries/VerificationCursor.java new file mode 100644 index 000000000000..c4fc85426b5a --- /dev/null +++ b/src/java/org/apache/cassandra/db/tries/VerificationCursor.java @@ -0,0 +1,592 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.Objects; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + +import org.agrona.DirectBuffer; +import org.apache.cassandra.utils.Hex; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +interface VerificationCursor +{ + /// Returns a directed version of the incoming transition, including the return path bit and an overflow bit so that + /// a 0x100 value (which can be result of applying [Cursor#positionForSkippingBranch]) can be correctly returned. + @VisibleForTesting + static int undecodedTransition(long encodedPosition) + { + return (int) (encodedPosition >> (Cursor.TRANSITION_SHIFT - 1)) & 0x3FF; + } + + /// Verifies: + /// - `advance` does advance, `depth <= prevDepth + 1` and transition is higher than previous at the same depth + /// (this requires path tracking) + /// - `skipTo` is not called with earlier or equal position (including lower levels) + /// - `maybeSkipTo` is not called with earlier position that can't be identified with depth/incomingTransition only + /// (i.e. seeks to lower depth with an incoming transition that lower than the previous at that depth) + /// - exhausted state matches `Cursor.exhaustedPosition(direction)` + /// - start state matches `Cursor.rootPosition(direction)` + class Plain> implements Cursor, Cursor.TransitionsReceiver + { + static + { + try + { + assert false; + throw new IllegalStateException("Assertions need to be turned on for verification cursors."); + } + catch (AssertionError e) + { + // correct path + } + } + + final Direction direction; + final C source; + long returnedPosition; + byte[] path; + + Cursor.TransitionsReceiver chainedReceiver = null; + boolean advanceMultipleCalledReceiver; + + Plain(C cursor) + { + this.direction = cursor.direction(); + this.source = cursor; + this.returnedPosition = Cursor.rootPosition(direction); + this.path = new byte[16]; + long reportedPosition = source.encodedPosition(); + assert Cursor.direction(reportedPosition) == direction : + String.format("Invalid direction bit %d in root position %s (%016x)\n%s", + (reportedPosition >> DIRECTION_BIT) & 1, + Cursor.toString(reportedPosition), + reportedPosition, + this); + assert Cursor.compare(reportedPosition, returnedPosition) == 0 : + String.format("Invalid initial position %s (must be %s)\n%s", + Cursor.toString(reportedPosition), + Cursor.toString(returnedPosition), + this); + } + + @Override + public long encodedPosition() + { + assert Cursor.compare(source.encodedPosition(), returnedPosition) == 0 : + String.format("Position changed without advance: %s -> %s\n%s", + Cursor.toString(returnedPosition), + Cursor.toString(source.encodedPosition()), + this); + return returnedPosition; + } + + @Override + public T content() + { + assert !Cursor.isExhausted(returnedPosition) : + String.format("Cannot query content on exhausted cursor.\n%s", + this); + + return source.content(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public long advance() + { + return verify(source.advance()); + } + + @Override + public long advanceMultiple(Cursor.TransitionsReceiver receiver) + { + advanceMultipleCalledReceiver = false; + chainedReceiver = receiver; + // Note: if the code below calls the receiver (us), returnedPosition will be adjusted to reflect descent. + long position = source.advanceMultiple(this); + chainedReceiver = null; + int depth = Cursor.depth(position); + int prevDepth = Cursor.depth(returnedPosition); + assert !advanceMultipleCalledReceiver || depth == prevDepth + 1 : + String.format("advanceMultiple returned depth %s did not match depth %s after added characters\n%s", + depth, + prevDepth + 1, + this); + return verify(position); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + verifySkipRequest(encodedSkipPosition); + long newPosition = source.skipTo(encodedSkipPosition); + assert Cursor.compare(newPosition, encodedSkipPosition) >= 0 || + (Cursor.isExhausted(encodedSkipPosition) && Cursor.isExhausted(newPosition)) : + String.format("Skip advanced to a position %s before seek target %s\n%s", + Cursor.toString(newPosition), + Cursor.toString(encodedSkipPosition), + this); + return verify(newPosition); + } + + private void verifySkipRequest(long encodedSkipPosition) + { + int skipDepth = Cursor.depth(encodedSkipPosition); + int currDepth = Cursor.depth(returnedPosition); + assert skipDepth <= currDepth + 1 : + String.format("Skip descends more than one level: %s -> %s\n%s", + Cursor.toString(returnedPosition), + Cursor.toString(encodedSkipPosition), + this); + int skipTransition = undecodedTransition(encodedSkipPosition); + if (skipDepth <= currDepth && skipDepth > 0) + assert ((getByte(skipDepth) ^ direction.select(0x00, 0xFF)) << 1) < skipTransition : + String.format("Skip goes backwards to %s where it already visited byte %s\n%s", + Cursor.toString(encodedSkipPosition), + getByte(skipDepth), + this); + + } + + private long verify(long newPosition) + { + int newDepth = Cursor.depth(newPosition); + int oldDepth = Cursor.depth(returnedPosition); + assert newDepth <= oldDepth + 1 : + String.format("Cursor advanced more than one level: %s -> %s\n%s", + Cursor.toString(returnedPosition), + Cursor.toString(newPosition), + this); + assert Cursor.direction(newPosition) == direction : + String.format("Invalid direction bit %d in position %s (%016x)\n%s", + (newPosition >> DIRECTION_BIT) & 1, + Cursor.toString(newPosition), + newPosition, + this); + + if (Cursor.isExhausted(newPosition)) + { + assert Cursor.compare(newPosition, Cursor.exhaustedPosition(direction)) == 0 : + String.format("Cursor exhausted state should be %s but was %s\n%s", + Cursor.toString(Cursor.exhaustedPosition(direction)), + Cursor.toString(newPosition), + this); + } + else if (newDepth == 0) + { + // For range/set tries it is possible to ascend back to the root on the return path. + assert Cursor.isOnReturnPath(newPosition) : "Ascend to depth 0 is only possible on the return path"; + assert Cursor.incomingTransition(newPosition) == 0 : "Invalid incoming transition " + Cursor.incomingTransition(newPosition) + " for depth 0"; + } + else + { + if (newDepth <= oldDepth) + { + assert ((getByte(newDepth) ^ direction.select(0x00, 0xFF)) << 1) + < undecodedTransition(newPosition) : + String.format("Cursor went backwards to %s where it already visited byte %s\n%s", + Cursor.toString(newPosition), + getByte(newDepth), + this); + } + int undecodedTransition = undecodedTransition(newPosition) >> 1; + assert undecodedTransition >= 0 && undecodedTransition <= 0xFF : + String.format("Cursor returned invalid incoming transition as %s (%016x)\n%s", + Cursor.toString(newPosition), + newPosition, + this); + addByte(newPosition); + } + returnedPosition = newPosition; + return newPosition; + } + + @Override + @SuppressWarnings("unchecked") + public Plain tailCursor(Direction direction) + { + return new Plain<>((C) source.tailCursor(direction)); + } + + + @Override + public void addPathByte(int nextByte) + { + advanceMultipleCalledReceiver = true; + returnedPosition = Cursor.positionForDescentWithByte(returnedPosition, nextByte); + addByte(returnedPosition); + + if (chainedReceiver != null) + chainedReceiver.addPathByte(nextByte); + } + + private void addByte(long asPosition) + { + addByte(Cursor.incomingTransition(asPosition), Cursor.depth(asPosition)); + } + + private void addByte(int nextByteEncoded, int depth) + { + int index = depth - 1; + if (index >= path.length) + path = Arrays.copyOf(path, path.length * 2); + path[index] = (byte) nextByteEncoded; + } + + private int getByte(int depth) + { + return path[depth - 1] & 0xFF; + } + + @Override + public void addPathBytes(DirectBuffer buffer, int pos, int count) + { + advanceMultipleCalledReceiver = true; + for (int i = 0; i < count; ++i) + { + int nextByte = buffer.getByte(pos + i) & 0xFF; + returnedPosition = Cursor.positionForDescentWithByte(returnedPosition, nextByte); + addByte(returnedPosition); + } + if (chainedReceiver != null) + chainedReceiver.addPathBytes(buffer, pos, count); + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + builder.append(source.getClass().getTypeName() + .replace(source.getClass().getPackageName() + '.', "")); + builder.append(" pos "); + builder.append(Cursor.toString(returnedPosition)); + if (Cursor.isExhausted(returnedPosition)) + { + builder.append(" exhausted"); + } + else + { + builder.append(" at "); + builder.append(Hex.bytesToHex(path, 0, Cursor.depth(returnedPosition))); + } + return builder.toString(); + } + } + + abstract class WithRanges, C extends RangeCursor> + extends Plain + implements RangeCursor + { + S currentPrecedingState; + S nextPrecedingState; + + WithRanges(C source) + { + super(source); + + currentPrecedingState = verifyCoveringStateProperties(source.precedingState()); + assert currentPrecedingState == null : + String.format("Cursor starts with non-null preceeding state %s\n%s", + currentPrecedingState, + this); + final S content = source.content(); + nextPrecedingState = content != null ? verifyBoundaryStateProperties(content).succedingState(direction) + : currentPrecedingState; + } + + void verifyEndState() + { + // We cannot be carrying an open covering state when exhausted (open-ended sets/ranges must close the range + // by stopping on the root in the return path). + assert currentPrecedingState == null : + String.format("Cursor ends with non-null covering state %s\n%s", + currentPrecedingState, + this); + } + + @Override + public long advance() + { + currentPrecedingState = nextPrecedingState; + return verifyState(super.advance()); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + currentPrecedingState = nextPrecedingState; + return verifyState(super.advanceMultiple(receiver)); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return verifySkipState(super.skipTo(encodedSkipPosition)); + } + + @Override + public S precedingState() + { + assert currentPrecedingState == source.precedingState() || + currentPrecedingState != null && currentPrecedingState.equals(source.precedingState()) : + String.format("Preceding state changed without advance: %s -> %s.\n%s", + currentPrecedingState, source.precedingState(), + this); + return currentPrecedingState; + } + + @Override + public S state() + { + S returnedState = source.state(); + if (Cursor.isExhausted(returnedPosition)) + assert returnedState == null : + String.format("Non-null state on exhausted cursor: %s.\n%s", + returnedState, + this); + + return returnedState; + } + + boolean agree(S left, S right) + { + return Objects.equals(left, right); + } + + private long verifyState(long position) + { + if (Cursor.isExhausted(position)) + verifyEndState(); + else + { + S precedingState = source.precedingState(); + boolean equal = agree(currentPrecedingState, precedingState); + assert equal : String.format("Unexpected change to covering state: %s -> %s\n%s", + currentPrecedingState, precedingState, this); + currentPrecedingState = precedingState; + + S content = source.content(); + if (content != null) + { + assert agree(currentPrecedingState, content.precedingState(direction)) : + String.format("Range end %s does not close covering state %s\n%s", + content.precedingState(direction), currentPrecedingState, this); + verifyBoundaryStateProperties(content); + nextPrecedingState = content.succedingState(direction); + } + } + + return position; + } + + private long verifySkipState(long encodedSkipPosition) + { + // The covering state information is invalidated by a skip. + if (!Cursor.isExhausted(encodedSkipPosition)) + { + currentPrecedingState = verifyCoveringStateProperties(source.precedingState()); + nextPrecedingState = currentPrecedingState; + } + else + currentPrecedingState = nextPrecedingState = null; + + return verifyState(encodedSkipPosition); + } + + S verifyCoveringStateProperties(S state) + { + if (state == null) + return null; + assert !state.isBoundary() : + String.format("Boundary state %s was returned where a covering state was expected\n%s", + state, + this); + final S precedingState = state.precedingState(Direction.FORWARD); + final S succeedingState = state.succedingState(Direction.FORWARD); + assert precedingState == state && succeedingState == state : + String.format("State %s must return itself its preceding and succeeding state (returned %s/%s)\n%s", + state, + precedingState, + succeedingState, + this); + return state; + } + + S verifyBoundaryStateProperties(S state) + { + if (state == null) + return null; + assert state.isBoundary() : + String.format("Covering state %s was returned where a boundary state was expected\n%s", + state, + this); + final S precedingState = state.precedingState(Direction.FORWARD); + final S succeedingState = state.succedingState(Direction.FORWARD); + verifyCoveringStateProperties(precedingState); + verifyCoveringStateProperties(succeedingState); + return state; + } + + + @Override + public abstract WithRanges tailCursor(Direction direction); + + @Override + public String toString() + { + return super.toString() + (Cursor.isExhausted(returnedPosition) ? "" : " state " + state()); + } + } + + + class Range> extends WithRanges> implements RangeCursor + { + Range(RangeCursor source) + { + super(source); + assert currentPrecedingState == null : + String.format("Initial preceding state %s should be null for range cursor\n%s", + currentPrecedingState, this); + } + + @Override + void verifyEndState() + { + assert currentPrecedingState == null : + String.format("End state %s should be null for range cursor\n%s", + currentPrecedingState, this); + } + + @Override + public Range tailCursor(Direction direction) + { + return new Range<>(source.tailCursor(direction)); + } + } + + class TrieSet extends WithRanges implements TrieSetCursor + { + TrieSet(TrieSetCursor source) + { + super(source); + // start and end state can be non-null for sets + } + + @Override + public TrieSetCursor.RangeState state() + { + return Preconditions.checkNotNull(source.state()); + } + + @Override + public TrieSet tailCursor(Direction direction) + { + return new TrieSet(source.tailCursor(direction)); + } + } + + class DeletionAware> + extends VerificationCursor.Plain> + implements DeletionAwareCursor + { + int deletionBranchDepth; + + DeletionAware(DeletionAwareCursor source) + { + super(source); + this.deletionBranchDepth = -1; + verifyDeletionBranch(0); + } + + @Override + public long advance() + { + return verifyDeletionBranch(super.advance()); + } + + @Override + public long advanceMultiple(TransitionsReceiver receiver) + { + return verifyDeletionBranch(super.advanceMultiple(receiver)); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return verifyDeletionBranch(super.skipTo(encodedSkipPosition)); + } + + @Override + public RangeCursor deletionBranchCursor(Direction direction) + { + // deletionBranch is already verified + final RangeCursor deletionBranch = source.deletionBranchCursor(direction); + if (deletionBranch == null) + return null; + return new Range<>(deletionBranch); + } + + long verifyDeletionBranch(long position) + { + int depth = Cursor.depth(position); + if (depth <= deletionBranchDepth) + deletionBranchDepth = -1; + + var deletionBranch = source.deletionBranchCursor(direction); + if (deletionBranch != null) + { + assert deletionBranchDepth == -1 : + String.format("Deletion branch at position %s covered by another deletion branch at parent depth %s\n%s", + Cursor.toString(position), + deletionBranchDepth, + this); + assert Cursor.compare(deletionBranch.encodedPosition(), Cursor.rootPosition(direction)) == 0 : + String.format("Invalid deletion branch initial position %s\n%s", + Cursor.toString(deletionBranch.encodedPosition()), + this); + assert deletionBranch.precedingState() == null : + String.format("Deletion branch starts with active deletion %s\n%s", + deletionBranch.precedingState(), + this); + deletionBranch.skipTo(Cursor.exhaustedPosition(direction)); + assert deletionBranch.precedingState() == null : + String.format("Deletion branch ends with active deletion %s\n%s", + deletionBranch.precedingState(), + this); + deletionBranchDepth = Cursor.depth(position); + } + return position; + } + + @Override + public DeletionAware tailCursor(Direction direction) + { + return new DeletionAware<>(source.tailCursor(direction)); + } + } +} diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java index 697709c37add..3944e6fcce85 100644 --- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java +++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java @@ -1644,6 +1644,10 @@ public boolean handles(IndexTransaction.Type type) private static final class WriteTimeTransaction implements UpdateTransaction { private final Index.Indexer[] indexers; + final Row.Builder toRemove = BTreeRow.sortedBuilder(); + final Row.Builder toInsert = BTreeRow.sortedBuilder(); + boolean rowUpdated = false; + boolean rowExisted = true; private WriteTimeTransaction(Index.Indexer... indexers) { @@ -1660,32 +1664,28 @@ public void start() public void onPartitionDeletion(DeletionTime deletionTime) { + maybeCompleteRow(); for (Index.Indexer indexer : indexers) indexer.partitionDelete(deletionTime); } public void onRangeTombstone(RangeTombstone tombstone) { + maybeCompleteRow(); for (Index.Indexer indexer : indexers) indexer.rangeTombstone(tombstone); } public void onInserted(Row row) { + maybeCompleteRow(); for (Index.Indexer indexer : indexers) indexer.insertRow(row); } public void onUpdated(Row existing, Row updated) { - final Row.Builder toRemove = BTreeRow.sortedBuilder(); - toRemove.newRow(existing.clustering()); - toRemove.addPrimaryKeyLivenessInfo(existing.primaryKeyLivenessInfo()); - toRemove.addRowDeletion(existing.deletion()); - final Row.Builder toInsert = BTreeRow.sortedBuilder(); - toInsert.newRow(updated.clustering()); - toInsert.addPrimaryKeyLivenessInfo(updated.primaryKeyLivenessInfo()); - toInsert.addRowDeletion(updated.deletion()); + startRow(existing.clustering(), existing.primaryKeyLivenessInfo(), existing.deletion(), updated.primaryKeyLivenessInfo(), updated.deletion()); // diff listener collates the columns to be added & removed from the indexes RowDiffListener diffListener = new RowDiffListener() { @@ -1703,22 +1703,73 @@ public void onComplexDeletion(int i, Clustering clustering, ColumnMetadata colum public void onCell(int i, Clustering clustering, Cell merged, Cell original) { - if (merged != null && !merged.equals(original)) - toInsert.addCell(merged); - - if (merged == null || (original != null && shouldCleanupOldValue(original, merged))) - toRemove.addCell(original); + onCellUpdate(original, merged); } }; Rows.diff(diffListener, updated, existing); - Row oldRow = toRemove.build(); - Row newRow = toInsert.build(); - for (Index.Indexer indexer : indexers) - indexer.updateRow(oldRow, newRow); + maybeCompleteRow(); + } + + private void maybeCompleteRow() + { + if (rowUpdated) + { + if (rowExisted) + { + Row oldRow = toRemove.build(); + Row newRow = toInsert.build(); + for (Index.Indexer indexer : indexers) + indexer.updateRow(oldRow, newRow); + } + else + { + Row newRow = toInsert.build(); + for (Index.Indexer indexer : indexers) + indexer.insertRow(newRow); + } + rowUpdated = false; + } + } + + public void startRow(Clustering clustering, + LivenessInfo existingLiveness, + Row.Deletion existingDeletion, + LivenessInfo updatedLiveness, + Row.Deletion updatedDeletion) + { + maybeCompleteRow(); + toInsert.newRow(clustering); + toInsert.addPrimaryKeyLivenessInfo(updatedLiveness); + toInsert.addRowDeletion(updatedDeletion); + if (existingLiveness != null) + { + toRemove.newRow(clustering); + toRemove.addPrimaryKeyLivenessInfo(existingLiveness); + toRemove.addRowDeletion(existingDeletion); + rowExisted = true; + } + else + rowExisted = false; + rowUpdated = true; + } + + public void onCellUpdate(Cell original, Cell merged) + { + if (merged != null && !merged.equals(original)) + toInsert.addCell(merged); + + if (rowExisted && merged == null || (original != null && shouldCleanupOldValue(original, merged))) + toRemove.addCell(original); + } + + public void onComplexColumnDeletion(ColumnMetadata column, DeletionTime deletionTime) + { + toInsert.addComplexDeletion(column, deletionTime); } public void commit() { + maybeCompleteRow(); for (Index.Indexer indexer : indexers) indexer.finish(); } diff --git a/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java b/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java index a0bc124c4ae7..8f4777bbf84c 100644 --- a/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java +++ b/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java @@ -22,6 +22,8 @@ import java.util.Iterator; import java.util.List; +import com.google.common.base.Predicates; + import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.db.rows.Row; @@ -66,7 +68,8 @@ public int size() } }; - private final InMemoryTrie rowMapping = InMemoryTrie.shortLived(TypeUtil.BYTE_COMPARABLE_VERSION); + private final InMemoryTrie rowMapping = InMemoryTrie.shortLivedOrdered(TypeUtil.BYTE_COMPARABLE_VERSION); + private final InMemoryTrie.Mutator rowMappingMutator = rowMapping.mutator((ex, ne) -> ne, Predicates.alwaysFalse()); private volatile boolean complete = false; @@ -168,7 +171,7 @@ public void add(PrimaryKey key, long sstableRowId) throws TrieSpaceExhaustedExce int segmentRowId = (int) sstableRowId; ByteComparable byteComparable = v -> key.asComparableBytes(v); - rowMapping.putSingleton(byteComparable, segmentRowId, (existing, neww) -> neww); + rowMappingMutator.putSingleton(byteComparable, segmentRowId); maxSegmentRowId = Math.max(maxSegmentRowId, segmentRowId); diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java index 4e7b36f84bd5..ae39fb6d1842 100644 --- a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java +++ b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java @@ -124,7 +124,7 @@ public TrieMemoryIndex(IndexContext indexContext, Memtable memtable, AbstractBou this.primaryKeysAccumulator = new PrimaryKeysAccumulator(primaryKeysHeapAllocations); this.primaryKeysRemover = new PrimaryKeysRemover(primaryKeysHeapAllocations); this.analyzerTransformsValue = indexContext.getAnalyzerFactory().create().transformValue(); - this.data = InMemoryTrie.longLived(TypeUtil.byteComparableVersionForTermsData(indexContext.version()), TrieMemtable.BUFFER_TYPE, indexContext.columnFamilyStore().readOrdering()); + this.data = InMemoryTrie.longLivedOrdered(TypeUtil.byteComparableVersionForTermsData(indexContext.version()), TrieMemtable.BUFFER_TYPE, indexContext.columnFamilyStore().readOrdering()); this.memtable = memtable; } @@ -829,7 +829,7 @@ private Trie getSubtrie(@Nullable Expression expression) upperInclusive = false; } - return data.subtrie(lowerBound, lowerInclusive, upperBound, upperInclusive); + return data.slice(lowerBound, lowerInclusive, upperBound, upperInclusive); } @Override diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java index 971703d49ed6..711b8fb4bbe3 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java +++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java @@ -39,6 +39,8 @@ import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; + +import org.apache.cassandra.db.CellSourceIdentifier; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.index.FeatureNeedsIndexRebuildException; import org.apache.cassandra.index.sai.disk.format.Version; @@ -89,7 +91,7 @@ import org.apache.cassandra.index.sai.utils.AbortedOperationException; import org.apache.cassandra.index.sai.utils.PrimaryKey; import org.apache.cassandra.index.sai.utils.PrimaryKeyWithSortKey; -import org.apache.cassandra.index.sai.utils.RowWithSourceTable; +import org.apache.cassandra.index.sai.utils.RowWithSource; import org.apache.cassandra.index.sai.utils.RangeUtil; import org.apache.cassandra.index.sai.utils.TypeUtil; import org.apache.cassandra.io.sstable.format.SSTableReader; @@ -347,18 +349,18 @@ public UnfilteredRowIterator getPartition(PrimaryKey key, ColumnFamilyStore.View SinglePartitionReadCommand partition = getPartitionReadCommand(key, executionController); // Class to transform the row to include its source table. - Function>> rowTransformer = (Object sourceTable) -> new Transformation<>() + Function>> rowTransformer = (CellSourceIdentifier sourceTable) -> new Transformation<>() { @Override protected Row applyToStatic(Row row) { - return new RowWithSourceTable(row, sourceTable); + return new RowWithSource(row, sourceTable); } @Override protected Row applyToRow(Row row) { - return new RowWithSourceTable(row, sourceTable); + return new RowWithSource(row, sourceTable); } }; diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java index c2278ac69ff1..e5f3e937c593 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java +++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java @@ -37,6 +37,7 @@ import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -857,7 +858,7 @@ public PrimaryKeyIterator(UnfilteredRowIterator partition, // Clone the original Row Row originalRow = (Row) content; ArrayList columnData = new ArrayList<>(originalRow.columnCount() + 1); - columnData.addAll(originalRow.columnData()); + Iterables.addAll(columnData, originalRow); // inject +score as a new column float score = ((PrimaryKeyWithScore) primaryKeyWithSortKey).getExactScore(orderer, originalRow); diff --git a/src/java/org/apache/cassandra/index/sai/utils/CellWithSource.java b/src/java/org/apache/cassandra/index/sai/utils/CellWithSource.java index 891058c24c94..2d6fdc2dc305 100644 --- a/src/java/org/apache/cassandra/index/sai/utils/CellWithSource.java +++ b/src/java/org/apache/cassandra/index/sai/utils/CellWithSource.java @@ -27,8 +27,6 @@ import org.apache.cassandra.db.memtable.Memtable; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.CellPath; -import org.apache.cassandra.db.rows.ColumnData; -import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.io.sstable.SSTableId; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.utils.ObjectSizes; @@ -147,6 +145,12 @@ public Cell withSkippedValue() return wrapIfNew(cell.withSkippedValue()); } + @Override + public Cell withPath(CellPath path) + { + return wrapIfNew(cell.withPath(path)); + } + @Override public Cell clone(ByteBufferCloner cloner) { @@ -159,6 +163,12 @@ public int dataSize() return cell.dataSize(); } + @Override + public int liveDataSize(long nowInSec) + { + return cell.liveDataSize(nowInSec); + } + @Override public long unsharedHeapSizeExcludingData() { @@ -190,16 +200,9 @@ public void digest(Digest digest) } @Override - public ColumnData updateAllTimestamp(long newTimestamp) + public Cell updateAllTimestamp(long newTimestamp) { - ColumnData maybeNewCell = cell.updateAllTimestamp(newTimestamp); - if (maybeNewCell instanceof Cell) - return wrapIfNew((Cell) maybeNewCell); - if (maybeNewCell instanceof ComplexColumnData) - return ((ComplexColumnData) maybeNewCell).transform(this::wrapIfNew); - // It's not clear when we would hit this code path, but it seems we should not - // hit this from SAI. - throw new IllegalStateException("Expected a Cell instance, but got " + maybeNewCell); + return wrapIfNew(cell.updateAllTimestamp(newTimestamp)); } @Override @@ -223,8 +226,7 @@ public Cell purgeDataOlderThan(long timestamp) @Override public int localDeletionTimeAsUnsignedInt() { - // Cannot call cell's localDeletionTimeAsUnsignedInt() because it's protected. - throw new UnsupportedOperationException(); + return cell.localDeletionTimeAsUnsignedInt(); } @Override @@ -239,12 +241,6 @@ public long minTimestamp() return cell.minTimestamp(); } - @Override - public int liveDataSize(long nowInSec) - { - return cell.liveDataSize(nowInSec); - } - private Cell wrapIfNew(Cell maybeNewCell) { if (maybeNewCell == null) diff --git a/src/java/org/apache/cassandra/index/sai/utils/CellWithSourceTable.java b/src/java/org/apache/cassandra/index/sai/utils/CellWithSourceTable.java deleted file mode 100644 index 041abded0496..000000000000 --- a/src/java/org/apache/cassandra/index/sai/utils/CellWithSourceTable.java +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.index.sai.utils; - -import java.nio.ByteBuffer; - -import org.apache.cassandra.db.DeletionPurger; -import org.apache.cassandra.db.Digest; -import org.apache.cassandra.db.marshal.ValueAccessor; -import org.apache.cassandra.db.rows.Cell; -import org.apache.cassandra.db.rows.CellPath; -import org.apache.cassandra.db.rows.ColumnData; -import org.apache.cassandra.db.rows.ComplexColumnData; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.utils.memory.ByteBufferCloner; - -/** - * A wrapped {@link Cell} that includes a reference to the cell's source table. - * @param the type of the cell's value - */ -public class CellWithSourceTable extends Cell -{ - private final Cell cell; - private final Object sourceTable; - - public CellWithSourceTable(Cell cell, Object sourceTable) - { - super(cell.column()); - this.cell = cell; - this.sourceTable = sourceTable; - } - - public Object sourceTable() - { - return sourceTable; - } - - @Override - public boolean isCounterCell() - { - return cell.isCounterCell(); - } - - @Override - public T value() - { - return cell.value(); - } - - @Override - public ValueAccessor accessor() - { - return cell.accessor(); - } - - @Override - public long timestamp() - { - return cell.timestamp(); - } - - @Override - public int ttl() - { - return cell.ttl(); - } - - @Override - public long localDeletionTime() - { - return cell.localDeletionTime(); - } - - @Override - public boolean isTombstone() - { - return cell.isTombstone(); - } - - @Override - public boolean isExpiring() - { - return cell.isExpiring(); - } - - @Override - public boolean isLive(long nowInSec) - { - return cell.isLive(nowInSec); - } - - @Override - public CellPath path() - { - return cell.path(); - } - - @Override - public Cell withUpdatedColumn(ColumnMetadata newColumn) - { - return wrapIfNew(cell.withUpdatedColumn(newColumn)); - } - - @Override - public Cell withUpdatedValue(ByteBuffer newValue) - { - return wrapIfNew(cell.withUpdatedValue(newValue)); - } - - @Override - public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime) - { - return wrapIfNew(cell.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime)); - } - - @Override - public Cell withSkippedValue() - { - return wrapIfNew(cell.withSkippedValue()); - } - - @Override - public Cell clone(ByteBufferCloner cloner) - { - return wrapIfNew(cell.clone(cloner)); - } - - @Override - public int dataSize() - { - return cell.dataSize(); - } - - @Override - public int liveDataSize(long nowInSec) - { - return cell.liveDataSize(nowInSec); - } - - @Override - public long unsharedHeapSizeExcludingData() - { - return cell.unsharedHeapSizeExcludingData(); - } - - @Override - public long unsharedHeapSize() - { - return cell.unsharedHeapSize(); - } - - @Override - public void validate() - { - cell.validate(); - } - - @Override - public boolean hasInvalidDeletions() - { - return cell.hasInvalidDeletions(); - } - - @Override - public void digest(Digest digest) - { - cell.digest(digest); - } - - @Override - public ColumnData updateAllTimestamp(long newTimestamp) - { - var maybeNewCell = cell.updateAllTimestamp(newTimestamp); - if (maybeNewCell instanceof Cell) - return wrapIfNew((Cell) maybeNewCell); - if (maybeNewCell instanceof ComplexColumnData) - return ((ComplexColumnData) maybeNewCell).transform(this::wrapIfNew); - // It's not clear when we would hit this code path, but it seems we should not - // hit this from SAI. - throw new IllegalStateException("Expected a Cell instance, but got " + maybeNewCell); - } - - @Override - public Cell markCounterLocalToBeCleared() - { - return wrapIfNew(cell.markCounterLocalToBeCleared()); - } - - @Override - public Cell purge(DeletionPurger purger, long nowInSec) - { - return wrapIfNew(cell.purge(purger, nowInSec)); - } - - @Override - public Cell purgeDataOlderThan(long timestamp) - { - return wrapIfNew(cell.purgeDataOlderThan(timestamp)); - } - - @Override - public int localDeletionTimeAsUnsignedInt() - { - return cell.localDeletionTimeAsUnsignedInt(); - } - - @Override - public long maxTimestamp() - { - return cell.maxTimestamp(); - } - - @Override - public long minTimestamp() - { - return cell.minTimestamp(); - } - - private Cell wrapIfNew(Cell maybeNewCell) - { - if (maybeNewCell == null) - return null; - // If the cell's method returned a reference to the same cell, then - // we can skip creating a new wrapper. - if (maybeNewCell == this.cell) - return this; - return new CellWithSourceTable<>(maybeNewCell, sourceTable); - } -} diff --git a/src/java/org/apache/cassandra/index/sai/utils/ComplexColumnWithSource.java b/src/java/org/apache/cassandra/index/sai/utils/ComplexColumnWithSource.java new file mode 100644 index 000000000000..d5c51e77b978 --- /dev/null +++ b/src/java/org/apache/cassandra/index/sai/utils/ComplexColumnWithSource.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.utils; + +import java.util.Iterator; + +import com.google.common.collect.Iterators; + +import org.apache.cassandra.db.CellSourceIdentifier; +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.utils.BiLongAccumulator; +import org.apache.cassandra.utils.LongAccumulator; +import org.apache.cassandra.utils.memory.Cloner; + +public class ComplexColumnWithSource extends ComplexColumnData +{ + private final ComplexColumnData wrapped; + private final CellSourceIdentifier sourceTable; + + public ComplexColumnWithSource(ComplexColumnData wrapped, CellSourceIdentifier sourceTable) + { + super(wrapped.column()); + this.wrapped = wrapped; + this.sourceTable = sourceTable; + } + + @Override + public int dataSize() + { + return wrapped.dataSize(); + } + + @Override + public int liveDataSize(long nowInSec) + { + return wrapped.liveDataSize(nowInSec); + } + + @Override + public long unsharedHeapSize() + { + return wrapped.unsharedHeapSize(); + } + + @Override + public long unsharedHeapSizeExcludingData() + { + return wrapped.unsharedHeapSizeExcludingData(); + } + + @Override + public void validate() + { + wrapped.validate(); + } + + @Override + public boolean hasInvalidDeletions() + { + return wrapped.hasInvalidDeletions(); + } + + @Override + public void digest(Digest digest) + { + wrapped.digest(digest); + } + + @Override + public ColumnData clone(Cloner cloner) + { + return null; + } + + @Override + public ComplexColumnData updateAllTimestamp(long newTimestamp) + { + return wrapIfNew(((ComplexColumnData) wrapped.updateAllTimestamp(newTimestamp))); + } + + @Override + public ComplexColumnData markCounterLocalToBeCleared() + { + return wrapIfNew(((ComplexColumnData) wrapped.markCounterLocalToBeCleared())); + } + + @Override + public boolean hasCells() + { + return wrapped.hasCells(); + } + + @Override + public int cellsCount() + { + return wrapped.cellsCount(); + } + + private Cell wrapCell(Cell c) + { + return c != null ? new CellWithSource<>(c, sourceTable) : null; + } + + @Override + public Cell getCell(CellPath path) + { + return wrapCell(wrapped.getCell(path)); + } + + @Override + public Cell getCellByIndex(int idx) + { + return wrapCell(wrapped.getCellByIndex(idx)); + } + + @Override + public DeletionTime complexDeletion() + { + return wrapped.complexDeletion(); + } + + @Override + public Iterator> iterator() + { + return Iterators.transform(wrapped.iterator(), this::wrapCell); + } + + @Override + public Iterator> reverseIterator() + { + return Iterators.transform(wrapped.reverseIterator(), this::wrapCell); + } + + @Override + public long accumulate(LongAccumulator> accumulator, long initialValue) + { + return wrapped.accumulate((cell, v) -> accumulator.apply(wrapCell(cell), v), initialValue); + } + + @Override + public long accumulate(BiLongAccumulator> accumulator, A arg, long initialValue) + { + return wrapped.accumulate((a, cell, v) -> accumulator.apply(a, wrapCell(cell), v), arg, initialValue); + } + + @Override + public ComplexColumnData purge(DeletionPurger purger, long nowInSec) + { + return wrapIfNew(wrapped.purge(purger, nowInSec)); + } + + @Override + public ComplexColumnData purgeDataOlderThan(long timestamp) + { + return wrapIfNew(wrapped.purgeDataOlderThan(timestamp)); + } + + @Override + public long maxTimestamp() + { + return wrapped.maxTimestamp(); + } + + @Override + public long minTimestamp() + { + return wrapped.minTimestamp(); + } + + private ComplexColumnData wrapIfNew(ComplexColumnData maybeNewCell) + { + if (maybeNewCell == null) + return null; + // If the source's method returned a reference to the same source, then + // we can skip creating a new wrapper. + if (maybeNewCell == this.wrapped) + return this; + return new ComplexColumnWithSource(maybeNewCell, sourceTable); + } +} diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java index 73de00c309c7..3e49dc232712 100644 --- a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java +++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeyWithSortKey.java @@ -77,13 +77,13 @@ public boolean isIndexDataValid(Row row, long nowInSecs) return false; // Check if the row is wrapped and if not, skip the source table check - if (!(cell instanceof CellWithSourceTable)) + if (!(cell instanceof CellWithSource)) { // If the cell is not wrapped, we can't validate the source table, // so we just check if the index data matches the live data return isIndexDataEqualToLiveData(cell.buffer()); } - return sourceTable.equals(((CellWithSourceTable) cell).sourceTable()) + return sourceTable.equals(((CellWithSource) cell).sourceTable()) && isIndexDataEqualToLiveData(cell.buffer()); } diff --git a/src/java/org/apache/cassandra/index/sai/utils/RowWithSource.java b/src/java/org/apache/cassandra/index/sai/utils/RowWithSource.java index 4b796e712e91..928b38c32b78 100644 --- a/src/java/org/apache/cassandra/index/sai/utils/RowWithSource.java +++ b/src/java/org/apache/cassandra/index/sai/utils/RowWithSource.java @@ -19,13 +19,11 @@ package org.apache.cassandra.index.sai.utils; import java.util.Collection; -import java.util.Comparator; import java.util.Iterator; import java.util.function.BiConsumer; import java.util.function.Consumer; import java.util.function.Function; -import com.google.common.collect.Collections2; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; @@ -48,7 +46,6 @@ import org.apache.cassandra.utils.BiLongAccumulator; import org.apache.cassandra.utils.LongAccumulator; import org.apache.cassandra.utils.ObjectSizes; -import org.apache.cassandra.utils.SearchIterator; import org.apache.cassandra.utils.memory.Cloner; /** @@ -136,6 +133,12 @@ public boolean isEmpty() return row.isEmpty(); } + @Override + public boolean isEmptyAfterDeletion() + { + return row.isEmptyAfterDeletion(); + } + @Override public String toString(TableMetadata metadata) { @@ -181,43 +184,18 @@ public Iterable> cells() return Iterables.transform(row.cells(), this::wrapCell); } - @Override - public Collection columnData() - { - return Collections2.transform(row.columnData(), this::wrapColumnData); - } - - @Override - public Iterable> cellsInLegacyOrder(TableMetadata metadata, boolean reversed) - { - return Iterables.transform(row.cellsInLegacyOrder(metadata, reversed), this::wrapCell); - } - @Override public boolean hasComplexDeletion() { return row.hasComplexDeletion(); } - @Override - public boolean hasComplex() - { - return row.hasComplex(); - } - @Override public boolean hasDeletion(long nowInSec) { return row.hasDeletion(nowInSec); } - @Override - public SearchIterator searchIterator() - { - SearchIterator iterator = row.searchIterator(); - return key -> wrapColumnData(iterator.next(key)); - } - @Override public Row filter(ColumnFilter filter, TableMetadata metadata) { @@ -231,15 +209,9 @@ public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setA } @Override - public Row transformAndFilter(LivenessInfo info, Deletion deletion, Function function) - { - return maybeWrapRow(row.transformAndFilter(info, deletion, function)); - } - - @Override - public Row transformAndFilter(Function function) + public Row transformAndFilter(Function infoFunction, CellTransformer function) { - return maybeWrapRow(row.transformAndFilter(function)); + return maybeWrapRow(row.transformAndFilter(infoFunction, function)); } @Override @@ -248,12 +220,6 @@ public Row clone(Cloner cloner) return maybeWrapRow(row.clone(cloner)); } - @Override - public Row purgeDataOlderThan(long timestamp, boolean enforceStrictLiveness) - { - return maybeWrapRow(row.purgeDataOlderThan(timestamp, enforceStrictLiveness)); - } - @Override public Row purge(DeletionPurger purger, long nowInSec, boolean enforceStrictLiveness) { @@ -266,6 +232,12 @@ public Row withOnlyQueriedData(ColumnFilter filter) return maybeWrapRow(row.withOnlyQueriedData(filter)); } + @Override + public Row purgeDataOlderThan(long timestamp, boolean enforceStrictLiveness) + { + return maybeWrapRow(row.purgeDataOlderThan(timestamp, enforceStrictLiveness)); + } + @Override public Row markCounterLocalToBeCleared() { @@ -320,6 +292,18 @@ public String toString(TableMetadata metadata, boolean includeClusterKeys, boole return row.toString(metadata, includeClusterKeys, fullDetails); } + @Override + public long minTimestamp() + { + return row.minTimestamp(); + } + + @Override + public long maxTimestamp() + { + return row.maxTimestamp(); + } + @Override public void apply(Consumer function) { @@ -338,12 +322,6 @@ public long accumulate(LongAccumulator accumulator, long initialValu return row.accumulate(accumulator, initialValue); } - @Override - public long accumulate(LongAccumulator accumulator, Comparator comparator, ColumnData from, long initialValue) - { - return row.accumulate(accumulator, comparator, from, initialValue); - } - @Override public long accumulate(BiLongAccumulator accumulator, A arg, long initialValue) { @@ -351,9 +329,9 @@ public long accumulate(BiLongAccumulator accumulator, A arg, } @Override - public long accumulate(BiLongAccumulator accumulator, A arg, Comparator comparator, ColumnData from, long initialValue) + public Row mergeWith(Row updateAsRow) { - return row.accumulate(accumulator, arg, comparator, from, initialValue); + return maybeWrapRow(row.mergeWith(updateAsRow)); } @Override @@ -369,7 +347,7 @@ private ColumnData wrapColumnData(ColumnData c) if (c instanceof Cell) return new CellWithSource<>((Cell) c, source); if (c instanceof ComplexColumnData) - return ((ComplexColumnData) c).transform(c1 -> new CellWithSource<>(c1, source)); + return new ComplexColumnWithSource((ComplexColumnData) c, source); throw new IllegalStateException("Unexpected ColumnData type: " + c.getClass().getName()); } @@ -395,16 +373,4 @@ public String toString() ", source=" + source + '}'; } - - @Override - public long minTimestamp() - { - return row.minTimestamp(); - } - - @Override - public long maxTimestamp() - { - return row.maxTimestamp(); - } } diff --git a/src/java/org/apache/cassandra/index/sai/utils/RowWithSourceTable.java b/src/java/org/apache/cassandra/index/sai/utils/RowWithSourceTable.java deleted file mode 100644 index cacebfb280d4..000000000000 --- a/src/java/org/apache/cassandra/index/sai/utils/RowWithSourceTable.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.index.sai.utils; - -import java.util.Collection; -import java.util.Comparator; -import java.util.Iterator; -import java.util.function.BiConsumer; -import java.util.function.Consumer; -import java.util.function.Function; - -import com.google.common.collect.Collections2; -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; - -import org.apache.cassandra.db.Clustering; -import org.apache.cassandra.db.DeletionPurger; -import org.apache.cassandra.db.DeletionTime; -import org.apache.cassandra.db.Digest; -import org.apache.cassandra.db.LivenessInfo; -import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.Cell; -import org.apache.cassandra.db.rows.CellPath; -import org.apache.cassandra.db.rows.ColumnData; -import org.apache.cassandra.db.rows.ComplexColumnData; -import org.apache.cassandra.db.rows.Row; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.utils.BiLongAccumulator; -import org.apache.cassandra.utils.LongAccumulator; -import org.apache.cassandra.utils.ObjectSizes; -import org.apache.cassandra.utils.SearchIterator; -import org.apache.cassandra.utils.memory.Cloner; - -/** - * A Row wrapper that has a source object that gets added to cell as part of the getCell call. This can only be used - * validly when all the cells share a common source object. - */ -public class RowWithSourceTable implements Row -{ - private static final long EMPTY_SIZE = ObjectSizes.measure(new RowWithSourceTable(null, null)); - - private final Row row; - private final Object source; - - public RowWithSourceTable(Row row, Object source) - { - this.row = row; - this.source = source; - } - - @Override - public Kind kind() - { - return row.kind(); - } - - @Override - public Clustering clustering() - { - return row.clustering(); - } - - @Override - public void digest(Digest digest) - { - row.digest(digest); - } - - @Override - public void validateData(TableMetadata metadata) - { - row.validateData(metadata); - } - - @Override - public boolean hasInvalidDeletions() - { - return row.hasInvalidDeletions(); - } - - @Override - public Collection columns() - { - return row.columns(); - } - - @Override - public int columnCount() - { - return row.columnCount(); - } - - @Override - public Deletion deletion() - { - return row.deletion(); - } - - @Override - public LivenessInfo primaryKeyLivenessInfo() - { - return row.primaryKeyLivenessInfo(); - } - - @Override - public boolean isStatic() - { - return row.isStatic(); - } - - @Override - public boolean isEmpty() - { - return row.isEmpty(); - } - - @Override - public String toString(TableMetadata metadata) - { - return row.toString(metadata); - } - - @Override - public boolean hasLiveData(long nowInSec, boolean enforceStrictLiveness) - { - return row.hasLiveData(nowInSec, enforceStrictLiveness); - } - - @Override - public Cell getCell(ColumnMetadata c) - { - var cell = row.getCell(c); - if (cell == null) - return null; - return new CellWithSourceTable<>(cell, source); - } - - @Override - public Cell getCell(ColumnMetadata c, CellPath path) - { - return wrapCell(row.getCell(c, path)); - } - - @Override - public ComplexColumnData getComplexColumnData(ColumnMetadata c) - { - return (ComplexColumnData) wrapColumnData(row.getComplexColumnData(c)); - } - - @Override - public ColumnData getColumnData(ColumnMetadata c) - { - return wrapColumnData(row.getColumnData(c)); - } - - @Override - public Iterable> cells() - { - return Iterables.transform(row.cells(), this::wrapCell); - } - - @Override - public Collection columnData() - { - return Collections2.transform(row.columnData(), this::wrapColumnData); - } - - @Override - public Iterable> cellsInLegacyOrder(TableMetadata metadata, boolean reversed) - { - return Iterables.transform(row.cellsInLegacyOrder(metadata, reversed), this::wrapCell); - } - - @Override - public boolean hasComplexDeletion() - { - return row.hasComplexDeletion(); - } - - @Override - public boolean hasComplex() - { - return row.hasComplex(); - } - - @Override - public boolean hasDeletion(long nowInSec) - { - return row.hasDeletion(nowInSec); - } - - @Override - public SearchIterator searchIterator() - { - var iterator = row.searchIterator(); - return key -> wrapColumnData(iterator.next(key)); - } - - @Override - public Row filter(ColumnFilter filter, TableMetadata metadata) - { - return maybeWrapRow(row.filter(filter, metadata)); - } - - @Override - public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setActiveDeletionToRow, TableMetadata metadata) - { - return maybeWrapRow(row.filter(filter, activeDeletion, setActiveDeletionToRow, metadata)); - } - - @Override - public Row transformAndFilter(LivenessInfo info, Deletion deletion, Function function) - { - return maybeWrapRow(row.transformAndFilter(info, deletion, function)); - } - - @Override - public Row transformAndFilter(Function function) - { - return maybeWrapRow(row.transformAndFilter(function)); - } - - @Override - public Row clone(Cloner cloner) - { - return maybeWrapRow(row.clone(cloner)); - } - - @Override - public Row purge(DeletionPurger purger, long nowInSec, boolean enforceStrictLiveness) - { - return maybeWrapRow(row.purge(purger, nowInSec, enforceStrictLiveness)); - } - - @Override - public Row withOnlyQueriedData(ColumnFilter filter) - { - return maybeWrapRow(row.withOnlyQueriedData(filter)); - } - - @Override - public Row purgeDataOlderThan(long timestamp, boolean enforceStrictLiveness) - { - return maybeWrapRow(row.purgeDataOlderThan(timestamp, enforceStrictLiveness)); - } - - @Override - public Row markCounterLocalToBeCleared() - { - return maybeWrapRow(row.markCounterLocalToBeCleared()); - } - - @Override - public Row updateAllTimestamp(long newTimestamp) - { - return maybeWrapRow(row.updateAllTimestamp(newTimestamp)); - } - - @Override - public Row withRowDeletion(DeletionTime deletion) - { - return maybeWrapRow(row.withRowDeletion(deletion)); - } - - @Override - public int dataSize() - { - return row.dataSize(); - } - - @Override - public int liveDataSize(long nowInSec) - { - return row.liveDataSize(nowInSec); - } - - @Override - public long unsharedHeapSizeExcludingData() - { - return row.unsharedHeapSizeExcludingData() + EMPTY_SIZE; - } - - @Override - public String toString(TableMetadata metadata, boolean fullDetails) - { - return row.toString(metadata, fullDetails); - } - - @Override - public long unsharedHeapSize() - { - return row.unsharedHeapSize(); - } - - @Override - public String toString(TableMetadata metadata, boolean includeClusterKeys, boolean fullDetails) - { - return row.toString(metadata, includeClusterKeys, fullDetails); - } - - @Override - public long minTimestamp() - { - return row.minTimestamp(); - } - - @Override - public long maxTimestamp() - { - return row.maxTimestamp(); - } - - @Override - public void apply(Consumer function) - { - row.apply(function); - } - - @Override - public void apply(BiConsumer function, A arg) - { - row.apply(function, arg); - } - - @Override - public long accumulate(LongAccumulator accumulator, long initialValue) - { - return row.accumulate(accumulator, initialValue); - } - - @Override - public long accumulate(LongAccumulator accumulator, Comparator comparator, ColumnData from, long initialValue) - { - return row.accumulate(accumulator, comparator, from, initialValue); - } - - @Override - public long accumulate(BiLongAccumulator accumulator, A arg, long initialValue) - { - return row.accumulate(accumulator, arg, initialValue); - } - - @Override - public long accumulate(BiLongAccumulator accumulator, A arg, Comparator comparator, ColumnData from, long initialValue) - { - return row.accumulate(accumulator, arg, comparator, from, initialValue); - } - - @Override - public Iterator iterator() - { - return Iterators.transform(row.iterator(), this::wrapColumnData); - } - - private ColumnData wrapColumnData(ColumnData c) - { - if (c == null) - return null; - if (c instanceof Cell) - return new CellWithSourceTable<>((Cell) c, source); - if (c instanceof ComplexColumnData) - return ((ComplexColumnData) c).transform(c1 -> new CellWithSourceTable<>(c1, source)); - throw new IllegalStateException("Unexpected ColumnData type: " + c.getClass().getName()); - } - - private Cell wrapCell(Cell c) - { - return c != null ? new CellWithSourceTable<>(c, source) : null; - } - - private Row maybeWrapRow(Row r) - { - if (r == null) - return null; - if (r == this.row) - return this; - return new RowWithSourceTable(r, source); - } - - @Override - public String toString() - { - return "RowWithSourceTable{" + - row + - ", source=" + source + - '}'; - } -} diff --git a/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java b/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java index b97de9c86598..0770fd883970 100644 --- a/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java +++ b/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java @@ -18,58 +18,74 @@ package org.apache.cassandra.index.transactions; +import javax.annotation.Nullable; + +import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.schema.ColumnMetadata; -/** - * Handling of index updates on the write path. - * - * Instances of an UpdateTransaction are scoped to a single partition update - * A new instance is used for every write, obtained from the - * newUpdateTransaction(PartitionUpdate) method. Likewise, a single - * CleanupTransaction instance is used for each partition processed during a - * compaction or cleanup. - * - * We make certain guarantees about the lifecycle of each UpdateTransaction - * instance. Namely that start() will be called before any other method, and - * commit() will be called at the end of the update. - * Each instance is initialized with 1..many Index.Indexer instances, one per - * registered Index. As with the transaction itself, these are scoped to a - * specific partition update, so implementations can be assured that all indexing - * events they receive relate to the same logical operation. - * - * onPartitionDelete(), onRangeTombstone(), onInserted() and onUpdated() - * calls may arrive in any order, but this should have no impact for the - * Indexers being notified as any events delivered to a single instance - * necessarily relate to a single partition. - * - * The typical sequence of events during a Memtable update would be: - * start() -- no-op, used to notify Indexers of the start of the transaction - * onPartitionDeletion(dt) -- if the PartitionUpdate implies one - * onRangeTombstone(rt)* -- for each in the PartitionUpdate, if any - * - * then: - * onInserted(row)* -- called for each Row not already present in the Memtable - * onUpdated(existing, updated)* -- called for any Row in the update for where a version was already present - * in the Memtable. It's important to note here that existing is the previous - * row from the Memtable and updated is the final version replacing it. It is - * *not* the incoming row, but the result of merging the incoming and existing - * rows. - * commit() -- finally, finish is called when the new Partition is swapped into the Memtable - */ +/// Handling of index updates on the write path. +/// +/// Instances of an `UpdateTransaction` are scoped to a single partition update +/// A new instance is used for every write, obtained from the +/// `newUpdateTransaction(PartitionUpdate)` method. Likewise, a single +/// `CleanupTransaction` instance is used for each partition processed during a +/// compaction or cleanup. +/// +/// We make certain guarantees about the lifecycle of each UpdateTransaction +/// instance. Namely that `start()` will be called before any other method, and +/// `commit()` will be called at the end of the update. +/// Each instance is initialized with 1..many `Index.Indexer` instances, one per +/// registered `Index`. As with the transaction itself, these are scoped to a +/// specific partition update, so implementations can be assured that all indexing +/// events they receive relate to the same logical operation. +/// +/// `onPartitionDelete()`, `onRangeTombstone()`, `onInserted()` and `onUpdated()` +/// calls may arrive in any order, but this should have no impact for the +/// Indexers being notified as any events delivered to a single instance +/// necessarily relate to a single partition. +/// +/// The typical sequence of events during a Memtable update would be: +/// - `start()` -- no-op, used to notify Indexers of the start of the transaction +/// - `onPartitionDeletion(dt)` -- if the PartitionUpdate implies one +/// - `onRangeTombstone(rt)`* -- for each in the PartitionUpdate, if any +/// then: +/// - `onInserted(row)`* -- called for each Row not already present in the Memtable +/// - `onUpdated(existing, updated)`* -- called for any Row in the update for where a version was already present +/// in the Memtable. It's important to note here that existing is the previous +/// row from the Memtable and updated is the final version replacing it. It is +/// *not* the incoming row, but the result of merging the incoming and existing +/// rows. +/// - `commit()` -- finally, finish is called when the new Partition is swapped into the Memtable +/// +/// If the memtable prefers to supply individual cells rather than rows, the `onInserted`/`onUpdated` calls above +/// are replaced with a sequence of `startRow` plus zero or more calls to `onCellUpdate`. public interface UpdateTransaction extends IndexTransaction { void onPartitionDeletion(DeletionTime deletionTime); void onRangeTombstone(RangeTombstone rangeTombstone); + void onInserted(Row row); - /** - * @param existing the existing row from the Memtable - * @param updated the updated version, which includes the update merged with the existing version - */ + /// @param existing the existing row from the Memtable + /// @param updated the updated version, which includes the update merged with the existing version void onUpdated(Row existing, Row updated); + /// Start a new row, whose content will be given through [#onCellUpdate]. + /// The existing liveness may be null, in which case no existing row was present in the data. + void startRow(Clustering clustering, + @Nullable LivenessInfo existingLiveness, + @Nullable Row.Deletion existingDeletion, + LivenessInfo updatedLiveness, + Row.Deletion updatedDeletion); + + void onCellUpdate(@Nullable Cell original, @Nullable Cell merged); + void onComplexColumnDeletion(ColumnMetadata column, DeletionTime deletionTime); + UpdateTransaction NO_OP = new UpdateTransaction() { public void start(){} @@ -77,6 +93,9 @@ public void onPartitionDeletion(DeletionTime deletionTime){} public void onRangeTombstone(RangeTombstone rangeTombstone){} public void onInserted(Row row){} public void onUpdated(Row existing, Row updated){} + public void startRow(Clustering clustering, LivenessInfo existingLiveness, Row.Deletion existingDeletion, LivenessInfo updatedLiveness, Row.Deletion updatedDeletion){} + public void onCellUpdate(Cell original, Cell merged){} + public void onComplexColumnDeletion(ColumnMetadata column, DeletionTime deletionTime){} public void commit(){} }; } diff --git a/src/java/org/apache/cassandra/io/compress/BufferType.java b/src/java/org/apache/cassandra/io/compress/BufferType.java index 5f75958f9762..5e926e5e2bc3 100644 --- a/src/java/org/apache/cassandra/io/compress/BufferType.java +++ b/src/java/org/apache/cassandra/io/compress/BufferType.java @@ -19,6 +19,8 @@ import java.nio.ByteBuffer; +import org.apache.cassandra.utils.ObjectSizes; + public enum BufferType { ON_HEAP @@ -27,6 +29,11 @@ public ByteBuffer allocate(int size) { return ByteBuffer.allocate(size); } + + public long onHeapSizeWithoutData() + { + return ObjectSizes.HEAP_BUFFER_SHALLOW_SIZE; + } }, OFF_HEAP { @@ -34,9 +41,15 @@ public ByteBuffer allocate(int size) { return ByteBuffer.allocateDirect(size); } + + public long onHeapSizeWithoutData() + { + return ObjectSizes.DIRECT_BUFFER_DEEP_SIZE; + } }; public abstract ByteBuffer allocate(int size); + public abstract long onHeapSizeWithoutData(); public static BufferType typeOf(ByteBuffer buffer) { diff --git a/src/java/org/apache/cassandra/io/sstable/format/SortedTableScrubber.java b/src/java/org/apache/cassandra/io/sstable/format/SortedTableScrubber.java index 206abf4a4229..9cfc65221674 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SortedTableScrubber.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SortedTableScrubber.java @@ -51,6 +51,7 @@ import org.apache.cassandra.db.partitions.Partition; import org.apache.cassandra.db.rows.AbstractCell; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.db.rows.Row; @@ -554,30 +555,26 @@ private Row rebuildTimestamptsForOverflowedRows(Row row) if (sstableVersion.hasUIntDeletionTime()) return row; - LivenessInfo livenessInfo = row.primaryKeyLivenessInfo(); - if (livenessInfo.isExpiring() && livenessInfo.localExpirationTime() >= 0) - { - livenessInfo = livenessInfo.withUpdatedTimestampAndLocalDeletionTime(livenessInfo.timestamp(), livenessInfo.localExpirationTime(), false); - } + return row.transformAndFilter(RowMergingSSTableIterator::rebuildTimestampsForOverflowedLivenessInfo, + RowMergingSSTableIterator::rebuildTimestampsForOverflowedCells) + .clone(HeapCloner.instance); + } - return row.transformAndFilter(livenessInfo, row.deletion(), cd -> { - if (cd.column().isSimple()) - { - Cell cell = (Cell)cd; - return cell.isExpiring() && cell.localDeletionTime() >= 0 - ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp(), cell.localDeletionTime()) - : cell; - } - else - { - ComplexColumnData complexData = (ComplexColumnData)cd; - return complexData.transformAndFilter(cell -> cell.isExpiring() && cell.localDeletionTime() >= 0 - ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp(), cell.localDeletionTime()) - : cell); - } - }).clone(HeapCloner.instance); + private static LivenessInfo rebuildTimestampsForOverflowedLivenessInfo(LivenessInfo livenessInfo) + { + return (livenessInfo.isExpiring() && livenessInfo.localExpirationTime() >= 0) + ? livenessInfo.withUpdatedTimestampAndLocalDeletionTime(livenessInfo.timestamp(), livenessInfo.localExpirationTime(), false) + : livenessInfo; + } + + private static > C rebuildTimestampsForOverflowedCells(C cell) + { + return cell.isExpiring() && cell.localDeletionTime() >= 0 + ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp(), cell.localDeletionTime()) + : cell; } + private boolean hasOverflowedLocalExpirationTimeRow(Row next) { if (sstableVersion.hasUIntDeletionTime()) @@ -634,6 +631,20 @@ public FixNegativeLocalDeletionTimeIterator(UnfilteredRowIterator iterator, Outp this.negativeLocalExpirationTimeMetrics = negativeLocalDeletionInfoMetrics; } + private static > C fixCellExpirationTime(C cell) + { + return cell.isExpiring() && cell.localDeletionTime() < 0 + ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp() + 1, AbstractCell.MAX_DELETION_TIME) + : cell; + } + + private static LivenessInfo fixLivenessInfoExpirationTime(LivenessInfo livenessInfo) + { + return (livenessInfo.isExpiring() && livenessInfo.localExpirationTime() < 0) + ? livenessInfo.withUpdatedTimestampAndLocalDeletionTime(livenessInfo.timestamp() + 1, AbstractCell.MAX_DELETION_TIME) + : livenessInfo; + } + @Override public UnfilteredRowIterator wrapped() { @@ -692,26 +703,9 @@ private boolean hasNegativeLocalExpirationTime(Row next) private Unfiltered fixNegativeLocalExpirationTime(Row row) { - LivenessInfo livenessInfo = row.primaryKeyLivenessInfo(); - if (livenessInfo.isExpiring() && livenessInfo.localExpirationTime() == Cell.INVALID_DELETION_TIME) - livenessInfo = livenessInfo.withUpdatedTimestampAndLocalDeletionTime(livenessInfo.timestamp() + 1, AbstractCell.MAX_DELETION_TIME_2038_LEGACY_CAP); - - return row.transformAndFilter(livenessInfo, row.deletion(), cd -> { - if (cd.column().isSimple()) - { - Cell cell = (Cell) cd; - return cell.isExpiring() && cell.localDeletionTime() == Cell.INVALID_DELETION_TIME - ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp() + 1, AbstractCell.MAX_DELETION_TIME_2038_LEGACY_CAP) - : cell; - } - else - { - ComplexColumnData complexData = (ComplexColumnData) cd; - return complexData.transformAndFilter(cell -> cell.isExpiring() && cell.localDeletionTime() == Cell.INVALID_DELETION_TIME - ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp() + 1, AbstractCell.MAX_DELETION_TIME_2038_LEGACY_CAP) - : cell); - } - }).clone(HeapCloner.instance); + return row.transformAndFilter(FixNegativeLocalDeletionTimeIterator::fixLivenessInfoExpirationTime, + FixNegativeLocalDeletionTimeIterator::fixCellExpirationTime) + .clone(HeapCloner.instance); } } diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java index 286c7c979813..4e52ebf57067 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java @@ -40,6 +40,7 @@ import org.apache.cassandra.db.commitlog.IntervalSet; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.io.ISerializer; import org.apache.cassandra.io.sstable.format.Version; @@ -450,8 +451,8 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o out.writeLong(component.maxTimestamp); if (version.hasUIntDeletionTime()) { - out.writeInt(Cell.deletionTimeLongToUnsignedInteger(component.minLocalDeletionTime)); - out.writeInt(Cell.deletionTimeLongToUnsignedInteger(component.maxLocalDeletionTime)); + out.writeInt(CellData.deletionTimeLongToUnsignedInteger(component.minLocalDeletionTime)); + out.writeInt(CellData.deletionTimeLongToUnsignedInteger(component.maxLocalDeletionTime)); } else { @@ -627,8 +628,8 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc long maxLocalDeletionTime; if (version.hasUIntDeletionTime()) { - minLocalDeletionTime = Cell.deletionTimeUnsignedIntegerToLong(in.readInt()); - maxLocalDeletionTime = Cell.deletionTimeUnsignedIntegerToLong(in.readInt()); + minLocalDeletionTime = CellData.deletionTimeUnsignedIntegerToLong(in.readInt()); + maxLocalDeletionTime = CellData.deletionTimeUnsignedIntegerToLong(in.readInt()); } else { diff --git a/src/java/org/apache/cassandra/service/pager/PagingState.java b/src/java/org/apache/cassandra/service/pager/PagingState.java index ec43068f3830..1e2212bc9f7e 100644 --- a/src/java/org/apache/cassandra/service/pager/PagingState.java +++ b/src/java/org/apache/cassandra/service/pager/PagingState.java @@ -19,10 +19,14 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; import com.google.common.annotations.VisibleForTesting; - +import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,7 +47,14 @@ import static org.apache.cassandra.db.TypeSizes.sizeof; import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; -import static org.apache.cassandra.utils.ByteBufferUtil.*; +import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER; +import static org.apache.cassandra.utils.ByteBufferUtil.bytesToHex; +import static org.apache.cassandra.utils.ByteBufferUtil.readWithShortLength; +import static org.apache.cassandra.utils.ByteBufferUtil.readWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.serializedSizeWithShortLength; +import static org.apache.cassandra.utils.ByteBufferUtil.serializedSizeWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.writeWithShortLength; +import static org.apache.cassandra.utils.ByteBufferUtil.writeWithVIntLength; import static org.apache.cassandra.utils.vint.VIntCoding.computeUnsignedVIntSize; import static org.apache.cassandra.utils.vint.VIntCoding.getUnsignedVInt; @@ -386,22 +397,10 @@ public static RowMark create(TableMetadata metadata, Row row, ProtocolVersion pr ByteBuffer mark; if (protocolVersion.isSmallerOrEqualTo(ProtocolVersion.V3)) { - // We need to be backward compatible with 2.1/2.2 nodes paging states. Which means we have to send - // the full cellname of the "last" cell in the row we get (since that's how 2.1/2.2 nodes will start after - // that last row if they get that paging state). - Iterator> cells = row.cellsInLegacyOrder(metadata, true).iterator(); - if (!cells.hasNext()) - { - // If the last returned row has no cell, this means in 2.1/2.2 terms that we stopped on the row - // marker. Note that this shouldn't happen if the table is COMPACT STORAGE tables. - assert !metadata.isCompactTable(); - mark = encodeCellName(metadata, row.clustering(), EMPTY_BYTE_BUFFER, null); - } - else - { - Cell cell = cells.next(); - mark = encodeCellName(metadata, row.clustering(), cell.column().name.bytes, cell.column().isComplex() ? cell.path().get(0) : null); - } + // In order to be backwards compatible with 2.x, protocol version 3 writes a cell name and path into + // the mark. However, Cassandra 3.0 and later never read the cell information. + // Since we are no longer compatible with 2.x, it siffices to use an empty cell info. + mark = encodeCellName(metadata, row.clustering(), EMPTY_BYTE_BUFFER, null); } else { diff --git a/src/java/org/apache/cassandra/utils/ObjectSizes.java b/src/java/org/apache/cassandra/utils/ObjectSizes.java index 8eff7f77bba1..b6255faf9e6f 100644 --- a/src/java/org/apache/cassandra/utils/ObjectSizes.java +++ b/src/java/org/apache/cassandra/utils/ObjectSizes.java @@ -41,9 +41,10 @@ public class ObjectSizes Guess.UNSAFE) .build(); - private static final long HEAP_BUFFER_SHALLOW_SIZE = measure(ByteBufferUtil.EMPTY_BYTE_BUFFER); + public static final long HEAP_BUFFER_SHALLOW_SIZE = measure(ByteBufferUtil.EMPTY_BYTE_BUFFER); + private static final long DIRECT_BUFFER_SHALLOW_SIZE = measure(ByteBuffer.allocateDirect(0)); - private static final long DIRECT_BUFFER_DEEP_SIZE = measureDeep(ByteBuffer.allocateDirect(0)); + public static final long DIRECT_BUFFER_DEEP_SIZE = measureDeep(ByteBuffer.allocateDirect(0)); public static final long IPV6_SOCKET_ADDRESS_SIZE = ObjectSizes.measureDeep(new InetSocketAddress(getIpvAddress(16), 42)); @@ -146,7 +147,7 @@ public static long sizeOnHeapExcludingDataOf(ByteBuffer[] array) *

    * Non-respect of those assumptions can lead to an invalid value being returned. * @param buffer the buffer to measure @@ -181,9 +182,9 @@ public static long sizeOnHeapOf(ByteBuffer buffer) *
      *
    • That slabs are always created using: {@code buffer.duplicate().position(start).limit(end)} and not through slice
    • *
    • That the input buffers are not read-only buffers
    • - *
    • That the direct buffers that are not slab are not duplicates
    • + *
    • That the direct buffers that are not slab are not duplicates
    • *
    - * Non-respect of those assumptions can lead to an invalid value being returned. T + * Non-respect of those assumptions can lead to an invalid value being returned. T * @param buffer the buffer to measure * @return the heap memory used by the specified byte buffer excluding the data.. */ diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java index 7b18805373cb..710fcd65ac89 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java @@ -20,7 +20,6 @@ import java.nio.ByteBuffer; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; /** @@ -37,6 +36,12 @@ public interface ByteComparable */ ByteSource asComparableBytes(Version version); + /// Returns a peekable version of the byte-source. This may require additional wrapping. + default ByteSource.Peekable asPeekableBytes(Version version) + { + return ByteSource.peekable(asComparableBytes(version)); + } + enum Version { LEGACY, @@ -75,13 +80,6 @@ default Preencoded preencode(Version version) // Simple factories used for testing - @VisibleForTesting - static ByteComparable of(String s) - { - // Note: This is not prefix-free - return v -> ByteSource.of(s, v); - } - static ByteComparable of(long value) { return v -> ByteSource.of(value); @@ -92,7 +90,7 @@ static ByteComparable of(int value) return v -> ByteSource.of(value); } - interface Preencoded extends ByteComparable + interface Preencoded extends ByteComparable, Comparable { Version encodingVersion(); @@ -113,6 +111,11 @@ default byte[] asByteComparableArray(Version version) { return asComparableBytes(version).remainingBytesToArray(); } + + default int compareTo(ByteComparable other) + { + return compare(this, other, encodingVersion()); + } } /** @@ -168,6 +171,17 @@ static ByteComparable cut(ByteComparable src, int cutoff) return version -> ByteSource.cut(src.asComparableBytes(version), cutoff); } + static ByteComparable skipFirst(ByteComparable src, int bytesToSkip) + { + return version -> + { + ByteSource bsrc = src.asComparableBytes(version); + for (int i = 0; i < bytesToSkip; i++) + bsrc.next(); + return bsrc; + }; + } + /** * Return the length of a byte comparable, not including the terminator byte. */ diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java index 9b73c6d5454b..aa739690278b 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java @@ -63,7 +63,7 @@ default int nextBytes(byte[] dest) /** Value returned if at the end of the stream. */ int END_OF_STREAM = -1; - ByteSource EMPTY = () -> END_OF_STREAM; + Duplicatable EMPTY = preencoded(new byte[0]); /** * Escape value. Used, among other things, to mark the end of subcomponents (so that shorter compares before anything longer). @@ -248,6 +248,24 @@ static ByteSource variableLengthInteger(long value) return new VariableLengthInteger(value); } + /** + * Produce a source for an unsigned integer, stored using variable length encoding. + * The representation uses between 1 and 9 bytes, is prefix-free and compares + * correctly. + */ + static ByteSource variableLengthUnsignedInteger(long value) + { + return new VariableLengthUnsignedInteger(value); + } + + /** + * Returns the direct concatenation of sources (no separators or terminators are added). + */ + static ByteSource concat(ByteSource... sources) + { + return new Concat(sources); + } + /** * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming * prevMax < currMin. @@ -719,6 +737,34 @@ public int next() } } + /** + * Direct concatenation of byte sources. + */ + static class Concat implements ByteSource + { + private final ByteSource[] srcs; + private int srcnum = 0; + + Concat(ByteSource[] srcs) + { + this.srcs = srcs; + } + + @Override + public int next() + { + while (true) + { + if (srcnum == srcs.length) + return END_OF_STREAM; + int b = srcs[srcnum].next(); + if (b > END_OF_STREAM) + return b; + ++srcnum; + } + } + } + /** * Construct the shortest common prefix of prevMax and currMin that separates those two byte streams. * If {@code useCurr == true} the last byte of the returned stream comes from {@code currMin} and is the first diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java index a18bf2d2bd2e..0c9643eaa269 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java @@ -241,6 +241,14 @@ public static long getVariableLengthInteger(ByteSource byteSource) } } + /** + * Decode a variable-length unsigned integer. + */ + public static long getVariableLengthUnsignedInteger(ByteSource byteSource) + { + return getVariableLengthUnsignedIntegerXoring(byteSource, 0); + } + /** * Decode a variable-length unsigned integer, passing all bytes read through XOR with the given xorWith parameter. * diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/PreencodedByteComparable.java b/src/java/org/apache/cassandra/utils/bytecomparable/PreencodedByteComparable.java index d14f466dcfce..a10155337652 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/PreencodedByteComparable.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/PreencodedByteComparable.java @@ -35,6 +35,20 @@ public Version encodingVersion() return version; } + @Override + public boolean equals(Object other) + { + if (!(other instanceof ByteComparable)) + return false; + return compareTo((ByteComparable) other) == 0; + } + + @Override + public String toString() + { + return byteComparableAsString(encodingVersion()); + } + static class Array extends PreencodedByteComparable { private final byte[] bytes; diff --git a/test/bin/jmh b/test/bin/jmh index ad8f44bd3c3b..d73e8ab02eee 100755 --- a/test/bin/jmh +++ b/test/bin/jmh @@ -127,8 +127,8 @@ cassandra_parms="$cassandra_parms -XX:+PreserveFramePointer" # Create log directory, some tests require that mkdir -p $CASSANDRA_HOME/logs -if [ ! -f $CASSANDRA_HOME/build/apache-cassandra-*.jar ] ; then - echo "$CASSANDRA_HOME/build/apache-cassandra-*.jar does not exist - execute 'ant jar' first" +if [ ! -f $CASSANDRA_HOME/build/dse-db-*.jar ] ; then + echo "$CASSANDRA_HOME/build/dse-db-*.jar does not exist - execute 'ant jar' first" exit 1 fi diff --git a/test/burn/org/apache/cassandra/index/sai/LongBM25Test.java b/test/burn/org/apache/cassandra/index/sai/LongBM25Test.java index 51d33a3c656f..6e07d75abf30 100644 --- a/test/burn/org/apache/cassandra/index/sai/LongBM25Test.java +++ b/test/burn/org/apache/cassandra/index/sai/LongBM25Test.java @@ -32,10 +32,13 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import org.junit.Before; +import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; +import org.apache.cassandra.db.memtable.AbstractShardedMemtable; + import static org.apache.cassandra.config.CassandraRelevantProperties.MEMTABLE_SHARD_COUNT; public class LongBM25Test extends SAITester @@ -81,12 +84,13 @@ public class LongBM25Test extends SAITester } KeySet keysInserted = new KeySet(); - private final int threadCount = 12; + private static final int threadCount = 12; - @Before - public void setup() throws Throwable + @BeforeClass + public static void setUpClass() { MEMTABLE_SHARD_COUNT.setInt(4 * threadCount); + SAITester.setUpClass(); } @FunctionalInterface @@ -97,6 +101,7 @@ private interface Op public void testConcurrentOps(Op op) throws ExecutionException, InterruptedException { + Assert.assertEquals(4 * threadCount, AbstractShardedMemtable.getDefaultShardCount()); createTable("CREATE TABLE %s (key int primary key, value text)"); // Create analyzed index following BM25Test pattern createIndex("CREATE CUSTOM INDEX ON %s(value) " + diff --git a/test/burn/org/apache/cassandra/index/sai/LongVectorTest.java b/test/burn/org/apache/cassandra/index/sai/LongVectorTest.java index e0cfad104a55..ee255033d3ca 100644 --- a/test/burn/org/apache/cassandra/index/sai/LongVectorTest.java +++ b/test/burn/org/apache/cassandra/index/sai/LongVectorTest.java @@ -28,10 +28,12 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; +import org.apache.cassandra.db.memtable.TrieMemtable; import org.apache.cassandra.service.reads.thresholds.CoordinatorWarnings; import static org.apache.cassandra.config.CassandraRelevantProperties.MEMTABLE_SHARD_COUNT; @@ -48,9 +50,10 @@ public class LongVectorTest extends SAITester private static final int threadCount = 12; @BeforeClass - public static void setShardCount() + public static void setUpClass() { MEMTABLE_SHARD_COUNT.setInt(4 * threadCount); + SAITester.setUpClass(); } @FunctionalInterface @@ -61,6 +64,7 @@ private interface Op public void testConcurrentOps(Op op) throws ExecutionException, InterruptedException { + Assert.assertEquals(4 * threadCount, TrieMemtable.SHARD_COUNT); createTable(String.format("CREATE TABLE %%s (key int primary key, value vector)", dimension)); createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex' WITH OPTIONS = { 'similarity_function': 'dot_product' }"); diff --git a/test/conf/cassandra.yaml b/test/conf/cassandra.yaml index f871dac3c4ce..9f90128d1818 100644 --- a/test/conf/cassandra.yaml +++ b/test/conf/cassandra.yaml @@ -91,6 +91,10 @@ memtable: shards: 4 trie_stage1: class_name: TrieMemtableStage1 + trie_stage2: + class_name: TrieMemtableStage2 + trie_stage3: + class_name: TrieMemtableStage3 skiplist_sharded: class_name: ShardedSkipListMemtable parameters: diff --git a/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java b/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java index 20d4897428a3..49c61888f0b7 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java @@ -54,12 +54,12 @@ import org.apache.cassandra.db.partitions.BTreePartitionData; import org.apache.cassandra.db.partitions.BTreePartitionUpdate; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.BTreeComplexColumn; import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.BufferCell; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.db.rows.ColumnData; -import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.Rows; @@ -285,7 +285,7 @@ Row complexRow() int mapCount = selectSortAndTransform(complexBuffer, complexPaths, columns[0].cellPathComparator(), complexCell); try (BulkIterator iter = BulkIterator.of(complexBuffer)) { - columnBuffer[0] = ComplexColumnData.unsafeConstruct(columns[0], BTree.build(iter, mapCount, UpdateFunction.noOp()), DeletionTime.LIVE); + columnBuffer[0] = BTreeComplexColumn.unsafeConstruct(columns[0], BTree.build(iter, mapCount, UpdateFunction.noOp()), DeletionTime.LIVE); } return bufferToRow(clusterings[0]); } diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadBenchBase.java b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadBenchBase.java index c44e1675a500..2e940cc2f38b 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadBenchBase.java +++ b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadBenchBase.java @@ -26,6 +26,7 @@ import java.util.function.Supplier; import com.google.common.base.Throwables; +import org.apache.commons.codec.digest.MurmurHash3; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CQLTester; @@ -51,6 +52,52 @@ public enum Flush @Param({"INMEM", "YES"}) Flush flush = Flush.INMEM; + @Param({"0"}) + double deletionsRatio = 0; + + @Param({"EQUAL"}) + DeletionSpec deletionSpec = DeletionSpec.EQUAL; + + public enum DeletionPattern + { + RANDOM, + FROM_START, + SPREAD; + } + + public enum DeletionSpec + { + EQUAL("picid = ?", 0), + SINGLETON_RANGE("picid >= ? AND picid <= ?", 0, 0), + RANGE_TO_NEXT("picid >= ? AND picid < ?", 0, -1), + RANGE_10("picid >= ? AND picid < ?", 0, 10), + RANGE_FROM_START("picid <= ?", 0); + + final int[] argumentShifts; + final String spec; + + DeletionSpec(String spec, int... argumentShifts) + { + this.argumentShifts = argumentShifts; + this.spec = spec; + } + + Object[] convertArgs(Object[] args, ReadBenchBase test) + { + Object[] result = new Object[1 + argumentShifts.length]; + result[0] = args[0]; + long column = (Long) args[1]; + for (int i = 0; i < argumentShifts.length; ++i) + result[i + 1] = column + (argumentShifts[i] >= 0 ? argumentShifts[i] : test.getDiffToNext() * -argumentShifts[i]); + return result; + } + } + + @Param({"RANDOM"}) + DeletionPattern deletionPattern = DeletionPattern.RANDOM; + + long deletionCount; + @Setup(Level.Trial) public void setup() throws Throwable { @@ -64,6 +111,18 @@ public void setup() throws Throwable performWrite(i, BATCH); if (i < count) performWrite(i, (int) (count - i)); + + deletionCount = Math.min((long) (count * deletionsRatio), count); + if (deletionCount > 0) + { + String deleteStatement = "DELETE FROM " + table + " WHERE userid = ? AND " + deletionSpec.spec; + System.err.println("Deleting " + deletionCount + " using " + deleteStatement); + + for (i = 0; i <= deletionCount - BATCH; i += BATCH) + performDelete(deleteStatement, i, BATCH); + if (i < deletionCount) + performDelete(deleteStatement, i, deletionCount - i); + } long writeLength = System.currentTimeMillis() - writeStart; System.err.format("... done in %.3f s.\n", writeLength / 1000.0); @@ -109,6 +168,11 @@ public void setup() throws Throwable } + long getDiffToNext() + { + return 1; + } + public void performWrite(String writeStatement, long ofs, long count) throws Throwable { if (threadCount == 1) @@ -148,6 +212,60 @@ public void performWriteThreads(String writeStatement, long ofs, long count) thr assert count == done; } + long deleteIndex(long index) + { + switch (deletionPattern) + { + case FROM_START: + return index; + case RANDOM: + return Integer.remainderUnsigned(MurmurHash3.hash32(index), count); + case SPREAD: + return index * count / deletionCount; + default: + throw new AssertionError(); + } + } + + public void performDelete(String deleteStatement, long ofs, long count) throws Throwable + { + if (threadCount == 1) + performDeleteSerial(deleteStatement, ofs, count); + else + performDeleteThreads(deleteStatement, ofs, count); + } + + public void performDeleteSerial(String deleteStatement, long ofs, long count) throws Throwable + { + for (long i = ofs; i < ofs + count; ++i) + execute(deleteStatement, deletionSpec.convertArgs(writeArguments(deleteIndex(i)), this)); + } + + public void performDeleteThreads(String deleteStatement, long ofs, long count) throws Throwable + { + List> futures = new ArrayList<>(); + for (long i = 0; i < count; ++i) + { + long pos = ofs + i; + futures.add(executorService.submit(() -> + { + try + { + execute(deleteStatement, deletionSpec.convertArgs(writeArguments(deleteIndex(pos)), ReadBenchBase.this)); + return 1; + } + catch (Throwable throwable) + { + throw Throwables.propagate(throwable); + } + })); + } + long done = 0; + for (Future f : futures) + done += f.get(); + assert count == done; + } + @TearDown(Level.Trial) public void teardown() throws InterruptedException { diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadWidePartitionsBench.java b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadWidePartitionsBench.java index bc46493e792f..82355983ca9f 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadWidePartitionsBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadWidePartitionsBench.java @@ -33,6 +33,12 @@ public Object[] writeArguments(long i) return new Object[] { i % partitions, i, i }; } + @Override + long getDiffToNext() + { + return partitions; + } + Object[] readArguments(long i, long offset) { return new Object[] { (i + offset) % partitions, i }; diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/SimpleTableWriter.java b/test/microbench/org/apache/cassandra/test/microbench/instance/SimpleTableWriter.java index fdf515fa381f..fa76644997a6 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/instance/SimpleTableWriter.java +++ b/test/microbench/org/apache/cassandra/test/microbench/instance/SimpleTableWriter.java @@ -80,8 +80,8 @@ public void commonSetup() throws Throwable rand = new Random(1); executorService = Executors.newFixedThreadPool(threadCount); partitions = Math.max(1, count / rowsPerPartition); - DatabaseDescriptor.setAutoSnapshot(false); CQLTester.setUpClass(); + DatabaseDescriptor.setAutoSnapshot(false); logger.info("setupClass done."); String memtableSetup = ""; if (!memtableClass.isEmpty()) diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/InMemoryTrieReadBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/InMemoryTrieReadBench.java index 8004d00554f4..d3724d5add66 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/tries/InMemoryTrieReadBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/tries/InMemoryTrieReadBench.java @@ -104,7 +104,7 @@ class Counter implements Trie.ValueConsumer int sum = 0; @Override - public void accept(Byte aByte) + public void content(Byte aByte) { sum += aByte; } @@ -152,7 +152,7 @@ public Void complete() } } Counter counter = new Counter(); - trie.process(counter, direction); + trie.process(direction, counter); return counter.sum; } diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index c953143212fb..837aeb1579cd 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -44,6 +44,7 @@ import java.util.Optional; import java.util.Random; import java.util.Set; +import java.util.TreeSet; import java.util.UUID; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadLocalRandom; @@ -2284,6 +2285,41 @@ public static void assertRowsIgnoringOrderAndExtra(UntypedResultSet result, Obje assertRowsIgnoringOrderInternal(result, true, rows); } + private static int compareTypedLists(List spec, List a, List b) + { + var itSpec = spec.iterator(); + var itA = a.iterator(); + var itB = b.iterator(); + while (itSpec.hasNext()) + { + if (!itA.hasNext()) + return itB.hasNext() ? -1 : 0; + if (!itB.hasNext()) + return 1; + var ea = itA.next(); + var eb = itB.next(); + var espec = itSpec.next(); + var diff = 0; + if (ea == null || eb == null) + { + diff = ea == null ? (eb == null ? 0 : -1) : 1; + } + else try + { + diff = espec.type.compare(ea, eb); + } + catch (UnsupportedOperationException e) + { + diff = ByteBufferUtil.compareUnsigned(ea, eb); + } + + if (diff != 0) + return diff; + } + assert !itA.hasNext() && !itB.hasNext(); + return 0; + } + private static void assertRowsIgnoringOrderInternal(UntypedResultSet result, boolean ignoreExtra, Object[]... rows) { if (result == null) @@ -2295,7 +2331,7 @@ private static void assertRowsIgnoringOrderInternal(UntypedResultSet result, boo List meta = result.metadata(); - Set> expectedRows = new HashSet<>(rows.length); + Set> expectedRows = new TreeSet<>((a, b) -> compareTypedLists(meta, a, b)); for (Object[] expected : rows) { Assert.assertEquals("Invalid number of (expected) values provided for row", expected.length, meta.size()); @@ -2317,7 +2353,7 @@ private static void assertRowsIgnoringOrderInternal(UntypedResultSet result, boo expectedRows.add(expectedRow); } - Set> actualRows = new HashSet<>(result.size()); + Set> actualRows = new TreeSet<>((a, b) -> compareTypedLists(meta, a, b)); for (UntypedResultSet.Row actual : result) { List actualRow = new ArrayList<>(meta.size()); diff --git a/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java b/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java index f974f50b8e49..dc2d6c21b3c3 100644 --- a/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java +++ b/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java @@ -32,6 +32,7 @@ import org.apache.cassandra.db.ExpirationDateOverflowHandling; import org.apache.cassandra.db.ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.QueryState; @@ -102,7 +103,7 @@ private void testSelectQueryOverflowingIntTimestamps(boolean prepared) // insert a row with an int overflowing timestamp. Behavior will depend on the used sstable version String query = format("INSERT INTO %s.%s (id, val) VALUES (0, 0) USING TTL %d", ks, tbl, ttl); - if (Cell.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP) + if (CellData.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP) { Assertions.assertThatThrownBy(() -> executeModify(query, Long.MIN_VALUE, prepared)) .isInstanceOf(InvalidRequestException.class) diff --git a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java index 06b79b495e2e..99cdd21c3c2e 100644 --- a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java +++ b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java @@ -28,6 +28,7 @@ import java.util.function.ToIntFunction; import com.google.common.collect.Iterables; +import com.google.common.collect.Streams; import org.junit.Test; import org.apache.cassandra.db.*; @@ -530,7 +531,7 @@ int countStaticCells(SSTableReader reader) int countComplexCells(SSTableReader reader) { - return count(reader, x -> x.isRow() ? ((Row) x).columnData().stream().mapToInt(this::countComplex).sum() : 0, x -> 0); + return count(reader, x -> x.isRow() ? Streams.stream((Row) x).mapToInt(this::countComplex).sum() : 0, x -> 0); } int countComplex(ColumnData c) diff --git a/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java index f7cda92fc830..306516ea1f45 100644 --- a/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java +++ b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java @@ -17,6 +17,8 @@ */ package org.apache.cassandra.cql3; +import java.util.BitSet; + import org.junit.Test; public class SimpleQueryTest extends CQLTester @@ -320,9 +322,66 @@ public void testRangeTombstones() throws Throwable expected[N + t2] = row("key", 2, t2, "someSemiLargeTextForValue_2_" + t2); } + assertRows(execute("SELECT * FROM %s"), true, expected); + } + + + @Test + public void testDeletionHeirarchy() throws Throwable + { + int N = 3 * 5; + + createTable("CREATE TABLE %s (k text, t int, v1 text, v2 int, PRIMARY KEY (k, t));"); + int offset = 0; + BitSet present = new BitSet(); + + for (int t = 0; t < N; t++) + { + execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", t, "v" + t, t + 10); + present.set(t); + } + + // Partition tombstone + execute("DELETE FROM %s WHERE k=?", "key"); + present.clear(); + + offset += N / 3; + for (int t = offset; t < N + offset; t++) + { + execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", t, "v" + t, t + 10); + present.set(t); + } + + // Range tombstone + execute("DELETE FROM %s WHERE k=? AND t>=? AND t 1) + { + assertEquals(2, index1.rowsUpdated.size()); + update = index1.rowsUpdated.get(1); + existingRow = update.left; + newRow = update.right; - // check the new row from the update call - assertFalse(newRow.deletion().isLive()); - assertEquals(3L, newRow.deletion().time().markedForDeleteAt()); - assertFalse(newRow.cells().iterator().hasNext()); + + // check the new row from the update call + assertFalse(existingRow.deletion().isLive()); + assertEquals(2L, existingRow.deletion().time().markedForDeleteAt()); + assertFalse(existingRow.cells().iterator().hasNext()); + + // check the new row from the update call + assertFalse(newRow.deletion().isLive()); + assertEquals(3L, newRow.deletion().time().markedForDeleteAt()); + assertFalse(newRow.cells().iterator().hasNext()); + } } @Test diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java index 716f51c38571..b2c2bee1e12b 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java @@ -17,6 +17,8 @@ */ package org.apache.cassandra.cql3.validation.miscellaneous; +import java.util.Arrays; +import java.util.Set; import java.util.concurrent.TimeUnit; import com.google.common.base.Throwables; @@ -24,31 +26,59 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.filter.TombstoneOverwhelmingException; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.memtable.TrieMemtable; +import org.apache.cassandra.db.memtable.TrieMemtableStage3; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.junit.Assert.assertFalse; /** * Test that TombstoneOverwhelmingException gets thrown when it should be and doesn't when it shouldn't be. */ +@RunWith(Parameterized.class) public class TombstonesTest extends CQLTester { - static final int ORIGINAL_FAILURE_THRESHOLD = DatabaseDescriptor.getGuardrailsConfig().getTombstoneFailThreshold(); + @Parameterized.Parameter + public String memtableClass; + + @Parameterized.Parameter(1) + public boolean flush; + + @Parameterized.Parameters(name = "{0} flush: {1}") + public static Iterable parameters() + { + return Arrays.asList(new Object[] {"skiplist", false}, + new Object[] {"trie_stage1", true}, // this uses the same partition code as SkipListMemtable + new Object[] {"trie_stage2", false}, // this flushes like SkipListMemtable, no need to test flushed + new Object[] {"trie_stage3", false}, + new Object[] {"trie_stage3", true}, + new Object[] {"trie", false}, + new Object[] {"trie", true}); + } + + static int ORIGINAL_FAILURE_THRESHOLD; static final int FAILURE_THRESHOLD = 100; static final int WARN_THRESHOLD = 50; + public static final Set> tombstoneIndependentMemtables = Set.of(TrieMemtable.class, TrieMemtableStage3.class); + @BeforeClass public static void setUp() throws Throwable { DatabaseDescriptor.daemonInitialization(); + ORIGINAL_FAILURE_THRESHOLD = DatabaseDescriptor.getGuardrailsConfig().getTombstoneFailThreshold(); DatabaseDescriptor.getGuardrailsConfig().setTombstonesThreshold(WARN_THRESHOLD, FAILURE_THRESHOLD); } @@ -62,20 +92,23 @@ public static void tearDown() public void testBelowThresholdSelect() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));"); + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH memtable = '" + memtableClass + "';"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); + boolean tombstonesCountTowardsThresholds = flush || !tombstoneIndependentMemtables.contains(cfs.getCurrentMemtable().getClass()); // insert exactly the amount of tombstones that shouldn't trigger an exception for (int i = 0; i < FAILURE_THRESHOLD; i++) execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'"); + if (flush) + flush(); try { execute("SELECT * FROM %s WHERE a = 'key';"); assertEquals(oldFailures, cfs.metric.tombstoneFailures.getCount()); - assertEquals(oldWarnings + 1, cfs.metric.tombstoneWarnings.getCount()); + assertEquals(oldWarnings + (tombstonesCountTowardsThresholds ? 1 : 0), cfs.metric.tombstoneWarnings.getCount()); } catch (Throwable e) { @@ -86,22 +119,27 @@ public void testBelowThresholdSelect() throws Throwable @Test public void testBeyondThresholdSelect() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));"); + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH memtable = {'class': '" + memtableClass + "'};"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); + boolean tombstonesCountTowardsThresholds = flush || !tombstoneIndependentMemtables.contains(cfs.getCurrentMemtable().getClass()); // insert exactly the amount of tombstones that *SHOULD* trigger an exception for (int i = 0; i < FAILURE_THRESHOLD + 1; i++) execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'"); + if (flush) + flush(); try { execute("SELECT * FROM %s WHERE a = 'key';"); - fail("SELECT with tombstones beyond the threshold should have failed, but hasn't"); + assertFalse("SELECT with tombstones beyond the threshold should have failed, but hasn't", tombstonesCountTowardsThresholds); } catch (Throwable e) { + assertTrue(memtableClass + " should not be affected by the number of tombstones", tombstonesCountTowardsThresholds); + String error = "Expected exception instanceof TombstoneOverwhelmingException instead got " + System.lineSeparator() + Throwables.getStackTraceAsString(e); @@ -114,7 +152,18 @@ public void testBeyondThresholdSelect() throws Throwable @Test public void testAllShadowedSelect() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));"); + testAllShadowedSelect(false); + } + + @Test + public void testAllShadowedInSeparateSSTable() throws Throwable + { + testAllShadowedSelect(true); + } + + public void testAllShadowedSelect(boolean flushBetween) throws Throwable + { + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH memtable = {'class': '" + memtableClass + "'};"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); @@ -123,9 +172,15 @@ public void testAllShadowedSelect() throws Throwable for (int i = 0; i < FAILURE_THRESHOLD + 1; i++) execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);"); + if (flushBetween) + flush(); + // delete all with a partition level tombstone execute("DELETE FROM %s WHERE a = 'key'"); + if (flush) + flush(); + try { execute("SELECT * FROM %s WHERE a = 'key';"); @@ -141,7 +196,7 @@ public void testAllShadowedSelect() throws Throwable @Test public void testLiveShadowedCellsSelect() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));"); + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH memtable = {'class': '" + memtableClass + "'};"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); @@ -152,6 +207,9 @@ public void testLiveShadowedCellsSelect() throws Throwable // delete all with a partition level tombstone execute("DELETE FROM %s WHERE a = 'key'"); + if (flush) + flush(); + try { execute("SELECT * FROM %s WHERE a = 'key';"); @@ -167,13 +225,15 @@ public void testLiveShadowedCellsSelect() throws Throwable @Test public void testExpiredTombstones() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH gc_grace_seconds = 1;"); + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b)) WITH gc_grace_seconds = 1 AND memtable = {'class': '" + memtableClass + "'};"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); for (int i = 0; i < FAILURE_THRESHOLD + 1; i++) execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);"); + if (flush) + flush(); // not yet past gc grace - must throw a TOE try @@ -208,18 +268,22 @@ public void testExpiredTombstones() throws Throwable @Test public void testBeyondWarnThresholdSelect() throws Throwable { - String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a,b));"); + String tableName = createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a,b)) WITH memtable = {'class': '" + memtableClass + "'};"); ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName); long oldFailures = cfs.metric.tombstoneFailures.getCount(); long oldWarnings = cfs.metric.tombstoneWarnings.getCount(); + boolean tombstonesCountTowardsThresholds = flush || !tombstoneIndependentMemtables.contains(cfs.getCurrentMemtable().getClass()); // insert the number of tombstones that *SHOULD* trigger an Warning for (int i = 0; i < WARN_THRESHOLD + 1; i++) execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'"); + if (flush) + flush(); + try { execute("SELECT * FROM %s WHERE a = 'key';"); - assertEquals(oldWarnings + 1, cfs.metric.tombstoneWarnings.getCount()); + assertEquals(oldWarnings + (tombstonesCountTowardsThresholds ? 1 : 0), cfs.metric.tombstoneWarnings.getCount()); assertEquals(oldFailures, cfs.metric.tombstoneFailures.getCount()); } catch (Throwable e) diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java index 609f17a5ee5d..db3f4587a0f2 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java @@ -37,7 +37,7 @@ import org.apache.cassandra.db.ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.rows.AbstractCell; -import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.sstable.IScrubber; @@ -70,7 +70,7 @@ public class TTLTest extends CQLTester private Config.CorruptedTombstoneStrategy corruptTombstoneStrategy; // We should start applying overflow policies depending on supported sstable formats. Either in year 2038 or 2086 - boolean overflowPoliciesApply = (Clock.Global.currentTimeMillis() / 1000) > (Cell.getVersionedMaxDeletiontionTime() - MAX_TTL); + boolean overflowPoliciesApply = (Clock.Global.currentTimeMillis() / 1000) > (CellData.getVersionedMaxDeletiontionTime() - MAX_TTL); @Before public void before() @@ -397,7 +397,7 @@ private void checkTTLIsCapped(String field) throws Throwable private long computeMaxTTL() { int nowInSecs = (int) (System.currentTimeMillis() / 1000); - return Cell.getVersionedMaxDeletiontionTime() - nowInSecs; + return CellData.getVersionedMaxDeletiontionTime() - nowInSecs; } public void testRecoverOverflowedExpirationWithScrub(boolean simple, boolean clustering, boolean runScrub, boolean runSStableScrub, boolean reinsertOverflowedTTL) throws Throwable diff --git a/test/unit/org/apache/cassandra/db/NativeCellTest.java b/test/unit/org/apache/cassandra/db/NativeCellTest.java index 2c86a3e1a34a..996b7cc71b7d 100644 --- a/test/unit/org/apache/cassandra/db/NativeCellTest.java +++ b/test/unit/org/apache/cassandra/db/NativeCellTest.java @@ -128,7 +128,7 @@ private static Cell rndcell(ColumnMetadata col) { long timestamp = rand.nextLong(); int ttl = rand.nextInt(); - long localDeletionTime = ThreadLocalRandom.current().nextLong(Cell.getVersionedMaxDeletiontionTime() + 1); + long localDeletionTime = ThreadLocalRandom.current().nextLong(CellData.getVersionedMaxDeletiontionTime() + 1); byte[] value = new byte[rand.nextInt(sanesize(expdecay()))]; rand.nextBytes(value); CellPath path = null; diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java index f520d0a628f3..07517cb37c52 100644 --- a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java +++ b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java @@ -36,6 +36,7 @@ import org.apache.cassandra.Util; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.cql3.statements.schema.IndexTarget; +import org.apache.cassandra.cql3.validation.miscellaneous.TombstonesTest; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; @@ -57,6 +58,7 @@ import org.apache.cassandra.schema.CompactionParams; import org.apache.cassandra.schema.IndexMetadata; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.MemtableParams; import org.apache.cassandra.schema.SchemaTestUtil; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteBufferUtil; @@ -76,11 +78,14 @@ public class RangeTombstoneTest private static final String CFNAME_INDEXED = "StandardIntegerIndexed"; public static final int GC_GRACE = 5000; - @Parameterized.Parameters(name = "compaction={0}") - public static Iterable compactionParamSets() + @Parameterized.Parameters(name = "compaction={0} memtable={1}") + public static Iterable compactionParamSets() { - return ImmutableSet.of(CompactionParams.stcs(ImmutableMap.of()), - CompactionParams.ucs(ImmutableMap.of())); + return ImmutableSet.of(new Object[] {CompactionParams.stcs(ImmutableMap.of()), "skiplist"}, + new Object[] {CompactionParams.lcs(ImmutableMap.of()), "trie_stage1"}, + new Object[] {CompactionParams.ucs(ImmutableMap.of()), "trie_stage2"}, + new Object[] {CompactionParams.ucs(ImmutableMap.of()), "trie_stage3"}, + new Object[] {CompactionParams.ucs(ImmutableMap.of()), "trie"}); } @BeforeClass @@ -93,14 +98,15 @@ public static void defineSchema() throws ConfigurationException standardCFMD(KSNAME, CFNAME_INDEXED, 1, UTF8Type.instance, Int32Type.instance, Int32Type.instance)); } - public RangeTombstoneTest(CompactionParams compactionParams) + public RangeTombstoneTest(CompactionParams compactionParams, String memtableDef) { + MemtableParams memtableParams = MemtableParams.get(memtableDef); Keyspace ks = Keyspace.open(KSNAME); ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME); - SchemaTestUtil.announceTableUpdate(cfs.metadata().unbuild().compaction(compactionParams).build()); + SchemaTestUtil.announceTableUpdate(cfs.metadata().unbuild().compaction(compactionParams).memtable(memtableParams).build()); cfs.disableAutoCompaction(); // don't trigger compaction at 4 sstables cfs = ks.getColumnFamilyStore(CFNAME_INDEXED); - SchemaTestUtil.announceTableUpdate(cfs.metadata().unbuild().compaction(compactionParams).build()); + SchemaTestUtil.announceTableUpdate(cfs.metadata().unbuild().compaction(compactionParams).memtable(memtableParams).build()); cfs.disableAutoCompaction(); // don't trigger compaction at 4 sstables } @@ -727,18 +733,28 @@ public void testOverwritesToDeletedColumns() throws Exception UpdateBuilder.create(cfs.metadata(), key).withTimestamp(0).newRow(1).add("val", 1).applyUnsafe(); - // add a RT which hides the column we just inserted + // add a RT which hides the row we just inserted new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(0, 1).build().applyUnsafe(); - // now re-insert that column + // now re-insert that row UpdateBuilder.create(cfs.metadata(), key).withTimestamp(2).newRow(1).add("val", 1).applyUnsafe(); Util.flush(cfs); - // We should have 1 insert and 1 update to the indexed "1" column - // CASSANDRA-6640 changed index update to just update, not insert then delete - assertEquals(1, index.rowsInserted.size()); - assertEquals(1, index.rowsUpdated.size()); + if (TombstonesTest.tombstoneIndependentMemtables.contains(cfs.getCurrentMemtable().getClass())) + { + // TrieMemtable deletes the row on receiving the tombstone (issuing an update), and then insert it a second + // time. + assertEquals(2, index.rowsInserted.size()); + assertEquals(1, index.rowsUpdated.size()); + } + else + { + // Legacy memtables will keep the row shadowed and update it. + assertEquals(1, index.rowsInserted.size()); + assertEquals(1, index.rowsUpdated.size()); + } + } private static ByteBuffer bb(int i) diff --git a/test/unit/org/apache/cassandra/db/RowUpdateBuilder.java b/test/unit/org/apache/cassandra/db/RowUpdateBuilder.java index a52064cd637e..dd3a083630d2 100644 --- a/test/unit/org/apache/cassandra/db/RowUpdateBuilder.java +++ b/test/unit/org/apache/cassandra/db/RowUpdateBuilder.java @@ -72,6 +72,17 @@ public RowUpdateBuilder(TableMetadata metadata, long localDeletionTime, long tim this.updateBuilder.nowInSec(localDeletionTime); } + public RowUpdateBuilder(TableMetadata metadata, DeletionTime partitionDeletion, long nowInSec, long timestamp, Object partitionKey) + { + this(PartitionUpdate.simpleBuilder(metadata, partitionKey)); + + this.updateBuilder.timestamp(partitionDeletion.markedForDeleteAt()); + this.updateBuilder.nowInSec(partitionDeletion.localDeletionTime()); + this.updateBuilder.delete(); + this.updateBuilder.timestamp(timestamp); + this.updateBuilder.nowInSec(nowInSec); + } + public RowUpdateBuilder timestamp(long ts) { updateBuilder.timestamp(ts); diff --git a/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java index 7654c399943a..1b7427ad5b15 100644 --- a/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java @@ -58,6 +58,8 @@ import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -87,6 +89,7 @@ import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.db.rows.SerializationHelper; +import org.apache.cassandra.db.rows.TrieBackedRow; import org.apache.cassandra.db.rows.UnfilteredSerializer; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataInputPlus; @@ -146,6 +149,7 @@ import static org.quicktheories.generators.SourceDSL.floats; @SuppressWarnings({ "unchecked", "rawtypes" }) +@RunWith(Parameterized.class) public class AbstractTypeTest { private final static Logger logger = LoggerFactory.getLogger(AbstractTypeTest.class); @@ -155,7 +159,7 @@ public class AbstractTypeTest static { CassandraRelevantProperties.VECTOR_FLOAT_ONLY.setBoolean(false); - + // make sure blob is always the same CassandraRelevantProperties.TEST_BLOB_SHARED_SEED.setInt(42); } @@ -166,6 +170,20 @@ public class AbstractTypeTest .setExpandSuperTypes(true) .setParallel(true)); + enum RowImplementation + { + BTREE, TRIE_BACKED + } + + @Parameterized.Parameter(0) + public static RowImplementation implementation; + + @Parameterized.Parameters(name = "{0}") + public static RowImplementation[] implementations() + { + return RowImplementation.values(); + } + // TODO // withUpdatedUserType/expandUserTypes/referencesDuration - types that recursive check types @@ -1111,7 +1129,7 @@ private static int sign(int value) { return Integer.compare(value, 0); } - + private static void verifyComparison(Comparator leftComparator, Comparator rightComparator, T lv1, T lv2, T rv1, T rv2, int expectedResult, Function desc) { SoftAssertions checks = new SoftAssertions(); @@ -1165,7 +1183,7 @@ private static void verifyComparisonCompatibilityForMultiCell(AbstractType left, UnfilteredSerializer.serializer.serialize(rightRow, rightHelper, out, MessagingService.current_version); try (DataInputBuffer in = new DataInputBuffer(out.getData())) { - Row.Builder builder = BTreeRow.sortedBuilder(); + Row.Builder builder = implementation == RowImplementation.BTREE ? BTreeRow.sortedBuilder() : TrieBackedRow.builder(rightTable.regularAndStaticColumns()); builder.addPrimaryKeyLivenessInfo(rightRow.primaryKeyLivenessInfo()); Row leftRow = (Row) UnfilteredSerializer.serializer.deserialize(in, leftHeader, leftHelper, builder); ComplexColumnData leftData1 = leftRow.getComplexColumnData(leftColumn1); @@ -1201,7 +1219,7 @@ private static void verifySerializationCompatibilityForSimpleCells(AbstractType UnfilteredSerializer.serializer.serialize(rightRow, rightHelper, out, MessagingService.current_version); try (DataInputBuffer in = new DataInputBuffer(out.getData())) { - Row.Builder builder = BTreeRow.sortedBuilder(); + Row.Builder builder = implementation == RowImplementation.BTREE ? BTreeRow.sortedBuilder() : TrieBackedRow.builder(rightTable.regularAndStaticColumns()); builder.addPrimaryKeyLivenessInfo(rightRow.primaryKeyLivenessInfo()); Row leftRow = (Row) UnfilteredSerializer.serializer.deserialize(in, leftHeader, leftHelper, builder); Cell leftData = (Cell) leftRow.getColumnData(leftColumn); @@ -1226,7 +1244,7 @@ private static void verifySerializationCompatibilityForComplexCells(AbstractType UnfilteredSerializer.serializer.serialize(rightRow, rightHelper, out, MessagingService.current_version); try (DataInputBuffer in = new DataInputBuffer(out.getData())) { - Row.Builder builder = BTreeRow.sortedBuilder(); + Row.Builder builder = implementation == RowImplementation.BTREE ? BTreeRow.sortedBuilder() : TrieBackedRow.builder(rightTable.regularAndStaticColumns()); builder.addPrimaryKeyLivenessInfo(rightRow.primaryKeyLivenessInfo()); Row leftRow = (Row) UnfilteredSerializer.serializer.deserialize(in, leftHeader, leftHelper, builder); ComplexColumnData leftData = leftRow.getComplexColumnData(leftColumn); @@ -1238,8 +1256,11 @@ private static void verifySerializationCompatibilityForComplexCells(AbstractType Cell rightCell = rightData.getCellByIndex(i); checks.assertThat(leftCell.buffer()).describedAs(bytesToHex(leftCell.buffer())).isEqualTo(rightCell.buffer()).describedAs(bytesToHex(rightCell.buffer())); checks.assertThat(leftCell.path().size()).describedAs(typeRelDesc(".cellPathSizeIsEqualTo", left, right)).isEqualTo(rightCell.path().size()); - for (int j = 0; j < leftCell.path().size(); j++) - checks.assertThat(leftCell.path().get(j)).describedAs(bytesToHex(leftCell.path().get(j))).isEqualTo(rightCell.path().get(j)).describedAs(bytesToHex(rightCell.path().get(j))); + + // All collections have one key component of type given by the name comparator. Though bytes returned may vary due to encoding paths as byte + // comparables, the keys have to compare equal. + AbstractType keyType = ((MultiCellCapableType) left).nameComparator(); + checks.assertThat(keyType.compare(leftCell.path().get(0), rightCell.path().get(0))).isZero().describedAs(keyType.getString(leftCell.path().get(0)) + " != " + keyType.getString(rightCell.path().get(0))); } } } diff --git a/test/unit/org/apache/cassandra/db/memtable/MemtableQuickTest.java b/test/unit/org/apache/cassandra/db/memtable/MemtableQuickTest.java index 8fef89ce7c08..184613849c69 100644 --- a/test/unit/org/apache/cassandra/db/memtable/MemtableQuickTest.java +++ b/test/unit/org/apache/cassandra/db/memtable/MemtableQuickTest.java @@ -76,7 +76,9 @@ public static List parameters() } params.add("trie"); params.add("trie_stage1"); - params.add("persistent_memory"); + params.add("trie_stage2", + "trie_stage3", + "persistent_memory"); return params.build(); } diff --git a/test/unit/org/apache/cassandra/db/memtable/MemtableSizeTestBase.java b/test/unit/org/apache/cassandra/db/memtable/MemtableSizeTestBase.java index 071b4c420895..b34db0fcf952 100644 --- a/test/unit/org/apache/cassandra/db/memtable/MemtableSizeTestBase.java +++ b/test/unit/org/apache/cassandra/db/memtable/MemtableSizeTestBase.java @@ -19,15 +19,18 @@ package org.apache.cassandra.db.memtable; +import java.io.BufferedReader; +import java.io.InputStreamReader; import java.lang.reflect.Field; +import java.nio.ByteBuffer; import java.util.List; +import java.util.Random; import com.google.common.collect.ImmutableList; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,19 +79,25 @@ public abstract class MemtableSizeTestBase extends CQLTester @Parameterized.Parameter(0) public String memtableClass = "skiplist"; - @Parameterized.Parameters(name = "{0}") - public static List parameters() + @Parameterized.Parameter(1) + public int valueSize = 8; + + @Parameterized.Parameters(name = "{0} value size {1}") + public static List parameters() { // Sharded memtables require Cassandra 5.0+, skip them in compatibility mode StorageCompatibilityMode mode = DatabaseDescriptor.getStorageCompatibilityMode(); boolean skipSharded = mode != null && mode.isBefore(CassandraVersion.CASSANDRA_5_0.major); - ImmutableList.Builder params = ImmutableList.builder(); - params.add("skiplist"); + ImmutableList.Builder params = ImmutableList.builder(); + params.add(new Object[] {"skiplist", 8}); if (!skipSharded) - params.add("skiplist_sharded"); - params.add("trie_stage1"); - params.add("trie"); + params.add(new Object[] {"skiplist_sharded", 32}); + params.add(new Object[] {"trie_stage1", 8}); + params.add(new Object[] {"trie_stage2", 8}); + params.add(new Object[] {"trie_stage3", 8}); + params.add(new Object[] {"trie", 8}); + params.add(new Object[] {"trie", 32}); return params.build(); } @@ -136,7 +145,7 @@ private void buildAndFillTable(String memtableClass) throws Throwable CQLTester.disablePreparedReuseForTest(); keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false"); - table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid))" + + table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid blob, PRIMARY KEY(userid, picid))" + " with compression = {'enabled': false}" + " and memtable = '" + memtableClass + "'"); execute("use " + keyspace + ';'); @@ -148,10 +157,46 @@ private void buildAndFillTable(String memtableClass) throws Throwable Util.flush(cfs); } + ByteBuffer valueFor(long v) + { + Random rand = new Random(v); + byte[] bytes = new byte[valueSize]; + rand.nextBytes(bytes); + return ByteBuffer.wrap(bytes); + } + + private static void runCommandAndDumpOutput(String cmd, int lineCount) + { + try + { + // Define the command and its arguments + ProcessBuilder builder = new ProcessBuilder("bash", "-c", cmd); + + // Start the process + Process process = builder.start(); + + // Read the output from the command + BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line; + while ((line = reader.readLine()) != null && --lineCount >= 0) + { + System.out.println(line); + } + while ((reader.readLine()) != null) {} + + // Wait for the command to finish and get exit code + int exitCode = process.waitFor(); + System.out.println("Exited with code: " + exitCode); + } + catch (Exception e) + { + e.printStackTrace(); + } + } + @Test public void testSize() throws Throwable { - try { buildAndFillTable(memtableClass); @@ -169,7 +214,7 @@ public void testSize() throws Throwable for (i = 0; i < limit; ++i) { for (long j = 0; j < rowsPerPartition; ++j) - execute(writeStatement, i, j, i + j); + execute(writeStatement, i, j, valueFor(i + j)); } logger.info("Deleting {} partitions", deletedPartitions); @@ -202,9 +247,13 @@ public void testSize() throws Throwable if (memtable instanceof TrieMemtable) ((TrieMemtable) memtable).releaseReferencesUnsafe(); + if (memtable instanceof TrieMemtableStage2) + ((TrieMemtableStage2) memtable).releaseReferencesUnsafe(); + if (memtable instanceof TrieMemtableStage3) + ((TrieMemtableStage3) memtable).releaseReferencesUnsafe(); -// System.out.println("Take jmap -histo:live "); -// Thread.sleep(10000); + // To see a summary of the objects on the heap, uncomment this: + // runCommandAndDumpOutput("jmap -histo:live " + ProcessHandle.current().pid(), 25); long deepSizeAfter = meter.measureDeep(memtable); logger.info("Memtable deep size {}", FBUtilities.prettyPrintMemory(deepSizeAfter)); @@ -227,7 +276,7 @@ public void testSize() throws Throwable FBUtilities.prettyPrintMemory(reportedHeap), FBUtilities.prettyPrintMemory(actualHeap - reportedHeap)); System.out.println(message); - Assert.assertTrue(message, Math.abs(reportedHeap - actualHeap) <= maxDifference); + Assert.assertTrue(message, Math.abs(reportedHeap - actualHeap) <= Math.max(100 * 1024, maxDifference)); } finally { @@ -247,7 +296,7 @@ public void testRowSize() throws Throwable for (long i = 0; i < partitions; ++i) { for (long j = 0; j < rowsPerPartition; ++j) - execute(writeStatement, i, j, i + j); + execute(writeStatement, i, j, valueFor(i + j)); } long rowSize = memtable.getEstimatedAverageRowSize(); diff --git a/test/unit/org/apache/cassandra/db/memtable/MemtableThreadedTest.java b/test/unit/org/apache/cassandra/db/memtable/MemtableThreadedTest.java index 27252e52c124..dadee81dc9f5 100644 --- a/test/unit/org/apache/cassandra/db/memtable/MemtableThreadedTest.java +++ b/test/unit/org/apache/cassandra/db/memtable/MemtableThreadedTest.java @@ -28,11 +28,13 @@ import com.google.common.collect.ImmutableList; import org.junit.Assert; +import org.junit.Assume; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.ColumnFamilyStore; @@ -40,7 +42,7 @@ import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS; -/// This test is a counterpart to InMemoryTrieThreadedTest that makes sure TrieMemtable is wiring the trie consistency +/// This test is a counterpart to InMemoryTrieConsistencyTest that makes sure TrieMemtable is wiring the trie consistency /// machinery correctly. Note that this test always applies mutations the same way (with partition-level forced copying) /// and is effectively doing the same test but checking different correctness properties. /// @@ -54,10 +56,13 @@ public class MemtableThreadedTest extends CQLTester @Parameterized.Parameters(name = "{0}") public static List parameters() { - return ImmutableList.of("SkipListMemtable", - "TrieMemtable", - "TrieMemtableStage1", - "PersistentMemoryMemtable"); + return ImmutableList.of("skiplist", + "skiplist_sharded", + "trie", + "trie_stage1", + "trie_stage2", + "trie_stage3", + "persistent_memory"); } @BeforeClass @@ -171,10 +176,12 @@ public void testAtomicUpdates(int PER_MUTATION, boolean checkSequence) throws Exception { + Assume.assumeFalse("Cannot use skiplist_sharded in CC4 compatibility mode.", + DatabaseDescriptor.getStorageCompatibilityMode().isBefore(5) && memtableClass.contains("sharded")); keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false"); table = createTable(keyspace, "CREATE TABLE %s ( pk bigint, ck bigint, value bigint, seq bigint, PRIMARY KEY(pk, ck))" + " with compression = {'enabled': false}" + - " and memtable = { 'class': '" + memtableClass + "'}" + + " and memtable = '" + memtableClass + "'" + " and compaction = { 'class': 'UnifiedCompactionStrategy', 'min_sstable_size_in_mb': '1' }"); // to trigger splitting of sstables, STAR-1826 execute("use " + keyspace + ';'); diff --git a/test/unit/org/apache/cassandra/db/memtable/TrieMemtableDocTrieMakerTest.java b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableDocTrieMakerTest.java new file mode 100644 index 000000000000..10d336886bcf --- /dev/null +++ b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableDocTrieMakerTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.memtable; + +import java.util.Map; +import java.util.UUID; + +import org.junit.Test; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.db.tries.TrieToDot; +import org.apache.cassandra.db.tries.TrieToMermaid; + +public class TrieMemtableDocTrieMakerTest extends CQLTester +{ + @Test + public void makeTrieDumps() throws Throwable + { + createTable("CREATE TABLE %s (company text, date date, total double, purchases map, PRIMARY KEY (company, date));"); + + execute("INSERT INTO %s (company, date, total, purchases) values (?, '2026-02-10', ?, ?) using timestamp 346", + "ACME", + 256.68, + Map.of(UUID.fromString("d79012af-8b34-4fb4-9799-6c0d29ca4e2f"), 88.67, + UUID.fromString("830b82ce-a7f2-4939-9ea1-46b9d3714848"), 256.68 - 88.67)); + + execute("INSERT INTO %s (company, date, total) values (?, '2026-02-13', ?) using timestamp 385", "ACME", 99.23); + execute("UPDATE %s using timestamp 385 SET purchases[?] = ? WHERE company = ? AND date = '2026-02-13'", + UUID.fromString("dab4819d-c6f5-4c05-b575-4a057d78c99a"), + 99.23, + "ACME"); + + execute("INSERT INTO %s (company, date, total, purchases) values (?, '2026-02-11', ?, ?) using timestamp 352", + "IBM", + 542.79, + Map.of(UUID.fromString("3edf143a-a8d1-478f-86b4-d83a72a75170"), 122.12, + UUID.fromString("35441ee9-3ac9-40d0-98e8-9c75b414addb"), 542.79 - 122.12)); + + execute("INSERT INTO %s (company, date, total, purchases) values (?, '2026-02-12', ?, ?) using timestamp 367", + "Apple", + 324.83, + Map.of(UUID.fromString("82b4ce57-d6a0-4470-8747-1c2aa4fc5961"), 324.83)); + execute("DELETE FROM %s using timestamp 329 WHERE company = 'Apple' AND date = '2026-02-09'"); + execute("DELETE FROM %s using timestamp 412 where company='Apple' AND date<='2026-01-31' AND date>='2026-01-01'"); + + TrieMemtable memtable = (TrieMemtable) getCurrentColumnFamilyStore().getCurrentMemtable(); + System.out.println(); + System.out.println(toShortString(memtable.dump())); + System.out.println(); + System.out.println(memtable.mergedTrie.dump(TrieMemtableDocTrieMakerTest::toShortString, TrieMemtableDocTrieMakerTest::toShortString)); + System.out.println(); + System.out.println(memtable.mergedTrie.process(Direction.FORWARD, new TrieToMermaid<>(TrieMemtableDocTrieMakerTest::toShortString, TrieMemtableDocTrieMakerTest::toShortString, x -> String.format("%02x", x), true))); + System.out.println(); + System.out.println(memtable.mergedTrie.process(Direction.FORWARD, new TrieToDot<>(TrieMemtableDocTrieMakerTest::toShortString, TrieMemtableDocTrieMakerTest::toShortString, x -> String.format("%02x", x), true))); + System.out.println(); + } + + public static String toShortString(T val) + { + String v = val.toString(); + return v.replaceAll(", localDeletion=\\d+", "");//.replaceAll("(ts|deletedAt)=\\d+(\\d{3})","$1=$2"); + } +} diff --git a/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java b/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java index e3d9c01b3b54..cd10d0c4faf9 100644 --- a/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java +++ b/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java @@ -38,6 +38,8 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; +import org.apache.cassandra.db.partitions.TrieBackedPartitionStage2; +import org.apache.cassandra.db.partitions.TrieBackedPartitionStage3; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.cql3.ColumnIdentifier; @@ -59,14 +61,18 @@ public class PartitionImplementationTest { enum Implementation { - BTREE(ImmutableBTreePartition::create), - TRIE(TrieBackedPartition::fromIterator); + BTREE(ImmutableBTreePartition::create, false), + TRIE_STAGE_2(TrieBackedPartitionStage2::fromIterator, false), + TRIE_STAGE_3(TrieBackedPartitionStage3::fromIterator, true), + TRIE(TrieBackedPartition::fromIterator, true); final Function creator; + final boolean filterInvalidEndThanStart; - Implementation(Function creator) + Implementation(Function creator, boolean filterInvalidEndThanStart) { this.creator = creator; + this.filterInvalidEndThanStart = filterInvalidEndThanStart; } } @@ -454,7 +460,7 @@ private void testSlicingOfIterators(NavigableSet sortedContent, Par if (reversed) Collections.reverse(slicelist); - assertIteratorsEqual(Iterators.concat(slicelist.toArray(new Iterator[0])), slicedIter); + assertIteratorsEqual(maybeFilterInvalidCloseThenOpen(Iterators.concat(slicelist.toArray(new Iterator[0])), reversed), slicedIter); } } @@ -467,7 +473,47 @@ private Iterator invert(Iterator slice) private Iterator slice(NavigableSet sortedContent, Slices slices) { - return Iterators.concat(streamOf(slices).map(slice -> slice(sortedContent, slice)).iterator()); + Iterator result = Iterators.concat(streamOf(slices).map(slice -> slice(sortedContent, slice)).iterator()); + result = maybeFilterInvalidCloseThenOpen(result, false); + + return result; + } + + private static Iterator maybeFilterInvalidCloseThenOpen(Iterator result, boolean reversed) + { + // Older implementations concatenate the individual slices, which may create an invalid close+open sequence with the same clustering. + // Stage 3 and later fix this problem. + if (!implementation.filterInvalidEndThanStart || !result.hasNext()) + return result; + + List list = new ArrayList<>(); + Clusterable c1 = result.next(); + while (result.hasNext()) + { + Clusterable c2 = result.next(); + if (metadata.comparator.compare(c1.clustering(), c2.clustering()) == 0) + { + assertTrue(c1 instanceof RangeTombstoneBoundMarker); + assertTrue(c2 instanceof RangeTombstoneBoundMarker); + RangeTombstoneBoundMarker m1 = (RangeTombstoneBoundMarker) c1; + RangeTombstoneBoundMarker m2 = (RangeTombstoneBoundMarker) c2; + assertTrue(m1.isClose(reversed)); + assertTrue(m2.isOpen(reversed)); + if (m1.deletionTime().equals(m2.deletionTime())) + c1 = result.hasNext() ? result.next() : null; + else + c1 = RangeTombstoneBoundaryMarker.makeBoundary(reversed, m1.clustering(), m2.clustering(), m1.deletionTime(), m2.deletionTime()); + } + else + { + list.add(c1); + c1 = c2; + } + } + if (c1 != null) + list.add(c1); + result = list.iterator(); + return result; } private Iterator slice(NavigableSet sortedContent, Slice slice) diff --git a/test/unit/org/apache/cassandra/db/partitions/AffectedColumnCountTest.java b/test/unit/org/apache/cassandra/db/partitions/AffectedColumnCountTest.java new file mode 100644 index 000000000000..35498993d2df --- /dev/null +++ b/test/unit/org/apache/cassandra/db/partitions/AffectedColumnCountTest.java @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import java.util.Arrays; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; + +import static org.junit.Assert.*; + +/// Tests for [PartitionUpdate#affectedColumnCount] and [PartitionUpdate#affectedRowCount]. +@RunWith(Parameterized.class) +public class AffectedColumnCountTest +{ + @Parameterized.Parameter(0) + public PartitionUpdate.Factory partitionUpdateFactory; + + @Parameterized.Parameters(name = "type {0}") + public static List factories() + { + return Arrays.asList(TriePartitionUpdate.FACTORY, + TriePartitionUpdateStage3.FACTORY, + TriePartitionUpdateStage2.FACTORY, + BTreePartitionUpdate.FACTORY); + } + + private static TableMetadata metadata; + private static DecoratedKey partitionKey; + private static ColumnMetadata r1; + private static ColumnMetadata r2; + private static ColumnMetadata r3; + private static ColumnMetadata s1; + private static ColumnMetadata s2; + private static ColumnMetadata c1; + private static ColumnMetadata c2; + + @BeforeClass + public static void setUp() + { + DatabaseDescriptor.daemonInitialization(); + metadata = TableMetadata.builder("test_ks", "test_table") + .addPartitionKeyColumn("pk", Int32Type.instance) + .addClusteringColumn("ck", Int32Type.instance) + .addRegularColumn("r1", Int32Type.instance) + .addRegularColumn("r2", Int32Type.instance) + .addRegularColumn("r3", Int32Type.instance) + .addStaticColumn("s1", Int32Type.instance) + .addStaticColumn("s2", Int32Type.instance) + .addRegularColumn("c1", SetType.getInstance(Int32Type.instance, true)) + .addRegularColumn("c2", SetType.getInstance(Int32Type.instance, true)) + .build(); + + partitionKey = DatabaseDescriptor.getPartitioner().decorateKey(ByteBufferUtil.bytes(1)); + r1 = metadata.getColumn(new ColumnIdentifier("r1", false)); + r2 = metadata.getColumn(new ColumnIdentifier("r2", false)); + r3 = metadata.getColumn(new ColumnIdentifier("r3", false)); + s1 = metadata.getColumn(new ColumnIdentifier("s1", false)); + s2 = metadata.getColumn(new ColumnIdentifier("s2", false)); + c1 = metadata.getColumn(new ColumnIdentifier("c1", false)); + c2 = metadata.getColumn(new ColumnIdentifier("c2", false)); + } + + @Test + public void testAffectedColumnCountWithPartitionDeletion() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + builder.addPartitionDeletion(DeletionTime.build(1000, FBUtilities.nowInSeconds())); + PartitionUpdate update = builder.build(); + + // With partition deletion, should count all regular and static columns + int expected = metadata.regularAndStaticColumns().size(); + assertEquals(expected, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithSingleSimpleCell() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row row = BTreeRow.singleCellRow(Clustering.make(ByteBufferUtil.bytes(1)), + BufferCell.live(r1, 1000, ByteBufferUtil.bytes(100))); + builder.add(row); + PartitionUpdate update = builder.build(); + + // Should count 1 cell + assertEquals(1, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithMultipleSimpleCells() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + builder.addCell(BufferCell.live(r1, 1000, ByteBufferUtil.bytes(100))); + builder.addCell(BufferCell.live(r2, 1000, ByteBufferUtil.bytes(200))); + builder.addCell(BufferCell.live(r3, 1000, ByteBufferUtil.bytes(300))); + + updateBuilder.add(builder.build()); + PartitionUpdate update = updateBuilder.build(); + + // Should count 3 cells + assertEquals(3, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithStaticCells() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.STATIC_CLUSTERING); + builder.addCell(BufferCell.live(s1, 1000, ByteBufferUtil.bytes(100))); + builder.addCell(BufferCell.live(s2, 1000, ByteBufferUtil.bytes(200))); + + updateBuilder.add(builder.build()); + PartitionUpdate update = updateBuilder.build(); + + // Should count 2 static cells + assertEquals(2, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithComplexColumn() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + + // Add cells to a complex column (set) + builder.addCell(BufferCell.live(c1, 1000, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(10)))); + builder.addCell(BufferCell.live(c1, 1000, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(20)))); + builder.addCell(BufferCell.live(c1, 1000, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(30)))); + + updateBuilder.add(builder.build()); + PartitionUpdate update = updateBuilder.build(); + + // Should count 1 column (complex columns count as 1, not individual cells) + assertEquals(1, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithMixedColumns() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add static row + Row.Builder staticBuilder = BTreeRow.unsortedBuilder(); + staticBuilder.newRow(Clustering.STATIC_CLUSTERING); + staticBuilder.addCell(BufferCell.live(s1, 1010, ByteBufferUtil.bytes(100))); + updateBuilder.add(staticBuilder.build()); + + // Add regular row with simple and complex columns + Row.Builder regularBuilder = BTreeRow.unsortedBuilder(); + regularBuilder.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + regularBuilder.addCell(BufferCell.live(r1, 1000, ByteBufferUtil.bytes(100))); + regularBuilder.addCell(BufferCell.live(r2, 1001, ByteBufferUtil.bytes(200))); + regularBuilder.addCell(BufferCell.live(c1, 1002, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(10)))); + regularBuilder.addCell(BufferCell.live(c1, 1002, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(20)))); + updateBuilder.add(regularBuilder.build()); + + PartitionUpdate update = updateBuilder.build(); + + // Should count: 1 static + 2 simple regular + 1 complex column = 4 columns (not cells) + assertEquals(4, update.affectedColumnCount()); + assertEquals(2, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithRangeTombstone() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add a range tombstone + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(1)), + Clustering.make(ByteBufferUtil.bytes(10))), + DeletionTime.build(1000, FBUtilities.nowInSeconds()))); + PartitionUpdate update = builder.build(); + + // Each range tombstone creates 2 markers (open/close), each counting all regular columns + int expected = 1 * metadata.regularColumns().size(); + assertEquals(expected, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithMultipleRangeTombstones() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add multiple range tombstones + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(1)), + Clustering.make(ByteBufferUtil.bytes(5))), + DeletionTime.build(1000, FBUtilities.nowInSeconds()))); + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(10)), + Clustering.make(ByteBufferUtil.bytes(15))), + DeletionTime.build(1001, FBUtilities.nowInSeconds()))); + PartitionUpdate update = builder.build(); + + // Each range tombstone creates 2 markers (open/close), each counting all regular columns + int expected = 2 * metadata.regularColumns().size(); + assertEquals(expected, update.affectedColumnCount()); + assertEquals(2, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithOverridingRangeTombstone() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add multiple range tombstones + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(3)), + Clustering.make(ByteBufferUtil.bytes(5))), + DeletionTime.build(1000, FBUtilities.nowInSeconds()))); + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(1)), + Clustering.make(ByteBufferUtil.bytes(7))), + DeletionTime.build(1001, FBUtilities.nowInSeconds()))); + PartitionUpdate update = builder.build(); + + // Each range tombstone creates 2 markers (open/close), each counting all regular columns + int expected = 1 * metadata.regularColumns().size(); + assertEquals(expected, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithOverlappingRangeTombstone() + { + PartitionUpdate.Builder builder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add multiple range tombstones + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(1)), + Clustering.make(ByteBufferUtil.bytes(5))), + DeletionTime.build(1000, FBUtilities.nowInSeconds()))); + builder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(3)), + Clustering.make(ByteBufferUtil.bytes(7))), + DeletionTime.build(1001, FBUtilities.nowInSeconds()))); + PartitionUpdate update = builder.build(); + + // Each range tombstone creates 2 markers (open/close), each counting all regular columns + int expected = 2 * metadata.regularColumns().size(); + assertEquals(expected, update.affectedColumnCount()); + assertEquals(2, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithRangeTombstoneAndCells() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add a range tombstone + updateBuilder.add(new RangeTombstone(org.apache.cassandra.db.Slice.make(Clustering.make(ByteBufferUtil.bytes(1)), + Clustering.make(ByteBufferUtil.bytes(10))), + DeletionTime.build(1000, FBUtilities.nowInSeconds()))); + + // Add a row with cells + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.make(ByteBufferUtil.bytes(20))); + builder.addCell(BufferCell.live(r1, 1000, ByteBufferUtil.bytes(100))); + builder.addCell(BufferCell.live(r2, 1000, ByteBufferUtil.bytes(200))); + updateBuilder.add(builder.build()); + + PartitionUpdate update = updateBuilder.build(); + + // Should count: range tombstone creates 2 markers (open/close), each with 5 regular columns = 10, plus 2 cells = 12 + int expected = 1 * metadata.regularColumns().size() + 2; + assertEquals(expected, update.affectedColumnCount()); + assertEquals(2, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithMultipleRows() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + // Add first row + Row.Builder builder1 = BTreeRow.unsortedBuilder(); + builder1.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + builder1.addCell(BufferCell.live(r1, 1000, ByteBufferUtil.bytes(100))); + updateBuilder.add(builder1.build()); + + // Add second row + Row.Builder builder2 = BTreeRow.unsortedBuilder(); + builder2.newRow(Clustering.make(ByteBufferUtil.bytes(2))); + builder2.addCell(BufferCell.live(r2, 1000, ByteBufferUtil.bytes(200))); + builder2.addCell(BufferCell.live(r3, 1000, ByteBufferUtil.bytes(300))); + updateBuilder.add(builder2.build()); + + PartitionUpdate update = updateBuilder.build(); + + // Should count: 1 + 2 = 3 cells + assertEquals(3, update.affectedColumnCount()); + assertEquals(2, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountEmpty() + { + TriePartitionUpdate update = TriePartitionUpdate.emptyUpdate(metadata, partitionKey); + + // Empty update should have 0 affected columns + assertEquals(0, update.affectedColumnCount()); + assertEquals(0, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithComplexColumnDeletion() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + + // Add a complex column deletion (marker) + DeletionTime deletion = DeletionTime.build(1000, FBUtilities.nowInSeconds()); + builder.addComplexDeletion(c1, deletion); + + updateBuilder.add(builder.build()); + PartitionUpdate update = updateBuilder.build(); + + // Complex column deletion counts as 1 column + assertEquals(1, update.affectedColumnCount()); + /// Trie-backed partitions only count rows with live data. See [TrieBackedPartition#rowCount]. + if (partitionUpdateFactory != TriePartitionUpdate.FACTORY) + assertEquals(1, update.affectedRowCount()); + } + + @Test + public void testAffectedColumnCountWithComplexColumnDeletionAndLiveData() + { + PartitionUpdate.Builder updateBuilder = partitionUpdateFactory.builder(metadata, partitionKey, metadata.regularAndStaticColumns(), 16); + + Row.Builder builder = BTreeRow.unsortedBuilder(); + builder.newRow(Clustering.make(ByteBufferUtil.bytes(1))); + + // Add a complex column deletion (marker) at timestamp 1000 + DeletionTime deletion = DeletionTime.build(1000, FBUtilities.nowInSeconds()); + builder.addComplexDeletion(c1, deletion); + + // Add newer live cells to the same complex column at timestamp 2000 + builder.addCell(BufferCell.live(c1, 2000, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(10)))); + builder.addCell(BufferCell.live(c1, 2000, ByteBufferUtil.bytes(1), CellPath.create(ByteBufferUtil.bytes(20)))); + + updateBuilder.add(builder.build()); + PartitionUpdate update = updateBuilder.build(); + + // Should count the complex column as 1 (deletion marker + cells count as the same column) + assertEquals(1, update.affectedColumnCount()); + assertEquals(1, update.affectedRowCount()); + } +} diff --git a/test/unit/org/apache/cassandra/db/partitions/AtomicBTreePartitionMemtableAccountingTest.java b/test/unit/org/apache/cassandra/db/partitions/AtomicBTreePartitionMemtableAccountingTest.java index 00a7d6f79a56..8f9a13acd6a9 100644 --- a/test/unit/org/apache/cassandra/db/partitions/AtomicBTreePartitionMemtableAccountingTest.java +++ b/test/unit/org/apache/cassandra/db/partitions/AtomicBTreePartitionMemtableAccountingTest.java @@ -226,14 +226,14 @@ void execute() { // Test regular row updates Pair regularRows = makeInitialAndUpdate(r1md, c2md); - PartitionUpdate initial = PartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.left); - PartitionUpdate update = PartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.right); + PartitionUpdate initial = BTreePartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.left); + PartitionUpdate update = BTreePartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.right); validateUpdates(metadata, partitionKey, Arrays.asList(initial, update)); // Test static row updates Pair staticRows = makeInitialAndUpdate(s3md, c4md); - PartitionUpdate staticInitial = PartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.left); - PartitionUpdate staticUpdate = PartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.right); + PartitionUpdate staticInitial = BTreePartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.left); + PartitionUpdate staticUpdate = BTreePartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.right); validateUpdates(metadata, partitionKey, Arrays.asList(staticInitial, staticUpdate)); } @@ -376,7 +376,7 @@ private long getUnreleasableSize(Row updRow, Row exsRow, DeletionTime exsDeletio updDeletion = updRow.deletion().time(); long size = 0; - for (ColumnData exsCd : exsRow.columnData()) + for (ColumnData exsCd : exsRow) { ColumnData updCd = updRow.getColumnData(exsCd.column()); if (exsCd instanceof Cell) diff --git a/test/unit/org/apache/cassandra/db/partitions/TrieBackedPartitionMemtableAccountingTest.java b/test/unit/org/apache/cassandra/db/partitions/TrieBackedPartitionMemtableAccountingTest.java new file mode 100644 index 000000000000..2460863caa17 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/partitions/TrieBackedPartitionMemtableAccountingTest.java @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.partitions; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.Config; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.memtable.AbstractAllocatorMemtable; +import org.apache.cassandra.db.memtable.TrieCellData; +import org.apache.cassandra.db.memtable.TrieMemtable; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.Cells; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.TrieBackedComplexColumn; +import org.apache.cassandra.db.rows.TrieBackedRow; +import org.apache.cassandra.db.rows.TrieTombstoneMarker; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.tries.CellReuseTest; +import org.apache.cassandra.db.tries.InMemoryDeletionAwareTrie; +import org.apache.cassandra.db.tries.TrieSpaceExhaustedException; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.ImmediateFuture; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.memory.MemtableAllocator; +import org.apache.cassandra.utils.memory.MemtableBufferAllocator; +import org.apache.cassandra.utils.memory.MemtableCleaner; +import org.apache.cassandra.utils.memory.MemtablePool; +import org.apache.cassandra.utils.memory.NativeAllocator; + +import static org.assertj.core.api.Assertions.assertThat; + +/* Test memory pool accounting when updating atomic btree partitions. CASSANDRA-18125 hit an issue + * where cells were doubly-counted when releasing causing negative allocator onHeap ownership which + * crashed memtable flushing. + * + * The aim of the test is to exhaustively test updates to simple and complex cells in all possible + * state and check the accounting is reasonable. It generates an initial row, then an update row + * and checks the allocator ownership is reasonable, then compares usage to a freshly recreated + * instance of the partition. + * + * Replacing existing values does not free up memory and is accounted for when comparing + * the fresh build. + */ +@RunWith(Parameterized.class) +public class TrieBackedPartitionMemtableAccountingTest +{ + public static final int INITIAL_TS = 2000; + public static final int EARLIER_TS = 1000; + public static final int LATER_TS = 3000; + + public static final long NOW_LDT = FBUtilities.nowInSeconds(); + public static final long LATER_LDT = NOW_LDT + 1000; + public static final long EARLIER_LDT = NOW_LDT - 1000; + + public static final int EXPIRED_TTL = 1; + public static final int EXPIRING_TTL = 10000; + + public static final long HEAP_LIMIT = 1 << 20; + public static final long OFF_HEAP_LIMIT = 1 << 20; + public static final float MEMTABLE_CLEANUP_THRESHOLD = 0.25f; + public static final MemtableCleaner DUMMY_CLEANER = () -> ImmediateFuture.failure(new IllegalStateException()); + + @Parameterized.Parameters(name="allocationType={0}") + public static Iterable data() + { + return Arrays.asList(Config.MemtableAllocationType.values()); + } + + @Parameterized.Parameter + public Config.MemtableAllocationType allocationType; + + static TableMetadata metadata; + static DecoratedKey partitionKey; + static ColumnMetadata r1md; + static ColumnMetadata c2md; + static ColumnMetadata s3md; + static ColumnMetadata c4md; + + @BeforeClass + public static void setUp() + { + DatabaseDescriptor.daemonInitialization(); + metadata = TableMetadata.builder("dummy_ks", "dummy_tbl") + .addPartitionKeyColumn("pk", Int32Type.instance) + .addRegularColumn("r1", Int32Type.instance) + .addRegularColumn("c2", SetType.getInstance(Int32Type.instance, true)) + .addStaticColumn("s3", Int32Type.instance) + .addStaticColumn("c4", SetType.getInstance(Int32Type.instance, true)) + .build(); + partitionKey = DatabaseDescriptor.getPartitioner().decorateKey(ByteBufferUtil.bytes(0)); + r1md = metadata.getColumn(new ColumnIdentifier("r1", false)); + c2md = metadata.getColumn(new ColumnIdentifier("c2", false)); + s3md = metadata.getColumn(new ColumnIdentifier("s3", false)); + c4md = metadata.getColumn(new ColumnIdentifier("c4", false)); + } + + @Ignore + @Test + public void repro() // For running in the IDE, update with failing testCase parameters to run + { + new TestCase(INITIAL_TS, Cell.NO_TTL, Cell.NO_DELETION_TIME, DeletionTime.build(EARLIER_TS, EARLIER_LDT), 1, + EARLIER_TS, Cell.NO_TTL, Cell.NO_DELETION_TIME, DeletionTime.LIVE, 3).execute(); + } + + @Test + public void exhaustiveTest() + { + // TTLs for initial and updated cells + List ttls = Arrays.asList(Cell.NO_TTL, EXPIRING_TTL, EXPIRED_TTL); + + // Initital local deleted times - a live cell, and a tombstone from now + List initialLDTs = Arrays.asList(Cell.NO_DELETION_TIME, NOW_LDT); + + // Initial complex deletion time for c2 - no deletion, earlier than c2 elements, or concurrent with c2 elements + List initialComplexDeletionTimes = Arrays.asList(DeletionTime.LIVE, + DeletionTime.build(EARLIER_TS, EARLIER_LDT), + DeletionTime.build(INITIAL_TS, NOW_LDT)); + + // Update timestamps - earlier - ignore update, same as initial, after initial - supercedes + List updateTimestamps = Arrays.asList(EARLIER_TS, INITIAL_TS, LATER_TS); + + // Update local deleted times - live cell, earlier tombstone, concurrent tombstone, or future deletion + List updateLDTs = Arrays.asList(Cell.NO_DELETION_TIME, EARLIER_LDT, NOW_LDT, LATER_LDT); + + // Update complex deletion time for c2 - no deletion, earlier than c2 elements, + // or concurrent with c2 elements, after c2 elements + List updateComplexDeletionTimes = Arrays.asList(DeletionTime.LIVE, + DeletionTime.build(EARLIER_TS, EARLIER_LDT), + DeletionTime.build(INITIAL_TS, NOW_LDT), + DeletionTime.build(LATER_TS, LATER_LDT)); + + // Number of cells to put in the update collection - overlapping by one cell + List initialComplexCellCount = Arrays.asList(3, 1); + List updateComplexCellCount = Arrays.asList(3, 1); + + ttls.forEach(initialTTL -> { + initialLDTs.forEach(initialLDT -> { + initialComplexDeletionTimes.forEach(initialCDT -> { + initialComplexCellCount.forEach(numC2InitialCells -> { + updateTimestamps.forEach(updateTS -> { + ttls.forEach(updateTTL -> { + updateLDTs.forEach(updateLDT -> { + updateComplexDeletionTimes.forEach(updateCDT -> { + updateComplexCellCount.forEach(numC2UpdateCells -> { + new TestCase(INITIAL_TS, initialTTL, initialLDT, initialCDT, numC2InitialCells, + updateTS, updateTTL, updateLDT, updateCDT, numC2UpdateCells).execute(); + }); + }); + }); + }); + }); + }); + }); + }); + }); + } + + class TestCase + { + int initialTS; + int initialTTL; + long initialLDT; + DeletionTime initialCDT; + int numC2InitialCells; + int updateTS; + int updateTTL; + long updateLDT; + DeletionTime updateCDT; + Integer numC2UpdateCells; + + public TestCase(int initialTS, int initialTTL, long initialLDT, DeletionTime initialCDT, int numC2InitialCells, + int updateTS, int updateTTL, long updateLDT, DeletionTime updateCDT, Integer numC2UpdateCells) + { + this.initialTS = initialTS; + this.initialTTL = initialTTL; + this.initialLDT = initialLDT; + this.initialCDT = initialCDT; + this.numC2InitialCells = numC2InitialCells; + this.updateTS = updateTS; + this.updateTTL = updateTTL; + this.updateLDT = updateLDT; + this.updateCDT = updateCDT; + this.numC2UpdateCells = numC2UpdateCells; + System.out.println(String.format("%s %s %s %s %s %s %s %s %s %s", + initialTS, initialTTL, initialLDT, initialCDT, numC2InitialCells, + updateTS, updateTTL, updateLDT, updateCDT, numC2UpdateCells)); + } + + void execute() + { + // Test regular row updates + Pair regularRows = makeInitialAndUpdate(r1md, c2md); + PartitionUpdate initial = TriePartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.left); + PartitionUpdate update = TriePartitionUpdate.singleRowUpdate(metadata, partitionKey, regularRows.right); + validateUpdates(metadata, partitionKey, Arrays.asList(initial, update)); + + // Test static row updates + Pair staticRows = makeInitialAndUpdate(s3md, c4md); + PartitionUpdate staticInitial = TriePartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.left); + PartitionUpdate staticUpdate = TriePartitionUpdate.singleRowUpdate(metadata, partitionKey, staticRows.right); + validateUpdates(metadata, partitionKey, Arrays.asList(staticInitial, staticUpdate)); + } + + private Pair makeInitialAndUpdate(ColumnMetadata regular, ColumnMetadata complex) + { + final ByteBuffer initialValueBB = ByteBufferUtil.bytes(111); + final ByteBuffer updateValueBB = ByteBufferUtil.bytes(222); + + // Create the initial row to populate the partition with + Row.Builder initialRowBuilder = TrieBackedRow.builder(metadata.regularAndStaticColumns()); + initialRowBuilder.newRow(regular.isStatic() ? Clustering.STATIC_CLUSTERING : Clustering.EMPTY); + + initialRowBuilder.addCell(makeCell(regular, initialTS, initialTTL, initialLDT, initialValueBB, null)); + if (initialCDT != DeletionTime.LIVE) + initialRowBuilder.addComplexDeletion(complex, initialCDT); + int cellPath = 1000; + for (int i = 0; i < numC2InitialCells; i++) + initialRowBuilder.addCell(makeCell(complex, initialTS, initialTTL, initialLDT, + ByteBufferUtil.EMPTY_BYTE_BUFFER, + CellPath.create(ByteBufferUtil.bytes(cellPath--)))); + Row initialRow = initialRowBuilder.build(); + + // Create the update row to modify the partition with + Row.Builder updateRowBuilder = TrieBackedRow.builder(metadata.regularAndStaticColumns()); + updateRowBuilder.newRow(regular.isStatic() ? Clustering.STATIC_CLUSTERING : Clustering.EMPTY); + + updateRowBuilder.addCell(makeCell(regular, updateTS, updateTTL, updateLDT, updateValueBB, null)); + if (updateCDT != DeletionTime.LIVE) + updateRowBuilder.addComplexDeletion(complex, updateCDT); + + // Make multiple update cells to make any issues more pronounced + cellPath = 1000; + for (int i = 0; i < numC2UpdateCells; i++) + updateRowBuilder.addCell(makeCell(complex, updateTS, updateTTL, updateLDT, + ByteBufferUtil.EMPTY_BYTE_BUFFER, + CellPath.create(ByteBufferUtil.bytes(cellPath++)))); + Row updateRow = updateRowBuilder.build(); + return Pair.create(initialRow, updateRow); + } + + Cell makeCell(ColumnMetadata column, long timestamp, int ttl, long localDeletionTime, ByteBuffer value, CellPath path) + { + if (localDeletionTime != Cell.NO_DELETION_TIME) // never a ttl for a tombstone + { + ttl = Cell.NO_TTL; + value = ByteBufferUtil.EMPTY_BYTE_BUFFER; + } + return new BufferCell(column, timestamp, ttl, localDeletionTime, value, path); + } + } + + static BufferType bufferTypeFor(Config.MemtableAllocationType allocationType) + { + switch (allocationType) + { + case heap_buffers: + case unslabbed_heap_buffers: + case unslabbed_heap_buffers_logged: + return BufferType.ON_HEAP; + case offheap_buffers: + case offheap_objects: + return BufferType.OFF_HEAP; + default: + throw new IllegalArgumentException("Unknown allocation type: " + allocationType); + } + } + + void validateUpdates(TableMetadata metadata, DecoratedKey partitionKey, List updates) + { + OpOrder opOrder = new OpOrder(); + opOrder.start(); + UpdateTransaction indexer = UpdateTransaction.NO_OP; + + MemtablePool memtablePool = AbstractAllocatorMemtable.createMemtableAllocatorPoolInternal(allocationType, + HEAP_LIMIT, + OFF_HEAP_LIMIT, + MEMTABLE_CLEANUP_THRESHOLD, + DUMMY_CLEANER); + MemtableAllocator allocator = memtablePool.newAllocator("initial"); + MemtableAllocator recreatedAllocator = memtablePool.newAllocator("recreated"); + try + { + // Prepare a partition to receive updates + TrieMemtable.CellDataBufferManager cellDataBufferManager; + if (allocator instanceof NativeAllocator) + cellDataBufferManager = new TrieMemtable.NativeBufferManager((NativeAllocator) allocator); + else + cellDataBufferManager = new TrieMemtable.SlabBufferManager((MemtableBufferAllocator) allocator, + opOrder, + bufferTypeFor(allocationType).onHeapSizeWithoutData()); + InMemoryDeletionAwareTrie trie = + InMemoryDeletionAwareTrie.longLived(TrieBackedPartition.BYTE_COMPARABLE_VERSION, bufferTypeFor(allocationType), opOrder, + new TrieMemtable.TrieSerializer(cellDataBufferManager, null)); + TriePartitionUpdater updater = new TriePartitionUpdater(null, trie); + trie.putRecursive(ByteComparable.EMPTY, new TrieMemtable.PartitionData(null), (x, y) -> y); + TrieBackedPartition partition = new TrieBackedPartition(partitionKey, + metadata.regularAndStaticColumns(), + new EncodingStats(LivenessInfo.NO_TIMESTAMP, LivenessInfo.NO_EXPIRATION_TIME, 0), + 1, // to avoid isEmpty true + 0, + trie, + metadata); + + // For each update, apply it and verify the allocator is positive + long unreleasable = updates.stream().mapToLong(updateUntyped -> { + TriePartitionUpdate update = TriePartitionUpdate.asTrieUpdate(updateUntyped); + DeletionTime exsDeletion = partition.partitionLevelDeletion(); + DeletionTime updDeletion = update.partitionLevelDeletion(); + long updateUnreleasable = 0; + if (!partition.isEmpty()) + { + for (Row updRow : update.rows()) + { + Row exsRow = partition.getRow(updRow.clustering()); + updateUnreleasable += getUnreleasableSize(updRow, exsRow, exsDeletion, updDeletion); + } + } + updateUnreleasable += getUnreleasableSize(update.staticRow(), partition.staticRow(), exsDeletion, updDeletion); + + OpOrder.Group writeOp = opOrder.getCurrent(); + TrieMemtable.mergeUpdate(trie, allocator, TriePartitionUpdate.asTrieUpdate(update).trie, indexer, writeOp, updater); + opOrder.newBarrier().issue(); + + assertThat(allocator.onHeap().owns()).isGreaterThanOrEqualTo(0L); + assertThat(allocator.offHeap().owns()).isGreaterThanOrEqualTo(0L); + return updateUnreleasable; + }).sum(); + CellReuseTest.verifyFreeCellsMatchUnreachable(trie); + + // Now recreate the partition to see if there's a leak in the accounting + + if (recreatedAllocator instanceof NativeAllocator) + cellDataBufferManager = new TrieMemtable.NativeBufferManager((NativeAllocator) recreatedAllocator); + else + cellDataBufferManager = new TrieMemtable.SlabBufferManager((MemtableBufferAllocator) recreatedAllocator, + opOrder, + bufferTypeFor(allocationType).onHeapSizeWithoutData()); + InMemoryDeletionAwareTrie recreatedTrie = + InMemoryDeletionAwareTrie.longLived(TrieBackedPartition.BYTE_COMPARABLE_VERSION, bufferTypeFor(allocationType), opOrder, + new TrieMemtable.TrieSerializer(cellDataBufferManager, null)); + TriePartitionUpdater recreatedUpdater = new TriePartitionUpdater(null, recreatedTrie); + recreatedTrie.putRecursive(ByteComparable.EMPTY, new TrieMemtable.PartitionData(null), (x, y) -> y); + try (UnfilteredRowIterator iter = partition.unfilteredIterator()) + { + TriePartitionUpdate update = TriePartitionUpdate.fromIterator(iter); + opOrder.newBarrier().issue(); + OpOrder.Group writeOp = opOrder.getCurrent(); + TrieMemtable.mergeUpdate(recreatedTrie, recreatedAllocator, TriePartitionUpdate.asTrieUpdate(update).trie, indexer, writeOp, recreatedUpdater); + } + CellReuseTest.verifyFreeCellsMatchUnreachable(recreatedTrie); + + // It is possible that the two tries have different structure (e.g. non-embedded prefixes, split nodes + // instead of sparse etc.). Allow this, but make sure the difference is small. + long trieDiff = trie.usedBufferSpace() - recreatedTrie.usedBufferSpace(); + assertThat(trieDiff).isLessThan(updates.size() * 50); + unreleasable += trieDiff; + + // offheap allocators don't release on heap memory, so expect the same + long unreleasableOnHeap = 0, unreleasableOffHeap = 0; + if (allocator.offHeap().owns() > 0) unreleasableOffHeap = unreleasable; + else unreleasableOnHeap = unreleasable; + + assertThat(recreatedAllocator.offHeap().owns()).isEqualTo(allocator.offHeap().owns() - unreleasableOffHeap); + assertThat(recreatedAllocator.onHeap().owns()).isEqualTo(allocator.onHeap().owns() - unreleasableOnHeap); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + finally + { + // Release test resources + recreatedAllocator.setDiscarding(); + recreatedAllocator.setDiscarded(); + allocator.setDiscarding(); + allocator.setDiscarded(); + try + { + memtablePool.shutdownAndWait(1, TimeUnit.SECONDS); + } + catch (Throwable tr) + { + // too bad + } + } + } + + private long getUnreleasableSize(Row updRow, Row exsRow, DeletionTime exsDeletion, DeletionTime updDeletion) + { + if (exsRow == null) + return 0; + + if (exsRow.deletion().supersedes(exsDeletion)) + exsDeletion = exsRow.deletion().time(); + if (updRow.deletion().supersedes(updDeletion)) + updDeletion = updRow.deletion().time(); + + long size = 0; + for (ColumnData exsCd : exsRow) + { + ColumnData updCd = updRow.getColumnData(exsCd.column()); + if (exsCd instanceof Cell) + { + Cell exsCell = (Cell) exsCd, updCell = (Cell) updCd; + if (updDeletion.deletes(exsCell)) + size += sizeOf(exsCell); + else if (updCell != null && Cells.reconcile(exsCell, updCell) != exsCell && !exsDeletion.deletes(updCell)) + size += sizeOf(exsCell); + } + else + { + TrieBackedComplexColumn exsCcd = (TrieBackedComplexColumn) exsCd; + ComplexColumnData updCcd = (ComplexColumnData) updCd; + + DeletionTime activeExsDeletion = exsDeletion; + DeletionTime activeUpdDeletion = updDeletion; + if (exsCcd.complexDeletion().supersedes(exsDeletion)) + activeExsDeletion = exsCcd.complexDeletion(); + if (updCcd != null && updCcd.complexDeletion().supersedes(updDeletion)) + activeUpdDeletion = updCcd.complexDeletion(); + + for (Cell exsCell : exsCcd) + { + Cell updCell = updCcd == null ? null : updCcd.getCell(exsCell.path()); + + if (activeUpdDeletion.deletes(exsCell)) + size += sizeOf(exsCcd.getCellWithoutPath(exsCell.path())); + else if (updCell != null && Cells.reconcile(exsCell, updCell) != exsCell && !activeExsDeletion.deletes(updCell)) + size += sizeOf(exsCcd.getCellWithoutPath(exsCell.path())); + } + } + } + return size; + } + + private static long sizeOf(CellData cell) + { + return TrieCellData.offTrieSize(cell); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/rows/BTreeRowTest.java b/test/unit/org/apache/cassandra/db/rows/BTreeRowTest.java index cfdb9c3ae5f8..73b8d9b8524a 100644 --- a/test/unit/org/apache/cassandra/db/rows/BTreeRowTest.java +++ b/test/unit/org/apache/cassandra/db/rows/BTreeRowTest.java @@ -86,16 +86,30 @@ public void testRowMinTimespampFromPrimaryKeyListener() } @Test - public void testRowMinTimespampFromDeletion() + public void testRowMinTimespampFromDeletionShadowable() { int v1CellTimestamp = 1000; int v2CellTimestamp = 500; int primaryKeyTimestamp = 100; - int localDeletionTime = 50; + int deletionTimestamp = 50; BTreeRow.Builder builder = row(3, cell(v1Metadata, 1, v1CellTimestamp), cell(v2Metadata, 1, v2CellTimestamp)); builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(primaryKeyTimestamp, FBUtilities.nowInSeconds())); - builder.addRowDeletion(new Row.Deletion(DeletionTime.build(localDeletionTime, FBUtilities.nowInSeconds()), true)); + builder.addRowDeletion(new Row.Deletion(DeletionTime.build(deletionTimestamp, FBUtilities.nowInSeconds()), true)); Row row = builder.build(); assertEquals(primaryKeyTimestamp, row.minTimestamp()); } + + @Test + public void testRowMinTimespampFromDeletion() + { + int v1CellTimestamp = 1000; + int v2CellTimestamp = 500; + int primaryKeyTimestamp = 100; + int deletionTimestamp = 50; + BTreeRow.Builder builder = row(3, cell(v1Metadata, 1, v1CellTimestamp), cell(v2Metadata, 1, v2CellTimestamp)); + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(primaryKeyTimestamp, FBUtilities.nowInSeconds())); + builder.addRowDeletion(new Row.Deletion(DeletionTime.build(deletionTimestamp, FBUtilities.nowInSeconds()), false)); + Row row = builder.build(); + assertEquals(deletionTimestamp, row.minTimestamp()); + } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/rows/ComplexColumnDataTest.java b/test/unit/org/apache/cassandra/db/rows/ComplexColumnDataTest.java index d91f110210ca..6b5ad88b0be8 100644 --- a/test/unit/org/apache/cassandra/db/rows/ComplexColumnDataTest.java +++ b/test/unit/org/apache/cassandra/db/rows/ComplexColumnDataTest.java @@ -38,9 +38,9 @@ public class ComplexColumnDataTest @Test public void testEmptyComplexColumn() { - ComplexColumnData data = new ComplexColumnData(complexColumn, - BTree.empty(), - DeletionTime.LIVE); + ComplexColumnData data = new BTreeComplexColumn(complexColumn, + BTree.empty(), + DeletionTime.LIVE); Assert.assertFalse(data.hasCells()); } @@ -48,27 +48,27 @@ public void testEmptyComplexColumn() public void testNonEmptyComplexColumn() { - ComplexColumnData data = new ComplexColumnData(complexColumn, - BTree.singleton("ignored value"), - DeletionTime.LIVE); + ComplexColumnData data = new BTreeComplexColumn(complexColumn, + BTree.singleton("ignored value"), + DeletionTime.LIVE); Assert.assertTrue(data.hasCells()); } @Test public void testComplexColumnMinTimestampWithDeletion() { - ComplexColumnData data = new ComplexColumnData(complexColumn, - BTree.empty(), - DeletionTime.build(500, 1000)); + ComplexColumnData data = new BTreeComplexColumn(complexColumn, + BTree.empty(), + DeletionTime.build(500, 1000)); Assert.assertEquals("Min timestamp must be equal to deletion timestamp", 500, data.minTimestamp()); } @Test public void testComplexColumnMinTimestampWithCells() { - ComplexColumnData data = new ComplexColumnData(complexColumn, - new Cell[]{ new BufferCell(simpleColumn, 100, 0, 200, null, null) }, - DeletionTime.build(500, 1000)); + ComplexColumnData data = new BTreeComplexColumn(complexColumn, + new Cell[]{ new BufferCell(simpleColumn, 100, 0, 200, null, null) }, + DeletionTime.build(500, 1000)); Assert.assertEquals("Min timestamp must be equal to min cell timestamp", 100, data.minTimestamp()); } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/rows/RowsTest.java b/test/unit/org/apache/cassandra/db/rows/RowsTest.java index 865d33a8b6a1..8429aa2055d6 100644 --- a/test/unit/org/apache/cassandra/db/rows/RowsTest.java +++ b/test/unit/org/apache/cassandra/db/rows/RowsTest.java @@ -32,6 +32,8 @@ import com.google.common.collect.Sets; import org.junit.Assert; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -47,6 +49,7 @@ import static org.apache.cassandra.db.CellTest.assertCellsEqual; +@RunWith(Parameterized.class) public class RowsTest { private static final String KEYSPACE = "rows_test"; @@ -56,6 +59,20 @@ public class RowsTest private static final ColumnMetadata m; private static final Clustering c1; + enum RowImplementation + { + BTREE, TRIE_BACKED + } + + @Parameterized.Parameter(0) + public static RowImplementation implementation; + + @Parameterized.Parameters(name = "{0}") + public static RowImplementation[] implementations() + { + return RowImplementation.values(); + } + static { DatabaseDescriptor.daemonInitialization(); @@ -217,7 +234,16 @@ private static long secondToTs(long now) private static Row.Builder createBuilder(Clustering c) { - Row.Builder builder = BTreeRow.unsortedBuilder(); + Row.Builder builder; + switch (implementation) + { + case TRIE_BACKED: + builder = TrieBackedRow.builder(kcvm.regularAndStaticColumns()); + break; + case BTREE: + default: + builder = BTreeRow.unsortedBuilder(); + } builder.newRow(c); return builder; } @@ -525,7 +551,7 @@ public void mergeRowDeletionSupercedesLiveness() Assert.assertEquals(expectedDeletion, merged.deletion()); Assert.assertEquals(LivenessInfo.EMPTY, merged.primaryKeyLivenessInfo()); - Assert.assertEquals(0, merged.columns().size()); + Assert.assertEquals(0, merged.columnCount()); } @@ -617,95 +643,4 @@ private static Row makeDummyRow(Cell ... cells) return builder.build(); } - - @Test - public void testLegacyCellIterator() - { - // Creates a table with - // - 3 Simple columns: a, c and e - // - 2 Complex columns: b and d - TableMetadata metadata = - TableMetadata.builder("dummy_ks", "dummy_tbl") - .addPartitionKeyColumn("k", BytesType.instance) - .addRegularColumn("a", BytesType.instance) - .addRegularColumn("b", MapType.getInstance(Int32Type.instance, BytesType.instance, true)) - .addRegularColumn("c", BytesType.instance) - .addRegularColumn("d", MapType.getInstance(Int32Type.instance, BytesType.instance, true)) - .addRegularColumn("e", BytesType.instance) - .build(); - - ColumnMetadata a = metadata.getColumn(new ColumnIdentifier("a", false)); - ColumnMetadata b = metadata.getColumn(new ColumnIdentifier("b", false)); - ColumnMetadata c = metadata.getColumn(new ColumnIdentifier("c", false)); - ColumnMetadata d = metadata.getColumn(new ColumnIdentifier("d", false)); - ColumnMetadata e = metadata.getColumn(new ColumnIdentifier("e", false)); - - Row row; - - // Row with only simple columns - - row = makeDummyRow(liveCell(a), - liveCell(c), - liveCell(e)); - - - assertCellOrder(row.cellsInLegacyOrder(metadata, false), - liveCell(a), - liveCell(c), - liveCell(e)); - - assertCellOrder(row.cellsInLegacyOrder(metadata, true), - liveCell(e), - liveCell(c), - liveCell(a)); - - // Row with only complex columns - - row = makeDummyRow(liveCell(b, 1), - liveCell(b, 2), - liveCell(d, 3), - liveCell(d, 4)); - - - assertCellOrder(row.cellsInLegacyOrder(metadata, false), - liveCell(b, 1), - liveCell(b, 2), - liveCell(d, 3), - liveCell(d, 4)); - - assertCellOrder(row.cellsInLegacyOrder(metadata, true), - liveCell(d, 4), - liveCell(d, 3), - liveCell(b, 2), - liveCell(b, 1)); - - // Row with mixed simple and complex columns - - row = makeDummyRow(liveCell(a), - liveCell(c), - liveCell(e), - liveCell(b, 1), - liveCell(b, 2), - liveCell(d, 3), - liveCell(d, 4)); - - - assertCellOrder(row.cellsInLegacyOrder(metadata, false), - liveCell(a), - liveCell(b, 1), - liveCell(b, 2), - liveCell(c), - liveCell(d, 3), - liveCell(d, 4), - liveCell(e)); - - assertCellOrder(row.cellsInLegacyOrder(metadata, true), - liveCell(e), - liveCell(d, 4), - liveCell(d, 3), - liveCell(c), - liveCell(b, 2), - liveCell(b, 1), - liveCell(a)); - } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/rows/TrieBackedRowTest.java b/test/unit/org/apache/cassandra/db/rows/TrieBackedRowTest.java new file mode 100644 index 000000000000..70150545f725 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/rows/TrieBackedRowTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import org.junit.Test; + +import org.apache.cassandra.Util; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; + +import static org.junit.Assert.assertEquals; + +public class TrieBackedRowTest +{ + private final TableMetadata metadata = TableMetadata.builder("", "") + .addPartitionKeyColumn("pk", Int32Type.instance) + .addClusteringColumn("ck", Int32Type.instance) + .addRegularColumn("v1", Int32Type.instance) + .addRegularColumn("v2", Int32Type.instance) + .build(); + private final ColumnMetadata v2Metadata = metadata.regularAndStaticColumns().columns(false).getSimple(1); + private final ColumnMetadata v1Metadata = metadata.regularAndStaticColumns().columns(false).getSimple(0); + + private TrieBackedRow.Builder row(int ck, Cell... columns) + { + TrieBackedRow.Builder builder = new TrieBackedRow.Builder(metadata.regularAndStaticColumns()); + builder.newRow(Util.clustering(metadata.comparator, ck)); + for (Cell cell : columns) + builder.addCell(cell); + return builder; + } + + private Cell cell(ColumnMetadata metadata, int v, long timestamp) + { + return new BufferCell(metadata, + timestamp, + BufferCell.NO_TTL, + BufferCell.NO_DELETION_TIME, + ByteBufferUtil.bytes(v), + null); + } + + @Test + public void testRowMinTimespampFromCells() + { + int v1CellTimestamp = 1000; + int v2CellTimestamp = 500; + int primaryKeyTimestamp = 2000; + TrieBackedRow.Builder builder = row(1, cell(v1Metadata, 1, v1CellTimestamp), cell(v2Metadata, 1, v2CellTimestamp)); + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(primaryKeyTimestamp, FBUtilities.nowInSeconds())); + Row row = builder.build(); + assertEquals(v2CellTimestamp, row.minTimestamp()); + } + + @Test + public void testRowMinTimespampFromPrimaryKeyListener() + { + int v1CellTimestamp = 1000; + int v2CellTimestamp = 500; + int primaryKeyTimestamp = 100; + TrieBackedRow.Builder builder = row(2, cell(v1Metadata, 1, v1CellTimestamp), cell(v2Metadata, 1, v2CellTimestamp)); + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(primaryKeyTimestamp, FBUtilities.nowInSeconds())); + Row row = builder.build(); + assertEquals(primaryKeyTimestamp, row.minTimestamp()); + } + + @Test + public void testRowMinTimespampFromDeletion() + { + int v1CellTimestamp = 1000; + int v2CellTimestamp = 500; + int primaryKeyTimestamp = 100; + int deletionTimestamp = 50; + TrieBackedRow.Builder builder = row(3, cell(v1Metadata, 1, v1CellTimestamp), cell(v2Metadata, 1, v2CellTimestamp)); + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(primaryKeyTimestamp, FBUtilities.nowInSeconds())); + builder.addRowDeletion(new Row.Deletion(DeletionTime.build(deletionTimestamp, FBUtilities.nowInSeconds()), false)); + Row row = builder.build(); + assertEquals(deletionTimestamp, row.minTimestamp()); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/rows/TrieTombstoneMarkerTest.java b/test/unit/org/apache/cassandra/db/rows/TrieTombstoneMarkerTest.java new file mode 100644 index 000000000000..bf627e38a1db --- /dev/null +++ b/test/unit/org/apache/cassandra/db/rows/TrieTombstoneMarkerTest.java @@ -0,0 +1,561 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.rows; + +import java.util.Arrays; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.tries.Direction; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.quicktheories.core.Gen; + +import static org.junit.Assert.*; +import static org.quicktheories.QuickTheory.qt; +import static org.quicktheories.generators.SourceDSL.integers; + +/** + * Unit tests for TrieTombstoneMarker and its implementations (Covering, Boundary, Point). + */ +public class TrieTombstoneMarkerTest +{ + @BeforeClass + public static void setup() + { + DatabaseDescriptor.daemonInitialization(); + } + + // Test data generators + private static final int MAX_TIMESTAMP = 1000000; + private static final int MAX_LOCAL_DELETION_TIME = 100000; + + private Gen deletionTimeGen() + { + return integers().between(1, MAX_TIMESTAMP) + .zip(integers().between(1, MAX_LOCAL_DELETION_TIME), + (ts, ldt) -> DeletionTime.build(ts, ldt)); + } + + private Gen deletionTimeOrLiveGen() + { + return integers().between(0, MAX_TIMESTAMP) + .zip(integers().between(0, MAX_LOCAL_DELETION_TIME), + (ts, ldt) -> ts == 0 ? DeletionTime.LIVE : DeletionTime.build(ts, ldt)); + } + + // TODO: Test level markers + + // ========== Covering Marker Tests ========== + + @Test + public void testCoveringCreation() + { + DeletionTime dt = DeletionTime.build(100, 50); + TrieTombstoneMarker.Covering marker = TrieTombstoneMarker.covering(dt, TrieTombstoneMarker.Kind.RANGE); + + assertNotNull(marker); + assertFalse(marker.isBoundary()); + assertEquals(marker, marker.precedingState(Direction.FORWARD)); + assertEquals(marker, marker.succedingState(Direction.FORWARD)); + assertEquals(TrieTombstoneMarker.Kind.RANGE, marker.deletionKind()); + assertEquals(dt, marker); + } + + @Test + public void testCoveringIsNotBoundary() + { + TrieTombstoneMarker covering = TrieTombstoneMarker.covering(DeletionTime.build(100, 50), TrieTombstoneMarker.Kind.RANGE); + assertFalse("Covering marker should not be a boundary", covering.isBoundary()); + } + + @Test + public void testCoveringPrecedingState() + { + TrieTombstoneMarker covering = TrieTombstoneMarker.covering(DeletionTime.build(100, 50), TrieTombstoneMarker.Kind.RANGE); + + // Covering markers return themselves as preceding state in both directions + assertEquals(covering, covering.precedingState(Direction.FORWARD)); + assertEquals(covering, covering.precedingState(Direction.REVERSE)); + // Covering markers return themselves as succeding state in both directions + assertEquals(covering, covering.succedingState(Direction.FORWARD)); + assertEquals(covering, covering.succedingState(Direction.REVERSE)); + } + + @Test + public void testCoveringCannotConvertToRangeTombstoneMarker() + { + TrieTombstoneMarker covering = TrieTombstoneMarker.covering(DeletionTime.build(100, 50), TrieTombstoneMarker.Kind.RANGE); + ClusteringComparator comparator = new ClusteringComparator(Int32Type.instance); + + try + { + covering.toRangeTombstoneMarker(ByteComparable.EMPTY, + ByteComparable.Version.OSS50, + comparator); + fail("Covering marker should not be convertible to RangeTombstoneMarker"); + } + catch (AssertionError e) + { + // Expected + } + } + + @Test + public void testCoveringMergeWithCovering() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((dt1, dt2) -> { + TrieTombstoneMarker m1 = TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker m2 = TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker merged = m1.mergeWith(m2); + + assertNotNull(merged); + assertFalse(merged.isBoundary()); + + // Should keep the higher deletion time + DeletionTime expected = dt1.supersedes(dt2) ? dt1 : dt2; + assertEquals(expected, merged); + }); + } + + @Test + public void testCoveringDropShadowed() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((dt1, dt2) -> { + TrieTombstoneMarker marker = TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker deletion = TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker result = marker.dropShadowed(deletion); + + if (dt1.supersedes(dt2)) + { + // Marker survives if it supersedes the deletion + assertNotNull(result); + assertEquals(dt1, result); + } + else + { + // Marker is dropped if deletion supersedes it + assertNull(result); + } + }); + } + + // ========== Boundary Marker Tests ========== + + @Test + public void testBoundaryCreation() + { + DeletionTime left = DeletionTime.build(100, 50); + DeletionTime right = DeletionTime.build(200, 60); + + TrieTombstoneMarker.Covering leftCov = TrieTombstoneMarker.covering(left, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering rightCov = TrieTombstoneMarker.covering(right, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker boundary = TrieTombstoneMarker.make(leftCov, rightCov, null); + + assertNotNull(boundary); + assertTrue(boundary.isBoundary()); + } + + @Test + public void testBoundaryWithEqualSidesBecomeCovering() + { + DeletionTime dt = DeletionTime.build(100, 50); + TrieTombstoneMarker.Covering cov = TrieTombstoneMarker.covering(dt, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker result = TrieTombstoneMarker.make(cov, cov, null); + + assertNotNull(result); + assertFalse("Equal sides should result in covering marker", result.isBoundary()); + assertEquals(dt, result); + } + + @Test + public void testBoundaryPrecedingState() + { + DeletionTime left = DeletionTime.build(100, 50); + DeletionTime right = DeletionTime.build(200, 60); + + TrieTombstoneMarker.Covering leftCov = TrieTombstoneMarker.covering(left, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering rightCov = TrieTombstoneMarker.covering(right, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker boundary = TrieTombstoneMarker.make(leftCov, rightCov, null); + + assertEquals(leftCov, boundary.precedingState(Direction.FORWARD)); + assertEquals(rightCov, boundary.precedingState(Direction.REVERSE)); + + assertEquals(rightCov, boundary.succedingState(Direction.FORWARD)); + assertEquals(leftCov, boundary.succedingState(Direction.REVERSE)); + } + + @Test + public void testBoundaryMergeWithBoundary() + { + qt().forAll(deletionTimeOrLiveGen(), deletionTimeOrLiveGen(), + deletionTimeOrLiveGen(), deletionTimeOrLiveGen()) + .assuming((dt1, dt2, dt3, dt4) -> + !dt1.equals(dt2) && !dt3.equals(dt4)) // Ensure we have boundaries + .checkAssert((dt1, dt2, dt3, dt4) -> { + TrieTombstoneMarker.Covering left1 = dt1.isLive() ? null : TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering right1 = dt2.isLive() ? null : TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering left2 = dt3.isLive() ? null : TrieTombstoneMarker.covering(dt3, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering right2 = dt4.isLive() ? null : TrieTombstoneMarker.covering(dt4, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker b1 = TrieTombstoneMarker.make(left1, right1, null); + TrieTombstoneMarker b2 = TrieTombstoneMarker.make(left2, right2, null); + + TrieTombstoneMarker merged = b1.mergeWith(b2); + + assertNotNull(merged); + + DeletionTime leftMax = DeletionTime.merge(left1, left2); + DeletionTime rightMax = DeletionTime.merge(right1, right2); + assertEquals(leftMax, merged.leftDeletion()); + assertEquals(rightMax, merged.rightDeletion()); + }); + } + + @Test + public void testBoundaryRestrict() + { + DeletionTime left = DeletionTime.build(100, 50); + DeletionTime right = DeletionTime.build(200, 60); + + TrieTombstoneMarker.Covering leftCov = TrieTombstoneMarker.covering(left, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering rightCov = TrieTombstoneMarker.covering(right, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker boundary = TrieTombstoneMarker.make(leftCov, rightCov, null); + + // Restrict to before only + TrieTombstoneMarker beforeOnly = boundary.restrict(true, false); + assertNotNull(beforeOnly); + assertTrue(beforeOnly.isBoundary()); + assertEquals(leftCov, beforeOnly.leftDeletion()); + assertEquals(null, beforeOnly.rightDeletion()); + + // Restrict to after only + TrieTombstoneMarker afterOnly = boundary.restrict(false, true); + assertNotNull(afterOnly); + assertTrue(afterOnly.isBoundary()); + assertEquals(null, afterOnly.leftDeletion()); + assertEquals(rightCov, afterOnly.rightDeletion()); + + // Restrict to both (should return same) + TrieTombstoneMarker both = boundary.restrict(true, true); + assertSame(boundary, both); + + // Restrict to neither (should return null) + TrieTombstoneMarker neither = boundary.restrict(false, false); + assertNull(neither); + } + + // ========== Point Marker Tests ========== + + @Test + public void testPointCreation() + { + DeletionTime pointDt = DeletionTime.build(150, 55); + TrieTombstoneMarker point = TrieTombstoneMarker.point(pointDt, TrieTombstoneMarker.Kind.ROW); + + assertNotNull(point); + assertTrue(point.isBoundary()); + assertEquals(TrieTombstoneMarker.Kind.ROW, point.applicableToPointForward().deletionKind()); + assertEquals(pointDt, point.applicableToPointForward()); + } + + @Test + public void testPointWithCoveringDeletion() + { + DeletionTime pointDt = DeletionTime.build(150, 55); + DeletionTime coveringDt = DeletionTime.build(100, 50); + + TrieTombstoneMarker.Covering pointCov = TrieTombstoneMarker.covering(pointDt, TrieTombstoneMarker.Kind.ROW); + TrieTombstoneMarker.Covering coveringCov = TrieTombstoneMarker.covering(coveringDt, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker point = new TrieTombstoneMarker.Point(pointCov, coveringCov); + + assertNotNull(point); + assertEquals(TrieTombstoneMarker.Kind.ROW, point.applicableToPointForward().deletionKind()); + assertEquals(pointDt, point.applicableToPointForward()); + } + + @Test + public void testPointMergeWithCovering() + { + DeletionTime pointDt = DeletionTime.build(150, 55); + DeletionTime coveringDt = DeletionTime.build(100, 50); + + TrieTombstoneMarker point = TrieTombstoneMarker.point(pointDt, TrieTombstoneMarker.Kind.ROW); + TrieTombstoneMarker covering = TrieTombstoneMarker.covering(coveringDt, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker merged = point.mergeWith(covering); + + assertNotNull(merged); + assertEquals(TrieTombstoneMarker.Kind.ROW, merged.applicableToPointForward().deletionKind()); + assertEquals(pointDt, merged.applicableToPointForward()); + + // Point should survive if it supersedes covering + if (pointDt.supersedes(coveringDt)) + { + assertEquals(pointDt, merged.pointDeletion()); + } + } + + @Test + public void testPointMergeWithPoint() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((dt1, dt2) -> { + TrieTombstoneMarker p1 = TrieTombstoneMarker.point(dt1, TrieTombstoneMarker.Kind.ROW); + TrieTombstoneMarker p2 = TrieTombstoneMarker.point(dt2, TrieTombstoneMarker.Kind.ROW); + + TrieTombstoneMarker merged = p1.mergeWith(p2); + + assertNotNull(merged); + assertEquals(TrieTombstoneMarker.Kind.ROW, merged.applicableToPointForward().deletionKind()); + + // Should keep the higher deletion time + DeletionTime expected = dt1.supersedes(dt2) ? dt1 : dt2; + assertEquals(expected, merged.applicableToPointForward()); + }); + } + + @Test + public void testPointDropShadowed() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((pointDt, deletionDt) -> { + TrieTombstoneMarker point = TrieTombstoneMarker.point(pointDt, TrieTombstoneMarker.Kind.ROW); + TrieTombstoneMarker deletion = TrieTombstoneMarker.covering(deletionDt, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker result = point.dropShadowed(deletion); + + if (pointDt.supersedes(deletionDt)) + { + // Point survives if it supersedes the deletion + assertNotNull(result); + assertEquals(TrieTombstoneMarker.Kind.ROW, result.applicableToPointForward().deletionKind()); + assertEquals(pointDt, result.applicableToPointForward()); + } + else + { + // Point is dropped if deletion supersedes it + assertNull(result); + } + }); + } + + // ========== Collection Merge Tests ========== + + @Test + public void testMergeCollection() + { + DeletionTime dt1 = DeletionTime.build(100, 50); + DeletionTime dt2 = DeletionTime.build(200, 60); + DeletionTime dt3 = DeletionTime.build(150, 55); + + List markers = Arrays.asList( + TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE), + TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE), + TrieTombstoneMarker.covering(dt3, TrieTombstoneMarker.Kind.RANGE) + ); + + TrieTombstoneMarker merged = TrieTombstoneMarker.merge(markers); + + assertNotNull(merged); + // Should have the highest deletion time + assertEquals(dt2, merged); + } + + @Test + public void testMergeEmptyCollection() + { + TrieTombstoneMarker merged = TrieTombstoneMarker.merge(Arrays.asList()); + assertNull(merged); + } + + // ========== Timestamp Update Tests ========== + + @Test + public void testCoveringWithUpdatedTimestamp() + { + DeletionTime original = DeletionTime.build(100, 50); + TrieTombstoneMarker marker = TrieTombstoneMarker.covering(original, TrieTombstoneMarker.Kind.RANGE); + + long newTimestamp = 200; + TrieTombstoneMarker updated = marker.withUpdatedTimestamp(newTimestamp); + + assertNotNull(updated); + assertEquals(newTimestamp, updated.applicableToPointForward().markedForDeleteAt()); + assertEquals(original.localDeletionTime(), updated.applicableToPointForward().localDeletionTime()); + } + + @Test + public void testPointWithUpdatedTimestamp() + { + DeletionTime pointDt = DeletionTime.build(150, 55); + TrieTombstoneMarker point = TrieTombstoneMarker.point(pointDt, TrieTombstoneMarker.Kind.ROW); + + long newTimestamp = 250; + TrieTombstoneMarker updated = point.withUpdatedTimestamp(newTimestamp); + + if (updated != null) + { + assertEquals(newTimestamp, updated.pointDeletion().markedForDeleteAt()); + assertEquals(pointDt.localDeletionTime(), updated.pointDeletion().localDeletionTime()); + } + } + + // ========== Map Function Tests ========== + + @Test + public void testCoveringMap() + { + DeletionTime original = DeletionTime.build(100, 50); + TrieTombstoneMarker marker = TrieTombstoneMarker.covering(original, TrieTombstoneMarker.Kind.RANGE); + + // Map to a higher timestamp + TrieTombstoneMarker mapped = marker.map(dt -> DeletionTime.build(dt.markedForDeleteAt() + 100, dt.localDeletionTime())); + + assertNotNull(mapped); + assertEquals(200, mapped.applicableToPointForward().markedForDeleteAt()); + } + + @Test + public void testCoveringMapToLive() + { + DeletionTime original = DeletionTime.build(100, 50); + TrieTombstoneMarker marker = TrieTombstoneMarker.covering(original, TrieTombstoneMarker.Kind.RANGE); + + // Map to LIVE + TrieTombstoneMarker mapped = marker.map(dt -> DeletionTime.LIVE); + + assertNull("Mapping to LIVE should return null", mapped); + } + + @Test + public void testPointMap() + { + DeletionTime pointDt = DeletionTime.build(150, 55); + TrieTombstoneMarker point = TrieTombstoneMarker.point(pointDt, TrieTombstoneMarker.Kind.ROW); + + // Map to a higher timestamp + TrieTombstoneMarker mapped = point.map(dt -> DeletionTime.build(dt.markedForDeleteAt() + 100, dt.localDeletionTime())); + + if (mapped != null) + { + assertNotNull(mapped.pointDeletion()); + assertEquals(250, mapped.pointDeletion().markedForDeleteAt()); + } + } + + // ========== Memory Size Tests ========== + + @Test + public void testCoveringMemorySize() + { + TrieTombstoneMarker covering = TrieTombstoneMarker.covering(DeletionTime.build(100, 50), TrieTombstoneMarker.Kind.RANGE); + long size = covering.unsharedHeapSize(); + + assertTrue("Covering marker should have positive heap size", size > 0); + } + + @Test + public void testBoundaryMemorySize() + { + DeletionTime left = DeletionTime.build(100, 50); + DeletionTime right = DeletionTime.build(200, 60); + + TrieTombstoneMarker.Covering leftCov = TrieTombstoneMarker.covering(left, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker.Covering rightCov = TrieTombstoneMarker.covering(right, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker boundary = TrieTombstoneMarker.make(leftCov, rightCov, null); + + long size = boundary.unsharedHeapSize(); + + assertTrue("Boundary marker should have positive heap size", size > 0); + assertTrue("Boundary should be larger than covering", size > + TrieTombstoneMarker.covering(left, TrieTombstoneMarker.Kind.RANGE).unsharedHeapSize()); + } + + @Test + public void testPointMemorySize() + { + TrieTombstoneMarker point = TrieTombstoneMarker.point(DeletionTime.build(150, 55), TrieTombstoneMarker.Kind.ROW); + long size = point.unsharedHeapSize(); + + assertTrue("Point marker should have positive heap size", size > 0); + } + + // ========== Property-Based Tests ========== + + @Test + public void testMergeIsCommutative() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((dt1, dt2) -> { + TrieTombstoneMarker m1 = TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker m2 = TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker merged1 = m1.mergeWith(m2); + TrieTombstoneMarker merged2 = m2.mergeWith(m1); + + assertEquals("Merge should be commutative", + merged1, merged2); + }); + } + + @Test + public void testMergeIsAssociative() + { + qt().forAll(deletionTimeGen(), deletionTimeGen(), deletionTimeGen()) + .checkAssert((dt1, dt2, dt3) -> { + TrieTombstoneMarker m1 = TrieTombstoneMarker.covering(dt1, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker m2 = TrieTombstoneMarker.covering(dt2, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker m3 = TrieTombstoneMarker.covering(dt3, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker merged1 = m1.mergeWith(m2).mergeWith(m3); + TrieTombstoneMarker merged2 = m1.mergeWith(m2.mergeWith(m3)); + + assertEquals("Merge should be associative", + merged1, merged2); + }); + } + + @Test + public void testDropShadowedIsIdempotent() + { + qt().forAll(deletionTimeGen(), deletionTimeGen()) + .checkAssert((markerDt, deletionDt) -> { + TrieTombstoneMarker marker = TrieTombstoneMarker.covering(markerDt, TrieTombstoneMarker.Kind.RANGE); + TrieTombstoneMarker deletion = TrieTombstoneMarker.covering(deletionDt, TrieTombstoneMarker.Kind.RANGE); + + TrieTombstoneMarker dropped1 = marker.dropShadowed(deletion); + TrieTombstoneMarker dropped2 = dropped1 != null ? dropped1.dropShadowed(deletion) : null; + + assertEquals("dropShadowed should be idempotent", dropped1, dropped2); + }); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/CellReuseTest.java b/test/unit/org/apache/cassandra/db/tries/CellReuseTest.java index 79620e079975..9d3c388315d3 100644 --- a/test/unit/org/apache/cassandra/db/tries/CellReuseTest.java +++ b/test/unit/org/apache/cassandra/db/tries/CellReuseTest.java @@ -26,25 +26,27 @@ import java.util.function.Function; import java.util.function.Predicate; -import com.google.common.collect.Streams; +import com.google.common.annotations.VisibleForTesting; import org.junit.Assert; import org.junit.Test; import org.agrona.collections.IntArrayList; +import org.agrona.concurrent.UnsafeBuffer; import org.apache.cassandra.io.compress.BufferType; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.concurrent.OpOrder; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.asString; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertMapEquals; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.byteComparableVersion; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.generateKeys; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.asString; +import static org.apache.cassandra.db.tries.TrieUtil.assertMapEquals; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; public class CellReuseTest { + private static final boolean VERBOSE = false; static Predicate> FORCE_COPY_PARTITION = features -> { var c = features.content(); if (c != null && c instanceof Boolean) @@ -59,28 +61,53 @@ public class CellReuseTest Random rand = new Random(2); @Test - public void testCellReusePartitionCopying() throws Exception + public void testCellReuseBytesPartitionCopying() throws Exception { - testCellReuse(FORCE_COPY_PARTITION); + testCellReuseBytes(FORCE_COPY_PARTITION); } @Test - public void testCellReuseNoCopying() throws Exception + public void testCellReuseBytesNoCopying() throws Exception { - testCellReuse(NO_ATOMICITY); + testCellReuseBytes(NO_ATOMICITY); } - public void testCellReuse(Predicate> forceCopyPredicate) throws Exception + public void testCellReuseBytes(Predicate> forceCopyPredicate) throws Exception { ByteComparable[] src = generateKeys(rand, COUNT); - InMemoryTrie trieLong = makeInMemoryTrie(src, opOrder -> InMemoryTrie.longLived(byteComparableVersion, BufferType.ON_HEAP, opOrder), + InMemoryTrie trieLong = makeInMemoryTrie(src, opOrder -> InMemoryTrie.longLived(VERSION, BufferType.ON_HEAP, opOrder, new TestContentSerializer()), forceCopyPredicate); + verifyFreeCellsMatchUnreachable(trieLong); + } + + @Test + public void testCellReusePojoPartitionCopying() throws Exception + { + testCellReusePojo(FORCE_COPY_PARTITION); + } + + @Test + public void testCellReusePojoNoCopying() throws Exception + { + testCellReusePojo(NO_ATOMICITY); + } + + public void testCellReusePojo(Predicate> forceCopyPredicate) throws Exception + { + ByteComparable[] src = generateKeys(rand, COUNT); + InMemoryTrie trieLong = makeInMemoryTrie(src, opOrder -> InMemoryTrie.longLived(VERSION, BufferType.ON_HEAP, opOrder), + forceCopyPredicate); + + verifyFreeCellsMatchUnreachable(trieLong); + } + + public static void verifyFreeCellsMatchUnreachable(InMemoryBaseTrie trieLong) + { // dump some information first - System.out.println(String.format(" LongLived ON_HEAP sizes %10s %10s count %d", + System.out.println(String.format(" LongLived ON_HEAP sizes %10s %10s", FBUtilities.prettyPrintMemory(trieLong.usedSizeOnHeap()), - FBUtilities.prettyPrintMemory(trieLong.usedSizeOffHeap()), - Streams.stream(trieLong.values()).count())); + FBUtilities.prettyPrintMemory(trieLong.usedSizeOffHeap()))); Pair longReachable = reachableCells(trieLong); BitSet reachable = longReachable.left; @@ -93,7 +120,8 @@ public void testCellReuse(Predicate> forceCopy lrobjs * 4 )); - IntArrayList availableList = ((MemoryAllocationStrategy.OpOrderReuseStrategy) trieLong.cellAllocator).indexesInPipeline(); + BufferManagerMultibuf mgr = ((BufferManagerMultibuf) trieLong.bufferManager); + IntArrayList availableList = (mgr.cellAllocator).indexesInPipeline(); BitSet available = new BitSet(reachable.size()); for (int v : availableList) available.set(v >> 5); @@ -107,7 +135,7 @@ public void testCellReuse(Predicate> forceCopy // Check all unreachable cells are marked for reuse BitSet unreachable = new BitSet(reachable.size()); unreachable.or(reachable); - unreachable.flip(0, trieLong.getAllocatedPos() >> 5); + unreachable.flip(0, mgr.getAllocatedPos() >> 5); unreachable.andNot(available); assertCellSetEmpty(unreachable, trieLong, " unreachable cells not marked as available"); } @@ -121,8 +149,8 @@ public void testAbortedMutation() throws Exception { ByteComparable[] src = generateKeys(rand, COUNT); OpOrder order = new OpOrder(); - InMemoryTrie trie = InMemoryTrie.longLived(byteComparableVersion, order); - InMemoryTrie check = InMemoryTrie.shortLived(byteComparableVersion); + InMemoryTrie trie = InMemoryTrie.longLived(VERSION, order); + InMemoryTrie check = InMemoryTrie.shortLived(VERSION); int step = Math.min(100, COUNT / 100); int throwStep = (COUNT + 10) / 5; // do 4 throwing inserts int nextThrow = throwStep; @@ -155,14 +183,14 @@ public void testAbortedMutation() throws Exception check.filteredEntrySet(ByteBuffer.class).iterator()); } - private void assertCellSetEmpty(BitSet set, InMemoryTrie trie, String message) + public static void assertCellSetEmpty(BitSet set, InMemoryBaseTrie trie, String message) { if (set.isEmpty()) return; for (int i = set.nextSetBit(0); i >= 0; i = set.nextSetBit(i + 1)) { - System.out.println(String.format("Cell at %d: %08x %08x %08x %08x %08x %08x %08x %08x", + System.out.println(String.format("Cell at %08x: %08x %08x %08x %08x %08x %08x %08x %08x", (i << 5), trie.getIntVolatile((i << 5) + 0), trie.getIntVolatile((i << 5) + 4), @@ -178,19 +206,21 @@ private void assertCellSetEmpty(BitSet set, InMemoryTrie trie, String message Assert.fail(set.cardinality() + message); } - private Pair reachableCells(InMemoryTrie trie) + public static Pair reachableCells(InMemoryBaseTrie trie) { -// System.out.println(trie.dump()); + if (VERBOSE) + System.out.println(trie.dump(Object::toString)); BitSet set = new BitSet(); BitSet objs = new BitSet(); mark(trie, trie.root, set, objs); return Pair.create(set, objs); } - private void mark(InMemoryTrie trie, int node, BitSet set, BitSet objs) + private static void mark(InMemoryBaseTrie trie, int node, BitSet set, BitSet objs) { set.set(node >> 5); -// System.out.println(trie.dumpNode(node)); + if (VERBOSE) + System.out.println(trie.dumpNode(node)); switch (trie.offset(node)) { case InMemoryTrie.SPLIT_OFFSET: @@ -199,14 +229,16 @@ private void mark(InMemoryTrie trie, int node, BitSet set, BitSet objs) int mid = trie.getSplitCellPointer(node, i, InMemoryTrie.SPLIT_START_LEVEL_LIMIT); if (mid != InMemoryTrie.NONE) { -// System.out.println(trie.dumpNode(mid)); + if (VERBOSE) + System.out.println(trie.dumpNode(mid)); set.set(mid >> 5); for (int j = 0; j < InMemoryTrie.SPLIT_OTHER_LEVEL_LIMIT; ++j) { int tail = trie.getSplitCellPointer(mid, j, InMemoryTrie.SPLIT_OTHER_LEVEL_LIMIT); if (tail != InMemoryTrie.NONE) { -// System.out.println(trie.dumpNode(tail)); + if (VERBOSE) + System.out.println(trie.dumpNode(tail)); set.set(tail >> 5); for (int k = 0; k < InMemoryTrie.SPLIT_OTHER_LEVEL_LIMIT; ++k) markChild(trie, trie.getSplitCellPointer(tail, k, InMemoryTrie.SPLIT_OTHER_LEVEL_LIMIT), set, objs); @@ -220,13 +252,9 @@ private void mark(InMemoryTrie trie, int node, BitSet set, BitSet objs) markChild(trie, trie.getIntVolatile(node + InMemoryTrie.SPARSE_CHILDREN_OFFSET + i * 4), set, objs); break; case InMemoryTrie.PREFIX_OFFSET: - int content = trie.getIntVolatile(node + InMemoryTrie.PREFIX_CONTENT_OFFSET); - if (content < 0) - objs.set(~content); - else - markChild(trie, content, set, objs); - - markChild(trie, trie.followContentTransition(node), set, objs); + markPrefixContent(trie, node + InMemoryTrie.PREFIX_CONTENT_OFFSET, set, objs); + markPrefixContent(trie, node + InMemoryTrie.PREFIX_ALTERNATE_OFFSET, set, objs); + markChild(trie, trie.followPrefixTransition(node), set, objs); break; default: assert trie.offset(node) <= InMemoryTrie.CHAIN_MAX_OFFSET && trie.offset(node) >= InMemoryTrie.CHAIN_MIN_OFFSET; @@ -235,19 +263,35 @@ private void mark(InMemoryTrie trie, int node, BitSet set, BitSet objs) } } - private void markChild(InMemoryTrie trie, int child, BitSet set, BitSet objs) + private static void markPrefixContent(InMemoryBaseTrie trie, int pointerAddress, BitSet set, BitSet objs) { - if (child == InMemoryTrie.NONE) - return; - if (child > 0) + int content = trie.getIntVolatile(pointerAddress); + markChild(trie, content, set, objs); + } + + private static void markChild(InMemoryBaseTrie trie, int child, BitSet set, BitSet objs) + { + if (!InMemoryTrie.isNullOrLeaf(child)) mark(trie, child, set, objs); - else - objs.set(~child); + + if (InMemoryTrie.isLeaf(child)) + { + int cell = trie.contentManager.cellUsedIfAny(child); + if (cell < 0) + objs.set(~cell); + else + { + set.set(cell >> 5); + if (VERBOSE) + System.out.println(trie.dumpNode(child)); + } + } } static InMemoryTrie makeInMemoryTrie(ByteComparable[] src, Function> creator, - Predicate> forceCopyPredicate) throws TrieSpaceExhaustedException + Predicate> forceCopyPredicate) + throws TrieSpaceExhaustedException { OpOrder order = new OpOrder(); InMemoryTrie trie = creator.apply(order); @@ -271,8 +315,8 @@ static void addToInMemoryTrie(ByteComparable[] src, // (so that all sources have the same value). int payload = asString(b).hashCode(); ByteBuffer v = ByteBufferUtil.bytes(payload); - Trie update = Trie.singleton(b, byteComparableVersion, v); - update = InMemoryTrieThreadedTest.withRootMetadata(update, Boolean.TRUE); + Trie update = Trie.singleton(b, VERSION, v); + update = TrieUtil.withRootMetadata(update, Boolean.TRUE); update = update.prefixedBy(source("prefix")); applyUpdating(trie, update, forceCopyPredicate); } @@ -280,7 +324,7 @@ static void addToInMemoryTrie(ByteComparable[] src, static ByteComparable source(String key) { - return ByteComparable.preencoded(byteComparableVersion, key.getBytes(StandardCharsets.UTF_8)); + return ByteComparable.preencoded(VERSION, key.getBytes(StandardCharsets.UTF_8)); } static void addThrowingEntry(ByteComparable b, @@ -289,14 +333,14 @@ static void addThrowingEntry(ByteComparable b, { int payload = asString(b).hashCode(); ByteBuffer v = ByteBufferUtil.bytes(payload); - Trie update = Trie.singleton(b, byteComparableVersion, v); + Trie update = Trie.singleton(b, VERSION, v); // Create an update with two metadata entries, so that the lower is already a copied node. // Abort processing on the lower metadata, where the new branch is not attached yet (so as not to affect the // contents). - update = InMemoryTrieThreadedTest.withRootMetadata(update, Boolean.FALSE); + update = TrieUtil.withRootMetadata(update, Boolean.FALSE); update = update.prefixedBy(source("fix")); - update = InMemoryTrieThreadedTest.withRootMetadata(update, Boolean.TRUE); + update = TrieUtil.withRootMetadata(update, Boolean.TRUE); update = update.prefixedBy(source("pre")); trie.apply(update, @@ -321,4 +365,142 @@ public static void applyUpdating(InMemoryTrie trie, { trie.apply(mutation, (x, y) -> y, needsForcedCopy); } + + class TestContentSerializer implements ContentSerializer + { + + @Override + public int idIfSpecial(Object content, boolean shouldPresentAfterBranch) + { + return content == Boolean.TRUE ? 0 : -1; + } + + @Override + public int serialize(Object content, boolean shouldPresentAfterBranch, UnsafeBuffer buffer, int offset) throws TrieSpaceExhaustedException + { + ByteBuffer buf = (ByteBuffer) content; + buffer.putInt(offset, buf.remaining()); + buffer.putBytes(offset + 4, buf, buf.position(), buf.remaining()); + return 0; + } + + @Override + public Object special(int id) + { + return Boolean.TRUE; + } + + @Override + public Object deserialize(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + int length = buffer.getInt(inBufferPos); + ByteBuffer buf = ByteBuffer.allocate(length); + buffer.getBytes(inBufferPos, buf, length); + return buf; + } + + @Override + public void releaseSpecial(int id) + { + + } + + @Override + public boolean releaseNeeded(int offset) + { + return false; + } + + @Override + public void release(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + throw new AssertionError("Should not be called"); + } + + @Override + public boolean shouldPreserveSpecialWithoutChildren(int id) + { + return false; + } + + @Override + public boolean shouldPreserveWithoutChildren(int offset) + { + return true; + } + + @Override + public boolean shouldPreserveWithoutChildren(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + throw new AssertionError("Should not be called"); + } + + @Override + public boolean shouldPresentSpecialAfterBranch(int id) + { + return false; + } + + @Override + public boolean shouldPresentAfterBranch(int offsetBits) + { + return false; + } + + @VisibleForTesting + @Override + public void releaseReferencesUnsafe() + { + + } + + @Override + public String dumpSpecial(int id) + { + return "PARTITION"; + } + + @Override + public String dumpContent(UnsafeBuffer buffer, int inBufferPos, int offsetBits) + { + return ByteBufferUtil.bytesToHex((ByteBuffer) deserialize(buffer, inBufferPos, offsetBits)); + } + + @Override + public int updateInPlace(UnsafeBuffer buffer, int inBufferPos, int offsetBits, Object newContent) throws TrieSpaceExhaustedException + { + return serialize(newContent, false, buffer, inBufferPos); + } + + @Override + public void completeMutation() + { + + } + + @Override + public void abortMutation() + { + + } + + @Override + public long usedSizeOffHeap() + { + return 0; + } + + @Override + public long usedSizeOnHeap() + { + return 0; + } + + @VisibleForTesting + @Override + public long unusedReservedOnHeapMemory() + { + return 0; + } + } } diff --git a/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java index df6f28c6e19a..62e896f67c3b 100644 --- a/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java +++ b/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java @@ -20,79 +20,114 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Random; import java.util.SortedMap; import java.util.TreeMap; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.config.CassandraRelevantProperties; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.*; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.addToInMemoryTrie; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.makeInMemoryTrie; import static org.apache.cassandra.db.tries.MergeTrieTest.removeDuplicates; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; public class CollectionMergeTrieTest { + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + private static final int COUNT = 15000; private static final Random rand = new Random(); @Test public void testDirect() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - ByteComparable[] src2 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); - SortedMap content2 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); content1.putAll(content2); // construct directly, trie.merge() will defer to mergeWith on two sources - Trie union = new CollectionMergeTrie<>(ImmutableList.of(trie1, trie2), x -> x.iterator().next()); + Trie union = makeCollectionMergeTrie(trie1, trie2); - assertSameContent(union, content1); + TrieUtil.assertSameContent(union, content1); } @Test public void testWithDuplicates() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - ByteComparable[] src2 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); - SortedMap content2 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); - addToInMemoryTrie(generateKeys(new Random(5), COUNT), content1, trie1, true); - addToInMemoryTrie(generateKeys(new Random(5), COUNT), content2, trie2, true); + addToInMemoryTrie(TrieUtil.generateKeys(new Random(5), COUNT), content1, trie1, true); + addToInMemoryTrie(TrieUtil.generateKeys(new Random(5), COUNT), content2, trie2, true); content1.putAll(content2); - Trie union = new CollectionMergeTrie<>(ImmutableList.of(trie1, trie2), x -> x.iterator().next()); + Trie union = makeCollectionMergeTrie(trie1, trie2); + + TrieUtil.assertSameContent(union, content1); + } - assertSameContent(union, content1); + private static Trie makeCollectionMergeTrie(InMemoryTrie... tries) + { + return dir -> new CollectionMergeCursor.Plain<>(x -> x.iterator().next(), dir, List.of(tries), Trie::cursor); } @Test public void testDistinct() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); - ByteComparable[] src2 = generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); src2 = removeDuplicates(src2, content1); - SortedMap content2 = new TreeMap<>(forwardComparator); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); content1.putAll(content2); - Trie union = new CollectionMergeTrie.Distinct<>(ImmutableList.of(trie1, trie2)); + Trie union = mergeDistinctTrie(ImmutableList.of(trie1, trie2)); - assertSameContent(union, content1); + TrieUtil.assertSameContent(union, content1); + } + + private static Trie mergeDistinctTrie(Collection> sources) + { + // This duplicates the code in the private Trie.mergeDistinctTrie + return new Trie() + { + @Override + public Cursor makeCursor(Direction direction) + { + return new CollectionMergeCursor.Plain<>(Trie.throwingResolver(), direction, sources, Trie::cursor); + } + + @Override + public Iterable valuesUnordered() + { + return Iterables.concat(Iterables.transform(sources, Trie::valuesUnordered)); + } + }; } @Test @@ -137,38 +172,38 @@ public void testMultiple(int mergeCount, int count) public void testMultipleDistinct(int mergeCount, int count) { List> tries = new ArrayList<>(mergeCount); - SortedMap content = new TreeMap<>(forwardComparator); + SortedMap content = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); for (int i = 0; i < mergeCount; ++i) { - ByteComparable[] src = removeDuplicates(generateKeys(rand, count), content); + Preencoded[] src = removeDuplicates(TrieUtil.generateKeys(rand, count), content); Trie trie = makeInMemoryTrie(src, content, true); tries.add(trie); } Trie union = Trie.mergeDistinct(tries); - assertSameContent(union, content); + TrieUtil.assertSameContent(union, content); } public void testMultipleWithDuplicates(int mergeCount, int count) { List> tries = new ArrayList<>(mergeCount); - SortedMap content = new TreeMap<>(forwardComparator); + SortedMap content = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); for (int i = 0; i < mergeCount; ++i) { - ByteComparable[] src = generateKeys(rand, count); + Preencoded[] src = TrieUtil.generateKeys(rand, count); Trie trie = makeInMemoryTrie(src, content, true); tries.add(trie); } Trie union = Trie.merge(tries, x -> x.iterator().next()); - assertSameContent(union, content); + TrieUtil.assertSameContent(union, content); try { union = Trie.mergeDistinct(tries); - assertSameContent(union, content); + TrieUtil.assertSameContent(union, content); Assert.fail("Expected assertion error for duplicate keys."); } catch (AssertionError e) @@ -176,15 +211,4 @@ public void testMultipleWithDuplicates(int mergeCount, int count) // correct path } } - - private int randomButNot(Random rand, int bound, int avoid) - { - int r; - do - { - r = rand.nextInt(bound); - } - while (r == avoid); - return r; - } } diff --git a/test/unit/org/apache/cassandra/db/tries/CombinedDataPoint.java b/test/unit/org/apache/cassandra/db/tries/CombinedDataPoint.java new file mode 100644 index 000000000000..42732f046f63 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/CombinedDataPoint.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Objects; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +class CombinedDataPoint implements DataPoint +{ + final LivePoint livePoint; + final DeletionMarker marker; + + public CombinedDataPoint(LivePoint livePoint, DeletionMarker marker) + { + this.livePoint = livePoint; + this.marker = marker; + } + + @Override + public DeletionMarker marker() + { + return marker; + } + + @Override + public LivePoint live() + { + return livePoint; + } + + @Override + public ByteComparable position() + { + return livePoint.position(); + } + + @Override + public DataPoint withMarker(DeletionMarker newMarker) + { + if (newMarker == null) + return livePoint; + else + return new CombinedDataPoint(livePoint, newMarker); + } + + @Override + public DataPoint remap(ByteComparable newKey) + { + return new CombinedDataPoint(livePoint.remap(newKey), marker.remap(newKey)); + } + + @Override + public String toString() + { + return marker.toString() + 'L' + livePoint.timestamp; + } + + public DataPoint toContent() + { + if (marker.isBoundary()) + return this; + return livePoint; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + CombinedDataPoint that = (CombinedDataPoint) o; + return Objects.equals(livePoint, that.livePoint) && Objects.equals(marker, that.marker); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/ConsistencyTestBase.java b/test/unit/org/apache/cassandra/db/tries/ConsistencyTestBase.java new file mode 100644 index 000000000000..c8480b2c6343 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/ConsistencyTestBase.java @@ -0,0 +1,715 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; +import java.util.function.LongUnaryOperator; +import java.util.function.Predicate; + +import com.google.common.collect.Iterables; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertTrue; + +public abstract class ConsistencyTestBase, R extends BaseTrie> +{ + // Note: This should not be run by default with verification to have the higher concurrency of faster writes and reads. + + private static final int COUNT = 30000; + private static final int PROGRESS_UPDATE = Math.max(1, COUNT / 15); + private static final int READERS = 8; + private static final int WALKERS = 2; + private static final Random rand = new Random(); + + /** + * Force copy every modified cell below the partition/enumeration level. Provides atomicity of mutations within the + * partition level as well as consistency. + */ + public final Predicate> FORCE_COPY_PARTITION = features -> isPartition(features.content()); + /** + * Force copy every modified cell below the partition/enumeration level. Provides atomicity of mutations within the + * partition level as well as consistency. + */ + public final Predicate> FORCE_COPY_PARTITION_RANGE_STATE = features -> isPartition(features.content()); + /** + * Force copy every modified cell below the earliest branching point. Provides atomicity of mutations at any level, + * but readers/walkers may see inconsistent views of the data, in the sense that older mutations may be missed + * while newer ones are returned. + */ + public final static Predicate> FORCE_ATOMIC = InMemoryBaseTrie.NodeFeatures::isBranching; + @SuppressWarnings("unchecked") + public static Predicate> forceAtomic() + { + return (Predicate>) (Predicate) FORCE_ATOMIC; + } + /** + * Do not do any additional copying beyond what is required to build the tries safely for concurrent readers. + * Mutations may be partially seen by readers, and older mutations may be missed while newer ones are returned. + */ + public final static Predicate> NO_ATOMICITY = features -> false; + @SuppressWarnings("unchecked") + public static Predicate> noAtomicity() + { + return (Predicate>) (Predicate) NO_ATOMICITY; + } + + abstract R makeTrie(OpOrder readOrder); + + abstract C value(ByteComparable b, ByteComparable cprefix, ByteComparable c, int add, int seqId); + + abstract C metadata(ByteComparable b); + + abstract String pk(C c); + + abstract String ck(C c); + + abstract int seq(C c); + + abstract int value(C c); + + abstract int updateCount(C c); + + abstract T makeSingleton(ByteComparable b, C content); + + abstract T withRootMetadata(T wrapped, C metadata); + + abstract T merge(Collection tries, Trie.CollectionMergeResolver mergeResolver); + + abstract void apply(R trie, + T mutation, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) throws TrieSpaceExhaustedException; + + abstract void delete(R trie, + ByteComparable deletionPrefix, + TestRangeState partitionMarker, + RangeTrie deletion, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) throws TrieSpaceExhaustedException; + + abstract boolean isPartition(C c); + boolean isPartition(TestRangeState c) + { + if (!(c instanceof TestStateMetadata)) + return false; + return isPartition(((TestStateMetadata) c).metadata); + } + + abstract C mergeMetadata(C c1, C c2); + abstract C deleteMetadata(C existing, int entriesCount); + + // To overridden by deletion branch testing. + Iterable> getEntrySet(BaseTrie trie) + { + return trie.entrySet(); + } + + + abstract void printStats(R trie, Predicate> forcedCopyChecker); + + @Test + public void testConsistentUpdates() throws Exception + { + // Check that multi-path updates with below-partition-level copying are safe for concurrent readers, + // and that content is atomically applied, i.e. that reader see either nothing from the update or all of it, + // and consistent, i.e. that it is not possible to receive some newer updates while missing + // older ones. (For example, if the sequence of additions is 3, 1, 5, without this requirement a reader + // could see an enumeration which lists 3 and 5 but not 1.) + testUpdateConsistency(3, FORCE_COPY_PARTITION, FORCE_COPY_PARTITION_RANGE_STATE, true, true); + // Note: using 3 per mutation, so that the first and second update fit in a sparse in-memory trie block. + } + + @Test + public void testAtomicUpdates() throws Exception + { + // Check that multi-path updates with below-branching-point copying are safe for concurrent readers, + // and that content is atomically applied, i.e. that reader see either nothing from the update or all of it. + testUpdateConsistency(3, forceAtomic(), forceAtomic(), true, false); + } + + @Test + public void testSafeUpdates() throws Exception + { + // Check that multi path updates without additional copying are safe for concurrent readers. + testUpdateConsistency(3, noAtomicity(), noAtomicity(), false, false); + } + + @Test + public void testConsistentSinglePathUpdates() throws Exception + { + // Check that single path updates with below-partition-level copying are safe for concurrent readers, + // and that content is consistent, i.e. that it is not possible to receive some newer updates while missing + // older ones. (For example, if the sequence of additions is 3, 1, 5, without this requirement a reader + // could see an enumeration which lists 3 and 5 but not 1.) + testUpdateConsistency(1, FORCE_COPY_PARTITION, FORCE_COPY_PARTITION_RANGE_STATE, true, true); + } + + + @Test + public void testAtomicSinglePathUpdates() throws Exception + { + // When doing single path updates atomicity comes for free. This only checks that the branching checker is + // not doing anything funny. + testUpdateConsistency(1, forceAtomic(), forceAtomic(), true, false); + } + + @Test + public void testSafeSinglePathUpdates() throws Exception + { + // Check that single path updates without additional copying are safe for concurrent readers. + testUpdateConsistency(1, noAtomicity(), noAtomicity(), true, false); + } + + // The generated keys all start with NEXT_COMPONENT, which makes it impossible to test the precise behavior of the + // partition-level force copying. Strip that byte. + private static ByteComparable[] skipFirst(ByteComparable[] keys) + { + ByteComparable[] result = new ByteComparable[keys.length]; + for (int i = 0; i < keys.length; ++i) + result[i] = skipFirst(keys[i]); + return result; + } + + private static ByteComparable skipFirst(ByteComparable key) + { + return v -> { + var bs = key.asComparableBytes(v); + int n = bs.next(); + assert n != ByteSource.END_OF_STREAM; + return bs; + }; + } + + private static ByteComparable swapTerminator(ByteComparable key, int newTerminator) + { + byte[] bytes = key.asByteComparableArray(VERSION); + bytes[bytes.length - 1] = (byte) newTerminator; + return ByteComparable.preencoded(VERSION, bytes); + } + + static class ThreadWithProgressAck extends Thread + { + final int threadId; + final LongUnaryOperator ackWriteProgress; + final Consumer runnable; + + ThreadWithProgressAck(AtomicInteger threadIdx, Consumer runnable) + { + threadId = threadIdx.getAndIncrement(); + ackWriteProgress = x -> x | (1<> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges, + boolean checkAtomicity, + boolean checkSequence) + throws Exception + { + long seed = rand.nextLong(); + System.out.println("Seed: " + seed); + rand.setSeed(seed); + + ByteComparable[] ckeys = skipFirst(generateKeys(rand, COUNT)); + ByteComparable[] pkeys = skipFirst(generateKeys(rand, Math.min(100, COUNT / 10))); // to guarantee repetition + + /* + * Adds COUNT partitions each with perPartition separate clusterings, where the sum of the values + * of all clusterings is 0. + * If the sum for any walk covering whole partitions is non-zero, we have had non-atomic updates. + */ + + OpOrder readOrder = new OpOrder(); + R trie = makeTrie(readOrder); + ConcurrentLinkedQueue errors = new ConcurrentLinkedQueue<>(); + List threads = new ArrayList<>(); + AtomicBoolean writeCompleted = new AtomicBoolean(false); + AtomicInteger writeProgress = new AtomicInteger(0); + AtomicLong writeProgressAck = new AtomicLong(0); + AtomicInteger threadIdx = new AtomicInteger(0); + + Consumer walkTrie = ackWriteProgress -> + { + while (!writeCompleted.get()) + { + try + { + writeProgressAck.getAndUpdate(ackWriteProgress); + int min = writeProgress.get(); + try (OpOrder.Group group = readOrder.start()) + { + Iterable> entries = getEntrySet(trie); + checkEntries("", min, true, checkAtomicity, false, PER_MUTATION, entries); + } + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + } + }; + + Consumer readTrie = ackWriteProgress -> + { + Random r = ThreadLocalRandom.current(); + while (!writeCompleted.get()) + { + try + { + { + writeProgressAck.getAndUpdate(ackWriteProgress); + ByteComparable key = pkeys[r.nextInt(pkeys.length)]; + int min = writeProgress.get() / (pkeys.length * PER_MUTATION) * PER_MUTATION; + Iterable> entries; + + try (OpOrder.Group group = readOrder.start()) + { + var tail = trie.tailTrie(key); + if (tail != null) + { + entries = getEntrySet(tail); + checkEntries(" in tail " + key.byteComparableAsString(VERSION), min, false, checkAtomicity, checkSequence, PER_MUTATION, entries); + } + else + Assert.assertEquals("Trie key not found when there should be data for it", 0, min); + } + + try (OpOrder.Group group = readOrder.start()) + { + entries = getEntrySet(trie.subtrie(key, key)); + checkEntries(" in branch " + key.byteComparableAsString(VERSION), min, true, checkAtomicity, checkSequence, PER_MUTATION, entries); + } + } + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + } + }; + + for (int i = 0; i < WALKERS; ++i) + threads.add(new ThreadWithProgressAck(threadIdx, walkTrie)); + + for (int i = 0; i < READERS; ++i) + threads.add(new ThreadWithProgressAck(threadIdx, readTrie)); + + byte[] choices = new byte[COUNT / PER_MUTATION]; + rand.nextBytes(choices); + threads.add(new Thread() + { + public void run() + { + final Trie.CollectionMergeResolver mergeResolver = new Trie.CollectionMergeResolver<>() + { + @Override + public C resolve(C c1, C c2) + { + if (isPartition(c1) && isPartition(c2)) + return mergeMetadata(c1, c2); + throw new AssertionError("Test error, keys should be distinct."); + } + + public C resolve(Collection contents) + { + return contents.stream().reduce(this::resolve).get(); + } + }; + + try + { + // Insert the data. + int lastUpdate = 0; + for (int i = 0; i < COUNT; i += PER_MUTATION) + { + ByteComparable b = pkeys[(i / PER_MUTATION) % pkeys.length]; + C partitionMarker = metadata(b); + ByteComparable cprefix = null; + if ((choices[i / PER_MUTATION] & 1) == 1) + cprefix = ckeys[i]; // Also test branching point below the partition level + + List sources = new ArrayList<>(); + for (int j = 0; j < PER_MUTATION; ++j) + { + + ByteComparable k = ckeys[i + j]; + T row = makeSingleton(k, + value(b, cprefix, k, + j == 0 ? -PER_MUTATION + 1 : 1, + (i / PER_MUTATION / pkeys.length) * PER_MUTATION + j)); + + if (cprefix != null) + row = row.prefixedBy(cprefix); + + row = withRootMetadata(row, partitionMarker); + row = row.prefixedBy(b); + sources.add(row); + } + + final T mutation = merge(sources, mergeResolver); + + apply(trie, mutation, + (existing, update) -> existing == null ? update : mergeResolver.resolve(existing, update), + forcedCopyChecker, forcedCopyCheckerRanges); + + if (i >= pkeys.length * PER_MUTATION && i - lastUpdate >= PROGRESS_UPDATE) + { + writeProgress.set(i); + lastUpdate = i; + } + } + + writeProgress.set(COUNT); + printStats(trie, forcedCopyChecker); + Thread.sleep(100); // Let the threads check the completed state too. + + // Make sure we can read everything we have inserted from this thread (if this fails, the problem + // is not concurrency). + try (OpOrder.Group group = readOrder.start()) + { + Iterable> entries = getEntrySet(trie); + checkEntries("", COUNT, true, checkAtomicity, false, PER_MUTATION, entries); + } + + InMemoryTrie.UpsertTransformer deleteResolver = (existing, update) -> + { + if (update instanceof TestStateMetadata) + { + assert isPartition(existing); + return deleteMetadata(existing, PER_MUTATION); + } + return null; + }; + + // Now delete the data in the reverse order of the insertion to satisfy the same constraints. + for (int i = COUNT - PER_MUTATION; i >= 0; i -= PER_MUTATION) + { + if (i < writeProgress.get()) + { + // Reduce the writeProgress so that we can start deleting a batch. + writeProgress.set(writeProgress.get() - PROGRESS_UPDATE); + // Wait until all reader threads have completed the current pass. + writeProgressAck.set(0); + while (writeProgressAck.get() + 1 < 1 << threadIdx.get()) + Thread.yield(); + } + + ByteComparable b = pkeys[(i / PER_MUTATION) % pkeys.length]; + TestRangeState partitionMarker = new TestStateMetadata<>(metadata(b)); + List> ranges = new ArrayList<>(); + ByteComparable cprefix = null; + if ((choices[i / PER_MUTATION] & 3) == 3) + { + // Delete the whole branch in one range + ranges.add(makeRangeCovering(ckeys[i])); + } + else + { + // A range for each entry + if ((choices[i / PER_MUTATION] & 1) == 1) + cprefix = ckeys[i]; + for (int j = 0; j < PER_MUTATION; ++j) + ranges.add(makeRangeCovering(ckeys[i + j])); + } + + RangeTrie deletion = RangeTrie.merge(ranges, Trie.throwingResolver()); + if (cprefix != null) + deletion = deletion.prefixedBy(cprefix); + + delete(trie, b, partitionMarker, deletion, deleteResolver, forcedCopyChecker, forcedCopyCheckerRanges); + } + + writeProgress.set(0); + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + finally + { + writeCompleted.set(true); + } + } + }); + + for (Thread t : threads) + t.start(); + + for (Thread t : threads) + t.join(); + + printStats(trie, forcedCopyChecker); + + Assert.assertEquals("Writer did not complete", 0, writeProgress.get()); + + assertTrue(Iterables.isEmpty(getEntrySet(trie))); + + if (!errors.isEmpty()) + { + System.out.println(trie.dump()); + for (byte b : choices) + switch (b & 3) + { + case 0: + case 2: + System.out.print('.'); + break; + case 1: + System.out.print('-'); + break; + case 3: + System.out.print('#'); + break; + } + System.out.println(); + Assert.fail("Got errors:\n" + errors); + } + } + + private static RangeTrie makeRangeCovering(ByteComparable cprefix) + { + ByteComparable left = swapTerminator(cprefix, ByteSource.LT_NEXT_COMPONENT); + ByteComparable right = swapTerminator(cprefix, ByteSource.GT_NEXT_COMPONENT); + return RangeTrie.range(left, true, right, true, VERSION, TestRangeState.COVERED); + } + public void checkEntries(String location, + int min, + boolean usePk, + boolean checkAtomicity, + boolean checkConsecutiveIds, + int PER_MUTATION, + Iterable> entries) + { + long sum = 0; + int count = 0; + long idSum = 0; + long idMax = 0; + int updateCount = 0; + for (var en : entries) + { + String path = en.getKey().byteComparableAsString(VERSION); + final C v = en.getValue(); + if (isPartition(v)) + { + Assert.assertEquals("Partition metadata" + location, (usePk ? pk(v) : ""), path); + updateCount += updateCount(v); + continue; + } + String valueKey = (usePk ? pk(v) : "") + ck(v); + Assert.assertEquals(location, valueKey, path); + ++count; + sum += value(v); + int seq = seq(v); + idSum += seq; + if (seq > idMax) + idMax = seq; + } + + assertTrue("Values" + location + " should be at least " + min + ", got " + count, min <= count); + + if (checkAtomicity) + { + // If mutations apply atomically, the row count is always a multiple of the mutation size... + assertTrue("Values" + location + " should be a multiple of " + PER_MUTATION + ", got " + count, count % PER_MUTATION == 0); + // ... and the sum of the values is 0 (as the sum for each individual mutation is 0). + Assert.assertEquals("Value sum" + location, 0, sum); + } + + if (checkConsecutiveIds) + { + // The update count reflected in the partition metadata must match the row count. + Assert.assertEquals("Update count" + location, count, updateCount); + // If mutations apply consistently for the partition, for any row we see we have to have seen all rows that + // were applied before that. In other words, the id sum should be the sum of the integers from 1 to the + // highest id seen in the partition. + Assert.assertEquals("Id sum" + location, idMax * (idMax + 1) / 2, idSum); + } + } + + public static abstract class TestRangeState implements RangeState + { + static final TestRangeState COVERED = new TestRangeCoveringState(); + static final TestRangeState RANGE_START = new TestRangeBoundary(Direction.FORWARD); + static final TestRangeState RANGE_END = new TestRangeBoundary(Direction.REVERSE); + + public static TestRangeState combine(TestRangeState existing, TestRangeState incoming) + { + // This can only be called for TestRangeBoundary as other types should not end up in any persisted tries. + TestRangeBoundary be = (TestRangeBoundary) existing; + TestRangeBoundary bi = (TestRangeBoundary) incoming; + if (be == null) + return bi; + if (be.direction == bi.direction) + return be; + return null; // switch from covered to covered, we should not store anything + } + + @Override + public TestRangeState succedingState(Direction direction) + { + return precedingState(direction.opposite()); + } + } + + static class TestRangeCoveringState extends TestRangeState + { + @Override + public boolean isBoundary() + { + return false; + } + + @Override + public TestRangeCoveringState precedingState(Direction direction) + { + return this; + } + + @Override + public TestRangeState restrict(boolean applicableBefore, boolean applicableAfter) + { + throw new AssertionError(); + } + + @Override + public TestRangeState asBoundary(Direction direction) + { + return direction.isForward() ? RANGE_START : RANGE_END; + } + + @Override + public String toString() + { + return "COVERING"; + } + } + + static class TestRangeBoundary extends TestRangeState + { + final Direction direction; + + TestRangeBoundary(Direction direction) + { + this.direction = direction; + } + + @Override + public boolean isBoundary() + { + return true; + } + + @Override + public TestRangeState precedingState(Direction direction) + { + return direction == this.direction ? null : COVERED; + } + + @Override + public TestRangeState restrict(boolean applicableBefore, boolean applicableAfter) + { + if (direction.isForward() && !applicableBefore || !direction.isForward() && !applicableAfter) + return null; + return this; + } + + @Override + public TestRangeState asBoundary(Direction direction) + { + throw new AssertionError(); + } + + @Override + public String toString() + { + return direction.isForward() ? "START" : "END"; + } + } + + static class TestStateMetadata extends TestRangeState + { + final C metadata; + + TestStateMetadata(C metadata) + { + this.metadata = metadata; + } + + @Override + public boolean isBoundary() + { + return true; + } + + @Override + public TestRangeState precedingState(Direction direction) + { + return null; + } + + @Override + public TestRangeState restrict(boolean applicableBefore, boolean applicableAfter) + { + return this; // metadata should survive ranges + } + + @Override + public TestRangeState asBoundary(Direction direction) + { + throw new AssertionError(); + } + + @Override + public String toString() + { + return metadata.toString(); + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/CursorTest.java b/test/unit/org/apache/cassandra/db/tries/CursorTest.java new file mode 100644 index 000000000000..77491c2052b8 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/CursorTest.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.junit.Test; + +import org.quicktheories.core.Gen; +import org.quicktheories.generators.SourceDSL; + +import static org.junit.Assert.*; +import static org.quicktheories.QuickTheory.qt; +import static org.quicktheories.generators.SourceDSL.integers; + +public class CursorTest +{ + static final int EXAMPLES = 100_000; + + private static final Gen DEPTH_GEN = integers().between(0, 1000); + private static final Gen TRANSITION_GEN = integers().between(0, 0xFF); + private static final Gen DIRECTION_GEN = SourceDSL.arbitrary().enumValues(Direction.class); + + @Test + public void testDepth() + { + // Test with forward direction + long pos = Cursor.encode(0, 0, Direction.FORWARD); + assertEquals(0, Cursor.depth(pos)); + + pos = Cursor.encode(1, 0, Direction.FORWARD); + assertEquals(1, Cursor.depth(pos)); + + pos = Cursor.encode(42, 0, Direction.FORWARD); + assertEquals(42, Cursor.depth(pos)); + + // Test with reverse direction + pos = Cursor.encode(0, 0, Direction.REVERSE); + assertEquals(0, Cursor.depth(pos)); + + pos = Cursor.encode(1, 0, Direction.REVERSE); + assertEquals(1, Cursor.depth(pos)); + + // Test with negative depth (exhausted position) + pos = Cursor.EXHAUSTED_POSITION_FORWARD; + assertEquals(-1, Cursor.depth(pos)); + } + + @Test + public void testIsExhausted() + { + // Non-exhausted positions + long pos = Cursor.encode(0, 0, Direction.FORWARD); + assertFalse(Cursor.isExhausted(pos)); + + pos = Cursor.encode(1, 0x12, Direction.FORWARD); + assertFalse(Cursor.isExhausted(pos)); + + // Exhausted positions + assertTrue(Cursor.isExhausted(Cursor.EXHAUSTED_POSITION_FORWARD)); + assertTrue(Cursor.isExhausted(Cursor.EXHAUSTED_POSITION_REVERSE)); + } + + @Test + public void testDepthCorrectionValue() + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, DEPTH_GEN, TRANSITION_GEN, DIRECTION_GEN) + .checkAssert((depth, diff, transition, direction) -> { + long pos = Cursor.encode(depth, transition, direction); + long diffPos = Cursor.encode(diff, transition, direction); + long adjustment = Cursor.depthCorrectionValue(diffPos); + + long newPos = pos + adjustment; + assertEquals(depth + diff, Cursor.depth(newPos)); + assertEquals(direction, Cursor.direction(newPos)); + assertEquals(transition.intValue(), Cursor.incomingTransition(newPos)); + }); + } + + @Test + public void testIncomingTransition() + { + // Test forward direction + long pos = Cursor.encode(1, 0x12, Direction.FORWARD); + assertEquals(0x12, Cursor.incomingTransition(pos)); + + // Test reverse direction (should be the same as forward) + pos = Cursor.encode(1, 0x12, Direction.REVERSE); + assertEquals(0x12, Cursor.incomingTransition(pos)); + + // Test with different transitions + for (int i = 0; i < 0x100; i++) + { + pos = Cursor.encode(1, i, Direction.FORWARD); + assertEquals(i, Cursor.incomingTransition(pos)); + } + } + + @Test + public void testUndecodedTransition() + { + // Test forward direction + long pos = Cursor.encode(1, 0x12, Direction.FORWARD); + assertEquals(0x12 << 1, VerificationCursor.undecodedTransition(pos)); + + // Test reverse direction (should have bit 0x100 set) + pos = Cursor.encode(1, 0x12, Direction.REVERSE); + assertEquals((0x12 ^ 0xFF) << 1, VerificationCursor.undecodedTransition(pos)); + + // Test with different transitions + for (int i = 0; i < 0x100; i++) + { + // Forward direction + pos = Cursor.encode(1, i, Direction.FORWARD); + assertEquals(i << 1, VerificationCursor.undecodedTransition(pos)); + + // Reverse direction + pos = Cursor.encode(1, i, Direction.REVERSE); + assertEquals((i ^ 0xFF) << 1, VerificationCursor.undecodedTransition(pos)); + } + } + + @Test + public void testDirection() + { + // Test forward direction + long pos = Cursor.encode(1, 0x12, Direction.FORWARD); + assertEquals(Direction.FORWARD, Cursor.direction(pos)); + + // Test reverse direction + pos = Cursor.encode(1, 0x12, Direction.REVERSE); + assertEquals(Direction.REVERSE, Cursor.direction(pos)); + } + + @Test + public void testCompareForward() + { + testCompare(Direction.FORWARD); + } + + @Test + public void testCompareReverse() + { + testCompare(Direction.REVERSE); + } + + public void testCompare(Direction direction) + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, DEPTH_GEN, TRANSITION_GEN) + .checkAssert((depth1, transition1, depth2, transition2) -> { + long pos1 = Cursor.encode(depth1, transition1, direction); + long pos2 = Cursor.encode(depth2, transition2, direction); + + long diff = Cursor.compare(pos1, pos2); + int cmp = Long.signum(diff); + + int cmpExpected = Integer.compare(depth2, depth1); // higher depth is earlier + // if equal, check directed difference in transitions + if (cmpExpected == 0 && transition1.intValue() != transition2.intValue()) + cmpExpected = direction.lt(transition1, transition2) ? -1 : 1; + + assertEquals(cmpExpected, cmp); + }); + + } + + @Test + public void testCompareSimple() + { + // Equal positions + long pos1 = Cursor.encode(1, 0x12, Direction.FORWARD); + long pos2 = Cursor.encode(1, 0x12, Direction.FORWARD); + assertEquals(0, Cursor.compare(pos1, pos2)); + + // Different depths + pos1 = Cursor.encode(2, 0x12, Direction.FORWARD); + pos2 = Cursor.encode(1, 0x12, Direction.FORWARD); + assertTrue(Cursor.compare(pos1, pos2) < 0); + assertTrue(Cursor.compare(pos2, pos1) > 0); + + // Same depth, different transitions + pos1 = Cursor.encode(1, 0x11, Direction.FORWARD); + pos2 = Cursor.encode(1, 0x12, Direction.FORWARD); + assertTrue(Cursor.compare(pos1, pos2) < 0); + assertTrue(Cursor.compare(pos2, pos1) > 0); + + // Different directions + pos1 = Cursor.encode(1, 0x12, Direction.FORWARD); + pos2 = Cursor.encode(1, 0x12, Direction.REVERSE); + assertNotEquals(0, Cursor.compare(pos1, pos2)); + } + + @Test + public void testRootPosition() + { + // Test forward direction + long pos = Cursor.rootPosition(Direction.FORWARD); + assertEquals(0, Cursor.depth(pos)); + assertEquals(0, Cursor.incomingTransition(pos)); + assertEquals(Direction.FORWARD, Cursor.direction(pos)); + + // Test reverse direction + pos = Cursor.rootPosition(Direction.REVERSE); + assertEquals(0, Cursor.depth(pos)); + assertEquals(0, Cursor.incomingTransition(pos)); + assertEquals(Direction.REVERSE, Cursor.direction(pos)); + } + + @Test + public void testExhaustedPosition() + { + // Test forward direction + long pos = Cursor.exhaustedPosition(Direction.FORWARD); + assertEquals(-1, Cursor.depth(pos)); + assertEquals(0, Cursor.incomingTransition(pos)); + assertEquals(Direction.FORWARD, Cursor.direction(pos)); + + // Test reverse direction + pos = Cursor.exhaustedPosition(Direction.REVERSE); + assertEquals(-1, Cursor.depth(pos)); + assertEquals(0, Cursor.incomingTransition(pos)); + assertEquals(Direction.REVERSE, Cursor.direction(pos)); + } + + @Test + public void testExhaustedPositionFromPrevious() + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, DIRECTION_GEN) + .checkAssert((depth, transition, direction) -> { + long prevPos = Cursor.encode(depth, transition, direction); + long pos = Cursor.exhaustedPosition(prevPos); + assertEquals(-1, Cursor.depth(pos)); + assertEquals(0, Cursor.incomingTransition(pos)); + assertEquals(direction, Cursor.direction(pos)); + }); + } + + @Test + public void testEncode() + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, DIRECTION_GEN) + .checkAssert((depth, transition, direction) -> { + long pos = Cursor.encode(depth, transition, direction); + assertEquals(depth.intValue(), Cursor.depth(pos)); + assertEquals(transition.intValue(), Cursor.incomingTransition(pos)); + assertEquals(direction, Cursor.direction(pos)); + }); + } + + @Test + public void testPositionForDescentWithByte() + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, TRANSITION_GEN, DIRECTION_GEN) + .checkAssert((depth, prev, transition, direction) -> { + long pos = Cursor.encode(depth, prev, direction); + long newPos = Cursor.positionForDescentWithByte(pos, transition); + assertEquals(depth.intValue() + 1, Cursor.depth(newPos)); + assertEquals(transition.intValue(), Cursor.incomingTransition(newPos)); + assertEquals(direction, Cursor.direction(newPos)); + }); + } + + @Test + public void testPositionForSkippingBranch() + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, DIRECTION_GEN) + .checkAssert((depth, transition, direction) -> { + long pos = Cursor.encode(depth, transition, direction); + long newPos = Cursor.positionForSkippingBranch(pos); + assertTrue(Cursor.compare(pos, newPos) < 0); + assertEquals(depth.intValue(), Cursor.depth(newPos)); + if (transition.intValue() != direction.select(0xFF, 0x00)) + assertEquals(transition + direction.increase, Cursor.incomingTransition(newPos)); + else + assertEquals(0x200, VerificationCursor.undecodedTransition(newPos)); + assertEquals(direction, Cursor.direction(newPos)); + }); + } + + @Test + public void testAscendedForward() + { + testAscended(Direction.FORWARD); + } + + @Test + public void testAscendedReverse() + { + testAscended(Direction.REVERSE); + } + + public void testAscended(Direction direction) + { + qt().withExamples(EXAMPLES) + .forAll(DEPTH_GEN, TRANSITION_GEN, DEPTH_GEN, TRANSITION_GEN) + .checkAssert((depth, transition, newDepth, newTransition) -> { + if (depth.intValue() == 0) + transition = 0; // non-zero is not valid for the root position + // ensure that the next position is a valid advance target. + if (newDepth.intValue() == 0) + { + newDepth = -1; + newTransition = 0; + } + else if (newDepth > depth + 1) + newDepth = depth + 1; + else if (newDepth.intValue() == depth.intValue() && direction.le(newTransition, transition)) + { + if (transition.intValue() != direction.select(0xFF, 0x00)) + newTransition = transition + direction.increase; + else + --depth; + } + + long prevPos = Cursor.encode(depth, transition, direction); + long nextPos = Cursor.encode(newDepth, newTransition, direction); + + assertEquals(newDepth <= depth, Cursor.ascended(nextPos, prevPos)); + }); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DataPoint.java b/test/unit/org/apache/cassandra/db/tries/DataPoint.java new file mode 100644 index 000000000000..d1745e87b6d4 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DataPoint.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.List; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +import com.google.common.collect.Streams; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public interface DataPoint +{ + static LivePoint combineLive(LivePoint a, LivePoint b) + { + if (a == null) + return b; + if (b == null) + return a; + return LivePoint.combine(a, b); + } + + static DeletionMarker combineDeletion(DeletionMarker a, DeletionMarker b) + { + if (a == null) + return b; + if (b == null) + return a; + return DeletionMarker.combine(a, b); + } + + static LivePoint deleteLive(DeletionMarker deletion, LivePoint live) + { + if (deletion == null || live == null) + return live; + return deletion.applyTo(live); + } + + static LivePoint deleteLive(LivePoint live, DeletionMarker deletion) + { + return deleteLive(deletion, live); + } + + DeletionMarker marker(); + LivePoint live(); + ByteComparable position(); + + DataPoint withMarker(DeletionMarker newMarker); + DataPoint remap(ByteComparable newKey); + + static String toString(ByteComparable position) + { + if (position == null) + return "null"; + return position.byteComparableAsString(VERSION); + } + + static List verify(List dataPoints) + { + int active = -1; + ByteComparable prev = null; + for (DataPoint dp : dataPoints) + { + DeletionMarker marker = dp.marker(); + if (marker == null) + continue; + assertTrue("Order violation " + toString(prev) + " vs " + toString(marker.position), + prev == null || ByteComparable.compare(prev, marker.position, VERSION) < 0); + assertEquals("Range close violation", active, marker.leftSide); + assertTrue(marker.rightSide != marker.leftSide); + prev = marker.position; + active = marker.rightSide; + } + assertEquals(-1, active); + return dataPoints; + } + + static DataPoint resolve(LivePoint a, DeletionMarker m) + { + if (a == null) + return m; + if (m == null) + return a; + return new CombinedDataPoint(a, m); + } + + static DataPoint combine(DataPoint a, DataPoint b) + { + LivePoint live = combine(a.live(), b.live(), LivePoint::combine); + DeletionMarker marker = combine(a.marker(), b.marker(), DeletionMarker::combine); + if (marker != null && live != null) + live = marker.applyTo(live); + return resolve(live, marker); + } + + static T combine(T a, T b, BiFunction combiner) + { + if (a == null) + return b; + if (b == null) + return a; + return combiner.apply(a, b); + } + + DataPoint toContent(); + + /// Extract the values of the provided trie into a list. + static List toList(DeletionAwareTrie trie) + { + return Streams.stream(trie.mergedTrie(DataPoint::resolve).entryIterator()) + .map(en -> en.getValue().remap(en.getKey())) + .collect(Collectors.toList()); + } + + /// Extract the values of the provided trie into a list. + static List contentOnlyList(DeletionAwareTrie trie) + { + return Streams.stream(trie.contentOnlyTrie().entryIterator()) + .map(en -> en.getValue().remap(en.getKey())) + .collect(Collectors.toList()); + } + + /// Extract the values of the provided trie into a list. + static List deletionOnlyList(DeletionAwareTrie trie) + { + return Streams.stream(trie.deletionOnlyTrie().entryIterator()) + .map(en -> en.getValue().remap(en.getKey())) + .collect(Collectors.toList()); + } + + static InMemoryDeletionAwareTrie fromList(List list) + { + return fromList(list, false); + } + + static InMemoryDeletionAwareTrie fromList(List list, boolean forceCopy) + { + InMemoryDeletionAwareTrie trie = InMemoryDeletionAwareTrie.shortLived(VERSION); + var mutator = trie.mutator(DataPoint::combineLive, + DataPoint::combineDeletion, + DataPoint::deleteLive, + DataPoint::deleteLive, + false, + v -> forceCopy); + + try + { + // If we put a deletion first, the deletion branch will start at the root which works but isn't interesting + // enough as a test. So put the live data first. + for (DataPoint i : list) + { + LivePoint live = i.live(); + if (live != null) + mutator.apply(DeletionAwareTrie.singleton(live.position, VERSION, live)); + } + + // If we simply put all deletions with putAlternativeRecursive, we won't get correct branches as they + // won't always close the intervals they open. Deletions need to be put as ranges instead. + int active = -1; + int activeStartedAt = -1; + for (int i = 0; i < list.size(); ++i) + { + DeletionMarker marker = list.get(i).marker(); + if (marker == null || marker.leftSide == marker.rightSide) + continue; + assert marker.leftSide == active; + if (active != -1) + { + if (marker == null || marker.leftSide == marker.rightSide) + continue; + + DeletionMarker startMarker = list.get(activeStartedAt).marker(); + assert startMarker != null; + int prefixLength = ByteComparable.diffPoint(startMarker.position, marker.position, VERSION) - 1; + mutator.apply( + DeletionAwareTrie.deletedRange(ByteComparable.cut(startMarker.position, prefixLength), + ByteComparable.skipFirst(startMarker.position, prefixLength), + ByteComparable.skipFirst(marker.position, prefixLength), + VERSION, marker.leftSideAsCovering)); + } + + active = marker.rightSide; + activeStartedAt = i; + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + return trie; + } + + static DeletionAwareTrie dumpDeletionAwareTrie(DeletionAwareTrie trie) + { + System.out.println("DeletionAware"); + System.out.println(trie.dump()); + System.out.println("Merged"); + System.out.println(trie.mergedTrie(DataPoint::resolve).dump()); + return trie; + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionAwareIntersectionTest.java b/test/unit/org/apache/cassandra/db/tries/DeletionAwareIntersectionTest.java new file mode 100644 index 000000000000..f14476096dc8 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionAwareIntersectionTest.java @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.List; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.DataPoint.fromList; +import static org.apache.cassandra.db.tries.DataPoint.verify; + +@RunWith(Parameterized.class) +public class DeletionAwareIntersectionTest extends DeletionAwareTestBase +{ + + @Test + public void testSubtrie() + { + { + testIntersection("no intersection"); + + testIntersection("all", + array(null, null)); + testIntersection("fully covered range", + array(before(20), before(25))); + testIntersection("fully covered range", + array(before(25), before(33))); + testIntersection("matching range", + array(before(21), before(24))); + testIntersection("touching empty", + array(before(24), before(26))); + + testIntersection("partial left", + array(before(22), before(25))); + testIntersection("partial left on change", + array(before(28), before(32))); + testIntersection("partial left with null", + array(before(29), null)); + + + testIntersection("partial right", + array(before(25), before(27))); + testIntersection("partial right on change", + array(before(25), before(28))); + testIntersection("partial right with null", + array(null, before(22))); + + testIntersection("inside range", + array(before(22), before(23))); + testIntersection("inside with change", + array(before(27), before(29))); + + testIntersection("point covered", + array(before(16), before(18))); + testIntersection("point at range start", + array(before(17), before(18))); + testIntersection("point at range end", + array(before(16), before(17))); + + + testIntersection("start point covered", + array(before(32), before(35))); + testIntersection("start point at range start", + array(before(33), before(35))); + testIntersection("start point at range end", + array(before(32), before(33))); + + + testIntersection("end point covered", + array(before(36), before(40))); + testIntersection("end point at range start", + array(before(38), before(40))); + testIntersection("end point at range end", + array(before(36), before(38))); + } + } + + @Test + public void testRanges() + { + { + testIntersection("fully covered ranges", + array(before(20), before(25), before(25), before(33))); + testIntersection("matching ranges", + array(before(21), before(24), before(26), before(31))); + testIntersection("touching empty", + array(before(20), before(21), before(24), before(26), before(32), before(33), before(34), before(36))); + testIntersection("partial left", + array(before(22), before(25), before(29), null)); + + testIntersection("partial right", + array(null, before(22), before(25), before(27))); + + testIntersection("inside ranges", + array(before(22), before(23), before(27), before(29))); + + testIntersection("jumping inside", + array(before(21), before(22), before(23), before(24), before(25), before(26), before(27), before(28), before(29), before(30))); + } + } + + @Test + public void testRangeOnSubtrie() + { + { + // non-overlapping + testIntersection("", array(before(20), before(23)), array(before(24), before(27))); + // touching, i.e. still non-overlapping + testIntersection("", array(before(20), before(23)), array(before(23), before(27))); + // overlapping 1 + testIntersection("", array(before(20), before(23)), array(before(22), before(27))); + // overlapping 2 + testIntersection("", array(before(20), before(23)), array(before(21), before(27))); + // covered + testIntersection("", array(before(20), before(23)), array(before(20), before(27))); + // covered + testIntersection("", array(before(23), before(27)), array(before(20), before(27))); + // covered 2 + testIntersection("", array(before(21), before(23)), array(before(20), before(27))); + } + } + + @Test + public void testRangesOnRanges() + { + testIntersections(); + } + + private List getTestRanges() + { + return flatten(asList(deletedPoint(17, 20), + livePoint(19, 30), + from(21, 10), deletedPointInside(22, 21, 10), livePoint(23, 31), to(24, 10), + from(26, 11), livePoint(27, 32), change(28, 11, 12).withPoint(22), livePoint(29, 33), to(30, 12), + livePoint(32, 34), from(33, 13).withPoint(23), to(34, 13), + from(36, 14), to(38, 14).withPoint(24), livePoint(39, 35))); + } + + private DeletionAwareTrie mergeGeneratedRanges() + { + return fromList(asList(from(21, 10), to(24, 10), + from(26, 11), to(29, 11), + from(33, 13), to(34, 13), + from(36, 14), to(38, 14))) + .mergeWith(fromList(asList(from(28, 12), to(30, 12))), + LivePoint::combine, + DeletionMarker::combine, + DeletionMarker::applyTo, false) + .mergeWith(fromList(flatten(asList(deletedPoint(17, 20), + deletedPoint(22, 21), + deletedPoint(28, 22), + deletedPoint(33, 23), + deletedPoint(38, 24)))), + LivePoint::combine, + DeletionMarker::combine, + DeletionMarker::applyTo, false) + .mergeWith(fromList(asList(livePoint(19, 30), + livePoint(23, 31), + livePoint(27, 32), + livePoint(29, 33), + livePoint(32, 34), + livePoint(39, 35))), + LivePoint::combine, + DeletionMarker::combine, + DeletionMarker::applyTo, false) + ; + } + + private DeletionAwareTrie collectionMergeGeneratedRanges() + { + return DeletionAwareTrie.merge(asList( + fromList(asList(from(21, 10), to(24, 10), + from(26, 11), to(29, 11), + from(33, 13), to(34, 13), + from(36, 14), to(38, 14))), + fromList(asList(from(28, 12), to(30, 12))), + fromList(flatten(asList(deletedPoint(17, 20), + deletedPoint(22, 21), + deletedPoint(28, 22), + deletedPoint(33, 23), + deletedPoint(38, 24)))), + fromList(asList(livePoint(19, 30), + livePoint(23, 31), + livePoint(27, 32), + livePoint(29, 33), + livePoint(32, 34), + livePoint(39, 35))) + ), + LivePoint::combineCollection, + DeletionMarker::combineCollection, + DeletionMarker::applyTo, + false); + } + + private void testIntersections() + { + testIntersection(""); + + ByteComparable[] set1 = array(null, before(24), + before(25), before(29), + before(32), null); + ByteComparable[] set2 = array(before(14), before(17), + before(22), before(27), + before(28), before(30), + before(32), before(34), + before(36), before(40)); + ByteComparable[] set3 = array(before(17), before(18), + before(19), before(20), + before(21), before(22), + before(23), before(24), + before(25), before(26), + before(27), before(28), + before(29), before(30), + before(31), before(32), + before(33), before(34), + before(35), before(36), + before(37), before(38)); + + testIntersections(set1, set2, set3); + } + + private void testIntersections(ByteComparable[] set1, ByteComparable[] set2, ByteComparable[] set3) + { + testIntersection("1", set1); + + testIntersection("2", set2); + + testIntersection("3", set3); + + testIntersection("12", set1, set2); + + testIntersection("13", set1, set3); + + testIntersection("23", set2, set3); + + testIntersection("123", set1, set2, set3); + } + + public void testIntersection(String message, ByteComparable[]... sets) + { + final List testRanges = getTestRanges(); + testIntersection(message, fromList(testRanges), testRanges, sets); + testIntersection(message + " on merge ", mergeGeneratedRanges(), testRanges, sets); // Mainly tests MergeCursor's skipTo + testIntersection(message + " on collection merge ", collectionMergeGeneratedRanges(), testRanges, sets); // Mainly tests MergeCursor's skipTo + } + + void testIntersection(String message, DeletionAwareTrie trie, List intersected, ByteComparable[]... sets) + { + if (VERBOSE) + { + System.out.println("Markers: " + intersected); + DataPoint.dumpDeletionAwareTrie(trie); + } + verify(intersected); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + assertDeletionAwareEqual(message + " forward b" + bits, intersected, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + ByteComparable[] ranges = sets[toRemove]; + System.out.println("Ranges: " + toString(ranges)); + testIntersection(message + " " + toRemove, + trie.intersect(TrieSet.ranges(TrieUtil.VERSION, ranges)), + intersect(intersected, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(ByteComparable[][]::new) + ); + } + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionAwareMergeTest.java b/test/unit/org/apache/cassandra/db/tries/DeletionAwareMergeTest.java new file mode 100644 index 000000000000..7d7a2b0bd1be --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionAwareMergeTest.java @@ -0,0 +1,591 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import com.google.common.collect.Lists; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.DataPoint.dumpDeletionAwareTrie; +import static org.apache.cassandra.db.tries.DataPoint.fromList; +import static org.apache.cassandra.db.tries.DataPoint.verify; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +@RunWith(Parameterized.class) +public class DeletionAwareMergeTest extends DeletionAwareTestBase +{ + @Parameterized.Parameters(name = "bits per transition {0} deletion point {1}") + public static List mergeData() + { + return IntStream.rangeClosed(1, bitsNeeded) + .boxed() + .flatMap(x -> IntStream.of(4, 13, 22, 31, 40) + .mapToObj(y -> new Object[] { x, y })) + .collect(Collectors.toList()); + } + + + @Parameterized.Parameter(1) + public int deletionPoint = 100; + + private List deletedRanges(ByteComparable... dataPoints) + { + List data = new ArrayList<>(asList(dataPoints)); + invertDataRangeList(data); + filterOutEmptyRepetitions(data); + + List markers = new ArrayList<>(); + for (int i = 0; i < data.size(); ++i) + { + ByteComparable pos = data.get(i); + if (pos == null) + pos = i % 2 == 0 ? before(0) : before((1< data) + { + // invert list + if (data.get(0) != null) + data.add(0, null); + else + data.remove(0); + if (data.get(data.size() - 1) != null) + data.add(null); + else + data.remove(data.size() - 1); + } + + private static void filterOutEmptyRepetitions(List data) + { + for (int i = 0; i < data.size() - 1; ++i) + { + if (data.get(i) != null && data.get(i + 1) != null && + ByteComparable.compare(data.get(i), data.get(i + 1), VERSION) == 0) + { + data.remove(i + 1); + data.remove(i); + --i; + } + } + } + + @Test + public void testSubtrie() + { + { + testMerge("no merge"); + + testMerge("all", + deletedRanges(null, null)); + testMerge("fully covered range", + deletedRanges(before(20), before(25))); + testMerge("fully covered range", + deletedRanges(before(25), before(33))); + testMerge("matching range", + deletedRanges(before(21), before(24))); + testMerge("touching empty", + deletedRanges(before(24), before(26))); + + testMerge("partial left", + deletedRanges(before(22), before(25))); + testMerge("partial left on change", + deletedRanges(before(28), before(32))); + testMerge("partial left with null", + deletedRanges(before(29), null)); + + + testMerge("partial right", + deletedRanges(before(25), before(27))); + testMerge("partial right on change", + deletedRanges(before(25), before(28))); + testMerge("partial right with null", + deletedRanges(null, before(22))); + + testMerge("inside range", + deletedRanges(before(22), before(23))); + testMerge("inside with change", + deletedRanges(before(27), before(29))); + + testMerge("empty range inside", + deletedRanges(before(27), before(27))); + + testMerge("point covered", + deletedRanges(before(16), before(18))); + testMerge("point at range start", + deletedRanges(before(17), before(18))); + testMerge("point at range end", + deletedRanges(before(16), before(17))); + + + testMerge("start point covered", + deletedRanges(before(32), before(35))); + testMerge("start point at range start", + deletedRanges(before(33), before(35))); + testMerge("start point at range end", + deletedRanges(before(32), before(33))); + + + testMerge("end point covered", + deletedRanges(before(36), before(40))); + testMerge("end point at range start", + deletedRanges(before(38), before(40))); + testMerge("end point at range end", + deletedRanges(before(36), before(38))); + } + } + + @Test + public void testRanges() + { + { + testMerge("fully covered ranges", + deletedRanges(before(20), before(25), before(25), before(33))); + testMerge("matching ranges", + deletedRanges(before(21), before(24), before(26), before(31))); + testMerge("touching empty", + deletedRanges(before(20), before(21), before(24), before(26), before(32), before(33), before(34), before(36))); + testMerge("partial left", + deletedRanges(before(22), before(25), before(29), null)); + + testMerge("partial right", + deletedRanges(null, before(22), before(25), before(27))); + + testMerge("inside ranges", + deletedRanges(before(22), before(23), before(27), before(29))); + + testMerge("jumping inside", + deletedRanges(before(21), before(22), before(23), before(24), before(25), before(26), before(27), before(28), before(29), before(30))); + } + } + + @Test + public void testRangeOnSubtrie() + { + { + // non-overlapping + testMerge("non-overlapping", deletedRanges(before(20), before(23)), deletedRanges(before(24), before(27))); + // touching, i.e. still non-overlapping + testMerge("touching", deletedRanges(before(20), before(23)), deletedRanges(before(23), before(27))); + // overlapping 1 + testMerge("overlapping1", deletedRanges(before(20), before(23)), deletedRanges(before(22), before(27))); + // overlapping 2 + testMerge("overlapping2", deletedRanges(before(20), before(23)), deletedRanges(before(21), before(27))); + // covered + testMerge("covered1", deletedRanges(before(20), before(23)), deletedRanges(before(20), before(27))); + // covered 2 + testMerge("covered2", deletedRanges(before(23), before(27)), deletedRanges(before(20), before(27))); + // covered 3 + testMerge("covered3", deletedRanges(before(21), before(23)), deletedRanges(before(20), before(27))); + } + } + + @Test + public void testRangesOnRanges() + { + testMerges(); + } + + private List getTestRanges() + { + return flatten(asList(deletedPoint(17, 20), + livePoint(19, 30), + from(21, 10), deletedPointInside(22, 21, 10), livePoint(23, 31), to(24, 10), + from(26, 11), livePoint(27, 32), change(28, 11, 12).withPoint(22), livePoint(29, 33), to(30, 12), + livePoint(32, 34), from(33, 13).withPoint(23), to(34, 13), + from(36, 14), to(38, 14).withPoint(24), livePoint(39, 35))); + } + + private void testMerges() + { + testMergeWith("", fromList(getTestRanges()), getTestRanges()); + + List set1 = deletedRanges(null, before(24), before(25), before(29), before(32), null); + List set2 = deletedRanges(before(14), before(17), + before(22), before(27), + before(28), before(30), + before(32), before(34), + before(36), before(40)); + List set3 = deletedRanges(before(17), before(18), + before(19), before(20), + before(21), before(22), + before(23), before(24), + before(25), before(26), + before(27), before(28), + before(29), before(30), + before(31), before(32), + before(33), before(34), + before(35), before(36), + before(37), before(38)); + + testMerges(set1, set2, set3); + } + + private void testMerges(List set1, List set2, List set3) + { + testMerge("1", set1); + + testMerge("2", set2); + + testMerge("3", set3); + + testMerge("12", set1, set2); + + testMerge("13", set1, set3); + + testMerge("23", set2, set3); + + testMerge("123", set1, set2, set3); + } + + @SafeVarargs + public final void testMerge(String message, List... sets) + { + List testRanges = getTestRanges(); + testMergeWith(message, fromList(testRanges), testRanges, sets); + testCollectionMerge(message + " collection", Lists.newArrayList(fromList(testRanges)), testRanges, sets); + testMergeInMemoryTrie(message + " inmem.apply", fromList(testRanges), testRanges, sets); + testMergeInMemoryTrieIntoSet(message + " inmem.apply into set", fromList(testRanges), testRanges, sets); + } + + + public void testMergeWith(String message, DeletionAwareTrie trie, List merged, List... sets) + { + if (VERBOSE) + { + System.out.println("Markers: " + merged); + dumpDeletionAwareTrie(trie); + } + verify(merged); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + assertDeletionAwareEqual(message + " forward b" + bits, merged, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + InMemoryDeletionAwareTrie adding = fromList(ranges); + if (VERBOSE) + { + System.out.println("Adding: " + ranges); + dumpDeletionAwareTrie(adding); + } + testMergeWith(message + " " + toRemove, + trie.mergeWith(adding, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false), + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + } + } + } + + public void testCollectionMerge(String message, List> triesToMerge, List merged, List... sets) + { + if (VERBOSE) + System.out.println("Markers: " + merged); + verify(merged); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + if (VERBOSE) + { + System.out.println("Sources:"); + triesToMerge.forEach(DataPoint::dumpDeletionAwareTrie); + } + + DeletionAwareTrie trie = DeletionAwareTrie.merge(triesToMerge, + LivePoint::combineCollection, + DeletionMarker::combineCollection, + DeletionMarker::applyTo, + false); + if (VERBOSE) + { + System.out.println("Result:"); + dumpDeletionAwareTrie(trie); + } + + assertDeletionAwareEqual(message + " forward b" + bits, merged, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + if (VERBOSE) + System.out.println("Adding: " + ranges); + triesToMerge.add(fromList(ranges)); + testCollectionMerge(message + " " + toRemove, + triesToMerge, + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + triesToMerge.remove(triesToMerge.size() - 1); + } + } + } + + public void testMergeInMemoryTrie(String message, DeletionAwareTrie trie, List merged, List... sets) + { + if (VERBOSE) + { + System.out.println("Markers: " + merged); + dumpDeletionAwareTrie(trie); + } + verify(merged); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + assertDeletionAwareEqual(message + " forward b" + bits, merged, trie); + } + else + { + try + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + InMemoryDeletionAwareTrie adding = fromList(ranges); + if (VERBOSE) + { + System.out.println("Adding: " + ranges); + dumpDeletionAwareTrie(adding); + } + var dupe = duplicateTrie(trie); + dupe.apply(adding, + DataPoint::combineLive, + DataPoint::combineDeletion, + DataPoint::deleteLive, + DataPoint::deleteLive, + false, + v -> false); + testMergeInMemoryTrie(message + " " + toRemove, + dupe, + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + } + + public void testMergeInMemoryTrieIntoSet(String message, DeletionAwareTrie trie, List merged, List... sets) + { + if (VERBOSE) + { + System.out.println("Markers: " + merged); + dumpDeletionAwareTrie(trie); + } + verify(merged); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + assertDeletionAwareEqual(message + " forward b" + bits, merged, trie); + } + else + { + try + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + var set = fromList(ranges); + if (VERBOSE) + { + System.out.println("Adding: " + ranges); + dumpDeletionAwareTrie(set); + } + set.apply(trie, + DataPoint::combineLive, + DataPoint::combineDeletion, + DataPoint::deleteLive, + DataPoint::deleteLive, + false, + v -> false); + testMergeInMemoryTrieIntoSet(message + " " + toRemove, + set, + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + } + + InMemoryDeletionAwareTrie duplicateTrie(DeletionAwareTrie trie) + { + try + { + InMemoryDeletionAwareTrie copy = InMemoryDeletionAwareTrie.shortLived(VERSION); + copy.apply(trie, + DataPoint::combineLive, + DataPoint::combineDeletion, + DataPoint::deleteLive, + DataPoint::deleteLive, + false, + v -> false); + return copy; + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + DeletionMarker delete(int deletionTime, DeletionMarker marker) + { + if (deletionTime < 0 || marker == null) + return marker; + + int newLeft = Math.max(deletionTime, marker.leftSide); + int newRight = Math.max(deletionTime, marker.rightSide); + if (newLeft < 0 && newRight < 0 || newLeft == newRight) + return null; + if (newLeft == marker.leftSide && newRight == marker.rightSide) + return marker; + return new DeletionMarker(marker.position, newLeft, newRight); + } + + LivePoint delete(int deletionTime, LivePoint marker) + { + if (deletionTime < 0 || marker == null) + return marker; + return marker.delete(deletionTime); + } + + DataPoint delete(int deletionTime, DataPoint marker) + { + LivePoint live = delete(deletionTime, marker.live()); + DeletionMarker deletion = delete(deletionTime, marker.marker()); + return DataPoint.resolve(live, deletion); + } + + int leftSide(DataPoint point) + { + if (point.marker() == null) + return -1; + return point.marker().leftSide; + } + + int rightSide(DataPoint point) + { + if (point.marker() == null) + return -1; + return point.marker().rightSide; + } + + List mergeLists(List left, List right) + { + int active = -1; + Iterator rightIt = right.iterator(); + DataPoint nextRight = rightIt.hasNext() ? rightIt.next() : null; + List result = new ArrayList<>(); + for (DataPoint nextLeft : left) + { + while (true) + { + int cmp; + if (nextRight == null) + cmp = -1; + else + cmp = ByteComparable.compare(nextLeft.position(), nextRight.position(), VERSION); + + if (cmp < 0) + { + maybeAdd(result, nextRight != null ? delete(leftSide(nextRight), nextLeft) : nextLeft); + break; + } + + if (cmp == 0) + { + if (nextLeft.marker() == null) + nextRight = delete(active, nextRight); + if (nextRight != null) + maybeAdd(result, DataPoint.combine(nextRight, nextLeft).toContent()); + else + maybeAdd(result, nextLeft); + + nextRight = rightIt.hasNext() ? rightIt.next() : null; + break; + } + else + { + // Must close active if it becomes covered, and must open active if it is no longer covered. + maybeAdd(result, delete(active, nextRight)); + } + + nextRight = rightIt.hasNext() ? rightIt.next() : null; + } + if (nextLeft.marker() != null) + active = nextLeft.marker().rightSide; + } + assert active == -1; + while (nextRight != null) + { + maybeAdd(result, delete(active, nextRight));// deletion is not needed (active == -1), do just in case + nextRight = rightIt.hasNext() ? rightIt.next() : null; + } + return result; + } + + static void maybeAdd(List list, T value) + { + if (value == null) + return; + list.add(value); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionAwareRandomizedTest.java b/test/unit/org/apache/cassandra/db/tries/DeletionAwareRandomizedTest.java new file mode 100644 index 000000000000..85b4a7017332 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionAwareRandomizedTest.java @@ -0,0 +1,529 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.quicktheories.core.Gen; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.DataPoint.fromList; +import static org.apache.cassandra.db.tries.DataPoint.toList; +import static org.apache.cassandra.db.tries.DataPoint.verify; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.quicktheories.QuickTheory.qt; +import static org.quicktheories.generators.SourceDSL.booleans; +import static org.quicktheories.generators.SourceDSL.integers; +import static org.quicktheories.generators.SourceDSL.lists; + +/// Randomized property-based testing for deletion-aware tries using QuickTheories. +/// +/// This test class uses the existing deletion-aware trie test infrastructure to perform +/// comprehensive randomized testing of trie operations, merging, and deletion handling. +/// It complements the structured tests in [DeletionAwareMergeTest] with property-based +/// testing to catch edge cases and verify invariants across a wide range of inputs. +@RunWith(Parameterized.class) +public class DeletionAwareRandomizedTest extends DeletionAwareTestBase +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + private static final int MAX_POINTS = 20; + private static final int MAX_VALUE = 63; // Fits in 6 bits (bitsNeeded) + private static final int MAX_TIMESTAMP = 100; + + /// Generator for random live data points. + /// Creates `LivePoint` instances with random positions and timestamps. + private Gen livePointGen() + { + return integers().between(0, MAX_VALUE) + .zip(integers().between(1, MAX_TIMESTAMP), + (pos, ts) -> new LivePoint(at(pos), ts)); + } + + /// Generator for random live point lists. + /// Creates sorted lists of `LivePoint` instances for trie construction. + private Gen> dataPointListGen() + { + return lists().of(livePointGen().map(lp -> (DataPoint) lp)) + .ofSizeBetween(0, MAX_POINTS) + .map(this::sortAndValidateDataPoints); + } + + /// Generator for pairs of data point lists for merge testing. + private Gen>> dataPointPairGen() + { + return dataPointListGen().zip(dataPointListGen(), Arrays::asList); + } + + /// Sorts data points by position and ensures they form a valid deletion-aware sequence. + private List sortAndValidateDataPoints(List points) + { + if (points.isEmpty()) + return points; + + // Sort by position + List sorted = points.stream() + .sorted((a, b) -> ByteComparable.compare(a.position(), b.position(), TrieUtil.VERSION)) + .collect(Collectors.toList()); + + // Remove duplicates at same position, keeping the last one + List deduplicated = new ArrayList<>(); + DataPoint prev = null; + for (DataPoint dp : sorted) + { + if (prev == null || ByteComparable.compare(prev.position(), dp.position(), TrieUtil.VERSION) != 0) + deduplicated.add(dp); + else + deduplicated.set(deduplicated.size() - 1, dp); // Replace with newer + prev = dp; + } + + try + { + return verify(deduplicated); + } + catch (AssertionError e) + { + // If verification fails, return empty list to avoid invalid test data + return Collections.emptyList(); + } + } + + /// Test that trie construction and iteration are consistent. + /// + /// **Property**: Converting a list to a trie and back should yield the same list. + @Test + public void testTrieConstructionConsistency() + { + qt().forAll(dataPointListGen(), booleans().all()) + .checkAssert((dataPoints, forceCopy) -> { + if (dataPoints.isEmpty()) + return; // Skip empty lists + + DeletionAwareTrie trie = fromList(dataPoints, forceCopy); + List reconstructed = toList(trie); + + assertEquals("Trie construction should be consistent with iteration", + dataPoints, reconstructed); + }); + } + + /// Test that merging tries is commutative for the same resolver. + /// + /// **Property**: `merge(A, B)` should equal `merge(B, A)` when using the same merge resolver. + @Test + public void testMergeCommutativity() + { + qt().forAll(dataPointPairGen(), booleans().all()) + .checkAssert((pair, forcedCopy) -> { + List list1 = pair.get(0); + List list2 = pair.get(1); + + if (list1.isEmpty() && list2.isEmpty()) + return; // Skip empty merges + + DeletionAwareTrie trie1 = fromList(list1, forcedCopy); + DeletionAwareTrie trie2 = fromList(list2, forcedCopy); + + // Merge in both directions + DeletionAwareTrie merged1to2 = + trie1.mergeWith(trie2, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + DeletionAwareTrie merged2to1 = + trie2.mergeWith(trie1, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + List result1 = toList(merged1to2); + List result2 = toList(merged2to1); + + assertEquals("Merge should be commutative", result1, result2); + }); + } + + /// Test that merging with an empty trie is an identity operation. + /// + /// **Property**: `merge(A, empty)` should equal `A`. + @Test + public void testMergeIdentity() + { + qt().forAll(dataPointListGen(), booleans().all()) + .checkAssert((dataPoints, forcedCopy) -> { + if (dataPoints.isEmpty()) + return; // Skip empty lists + + DeletionAwareTrie trie = fromList(dataPoints, forcedCopy); + DeletionAwareTrie empty = fromList(Collections.emptyList(), forcedCopy); + + DeletionAwareTrie merged = + trie.mergeWith(empty, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + List original = toList(trie); + List result = toList(merged); + + assertEquals("Merging with empty trie should be identity", original, result); + }); + } + + /// Test that subtrie operations preserve ordering and boundaries. + /// + /// **Property**: A subtrie should contain only elements within the specified range. + @Test + public void testSubtrieRangeInvariant() + { + qt().forAll(dataPointListGen() + .zip(integers().between(0, MAX_VALUE), + integers().between(0, MAX_VALUE), + (points, start, end) -> { + int left = Math.min(start, end); + int right = Math.max(start, end); + return asList(points, left, right); + }), booleans().all()) + .checkAssert((params, forcedCopy) -> { + @SuppressWarnings("unchecked") + List dataPoints = (List) params.get(0); + int left = (Integer) params.get(1); + int right = (Integer) params.get(2); + + if (dataPoints.isEmpty()) + return; // Skip empty lists + + DeletionAwareTrie trie = fromList(dataPoints, forcedCopy); + DeletionAwareTrie subtrie = + trie.subtrie(before(left), before(right)); + + List subtriePoints = toList(subtrie); + + // Verify all points in subtrie are within range + for (DataPoint dp : subtriePoints) + { + ByteComparable pos = dp.position(); + int cmp1 = ByteComparable.compare(pos, before(left), TrieUtil.VERSION); + int cmp2 = ByteComparable.compare(pos, before(right), TrieUtil.VERSION); + + if (cmp1 < 0 || cmp2 >= 0) + { + throw new AssertionError( + String.format("Point %s outside subtrie range [%s, %s)", + pos.byteComparableAsString(TrieUtil.VERSION), + before(left).byteComparableAsString(TrieUtil.VERSION), + before(right).byteComparableAsString(TrieUtil.VERSION))); + } + } + }); + } + + /// Test that the optimized `MergeCursor` produces the same results as the safe version. + /// + /// **Property**: Optimized and safe merge cursors should produce identical results. + @Test + public void testOptimizedMergeCursorEquivalence() + { + qt().forAll(dataPointPairGen(), booleans().all()) + .checkAssert((pair, forcedCopy) -> { + List list1 = pair.get(0); + List list2 = pair.get(1); + + if (list1.isEmpty() || list2.isEmpty()) + return; // Skip cases with empty lists + + DeletionAwareTrie trie1 = fromList(list1, forcedCopy); + DeletionAwareTrie trie2 = fromList(list2, forcedCopy); + + // Test both optimized and safe merge using the trie API + DeletionAwareTrie safeMerge = + trie1.mergeWith(trie2, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + // Create optimized merge + DeletionAwareTrie optimizedMerge = + trie1.mergeWith(trie2, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, true); + + List safeResult = toList(safeMerge); + List optimizedResult = toList(optimizedMerge); + + assertEquals("Optimized and safe merge cursors should produce identical results", + safeResult, optimizedResult); + }); + } + + /// Test that deletion markers properly delete live data within their ranges. + /// + /// **Property**: Live data covered by deletion ranges should be removed or modified. + @Test + public void testDeletionApplicationInvariant() + { + qt().forAll(integers().between(0, MAX_VALUE), + integers().between(1, MAX_TIMESTAMP), + integers().between(1, MAX_TIMESTAMP)) + .checkAssert((pos, liveTs, deleteTs) -> { + // Create a live point and a deletion that should affect it + LivePoint live = new LivePoint(at(pos), liveTs); + DeletionMarker deletion = new DeletionMarker(before(pos), -1, deleteTs); + + // Apply deletion to live data + LivePoint result = deletion.applyTo(live); + + if (deleteTs > liveTs) + { + // Deletion should remove the live data (return null) + assertNull("Live data should be deleted when deletion timestamp > live timestamp", + result); + } + else + { + // Deletion should not affect live data with same or newer timestamp + assertEquals("Live data should not be affected by older or equal deletions", live, result); + } + }); + } + + /// Test that trie operations maintain structural invariants. + /// + /// **Property**: All trie operations should preserve the trie's structural integrity. + @Test + public void testTrieStructuralInvariants() + { + qt().forAll(dataPointListGen(), booleans().all()) + .checkAssert((dataPoints, forcedCopy) -> { + if (dataPoints.isEmpty()) + return; // Skip empty lists + + DeletionAwareTrie trie = fromList(dataPoints, forcedCopy); + + // Test that trie construction is consistent + List reconstructed = toList(trie); + + // Verify that the reconstructed list maintains ordering + for (int i = 1; i < reconstructed.size(); i++) + { + ByteComparable prev = reconstructed.get(i - 1).position(); + ByteComparable curr = reconstructed.get(i).position(); + int cmp = ByteComparable.compare(prev, curr, TrieUtil.VERSION); + + if (cmp > 0) + { + throw new AssertionError( + String.format("Trie ordering violation: %s > %s", + prev.byteComparableAsString(TrieUtil.VERSION), + curr.byteComparableAsString(TrieUtil.VERSION))); + } + } + }); + } + + /// Test that range operations are consistent with full trie operations. + /// + /// **Property**: Operating on a range should be equivalent to filtering the full result. + @Test + public void testRangeOperationConsistency() + { + qt().forAll(dataPointListGen() + .zip(integers().between(0, MAX_VALUE), + integers().between(0, MAX_VALUE), + (points, start, end) -> { + int left = Math.min(start, end); + int right = Math.max(start, end); + return asList(points, left, right); + }), + booleans().all()) + .checkAssert((params, forcedCopy) -> { + @SuppressWarnings("unchecked") + List dataPoints = (List) params.get(0); + int left = (Integer) params.get(1); + int right = (Integer) params.get(2); + + if (dataPoints.isEmpty()) + return; // Skip empty lists + + DeletionAwareTrie trie = fromList(dataPoints, forcedCopy); + + // Get subtrie result + DeletionAwareTrie subtrie = + trie.subtrie(before(left), before(right)); + List subtrieResult = toList(subtrie); + + // Get filtered full result + List fullResult = toList(trie); + List filteredResult = fullResult.stream() + .filter(dp -> { + ByteComparable pos = dp.position(); + int cmp1 = ByteComparable.compare(pos, before(left), TrieUtil.VERSION); + int cmp2 = ByteComparable.compare(pos, before(right), TrieUtil.VERSION); + return cmp1 >= 0 && cmp2 < 0; + }) + .collect(Collectors.toList()); + + assertEquals("Subtrie should be equivalent to filtering full trie", + filteredResult, subtrieResult); + }); + } + + /// Test that merge operations are associative. + /// + /// **Property**: `merge(merge(A, B), C)` should equal `merge(A, merge(B, C))`. + @Test + public void testMergeAssociativity() + { + qt().forAll(dataPointListGen() + .zip(dataPointListGen(), dataPointListGen(), Arrays::asList), + booleans().all()) + .checkAssert((triple, forcedCopy) -> { + List list1 = triple.get(0); + List list2 = triple.get(1); + List list3 = triple.get(2); + + if (list1.isEmpty() && list2.isEmpty() && list3.isEmpty()) + return; // Skip all empty + + DeletionAwareTrie trie1 = fromList(list1, forcedCopy); + DeletionAwareTrie trie2 = fromList(list2, forcedCopy); + DeletionAwareTrie trie3 = fromList(list3); + + // Test (A merge B, forcedCopy) merge C + DeletionAwareTrie ab = + trie1.mergeWith(trie2, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + DeletionAwareTrie ab_c = + ab.mergeWith(trie3, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + // Test A merge (B merge C) + DeletionAwareTrie bc = + trie2.mergeWith(trie3, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + DeletionAwareTrie a_bc = + trie1.mergeWith(bc, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + List result1 = toList(ab_c); + List result2 = toList(a_bc); + + assertEquals("Merge should be associative", result1, result2); + }); + } + + + /// Test collection merge functionality using randomized property-based testing. + /// This test verifies that merging multiple tries using collection merge produces + /// the same result as sequential pairwise merges. + @Test + public void testCollectionMerge() + { + qt().forAll(dataPointListGen().zip(dataPointListGen(), dataPointListGen(), Arrays::asList), booleans().all()) + .checkAssert((triple, forcedCopy) -> { + List list1 = triple.get(0); + List list2 = triple.get(1); + List list3 = triple.get(2); + + // Skip cases where all tries are empty or where we have empty tries mixed with non-empty ones + // Collection merge requires at least one non-empty trie and can't handle mixed empty/non-empty + boolean hasEmpty = list1.isEmpty() || list2.isEmpty() || list3.isEmpty(); + boolean hasNonEmpty = !list1.isEmpty() || !list2.isEmpty() || !list3.isEmpty(); + + if (!hasNonEmpty || hasEmpty) + return; // Skip if all empty or if we have any empty tries + + DeletionAwareTrie trie1 = fromList(list1, forcedCopy); + DeletionAwareTrie trie2 = fromList(list2, forcedCopy); + DeletionAwareTrie trie3 = fromList(list3, forcedCopy); + + // Test collection merge + DeletionAwareTrie collectionMerged = + DeletionAwareTrie.merge(Arrays.asList(trie1, trie2, trie3), + LivePoint::combineCollection, + DeletionMarker::combineCollection, + DeletionMarker::applyTo, + false); + + List collectionResult = toList(collectionMerged); + + // Test pairwise merge for comparison + DeletionAwareTrie pairwise12 = + trie1.mergeWith(trie2, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + DeletionAwareTrie pairwiseMerged = + pairwise12.mergeWith(trie3, LivePoint::combine, DeletionMarker::combine, DeletionMarker::applyTo, false); + + List pairwiseResult = toList(pairwiseMerged); + + assertEquals("Collection merge should equal pairwise merge", pairwiseResult, collectionResult); + }); + } + + /// Test that the optimized collection merge produces the same results as the safe version. + /// This verifies that the deletionsAtFixedPoints optimization works correctly for collection merges. + @Test + public void testOptimizedCollectionMerge() + { + qt().forAll(dataPointListGen().zip(dataPointListGen(), dataPointListGen(), Arrays::asList), booleans().all()) + .checkAssert((triple, forcedCopy) -> { + List list1 = triple.get(0); + List list2 = triple.get(1); + List list3 = triple.get(2); + + // Skip cases where all tries are empty or where we have empty tries mixed with non-empty ones + // Collection merge requires at least one non-empty trie and can't handle mixed empty/non-empty + boolean hasEmpty = list1.isEmpty() || list2.isEmpty() || list3.isEmpty(); + boolean hasNonEmpty = !list1.isEmpty() || !list2.isEmpty() || !list3.isEmpty(); + + if (!hasNonEmpty || hasEmpty) + return; // Skip if all empty or if we have any empty tries + + DeletionAwareTrie trie1 = fromList(list1, forcedCopy); + DeletionAwareTrie trie2 = fromList(list2, forcedCopy); + DeletionAwareTrie trie3 = fromList(list3); + + // Test safe collection merge (deletionsAtFixedPoints = false, forcedCopy) + DeletionAwareTrie safeMerged = dir -> + new CollectionMergeCursor.DeletionAware<>( + LivePoint::combineCollection, + DeletionMarker::combineCollection, + DeletionMarker::applyTo, + false, + dir, + Arrays.asList(trie1, trie2, trie3), + DeletionAwareTrie::cursor); + + // Test optimized collection merge (deletionsAtFixedPoints = true) + DeletionAwareTrie optimizedMerged = dir -> + new CollectionMergeCursor.DeletionAware<>( + LivePoint::combineCollection, + DeletionMarker::combineCollection, + DeletionMarker::applyTo, + true, + dir, + Arrays.asList(trie1, trie2, trie3), + DeletionAwareTrie::cursor); + + List safeResult = toList(safeMerged); + List optimizedResult = toList(optimizedMerged); + + assertEquals("Optimized and safe collection merge should produce identical results", + safeResult, optimizedResult); + }); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionAwareTailTrieTest.java b/test/unit/org/apache/cassandra/db/tries/DeletionAwareTailTrieTest.java new file mode 100644 index 000000000000..acd6e86d1b60 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionAwareTailTrieTest.java @@ -0,0 +1,519 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.google.common.base.Predicates; +import com.google.common.collect.Streams; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.directComparable; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Tests for DeletionAwareTrie#tailTrie and #tailTries methods, specifically focusing on + * the includeCoveringDeletions flag functionality. + */ +public class DeletionAwareTailTrieTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static ByteComparable EMPTY = ByteComparable.preencoded(VERSION, new byte[0]); + static String value1 = "value1"; + static String value2 = "value2"; + static String value3 = "value3"; + static String value4 = "value4"; + static String value5 = "value5"; + static String value6 = "value6"; + static String value7 = "value7"; + static ByteComparable key1 = directComparable("partition/key1"); + static ByteComparable key2 = directComparable("partition/key2"); + static ByteComparable key3 = directComparable("partition/key3"); + static ByteComparable key4 = directComparable("partition/key4"); + static ByteComparable key5 = directComparable("partition/key5"); + static ByteComparable key6 = directComparable("partition/key6"); + static ByteComparable key7 = directComparable("partition/key7"); + static ByteComparable partition = directComparable("partition"); + + static InMemoryDeletionAwareTrie trie = makeTrie(); + + @Test + public void testTailAtPartitionWithCoveringDeletionIncluded() + { + testTailTrie("partition", true, + EMPTY, TestRangeState.open(100), + directComparable("/key1"), value1, + directComparable("/key2"), TestRangeState.boundary(100, 200), + directComparable("/key2"), TestRangeState.boundary(200, 100), + directComparable("/key3"), value3, + directComparable("/key4"), TestRangeState.boundary(100, 150), + directComparable("/key4"), value4, + directComparable("/key5"), TestRangeState.boundary(150, 200), + directComparable("/key5"), value5, + directComparable("/key5"), TestRangeState.boundary(200, 150), + directComparable("/key6"), value6, + directComparable("/key7"), value7, + directComparable("/key7"), TestRangeState.boundary(150, 100), + EMPTY, TestRangeState.close(100)); + } + + @Test + public void testTailBelowPartitionWithCoveringDeletionIncluded() + { + testTailTrie("partition/key", true, + EMPTY, TestRangeState.open(100), + directComparable("1"), value1, + directComparable("2"), TestRangeState.boundary(100, 200), + directComparable("2"), TestRangeState.boundary(200, 100), + directComparable("3"), value3, + directComparable("4"), TestRangeState.boundary(100, 150), + directComparable("4"), value4, + directComparable("5"), TestRangeState.boundary(150, 200), + directComparable("5"), value5, + directComparable("5"), TestRangeState.boundary(200, 150), + directComparable("6"), value6, + directComparable("7"), value7, + directComparable("7"), TestRangeState.boundary(150, 100), + EMPTY, TestRangeState.close(100)); + } + + @Test + public void testTailAtDataKeyWithCoveringDeletionIncluded() + { + testTailTrie("partition/key3", true, + EMPTY, TestRangeState.open(100), + EMPTY, value3, + EMPTY, TestRangeState.close(100)); + } + + @Test + public void testTailAtDeletedKeyWithCoveringDeletionIncluded() + { + testTailTrie("partition/key2", true, + EMPTY, TestRangeState.open(200), + EMPTY, TestRangeState.close(200)); + } + + @Test + public void testTailAtDataKeyWithCoveringRangeDeletionIncluded() + { + testTailTrie("partition/key6", true, + EMPTY, TestRangeState.open(150), + EMPTY, value6, + EMPTY, TestRangeState.close(150)); + } + + @Test + public void testTailAtDeletedKeyWithCoveringRangeDeletionIncluded() + { + testTailTrie("partition/key5", true, + EMPTY, TestRangeState.open(200), + EMPTY, value5, + EMPTY, TestRangeState.close(200)); + } + + @Test + public void testTriesCoveringIncluded() + { + testTailTries(true, + tail("partition/key1", + EMPTY, TestRangeState.open(100), + EMPTY, value1, + EMPTY, TestRangeState.close(100)), + tail("partition/key3", + EMPTY, TestRangeState.open(100), + EMPTY, value3, + EMPTY, TestRangeState.close(100)), + tail("partition/key4", + EMPTY, TestRangeState.open(150), + EMPTY, value4, + EMPTY, TestRangeState.close(150)), + tail("partition/key5", + EMPTY, TestRangeState.open(200), + EMPTY, value5, + EMPTY, TestRangeState.close(200)), + tail("partition/key6", + EMPTY, TestRangeState.open(150), + EMPTY, value6, + EMPTY, TestRangeState.close(150)), + tail("partition/key7", + EMPTY, TestRangeState.open(150), + EMPTY, value7, + EMPTY, TestRangeState.close(150))); + } + + @Test + public void testTailAtPartitionWithCoveringDeletionExcluded() + { + testTailTrie("partition", false, + EMPTY, TestRangeState.open(100), + directComparable("/key1"), value1, + directComparable("/key2"), TestRangeState.boundary(100, 200), + directComparable("/key2"), TestRangeState.boundary(200, 100), + directComparable("/key3"), value3, + directComparable("/key4"), TestRangeState.boundary(100, 150), + directComparable("/key4"), value4, + directComparable("/key5"), TestRangeState.boundary(150, 200), + directComparable("/key5"), value5, + directComparable("/key5"), TestRangeState.boundary(200, 150), + directComparable("/key6"), value6, + directComparable("/key7"), value7, + directComparable("/key7"), TestRangeState.boundary(150, 100), + EMPTY, TestRangeState.close(100)); + } + + @Test + public void testTailBelowPartitionWithCoveringDeletionExcluded() + { + testTailTrie("partition/key", false, + directComparable("1"), value1, + directComparable("2"), TestRangeState.open(200), + directComparable("2"), TestRangeState.close(200), + directComparable("3"), value3, + directComparable("4"), TestRangeState.open(150), + directComparable("4"), value4, + directComparable("5"), TestRangeState.boundary(150, 200), + directComparable("5"), value5, + directComparable("5"), TestRangeState.boundary(200, 150), + directComparable("6"), value6, + directComparable("7"), value7, + directComparable("7"), TestRangeState.close(150)); + } + + @Test + public void testTailAtDataKeyWithCoveringDeletionExcluded() + { + testTailTrie("partition/key3", false, + EMPTY, value3); + } + + @Test + public void testTailAtDataKeyWithCoveringRangeDeletionExcluded() + { + testTailTrie("partition/key6", false, + EMPTY, value6); + } + + @Test + public void testTailAtDeletedKeyWithCoveringRangeDeletionExcluded() + { + testTailTrie("partition/key5", false, + EMPTY, TestRangeState.open(200), + EMPTY, value5, + EMPTY, TestRangeState.close(200)); + } + + @Test + public void testTriesCoveringExcluded() + { + testTailTries(false, + tail("partition/key1", + EMPTY, value1), + tail("partition/key3", + EMPTY, value3), + tail("partition/key4", + // Note: Because in forward the range deletion starts in this branch, the ignoreCoveringDeletions + // option does not treat it as covering and must close it at the end of the tail. + arr(EMPTY, TestRangeState.open(150), + EMPTY, value4, + EMPTY, TestRangeState.close(150)), + arr(EMPTY, value4)), + tail("partition/key5", + EMPTY, TestRangeState.open(200), + EMPTY, value5, + EMPTY, TestRangeState.close(200)), + tail("partition/key6", + EMPTY, value6), + tail("partition/key7", + arr(EMPTY, value7), + // Note: Because in reverse the range deletion starts in this branch, the ignoreCoveringDeletions + // option does not treat it as covering and must close it at the end of the tail. + arr(EMPTY, TestRangeState.open(150), + EMPTY, value7, + EMPTY, TestRangeState.close(150)))); + } + + static Object[] arr(Object... data) + { + return data; + } + + static class TailExpectations + { + final ByteComparable key; + final Object[] forwardExpectations; + final Object[] reverseExpectations; + + TailExpectations(ByteComparable key, Object[] forwardExpectations, Object[] reverseExpectations) + { + this.key = key; + this.forwardExpectations = forwardExpectations; + this.reverseExpectations = reverseExpectations; + } + } + + static TailExpectations tail(String key, Object... expectations) + { + return new TailExpectations(directComparable(key), expectations, expectations); + } + + static TailExpectations tail(String key, Object[] fwdExpectations, Object[] revExpectations) + { + return new TailExpectations(directComparable(key), fwdExpectations, revExpectations); + } + + void testTailTrie(String key, boolean includeCoveringDeletions, Object... expectedData) + { + testTailTrie(Direction.FORWARD, key, includeCoveringDeletions, expectedData); + testTailTrie(Direction.REVERSE, key, includeCoveringDeletions, expectedData); + } + void testTailTrie(Direction tailDirection, String key, boolean includeCoveringDeletions, Object... expectedData) + { + // Get tail trie at "partition" with includeCoveringDeletions=true + DeletionAwareTrie tail = trie.tailTrie(directComparable(key), includeCoveringDeletions); + assertNotNull("Tail trie should not be null", tail); + + // Verify the tail has the deletion branch at its root + if (Stream.of(expectedData).anyMatch(TestRangeState.class::isInstance)) + { + DeletionAwareCursor cursor = tail.cursor(tailDirection); + RangeCursor deletionBranchCursor = cursor.deletionBranchCursor(tailDirection); + assertNotNull("Deletion branch should be present at root when including covering deletions", deletionBranchCursor); + } + + System.out.println(trie.dump()); + System.out.println(tail.dump()); + + // Verify the content of the tail includes all data. + var list = collectEntriesAsList(tail, tailDirection); + var expected = tailDirection.isForward() ? Arrays.asList(expectedData) + : makeReversedExpectations(expectedData); + + assertEquals(expected, list); + } + + private List makeReversedExpectations(Object[] data) + { + // reverse the pairs + List reversed = new ArrayList<>(); + for (int i = data.length - 2; i >= 0; i-=2) + { + reversed.add(data[i]); + reversed.add(data[i + 1]); + } + return reversed; + } + + private void testTailTries(boolean includeCoveringDeletions, TailExpectations... tails) + { + testTailTries(Direction.FORWARD, includeCoveringDeletions, tails); + testTailTries(Direction.REVERSE, includeCoveringDeletions, tails); + } + + private void testTailTries(Direction direction, boolean includeCoveringDeletions, TailExpectations... tails) + { + int idx = direction.select(0, tails.length - 1); + for (var tailEntry : trie.tailTries(direction, Predicates.alwaysTrue(), includeCoveringDeletions)) + { + var tail = tails[idx]; + System.out.println("Trie at " + tailEntry.getKey().byteComparableAsString(VERSION)); + System.out.println(tailEntry.getValue().dump()); + // Check the key + assertEquals(0, ByteComparable.compare(tail.key, tailEntry.getKey(), VERSION)); + + // Verify the content of the tail includes all data. + var list = collectEntriesAsList(tailEntry.getValue(), direction); + var expected = direction.isForward() ? Arrays.asList(tail.forwardExpectations) + : makeReversedExpectations(tail.reverseExpectations); + + assertEquals(expected, list); + idx += direction.increase; + } + } + + private static InMemoryDeletionAwareTrie makeTrie() + { + try + { + // Create an in-memory trie with live data and deletions + InMemoryDeletionAwareTrie trie = InMemoryDeletionAwareTrie.shortLived(VERSION); + + // Add live data at partition/key1, partition/key2, partition/key3 + + trie.putRecursive(key1, value1, (e, u) -> u); + trie.putRecursive(key3, value3, (e, u) -> u); + trie.putRecursive(key4, value4, (e, u) -> u); + trie.putRecursive(key5, value5, (e, u) -> u); + trie.putRecursive(key6, value6, (e, u) -> u); + trie.putRecursive(key7, value7, (e, u) -> u); + + // Deletion at key2 + trie.apply(DeletionAwareTrie.deletedRange(key2, + EMPTY, + true, + EMPTY, + true, + VERSION, + TestRangeState.covering(200)), + (e, u) -> { + throw new AssertionError("Should not merge data"); + }, + TestRangeState::upsert, + (d, v) -> d, // keep covered data + (d, v) -> { + throw new AssertionError(); + }, + false, + x -> false); + // Deletion at key5 + trie.apply(DeletionAwareTrie.deletedRange(key5, + EMPTY, + true, + EMPTY, + true, + VERSION, + TestRangeState.covering(200)), + (e, u) -> { + throw new AssertionError("Should not merge data"); + }, + TestRangeState::upsert, + (d, v) -> d, // keep covered data + (d, v) -> { + throw new AssertionError(); + }, + false, + x -> false); + + // Deletion range key4-key6 inclusive + trie.apply(DeletionAwareTrie.deletedRange(partition, + directComparable("/key4"), + true, + directComparable("/key7"), + true, + VERSION, + TestRangeState.covering(150)), + (e, u) -> { + throw new AssertionError("Should not merge data"); + }, + TestRangeState::upsert, + (d, v) -> d, // keep covered data + (d, v) -> { + throw new AssertionError(); + }, + false, + x -> false); + + // Create a deletion range at partition level covering key1 to key3 + TestRangeState deletion = TestRangeState.covering(100); + RangeTrie rangeTrie = RangeTrie.branch(EMPTY, VERSION, deletion); + + // Apply the deletion branch at partition level + DeletionAwareTrie deletionBranch = + DeletionAwareTrie.deletionBranch(partition, VERSION, rangeTrie); + + trie.apply(deletionBranch, + (e, u) -> { + throw new AssertionError("Should not merge data"); + }, + TestRangeState::upsert, + (d, v) -> d, // keep covered data + (d, v) -> { + throw new AssertionError(); + }, + false, + x -> false); + + return trie; + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + private static List collectEntriesAsList(DeletionAwareTrie tail, Direction direction) + { + return Streams.stream(tail.mergedTrie((x, y) -> x != null + ? y != null && y.isBoundary() + ? Pair.create(y, x) // deletion first + : x + : y) + .entrySet(direction)) + .flatMap(en -> en.getValue() instanceof Pair + ? Stream.of(en.getKey(), ((Pair) en.getValue()).left, + en.getKey(), ((Pair) en.getValue()).right) + : Stream.of(en.getKey(), en.getValue())) + .collect(Collectors.toList()); + } + + /** + * Test tailTries iteration with includeCoveringDeletions=true. + */ + @Test + public void testTailTriesWithCoveringDeletionsIncluded() throws Exception + { + InMemoryDeletionAwareTrie trie = makeTrie(); + + // Iterate with includeCoveringDeletions=true + List keys = new ArrayList<>(); + for (var entry : trie.tailTries(Direction.FORWARD, v -> v instanceof String, true)) + { + keys.add(entry.getKey()); + assertNotNull("Tail trie should not be null", entry.getValue()); + } + + assertTrue("Should have found some tail tries", keys.size() > 0); + } + + /** + * Test tailTries iteration with includeCoveringDeletions=false. + */ + @Test + public void testTailTriesWithCoveringDeletionsExcluded() throws Exception + { + InMemoryDeletionAwareTrie trie = makeTrie(); + + // Iterate with includeCoveringDeletions=false + List keys = new ArrayList<>(); + for (var entry : trie.tailTries(Direction.FORWARD, v -> v instanceof String, false)) + { + keys.add(entry.getKey()); + assertNotNull("Tail trie should not be null", entry.getValue()); + } + + assertTrue("Should have found some tail tries", keys.size() > 0); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionAwareTestBase.java b/test/unit/org/apache/cassandra/db/tries/DeletionAwareTestBase.java new file mode 100644 index 000000000000..5dd347bf266b --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionAwareTestBase.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import org.junit.BeforeClass; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.db.tries.DataPoint.contentOnlyList; +import static org.apache.cassandra.db.tries.DataPoint.deletionOnlyList; +import static org.apache.cassandra.db.tries.DataPoint.toList; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.junit.Assert.assertEquals; + +public class DeletionAwareTestBase +{ + /// Change to true to pring debug info + static final boolean VERBOSE = false; + + static final int bitsNeeded = 6; + + @Parameterized.Parameters(name = "bits per transition {0}") + public static List data() + { + return IntStream.rangeClosed(1, bitsNeeded) + .boxed() + .collect(Collectors.toList()); + } + + @Parameterized.Parameter(0) + public int bits = bitsNeeded; + + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + private static String toString(ByteComparable ranges) + { + if (ranges == null) + return "null"; + return ranges.byteComparableAsString(TrieUtil.VERSION); + } + + private static DeletionMarker makeActiveMarker(int active, int rangeIndex, ByteComparable nextRange) + { + if (active >= 0) // cmp > 0, must covert active to marker + { + if ((rangeIndex & 1) != 0) + return new DeletionMarker(nextRange, active, -1); + else + return new DeletionMarker(nextRange, -1, active); + } + return null; + } + + protected static void assertDeletionAwareEqual(String msg, List merged, DeletionAwareTrie trie) + { + try + { + assertEquals(msg, merged, toList(trie)); + assertEquals(msg + " live", + merged.stream().map(DataPoint::live).filter(x -> x != null).collect(Collectors.toList()), + contentOnlyList(trie)); + assertEquals(msg + " deletions", + merged.stream().map(DataPoint::marker).filter(x -> x != null).collect(Collectors.toList()), + deletionOnlyList(trie)); + System.out.println(msg + " matched."); + } + catch (AssertionError e) + { + System.out.println(); + DataPoint.dumpDeletionAwareTrie(trie); + throw e; + } + } + + static void maybeAdd(List list, T value) + { + if (value == null) + return; + list.add(value); + } + + /// Creates a [ByteComparable] for the provided value by splitting the integer in sequences of "bits" bits. + private ByteComparable of(int value, int terminator) + { + // TODO: Also in all other tests of this type + assert value >= 0 && value <= Byte.MAX_VALUE; + + byte[] splitBytes = new byte[(bitsNeeded + bits - 1) / bits + 1]; + int pos = 0; + int mask = (1 << bits) - 1; + for (int i = bitsNeeded - bits; i > 0; i -= bits) + splitBytes[pos++] = (byte) ((value >> i) & mask); + + splitBytes[pos++] = (byte) (value & mask); + splitBytes[pos++] = (byte) terminator; + return ByteComparable.preencoded(VERSION, splitBytes); + } + + ByteComparable at(int value) + { + return of(value, ByteSource.TERMINATOR); + } + + ByteComparable before(int value) + { + return of(value, ByteSource.LT_NEXT_COMPONENT); + } + + ByteComparable after(int value) + { + return of(value, ByteSource.GT_NEXT_COMPONENT); + } + + DeletionMarker from(int where, int value) + { + return new DeletionMarker(before(where), -1, value); + } + + DeletionMarker to(int where, int value) + { + return new DeletionMarker(before(where), value, -1); + } + + DeletionMarker change(int where, int from, int to) + { + return new DeletionMarker(before(where), from, to); + } + + DeletionMarker[] deletedPoint(int where, int value) + { + return deletedPointInside(where, value, -1); + } + + DeletionMarker[] deletedPointInside(int where, int value, int active) + { + return new DeletionMarker[] + { + new DeletionMarker(before(where), active, value), + new DeletionMarker(after(where), value, active) + }; + } + + DataPoint livePoint(int where, int timestamp) + { + return new LivePoint(at(where), timestamp); + } + + protected ByteComparable[] array(ByteComparable... data) + { + return data; + } + + protected List flatten(List pointsOrArrays) + { + return pointsOrArrays.stream() + .flatMap(x -> x instanceof DataPoint ? Stream.of((DataPoint) x) : Arrays.stream((DeletionMarker[]) x)) + .collect(Collectors.toList()); + } + + String toString(ByteComparable[] ranges) + { + StringBuilder b = new StringBuilder(); + for (int i = 0; i < ranges.length; i += 2) + { + b.append('['); + b.append(DeletionAwareTestBase.toString(ranges[i])); + b.append(';'); + b.append(DeletionAwareTestBase.toString(ranges[i + 1])); + b.append(')'); + } + return b.toString(); + } + + List intersect(List dataPoints, ByteComparable... ranges) + { + int rangeIndex = 0; + int active = -1; + ByteComparable nextRange = ranges[0]; + if (nextRange == null) + nextRange = ++rangeIndex < ranges.length ? ranges[rangeIndex] : null; + List result = new ArrayList<>(); + for (DataPoint dp : dataPoints) + { + DeletionMarker marker = dp.marker(); + while (true) + { + int cmp; + if (nextRange == null) + cmp = -1; + else + cmp = ByteComparable.compare(dp.position(), nextRange, TrieUtil.VERSION); + + if (cmp < 0) + { + if ((rangeIndex & 1) != 0) + DeletionAwareTestBase.maybeAdd(result, dp); + break; + } + + if (cmp == 0) + { + DeletionMarker adjustedMarker = marker != null ? marker : DeletionAwareTestBase.makeActiveMarker(active, rangeIndex, nextRange); + + if ((rangeIndex & 1) == 0) + DeletionAwareTestBase.maybeAdd(result, dp.withMarker(startOf(adjustedMarker))); + else + DeletionAwareTestBase.maybeAdd(result, dp.withMarker(endOf(adjustedMarker))); // live points are included at starts as well as ends + + nextRange = ++rangeIndex < ranges.length ? ranges[rangeIndex] : null; + break; + } + else + DeletionAwareTestBase.maybeAdd(result, DeletionAwareTestBase.makeActiveMarker(active, rangeIndex, nextRange)); + + nextRange = ++rangeIndex < ranges.length ? ranges[rangeIndex] : null; + } + if (marker != null) + active = marker.rightSide; + } + assert active == -1; + return result; + } + + DeletionMarker startOf(DeletionMarker marker) + { + return marker != null ? marker.restrict(false, true) : null; + } + + DeletionMarker endOf(DeletionMarker marker) + { + return marker != null ? marker.restrict(true, false) : null; + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionBranchConsistencyTest.java b/test/unit/org/apache/cassandra/db/tries/DeletionBranchConsistencyTest.java new file mode 100644 index 000000000000..8f56ddcb4285 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionBranchConsistencyTest.java @@ -0,0 +1,308 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Collection; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Predicate; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +/// Consistency test for [InMemoryDeletionAwareTrie] that validates concurrent operations +/// with both live data and deletion markers under different atomicity guarantees. +/// This test extends [ConsistencyTestBase] to verify that [InMemoryDeletionAwareTrie] maintains +/// correctness and consistency under concurrent access patterns typical of Cassandra's +/// memtable operations with deletions. +@SuppressWarnings("rawtypes") +public class DeletionBranchConsistencyTest +extends ConsistencyTestBase, + InMemoryDeletionAwareTrie> +{ + + @SuppressWarnings("rawtypes") // type does not matter, we are always throwing an exception + static final InMemoryBaseTrie.UpsertTransformer UPSERT_THROW = (x, y) -> { throw new AssertionError(); }; + @SuppressWarnings("rawtypes") // type does not matter, we are always throwing an exception + static final BiFunction BIFUNCTION_THROW = (x, y) -> { throw new AssertionError(); }; + + @Override + InMemoryDeletionAwareTrie makeTrie(OpOrder readOrder) + { + return InMemoryDeletionAwareTrie.longLived(VERSION, readOrder); + } + + @Override + TestStateMetadata value(ByteComparable b, ByteComparable cprefix, ByteComparable c, int add, int seqId) + { + String pk = b.byteComparableAsString(VERSION); + String ck = (cprefix != null ? cprefix.byteComparableAsString(VERSION) : "") + c.byteComparableAsString(VERSION); + return new TestStateMetadata<>(new Value(pk, ck, add, seqId)); + } + + @Override + TestStateMetadata metadata(ByteComparable b) + { + return new TestStateMetadata<>(new Metadata(b.byteComparableAsString(VERSION))); + } + + @Override + String pk(TestStateMetadata c) + { + return ((Content)c.metadata).pk; + } + + @Override + String ck(TestStateMetadata c) + { + return ((Value) c.metadata).ck; + } + + @Override + int seq(TestStateMetadata c) + { + return ((Value) c.metadata).seq; + } + + @Override + int value(TestStateMetadata c) + { + return ((Value) c.metadata).value; + } + + @Override + int updateCount(TestStateMetadata c) + { + return ((Metadata) c.metadata).updateCount; + } + + @Override + DeletionAwareTrie makeSingleton(ByteComparable b, TestStateMetadata content) + { + return DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, VERSION, RangeTrie.point(b, VERSION, true, content)); + } + + @Override + DeletionAwareTrie withRootMetadata(DeletionAwareTrie wrapped, TestStateMetadata metadata) + { + return TrieUtil.withRootMetadata(wrapped, metadata); + } + + @Override + DeletionAwareTrie merge(Collection> tries, + Trie.CollectionMergeResolver mergeResolver) + { + return DeletionAwareTrie.merge(tries, + mergeResolver, + Trie.throwingResolver(), + BIFUNCTION_THROW, + true); // deletionsAtFixedPoints = true for consistency + } + + @Override + void apply(InMemoryDeletionAwareTrie trie, + DeletionAwareTrie mutation, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + trie.mutator(mergeResolver, + (x, y) -> mergeResolver.apply((TestStateMetadata) x, (TestStateMetadata) y), // Use the provided merge resolver for content + UPSERT_THROW, + BIFUNCTION_THROW, + false, + forcedCopyChecker, + forcedCopyCheckerRanges) + .apply(mutation); // Use the provided forced copy checker + } + + @Override + void delete(InMemoryDeletionAwareTrie trie, + ByteComparable deletionPrefix, + TestRangeState partitionMarker, + RangeTrie deletionBranch, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + DeletionAwareTrie deletion = DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, VERSION, deletionBranch); + deletion = TrieUtil.withRootMetadata(deletion, partitionMarker); + deletion = deletion.prefixedBy(deletionPrefix); + + trie.mutator(mergeResolver, + (existing, incoming) -> (existing instanceof TestStateMetadata) + ? mergeResolver.apply((TestStateMetadata) existing, incoming) + : TestRangeState.combine(existing, incoming), + mergeResolver, + BIFUNCTION_THROW, + false, + forcedCopyCheckerRanges, + forcedCopyCheckerRanges) + .apply(deletion); + } + + @Override + boolean isPartition(TestStateMetadata c) + { + return c != null && ((Content) c.metadata).isPartition(); + } + + @Override + TestStateMetadata mergeMetadata(TestStateMetadata c1, TestStateMetadata c2) + { + if (c1 == null) return c2; + if (c2 == null) return c1; + return toTestStateMetadata(((Metadata) c1.metadata).mergeWith((Metadata) c2.metadata)); + } + + @Override + TestStateMetadata deleteMetadata(TestStateMetadata existing, int entriesToRemove) + { + if (existing == null) return null; + return toTestStateMetadata(((Metadata) existing.metadata).delete(entriesToRemove)); + } + + @Override + Iterable> getEntrySet(BaseTrie trie) + { + return ((DeletionAwareTrie) trie) + .mergedTrie(DeletionBranchConsistencyTest::mergeStateAndMetadata) + .entrySet(); + } + + static TestStateMetadata mergeStateAndMetadata(TestStateMetadata m, TestRangeState s) + { + if (!(s instanceof TestStateMetadata)) + return m; + TestStateMetadata m2 = (TestStateMetadata) s; + if (m == null) + return m2; + return toTestStateMetadata(((Metadata) m.metadata).mergeWith((Metadata) m2.metadata)); + } + + static TestStateMetadata toTestStateMetadata(Content c) + { + return c != null ? new TestStateMetadata(c) : null; + } + + @Override + void printStats(InMemoryDeletionAwareTrie trie, + Predicate> forcedCopyChecker) + { + System.out.format("DeletionAware Reuse %s %s on-heap %,d (+%,d) off-heap %,d\n", + ((BufferManagerMultibuf) trie.bufferManager).cellAllocator.getClass().getSimpleName(), + trie.bufferManager.bufferType(), + trie.usedSizeOnHeap(), + trie.unusedReservedOnHeapMemory(), + trie.usedSizeOffHeap()); + } + + // TestStateMetadata hierarchy for deletion-aware consistency testing + abstract static class Content + { + final String pk; + + Content(String pk) + { + this.pk = pk; + } + + abstract boolean isPartition(); + } + + static class Value extends Content + { + final String ck; + final int value; + final int seq; + + Value(String pk, String ck, int value, int seq) + { + super(pk); + this.ck = ck; + this.value = value; + this.seq = seq; + } + + @Override + public String toString() + { + return "Value{" + + "pk='" + pk + '\'' + + ", ck='" + ck + '\'' + + ", value=" + value + + ", seq=" + seq + + '}'; + } + + @Override + boolean isPartition() + { + return false; + } + } + + static class Metadata extends Content + { + int updateCount; + + Metadata(String pk) + { + super(pk); + updateCount = 1; + } + + @Override + boolean isPartition() + { + return true; + } + + Metadata mergeWith(Metadata other) + { + Metadata m = new Metadata(pk); + m.updateCount = updateCount + other.updateCount; + return m; + } + + Metadata delete(int entriesToRemove) + { + assert updateCount >= entriesToRemove; + if (updateCount == entriesToRemove) + return null; + Metadata m = new Metadata(pk); + m.updateCount = updateCount - entriesToRemove; + return m; + } + + @Override + public String toString() + { + return "Metadata{" + + "pk='" + pk + '\'' + + ", updateCount=" + updateCount + + '}'; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/DeletionMarker.java b/test/unit/org/apache/cassandra/db/tries/DeletionMarker.java new file mode 100644 index 000000000000..40d8f6c398fd --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/DeletionMarker.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Objects; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +public class DeletionMarker implements DataPoint, RangeState +{ + final ByteComparable position; + final int leftSide; + final int rightSide; + + final DeletionMarker leftSideAsCovering; + final DeletionMarker rightSideAsCovering; + + DeletionMarker(ByteComparable position, int leftSide, int rightSide) + { + this.position = position; + this.leftSide = leftSide; + this.rightSide = rightSide; + + if (leftSide == rightSide) + leftSideAsCovering = rightSideAsCovering = this; + else + { + if (this.leftSide < 0) + leftSideAsCovering = null; + else + leftSideAsCovering = new DeletionMarker(this.position, this.leftSide, this.leftSide); + + if (this.rightSide < 0) + rightSideAsCovering = null; + else + rightSideAsCovering = new DeletionMarker(this.position, this.rightSide, this.rightSide); + } + } + + static DeletionMarker combine(DeletionMarker m1, DeletionMarker m2) + { + return combineCollection(Arrays.asList(m1, m2)); + } + + + public static DeletionMarker combineCollection(Collection rangeMarkers) + { + int newLeft = -1; + int newRight = -1; + ByteComparable position = null; + for (DeletionMarker marker : rangeMarkers) + { + newLeft = Math.max(newLeft, marker.leftSide); + newRight = Math.max(newRight, marker.rightSide); + position = marker.position; + } + if (newLeft < 0 && newRight < 0) + return null; + + return new DeletionMarker(position, newLeft, newRight); + } + + DeletionMarker[] withPoint(int value) + { + return new DeletionMarker[] + { + new DeletionMarker(position, leftSide, value), + new DeletionMarker(replaceTerminator(position, ByteSource.GT_NEXT_COMPONENT), value, rightSide) + }; + } + + ByteComparable replaceTerminator(ByteComparable c, int terminator) + { + byte[] key = c.asByteComparableArray(TrieUtil.VERSION); + key[key.length - 1] = (byte) terminator; + return ByteComparable.preencoded(TrieUtil.VERSION, key); + } + + @Override + public DeletionMarker marker() + { + return this; + } + + @Override + public LivePoint live() + { + return null; + } + + @Override + public ByteComparable position() + { + return position; + } + + @Override + public DeletionMarker withMarker(DeletionMarker newMarker) + { + return newMarker; + } + + @Override + public DeletionMarker remap(ByteComparable newKey) + { + return new DeletionMarker(newKey, leftSide, rightSide); + } + + @Override + public String toString() + { + + return (leftSide >= 0 ? leftSide + "<" : "") + + '"' + DataPoint.toString(position) + '"' + + (rightSide >= 0 ? "<" + rightSide : "") + + (isBoundary() ? "" : " not reportable"); + } + + @Override + public boolean isBoundary() + { + return leftSide != rightSide; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DeletionMarker that = (DeletionMarker) o; + return leftSide == that.leftSide + && rightSide == that.rightSide; + } + + @Override + public int hashCode() + { + return Objects.hash(position, leftSide, rightSide); + } + + @Override + public DeletionMarker toContent() + { + return isBoundary() ? this : null; + } + + @Override + public DeletionMarker restrict(boolean applicableBefore, boolean applicableAfter) + { + assert isBoundary(); + if ((applicableBefore || leftSide < 0) && (applicableAfter || rightSide < 0)) + return this; + int newLeft = applicableBefore ? leftSide : -1; + int newRight = applicableAfter ? rightSide : -1; + if (newLeft >= 0 || newRight >= 0) + return new DeletionMarker(position, newLeft, newRight); + else + return null; + } + + @Override + public DeletionMarker precedingState(Direction direction) + { + return direction.select(leftSideAsCovering, rightSideAsCovering); + } + + @Override + public DeletionMarker succedingState(Direction direction) + { + return direction.select(rightSideAsCovering, leftSideAsCovering); + } + + @Override + public DeletionMarker asBoundary(Direction direction) + { + assert !isBoundary(); + final boolean isForward = direction.isForward(); + int newLeft = !isForward ? leftSide : -1; + int newRight = isForward ? rightSide : -1; + return new DeletionMarker(position, newLeft, newRight); + } + + public LivePoint applyTo(LivePoint content) + { + return content.delete(rightSide); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/FilteringTest.java b/test/unit/org/apache/cassandra/db/tries/FilteringTest.java new file mode 100644 index 000000000000..c2702e9b1cf1 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/FilteringTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.AbstractMap; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import com.google.common.collect.Streams; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.io.compress.BufferType; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.MapValuesTest.*; +import static org.junit.Assert.assertEquals; +import static org.quicktheories.QuickTheory.qt; + +/// Property-based tests for [Trie#mapValues] using QuickTheories framework. +/// +/// Tests that the `mapValues` operation correctly transforms all content in a trie +/// through a mapping function while preserving the trie structure and key ordering. +/// Also tests various types of filtered iteration. +public class FilteringTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + private static T filter(Class clazz, Object o) + { + return clazz.isInstance(o) ? clazz.cast(o) : null; + } + + /** + * Test multi-typed conversion followed by filtering in different ways. + */ + @Test + public void testTypesAndFiltering() + { + qt().forAll(keyValueListGen()) + .checkAssert(keyValues -> { + if (keyValues.isEmpty()) + return; // Skip empty case + + TrieWithContent trieWithContent = createIntegerTrie(keyValues); + + // Chain multiple mappings + Function multiTypeMapper = x -> { + switch (x % 3) + { + case 0: + return x; + case 1: + return Integer.toString(x); + default: + return ByteBuffer.wrap(Integer.toString(x).getBytes()); + } + }; + + Trie mappedTrie = trieWithContent.trie.mapValues(multiTypeMapper); + + // Verify initial mapping + verifyMappedValues(mappedTrie, trieWithContent.content, multiTypeMapper); + testNullFilteredEntries(trieWithContent, multiTypeMapper); + + checkFiltering(mappedTrie, multiTypeMapper, trieWithContent, Integer.class); + checkFiltering(mappedTrie, multiTypeMapper, trieWithContent, String.class); + checkFiltering(mappedTrie, multiTypeMapper, trieWithContent, ByteBuffer.class); + }); + } + + private void checkFiltering(Trie mappedTrie, + Function multiTypeMapper, + TrieWithContent trieWithContent, + Class clazz) + { + Function extractor = x -> filter(clazz, x); + Trie stringTrie = mappedTrie.mapValues(extractor); + Function filteredMapper = multiTypeMapper.andThen(extractor); + verifyMappedValues(stringTrie, trieWithContent.content, filteredMapper); + + // check filtered values + assertEquals(trieWithContent.content.values() + .stream() + .map(filteredMapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.filteredValues(Direction.FORWARD, clazz)).collect(Collectors.toList())); + assertEquals(trieWithContent.content.descendingMap() + .values() + .stream() + .map(filteredMapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.filteredValues(Direction.REVERSE, clazz)).collect(Collectors.toList())); + + // check filtered entrySet + assertEquals(trieWithContent.content.entrySet() + .stream() + .map(x -> map(x, filteredMapper)) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.filteredEntrySet(Direction.FORWARD, clazz)).collect(Collectors.toList())); + assertEquals(trieWithContent.content.descendingMap() + .entrySet() + .stream() + .map(x -> map(x, filteredMapper)) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.filteredEntrySet(Direction.REVERSE, clazz)).collect(Collectors.toList())); + + testNullFilteredEntries(trieWithContent, filteredMapper); + testTailTries(mappedTrie, clazz, trieWithContent.content, filteredMapper); + testDanglingMetadataCleaner(mappedTrie, clazz); + } + + private static void testDanglingMetadataCleaner(Trie trie, Class clazz) + { + NavigableMap survivors = new TreeMap<>(); + for (var en : trie.entrySet()) + { + Trie tail = trie.tailTrie(en.getKey()); + assert tail != null; + if (tail.filteredValuesIterator(Direction.FORWARD, clazz).hasNext()) + survivors.put(en.getKey(), en.getValue()); + } + + // set up in-memory trie with dangling non-clazz clean-up + InMemoryTrie copy = new InMemoryTrie<>(VERSION, + BufferType.ON_HEAP, + InMemoryBaseTrie.ExpectedLifetime.SHORT, + null, + true, + clazz::isInstance); + try + { + copy.mutator((x, y) -> y, + (Predicate>) x -> false) + .apply(trie); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + + assertEquals(survivors.entrySet() + .stream() + .collect(Collectors.toList()), + Streams.stream(copy.entrySet()) + .collect(Collectors.toList())); + } + + private Map.Entry map(Map.Entry en, Function mapper) + { + V2 value = mapper.apply(en.getValue()); + if (value == null) + return null; + return new AbstractMap.SimpleEntry<>(en.getKey(), value); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieConsistencyTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieConsistencyTest.java new file mode 100644 index 000000000000..cf947041a734 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieConsistencyTest.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Collection; +import java.util.function.BiFunction; +import java.util.function.Predicate; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +/// Consistency test for [InMemoryDeletionAwareTrie] that validates concurrent operations +/// with both live data and deletion markers under different atomicity guarantees. +/// +/// This test extends [ConsistencyTestBase] to verify that [InMemoryDeletionAwareTrie] maintains +/// correctness and consistency under concurrent access patterns typical of Cassandra's +/// memtable operations with deletions. +public class InMemoryDeletionAwareTrieConsistencyTest +extends ConsistencyTestBase, + InMemoryDeletionAwareTrie> +{ + + @SuppressWarnings("rawtypes") // type does not matter, we are always throwing an exception + static final InMemoryBaseTrie.UpsertTransformer UPSERT_THROW = (x, y) -> { throw new AssertionError(); }; + @SuppressWarnings("rawtypes") // type does not matter, we are always throwing an exception + static final BiFunction BIFUNCTION_THROW = (x, y) -> { throw new AssertionError(); }; + + @Override + InMemoryDeletionAwareTrie makeTrie(OpOrder readOrder) + { + return InMemoryDeletionAwareTrie.longLived(VERSION, readOrder); + } + + @Override + Content value(ByteComparable b, ByteComparable cprefix, ByteComparable c, int add, int seqId) + { + String pk = b.byteComparableAsString(VERSION); + String ck = (cprefix != null ? cprefix.byteComparableAsString(VERSION) : "") + c.byteComparableAsString(VERSION); + return new Value(pk, ck, add, seqId); + } + + @Override + Content metadata(ByteComparable b) + { + return new Metadata(b.byteComparableAsString(VERSION)); + } + + @Override + String pk(Content c) + { + return c.pk; + } + + @Override + String ck(Content c) + { + return ((Value) c).ck; + } + + @Override + int seq(Content c) + { + return ((Value) c).seq; + } + + @Override + int value(Content c) + { + return ((Value) c).value; + } + + @Override + int updateCount(Content c) + { + return ((Metadata) c).updateCount; + } + + @Override + DeletionAwareTrie makeSingleton(ByteComparable b, Content content) + { + return DeletionAwareTrie.singleton(b, VERSION, content); + } + + @Override + DeletionAwareTrie withRootMetadata(DeletionAwareTrie wrapped, Content metadata) + { + return TrieUtil.withRootMetadata(wrapped, metadata); + } + + @Override + DeletionAwareTrie merge(Collection> tries, + Trie.CollectionMergeResolver mergeResolver) + { + return DeletionAwareTrie.merge(tries, + mergeResolver, + Trie.throwingResolver(), + BIFUNCTION_THROW, + true); // deletionsAtFixedPoints = true for consistency + } + + @Override + void apply(InMemoryDeletionAwareTrie trie, + DeletionAwareTrie mutation, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + trie.mutator(mergeResolver, // Use the provided merge resolver for content + (del, incoming) -> { throw new AssertionError(); }, + (del, incoming) -> { throw new AssertionError(); }, + (del, incoming) -> { throw new AssertionError(); }, + true, // deletionsAtFixedPoints = true for consistency + forcedCopyChecker, + forcedCopyCheckerRanges) + .apply(mutation); // Use the provided forced copy checker + } + + @Override + void delete(InMemoryDeletionAwareTrie trie, + ByteComparable deletionPrefix, + TestRangeState partitionMarker, + RangeTrie deletionBranch, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + DeletionAwareTrie deletion = DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, VERSION, deletionBranch); + deletion = TrieUtil.withRootMetadata(deletion, partitionMarker); + deletion = deletion.prefixedBy(deletionPrefix); + + trie.mutator(mergeResolver, + (existing, incoming) -> TestRangeState.combine(existing, incoming), + mergeResolver, + BIFUNCTION_THROW, + true, + forcedCopyCheckerRanges, + forcedCopyCheckerRanges) + .apply(deletion); + } + + @Override + boolean isPartition(Content c) + { + return c != null && c.isPartition(); + } + + @Override + Content mergeMetadata(Content c1, Content c2) + { + if (c1 == null) return c2; + if (c2 == null) return c1; + return ((Metadata) c1).mergeWith((Metadata) c2); + } + + @Override + Content deleteMetadata(Content existing, int entriesToRemove) + { + if (existing == null) return null; + return ((Metadata) existing).delete(entriesToRemove); + } + + @Override + void printStats(InMemoryDeletionAwareTrie trie, + Predicate> forcedCopyChecker) + { + System.out.format("DeletionAware Reuse %s %s on-heap %,d (+%,d) off-heap %,d\n", + ((BufferManagerMultibuf) trie.bufferManager).cellAllocator.getClass().getSimpleName(), + trie.bufferManager.bufferType(), + trie.usedSizeOnHeap(), + trie.unusedReservedOnHeapMemory(), + trie.usedSizeOffHeap()); + } + + // Content hierarchy for deletion-aware consistency testing + abstract static class Content + { + final String pk; + + Content(String pk) + { + this.pk = pk; + } + + abstract boolean isPartition(); + } + + static class Value extends Content + { + final String ck; + final int value; + final int seq; + + Value(String pk, String ck, int value, int seq) + { + super(pk); + this.ck = ck; + this.value = value; + this.seq = seq; + } + + @Override + public String toString() + { + return "Value{" + + "pk='" + pk + '\'' + + ", ck='" + ck + '\'' + + ", value=" + value + + ", seq=" + seq + + '}'; + } + + @Override + boolean isPartition() + { + return false; + } + } + + static class Metadata extends Content + { + int updateCount; + + Metadata(String pk) + { + super(pk); + updateCount = 1; + } + + @Override + boolean isPartition() + { + return true; + } + + Metadata mergeWith(Metadata other) + { + Metadata m = new Metadata(pk); + m.updateCount = updateCount + other.updateCount; + return m; + } + + Metadata delete(int entriesToRemove) + { + assert updateCount >= entriesToRemove; + if (updateCount == entriesToRemove) + return null; + Metadata m = new Metadata(pk); + m.updateCount = updateCount - entriesToRemove; + return m; + } + + @Override + public String toString() + { + return "Metadata{" + + "pk='" + pk + '\'' + + ", updateCount=" + updateCount + + '}'; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieThreadedTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieThreadedTest.java new file mode 100644 index 000000000000..515179dd578a --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryDeletionAwareTrieThreadedTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.junit.BeforeClass; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +/// Multithreaded test for [InMemoryDeletionAwareTrie] that creates both data and deletion entries. +/// +/// This test extends [ThreadedTestBase] to verify that [InMemoryDeletionAwareTrie] works correctly +/// under concurrent access with both live data points and deletion markers being added. +public class InMemoryDeletionAwareTrieThreadedTest extends ThreadedTestBase> +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + @Override + LivePoint value(ByteComparable b) + { + // Create a live point with a timestamp based on the key's hash + // This ensures all threads will create the same value for the same key + String keyStr = b.byteComparableAsString(VERSION); + int timestamp = Math.abs(keyStr.hashCode()) % 1000 + 1; // Ensure positive timestamp + return new LivePoint(b, timestamp); + } + + @Override + InMemoryDeletionAwareTrie makeTrie(OpOrder readOrder) + { + return InMemoryDeletionAwareTrie.longLived(VERSION, readOrder); + } + + @Override + void add(InMemoryDeletionAwareTrie trie, ByteComparable b, LivePoint v, int iteration) throws TrieSpaceExhaustedException + { + // Alternate between adding live data and deletion markers + if (iteration % 3 == 0) + { + // Add live data point using apply with singleton trie + DeletionAwareTrie singletonTrie = DeletionAwareTrie.singleton(b, VERSION, v); + trie.apply(singletonTrie, + DataPoint::combineLive, // Combine live data using DataPoint utility + DataPoint::combineDeletion, // Combine deletion markers using DataPoint utility + DataPoint::deleteLive, // Apply deletions to existing data using DataPoint utility + DataPoint::deleteLive, // Apply deletions to incoming data using DataPoint utility + true, // deletionsAtFixedPoints = true (singleton deletions satisfy invariant) + x -> false); // needsForcedCopy = never force copy for this test + } + else if (iteration % 3 == 1) + { + // Add deletion marker using DeletionAwareTrie.deletion + // Create a deletion marker that deletes data with timestamp less than current + int deletionTime = v.timestamp + 10; // Delete older data + DeletionMarker marker = new DeletionMarker(b, deletionTime, deletionTime); + + DeletionAwareTrie deletionTrie = + DeletionAwareTrie.deletedRange(b, b, true, b, true, TrieUtil.VERSION, marker); + + trie.apply(deletionTrie, + (existing, incoming) -> existing, // Keep existing live data (no incoming live data in deletion trie) + DataPoint::combineDeletion, // Combine deletion markers using DataPoint utility + DataPoint::deleteLive, // Apply deletions to existing data using DataPoint utility + DataPoint::deleteLive, // Apply deletions to incoming data using DataPoint utility + true, // deletionsAtFixedPoints = true (singleton deletions satisfy invariant) + x -> false); // needsForcedCopy = never force copy for this test + } + else + { + // Add a merge of singleton and deletion + DeletionAwareTrie singletonTrie = DeletionAwareTrie.singleton(b, VERSION, v); + + // Create a deletion marker that deletes data with timestamp less than current + int deletionTime = v.timestamp + 5; // Delete slightly older data + DeletionMarker marker = new DeletionMarker(b, deletionTime, deletionTime); + DeletionAwareTrie deletionTrie = + DeletionAwareTrie.deletedRange(b, b, true, b, true, TrieUtil.VERSION, marker); + + // Merge singleton and deletion into a combined trie + DeletionAwareTrie combinedTrie = + singletonTrie.mergeWith(deletionTrie, + DataPoint::combineLive, + DataPoint::combineDeletion, + DataPoint::deleteLive, + true); // deletionsAtFixedPoints = true + + trie.apply(combinedTrie, + DataPoint::combineLive, // Combine live data using DataPoint utility + DataPoint::combineDeletion, // Combine deletion markers using DataPoint utility + DataPoint::deleteLive, // Apply deletions to existing data using DataPoint utility + DataPoint::deleteLive, // Apply deletions to incoming data using DataPoint utility + true, // deletionsAtFixedPoints = true (singleton deletions satisfy invariant) + x -> false); // needsForcedCopy = never force copy for this test + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieTest.java new file mode 100644 index 000000000000..c6801c47426f --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieTest.java @@ -0,0 +1,568 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import com.google.common.base.Predicates; +import com.google.common.base.Throwables; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class InMemoryRangeTrieTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static int delTime; + + @Parameterized.Parameter(0) + public static boolean addTerminators = false; + + @Parameterized.Parameter(1) + public static boolean forceCopy = false; + + @Parameterized.Parameters(name = "addTerminators: {0} forceCopy: {1}") + public static Object[] parameters() + { + return new Object[][] + { + new Object[]{ false, false }, + new Object[]{ false, true }, + new Object[]{ true, false }, + new Object[]{ true, true }, + }; + } + + @Before + public void init() + { + delTime = 99; + } + + static TestRangeState toMarker(String string) + { + return toMarker(string, delTime--); + } + + static TestRangeState toMarker(String string, int delTime) + { + return new TestRangeState(TrieUtil.directComparable(string), delTime, delTime); + } + + static TestRangeState addMarkerStrings(TestRangeState a, TestRangeState b) + { + if (a == null) + return b; + + TestRangeState c = TestRangeState.combine(a, b); + return new TestRangeState(TrieUtil.directComparable(fromMarker(a) + fromMarker(b)), + c.appliesAfter, + c.leftSide, + c.rightSide); + } + + static String fromMarker(TestRangeState marker) + { + if (marker == null) + return null; + return new String(marker.position.asByteComparableArray(VERSION), StandardCharsets.UTF_8); + } + + @Test + public void testSingle() + { + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + putRange(trie, "test", toMarker("test"), (x, y) -> y); + System.out.println("Trie " + trie.dump()); + assertEquals("test", fromMarker(trie.applicableRange(key("test")))); + assertEquals(null, fromMarker(trie.applicableRange(key("tezt")))); + assertEquals(null, fromMarker(trie.applicableRange(key("tast")))); + } + + @Test + public void testSplitMulti() + { + testEntries("testing", "tests", "trials", "trial", "aaaa", "aaaab", "abdddd", "abeeee"); + } + + @Test + public void testSplitMultiBug() + { + testEntriesHex(new String[]{ "0c4143aeff", "0c4143ae69ff" }); + } + + @Test + public void testUpdateContent() + { + String[] tests = new String[]{ "testing", "tests", "trials", "trial", "testing", "trial", "trial" }; + String[] values = new String[]{ "testing", "tests", "trials", "trial", "t2", "x2", "y2" }; + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + for (int i = 0; i < tests.length; ++i) + { + String test = tests[i]; + String v = values[i]; + System.out.println("Adding " + test + ": " + v); + putRange(trie, test, toMarker(v), InMemoryRangeTrieTest::addMarkerStrings); + System.out.println("Trie " + trie.dump()); + } + + for (int i = 0; i < tests.length; ++i) + { + String test = tests[i]; + assertEquals(IntStream.range(0, tests.length) + .filter(x -> test.startsWith(tests[x])) + .mapToObj(x -> values[x]) + .collect(Collectors.joining()), + fromMarker(trie.applicableRange(key(TrieUtil.directComparable(test))))); + } + } + + @Test + public void testApplyUpdate() throws TrieSpaceExhaustedException + { + // Ranges in pairs. They may touch but not overlap (putRecursive can't handle covering ranges). + String[] bounds = new String[]{ "aaa", "touchAfter", ">touchAfter", "touchTwice", ">touchTwice", ">touch" }; + String[] tests = Arrays.stream(bounds).map(x -> x.substring(1)).toArray(String[]::new); + Boolean[] after = Arrays.stream(bounds).map(x -> x.charAt(0) == '>').toArray(Boolean[]::new); + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + for (int i = 0; i < tests.length; i += 2) + { + System.out.println("Adding " + bounds[i] + "-" + bounds[i + 1]); + + trie.apply(RangeTrie.range(bound(tests[i], after[i]), !after[i], bound(tests[i+1], after[i+1]), after[i+1], VERSION, toMarker(bounds[i], i)), + InMemoryRangeTrieTest::addMarkerStrings, + x -> forceCopy); + System.out.println("Trie " + trie.dump()); + } + + for (int i = 0; i < tests.length; ++i) + { + String bound = bounds[i]; + assertEquals(IntStream.range(0, tests.length) + .filter(x -> bounds[x].equals(bound)) + // The ranges have the left side as the marker string + .mapToObj(x -> bounds[x & ~1]) + .collect(Collectors.joining()), + fromMarker(get(trie, bound(tests[i], after[i]), after[i]))); + } + } + + @Test + public void testPutRecursiveUpdate() throws TrieSpaceExhaustedException + { + // Ranges in pairs. They may touch but not overlap (putRecursive can't handle covering ranges). + String[] bounds = new String[]{ "aaa", "touchAfter", ">touchAfter", "touchTwice", ">touchTwice", ">touch" }; + String[] tests = Arrays.stream(bounds).map(x -> x.substring(1)).toArray(String[]::new); + Boolean[] after = Arrays.stream(bounds).map(x -> x.charAt(0) == '>').toArray(Boolean[]::new); + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + for (int i = 0; i < tests.length; i += 2) + { + System.out.println("Adding " + bounds[i] + "-" + bounds[i + 1]); + trie.putRecursive(bound(tests[i], after[i]), + toMarker(bounds[i], i).asBoundary(Direction.FORWARD), + after[i], + InMemoryRangeTrieTest::addMarkerStrings); + trie.putRecursive(bound(tests[i + 1], after[i + 1]), + toMarker(bounds[i + 1], i).asBoundary(Direction.REVERSE), + after[i + 1], + InMemoryRangeTrieTest::addMarkerStrings); + System.out.println("Trie " + trie.dump()); + } + + for (int i = 0; i < tests.length; ++i) + { + String bound = bounds[i]; + assertEquals(Arrays.stream(bounds) + .filter(x -> x.equals(bound)) + .collect(Collectors.joining()), + fromMarker(get(trie, bound(tests[i], after[i]), after[i]))); + } + } + + @Test + public void testMultipathApplyEE() throws TrieSpaceExhaustedException + { + testMultipathApply(true, false, + new String[]{ "abc", "ade", "a", "bcd", "bcd", "bceeeee", "bce", "bd" }); + } + + @Test + public void testMultipathApplyIE() throws TrieSpaceExhaustedException + { + // repetitions are acceptable, but our test will fail because the entry there will end up dropped + testMultipathApply(false, false, + new String[]{ "a", "ade", "b", "bcd", "bce", "bceeeee", "bcf", "bd" }); + } + + @Test + public void testMultipathApplyEI() throws TrieSpaceExhaustedException + { + testMultipathApply(true, true, + new String[]{ "abc", "ab", "a", "bcd", "bceeeee", "bce", "bcf", "bd" }); + } + + @Test + public void testMultipathApplyII() throws TrieSpaceExhaustedException + { + testMultipathApply(false, true, + new String[]{ "a", "abc", "ade", "a", "bcd", "bcd", "bce", "bceeeeee", "bddddddd", "bd", "efg", "hik" }); + } + + private void testMultipathApply(boolean startsAfter, boolean endsAfter, String[] tests) throws TrieSpaceExhaustedException + { + ByteComparable[] keys = IntStream.range(0, tests.length) + .mapToObj(x -> bound(tests[x], x % 2 == 0 ? startsAfter : endsAfter)) + .toArray(ByteComparable[]::new); + + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + trie.apply(RangeTrie.fromSet(TrieSet.ranges(VERSION, !startsAfter, endsAfter, keys), + toMarker("marker", 1)), + InMemoryRangeTrieTest::addMarkerStrings, + x -> forceCopy); + System.out.println("Trie " + trie.dump()); + + for (int i = 0; i < tests.length; ++i) + { + boolean after = i % 2 == 0 ? startsAfter : endsAfter; + assertEquals("for key " + (after ? ">" : "<") + tests[i], "marker", + fromMarker(get(trie, bound(tests[i], after), after))); + } + } + + private T get(BaseTrie trie, ByteComparable key, boolean after) + { + Cursor cursor = trie.cursor(Direction.FORWARD); + ByteSource bytes = key.asComparableBytes(cursor.byteComparableVersion()); + int next = bytes.next(); + long position = cursor.encodedPosition(); + while (next != ByteSource.END_OF_STREAM) + { + long nextPosition = Cursor.positionForDescentWithByte(position, next); + next = bytes.next(); + if (after && next == ByteSource.END_OF_STREAM) + nextPosition |= Cursor.ON_RETURN_PATH_BIT; + if (Cursor.compare(cursor.skipTo(nextPosition), nextPosition) != 0) + return null; + position = nextPosition; + } + return cursor.content(); + } + + private ByteComparable bound(String s, boolean after) + { + return after ? rightBound(s) : leftBound(s); + } + + private void testEntries(String... tests) + { + testEntries(tests, InMemoryRangeTrieTest::bc); + } + + private void testEntriesHex(String[] tests) + { + testEntries(tests, s -> ByteComparable.preencoded(VERSION, ByteBufferUtil.hexToBytes(s))); + // Run the other translations just in case. + testEntries(tests); + } + + private void testEntries(String[] tests, Function mapping) + + { + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + for (String test : tests) + { + ByteComparable e = mapping.apply(test); + System.out.println("Adding " + asString(e) + ": " + test); + putRange(trie, e, toMarker(test), (x, y) -> TestRangeState.combine(y, x)); + System.out.println("Trie\n" + trie.dump()); + } + + for (String test : tests) + { + // Entries with greater delTime override ones with smaller. So we will match the leftmost key in the list. + String expected = Arrays.stream(tests).filter(test::startsWith).findFirst().get(); + assertEquals(expected, fromMarker(trie.applicableRange(key(mapping.apply(test))))); + } + } + + static String asString(ByteComparable bc) + { + return bc != null ? bc.byteComparableAsString(VERSION) : "null"; + } + + @Test + public void testCursorDeletionBeforeNearest() throws TrieSpaceExhaustedException + { + testCursorsWithInterveningDeletions(strings("aaebc", "aaecd"), + "aa", "aaec", + strings("aabc", "aacd")); + } + + @Test + public void testCursorRangeDeletionCoversPosition() throws TrieSpaceExhaustedException + { + testCursorsWithInterveningDeletions(strings("aaabc", "aaacde", "bcd", "cde"), + "aaa", "aaacd", + strings("a_", "ab")); + } + + @Test + public void testCursorBranchDeletionCoversPosition() throws TrieSpaceExhaustedException + { + testCursorsWithInterveningDeletions(strings("aaabc", "aaacde", "bcd", "cde"), + "aaa", "aaacd", + strings("aa", "aa")); + } + + private String[] strings(String... strings) + { + return strings; + } + + private void testCursorsWithInterveningDeletions(String[] preparations, + String leftPos, + String rightPos, + String[] insertions) + throws TrieSpaceExhaustedException + { + // Note: if position matches a boundary we may get a false negative when looking for it because it will be on + // the return path in one of the directions. If any of these checks fails, it is a test error, please make + // sure the queried positions are not boundaries. + assertFalse(Arrays.asList(preparations).contains(leftPos)); + assertFalse(Arrays.asList(preparations).contains(rightPos)); + assertFalse(Arrays.asList(insertions).contains(leftPos)); + assertFalse(Arrays.asList(insertions).contains(rightPos)); + + // New deletions supercede old + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, false, 1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, true, 1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, false, 1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, true, 1); + + // New deletions addition to old + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, false, -1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, true, -1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, false, -1); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, true, -1); + + // New deletions group with old + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, false, 0); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.FORWARD, true, 0); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, false, 0); + testCursorsWithInterveningDeletions(preparations, leftPos, rightPos, insertions, Direction.REVERSE, true, 0); + } + + private void testCursorsWithInterveningDeletions(String[] preparations, + String leftPos, + String rightPos, + String[] insertions, + Direction dir, + boolean useSkip, + int delTimeIncrease) + throws TrieSpaceExhaustedException + { + // Note: ranges are inserted one pair at a time, with changing delTime. + delTime = 100; + if (!dir.isForward() && rightPos.startsWith(leftPos)) + { + String t = leftPos; + leftPos = rightPos; + rightPos = t; // swap left and right as prefixes are always before + } + + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(VERSION); + insertRanges(trie, preparations, delTimeIncrease); + + final String current = dir.select(leftPos, rightPos); + RangeCursor c = trie.cursor(dir); + TriePathReconstructor paths = new TriePathReconstructor(); + boolean found; + if (useSkip) + found = c.descendAlong(bc(current).asComparableBytes(VERSION)); + else + found = advanceTo(c, bc(current), paths); + + assertTrue(found); + + insertRanges(trie, insertions, delTimeIncrease); + + // Even if the branch c is on is deleted, we should be able to continue iterating it and finding the right data. + String target = dir.select(rightPos, leftPos); + if (useSkip) + found = skipByDifference(c, bc(current), bc(target)); + else + found = advanceTo(c, bc(target), paths); + + assertTrue(found); + + while (!Cursor.isExhausted(c.advanceMultiple(null))) + { + } // let the verification cursor check the correctness of the iteration +} + + ByteComparable maybeInvert(ByteComparable bc, Direction dir) + { + return dir.isForward() ? bc : InMemoryTriePutTest.invert(bc); + } + + private boolean advanceTo(RangeCursor c, ByteComparable target, TriePathReconstructor paths) + { + int cmp; + Direction dir = c.direction(); + while (true) + { + cmp = ByteComparable.compare(maybeInvert(target, dir), maybeInvert(ByteComparable.preencoded(VERSION, paths.keyBytes, 0, paths.keyPos), dir), VERSION); + if (cmp == 0) + return true; + if (cmp < 0) + return false; + if (Cursor.isExhausted(c.advance())) + return false; // exhausted + + long position = c.encodedPosition(); + paths.resetPathLength(Cursor.depth(position) - 1); + paths.addPathByte(Cursor.incomingTransition(position)); + } + } + + private boolean skipByDifference(Cursor cursor, ByteComparable a, ByteComparable b) + { + ByteSource.Peekable sa = ByteSource.peekable(a.asComparableBytes(VERSION)); + ByteSource.Peekable sb = ByteSource.peekable(b.asComparableBytes(VERSION)); + int depth = 0; + while (sa.peek() == sb.peek()) + { + sa.next(); + sb.next(); + ++depth; + } + + final int nextByte = sb.next(); + long skipPosition = Cursor.encode(depth + 1, nextByte, cursor.direction()); + long skippedPosition = cursor.skipTo(skipPosition); + if (Cursor.compare(skippedPosition, skipPosition) != 0) + return false; + return cursor.descendAlong(sb); + } + + private void insertRanges(InMemoryRangeTrie trie, String[] insertions, int delTimeIncrease) throws TrieSpaceExhaustedException + { + for (int i = 0; i < insertions.length; i += 2) + { + ByteComparable left = leftBound(insertions[i]); + ByteComparable right = rightBound(insertions[i + 1]); + trie.apply(RangeTrie.range(left, true, right, true, TrieUtil.VERSION, toMarker(insertions[i], delTime)), + (existing, update) -> existing == null ? update : TestRangeState.combine(existing, update), + delTimeIncrease >= 0 ? x -> forceCopy : Predicates.alwaysTrue()); // if we delete covered branches, we should be okay with no force copying + delTime += delTimeIncrease; + } + System.out.println("After inserting " + Arrays.toString(insertions) + ":\n" + trie.dump()); + } + + static ByteComparable withTerminator(int terminator, ByteComparable bc) + { + if (addTerminators) + return v -> ByteSource.append(bc.asComparableBytes(v), terminator); + else + return bc; + } + + private static ByteComparable bc(String s) + { + return addTerminators ? ByteComparable.preencoded(VERSION, s.getBytes()) : TrieUtil.directComparable(s); + } + + static ByteComparable leftBound(String s) + { + return leftBound(bc(s)); + } + + static ByteComparable rightBound(String s) + { + return rightBound(bc(s)); + } + + static ByteComparable key(String s) + { + return key(bc(s)); + } + + static ByteComparable leftBound(ByteComparable bc) + { + return withTerminator(0x00, bc); + } + + static ByteComparable rightBound(ByteComparable bc) + { + return withTerminator(0xFF, bc); + } + + static ByteComparable key(ByteComparable bc) + { + return withTerminator(0x80, bc); + } + + + static > void putRange(InMemoryRangeTrie trie, + String s, + S value, + Trie.MergeResolver resolver) + { + putRange(trie, bc(s), value, resolver); + } + + static > void putRange(InMemoryRangeTrie trie, + ByteComparable key, + S value, + Trie.MergeResolver resolver) + { + try + { + trie.apply(RangeTrie.range(leftBound(key), true, rightBound(key), true, TrieUtil.VERSION, value), + (existing, update) -> existing != null ? resolver.resolve(existing, update) : update, + x -> forceCopy); + } + catch (TrieSpaceExhaustedException e) + { + throw Throwables.propagate(e); + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieThreadedTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieThreadedTest.java new file mode 100644 index 000000000000..b2d5b5cd7e04 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryRangeTrieThreadedTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import org.junit.BeforeClass; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +public class InMemoryRangeTrieThreadedTest extends ThreadedTestBase> +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + + @Override + TestRangeState value(ByteComparable b) + { + return new TestRangeState(b, 1, 1); + } + + @Override + InMemoryRangeTrie makeTrie(OpOrder readOrder) + { + return InMemoryRangeTrie.longLived(VERSION, readOrder); + } + + @Override + void add(InMemoryRangeTrie trie, ByteComparable b, TestRangeState v, int iteration) throws TrieSpaceExhaustedException + { + ByteComparable left = ver -> ByteSource.withTerminator(ByteSource.LT_NEXT_COMPONENT, b.asComparableBytes(ver)); + ByteComparable right = ver -> ByteSource.withTerminator(ByteSource.GT_NEXT_COMPONENT, b.asComparableBytes(ver)); + if (iteration % 2 == 0) + { + trie.putRecursive(left, v, false, (x, y) -> y.asBoundary(Direction.FORWARD)); + trie.putRecursive(right, v, true, (x, y) -> y.asBoundary(Direction.REVERSE)); + } + else + trie.apply(RangeTrie.range(left, true, right, true, TrieUtil.VERSION, v), (x, y) -> y, x -> true); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryTrieConsistencyTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieConsistencyTest.java new file mode 100644 index 000000000000..377cd96702f0 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieConsistencyTest.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Collection; +import java.util.function.Predicate; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +public class InMemoryTrieConsistencyTest extends ConsistencyTestBase, InMemoryTrie> +{ + @Override + InMemoryTrie makeTrie(OpOrder readOrder) + { + return InMemoryTrie.longLived(VERSION, readOrder); + } + + @Override + Value value(ByteComparable b, ByteComparable cprefix, ByteComparable c, int add, int seqId) + { + return new Value(b.byteComparableAsString(VERSION), + (cprefix != null ? cprefix.byteComparableAsString(VERSION) : "") + c.byteComparableAsString(VERSION), add, seqId); + } + + @Override + Content metadata(ByteComparable b) + { + return new Metadata(b.byteComparableAsString(VERSION)); + } + + @Override + String pk(Content c) + { + return c.pk; + } + + @Override + String ck(Content c) + { + return ((Value) c).ck; + } + + @Override + int seq(Content c) + { + return ((Value) c).seq; + } + + @Override + int value(Content c) + { + return ((Value) c).value; + } + + @Override + int updateCount(Content c) + { + return ((Metadata) c).updateCount; + } + + @Override + Trie makeSingleton(ByteComparable b, Content content) + { + return Trie.singleton(b, VERSION, content); + } + + @Override + Trie withRootMetadata(Trie wrapped, Content metadata) + { + return TrieUtil.withRootMetadata(wrapped, metadata); + } + + @Override + Trie merge(Collection> tries, Trie.CollectionMergeResolver mergeResolver) + { + return Trie.merge(tries, mergeResolver); + } + + @Override + void apply(InMemoryTrie trie, + Trie mutation, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + trie.apply(mutation, mergeResolver, forcedCopyChecker); + } + + @Override + void delete(InMemoryTrie trie, + ByteComparable deletionPrefix, + TestRangeState partitionMarker, + RangeTrie deletion, + InMemoryBaseTrie.UpsertTransformer mergeResolver, + Predicate> forcedCopyChecker, + Predicate> forcedCopyCheckerRanges) + throws TrieSpaceExhaustedException + { + deletion = TrieUtil.withRootMetadata(deletion, partitionMarker); + deletion = deletion.prefixedBy(deletionPrefix); + trie.rangeMutator(mergeResolver, forcedCopyCheckerRanges).apply(deletion); + } + + @Override + boolean isPartition(Content c) + { + return c != null && c.isPartition(); + } + + @Override + Content mergeMetadata(Content c1, Content c2) + { + return ((Metadata) c1).mergeWith((Metadata) c2); + } + + @Override + Content deleteMetadata(Content c1, int entriesToRemove) + { + return ((Metadata) c1).delete(entriesToRemove); + } + + @Override + void printStats(InMemoryTrie trie, Predicate> forcedCopyChecker) + { + System.out.format("Reuse %s %s atomicity %s on-heap %,d (+%,d) off-heap %,d\n", + ((BufferManagerMultibuf) trie.bufferManager).cellAllocator.getClass().getSimpleName(), + trie.bufferManager.bufferType(), + forcedCopyChecker == this.noAtomicity() ? "none" : + forcedCopyChecker == this.forceAtomic() ? "atomic" : "consistent partition", + trie.usedSizeOnHeap(), + trie.unusedReservedOnHeapMemory(), + trie.usedSizeOffHeap()); + } + + abstract static class Content + { + final String pk; + + Content(String pk) + { + this.pk = pk; + } + + abstract boolean isPartition(); + } + + static class Value extends Content + { + final String ck; + final int value; + final int seq; + + Value(String pk, String ck, int value, int seq) + { + super(pk); + this.ck = ck; + this.value = value; + this.seq = seq; + } + + @Override + public String toString() + { + return "Value{" + + "pk='" + pk + '\'' + + ", ck='" + ck + '\'' + + ", value=" + value + + ", seq=" + seq + + '}'; + } + + @Override + boolean isPartition() + { + return false; + } + } + + static class Metadata extends Content + { + int updateCount; + + Metadata(String pk) + { + super(pk); + updateCount = 1; + } + + @Override + boolean isPartition() + { + return true; + } + + Metadata mergeWith(Metadata other) + { + Metadata m = new Metadata(pk); + m.updateCount = updateCount + other.updateCount; + return m; + } + + Metadata delete(int entriesToRemove) + { + assert updateCount >= entriesToRemove; + if (updateCount == entriesToRemove) + return null; + Metadata m = new Metadata(pk); + m.updateCount = updateCount - entriesToRemove; + return m; + } + + @Override + public String toString() + { + return "Metadata{" + + "pk='" + pk + '\'' + + ", updateCount=" + updateCount + + '}'; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryTriePutTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryTriePutTest.java index e5f0044f3612..6b017d207726 100644 --- a/test/unit/org/apache/cassandra/db/tries/InMemoryTriePutTest.java +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryTriePutTest.java @@ -25,8 +25,10 @@ import org.junit.Ignore; import org.junit.Test; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; import static org.junit.Assert.fail; public class InMemoryTriePutTest extends InMemoryTrieTestBase @@ -48,7 +50,7 @@ public void testLongKey_StackOverflow() throws TrieSpaceExhaustedException try { - trie.putRecursive(ByteComparable.preencoded(byteComparableVersion, buf), "value", (x, y) -> y); + trie.putRecursive(ByteComparable.preencoded(VERSION, buf), "value", (x, y) -> y); Assert.fail("StackOverflowError expected with a recursive put for very long keys!"); } catch (StackOverflowError soe) @@ -56,42 +58,45 @@ public void testLongKey_StackOverflow() throws TrieSpaceExhaustedException // Expected. } // Using non-recursive put should work. - putSimpleResolve(trie, ByteComparable.preencoded(byteComparableVersion, buf), "value", (x, y) -> y, false); + putSimpleResolve(trie, ByteComparable.preencoded(VERSION, buf), "value", (x, y) -> y, false); } - // This tests that trie space allocation works correctly close to the 2G limit. It is normally disabled because - // the test machines don't provide enough heap memory (test requires ~8G heap to finish). Run it manually when - // InMemoryTrie.allocateBlock is modified. + /// This tests that trie space allocation works correctly close to the 2G limit. It is normally disabled because + /// the test machines don't provide enough heap memory (test requires ~8G heap to finish). Run it manually and + /// separately (so that the size limit can be set before the in-memory trie static initialization) when + /// [BufferManagerMultibuf#allocateNewCell] is modified. @Ignore @Test public void testOver1GSize() throws TrieSpaceExhaustedException { + CassandraRelevantProperties.MEMTABLE_TRIE_SIZE_LIMIT.setInt(1024); InMemoryTrie trie = strategy.create(); - trie.advanceAllocatedPos(0x20000000); + BufferManagerMultibuf mgr = ((BufferManagerMultibuf) trie.bufferManager); + mgr.advanceAllocatedPos(0x20000000); String t1 = "test1"; String t2 = "testing2"; String t3 = "onemoretest3"; - trie.putRecursive(ByteComparable.of(t1), t1, (x, y) -> y); - Assert.assertEquals(t1, trie.get(ByteComparable.of(t1))); - Assert.assertNull(trie.get(ByteComparable.of(t2))); + trie.putRecursive(TrieUtil.comparable(t1), t1, (x, y) -> y); + Assert.assertEquals(t1, trie.get(TrieUtil.comparable(t1))); + Assert.assertNull(trie.get(TrieUtil.comparable(t2))); Assert.assertFalse(trie.reachedAllocatedSizeThreshold()); - trie.advanceAllocatedPos(InMemoryTrie.ALLOCATED_SIZE_THRESHOLD + 0x1000); - trie.putRecursive(ByteComparable.of(t2), t2, (x, y) -> y); - Assert.assertEquals(t1, trie.get(ByteComparable.of(t1))); - Assert.assertEquals(t2, trie.get(ByteComparable.of(t2))); - Assert.assertNull(trie.get(ByteComparable.of(t3))); + mgr.advanceAllocatedPos(BufferManagerMultibuf.ALLOCATED_SIZE_THRESHOLD + 0x1000); + trie.putRecursive(TrieUtil.comparable(t2), t2, (x, y) -> y); + Assert.assertEquals(t1, trie.get(TrieUtil.comparable(t1))); + Assert.assertEquals(t2, trie.get(TrieUtil.comparable(t2))); + Assert.assertNull(trie.get(TrieUtil.comparable(t3))); Assert.assertTrue(trie.reachedAllocatedSizeThreshold()); - trie.advanceAllocatedPos(0x7FFFFEE0); // close to 2G - Assert.assertEquals(t1, trie.get(ByteComparable.of(t1))); - Assert.assertEquals(t2, trie.get(ByteComparable.of(t2))); - Assert.assertNull(trie.get(ByteComparable.of(t3))); + mgr.advanceAllocatedPos(-1); // as close to the limit as possible, next allocation should trigger an exception + Assert.assertEquals(t1, trie.get(TrieUtil.comparable(t1))); + Assert.assertEquals(t2, trie.get(TrieUtil.comparable(t2))); + Assert.assertNull(trie.get(TrieUtil.comparable(t3))); Assert.assertTrue(trie.reachedAllocatedSizeThreshold()); try { - trie.putRecursive(ByteComparable.of(t3), t3, (x, y) -> y); // should put it over the edge + trie.putRecursive(TrieUtil.comparable(t3), t3, (x, y) -> y); // should put it over the edge fail("InMemoryTrie.SpaceExhaustedError was expected"); } catch (TrieSpaceExhaustedException e) @@ -99,14 +104,14 @@ public void testOver1GSize() throws TrieSpaceExhaustedException // expected } - Assert.assertEquals(t1, trie.get(ByteComparable.of(t1))); - Assert.assertEquals(t2, trie.get(ByteComparable.of(t2))); - Assert.assertNull(trie.get(ByteComparable.of(t3))); + Assert.assertEquals(t1, trie.get(TrieUtil.comparable(t1))); + Assert.assertEquals(t2, trie.get(TrieUtil.comparable(t2))); + Assert.assertNull(trie.get(TrieUtil.comparable(t3))); Assert.assertTrue(trie.reachedAllocatedSizeThreshold()); try { - trie.advanceAllocatedPos(Integer.MAX_VALUE); + mgr.advanceAllocatedPos(Integer.MAX_VALUE); fail("InMemoryTrie.SpaceExhaustedError was expected"); } catch (TrieSpaceExhaustedException e) @@ -114,9 +119,9 @@ public void testOver1GSize() throws TrieSpaceExhaustedException // expected } - Assert.assertEquals(t1, trie.get(ByteComparable.of(t1))); - Assert.assertEquals(t2, trie.get(ByteComparable.of(t2))); - Assert.assertNull(trie.get(ByteComparable.of(t3))); + Assert.assertEquals(t1, trie.get(TrieUtil.comparable(t1))); + Assert.assertEquals(t2, trie.get(TrieUtil.comparable(t2))); + Assert.assertNull(trie.get(TrieUtil.comparable(t3))); Assert.assertTrue(trie.reachedAllocatedSizeThreshold()); trie.discardBuffers(); diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryTrieTestBase.java b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieTestBase.java index adfad17c4ab2..330e86d4c752 100644 --- a/test/unit/org/apache/cassandra/db/tries/InMemoryTrieTestBase.java +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieTestBase.java @@ -24,37 +24,44 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import com.google.common.base.Predicates; +import com.google.common.base.Throwables; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Multiset; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.io.compress.BufferType; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; @RunWith(Parameterized.class) public abstract class InMemoryTrieTestBase { + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + // Set this to true (in combination with smaller count) to dump the tries while debugging a problem. // Do not commit the code with VERBOSE = true. - private static final boolean VERBOSE = false; - - // Set to true by some tests that need prefix-free keys. - static boolean prefixFree = false; + static final boolean VERBOSE = false; private static final int COUNT = 100000; - private static final int KEY_CHOICE = 25; - private static final int MIN_LENGTH = 10; - private static final int MAX_LENGTH = 50; Random rand = new Random(); @@ -79,12 +86,12 @@ static ByteSource invert(ByteSource src) @Test public void testSingle() { - ByteComparable e = ByteComparable.of("test"); + Preencoded e = TrieUtil.comparable("test"); InMemoryTrie trie = strategy.create(); putSimpleResolve(trie, e, "test", (x, y) -> y); System.out.println("Trie " + trie.dump()); assertEquals("test", trie.get(e)); - assertEquals(null, trie.get(ByteComparable.of("teste"))); + assertEquals(null, trie.get(TrieUtil.comparable("teste"))); } public enum ReuseStrategy @@ -93,40 +100,43 @@ public enum ReuseStrategy { InMemoryTrie create() { - return InMemoryTrie.shortLived(byteComparableVersion); + return InMemoryTrie.shortLived(VERSION); } }, LONG_LIVED { InMemoryTrie create() { - return InMemoryTrie.longLived(byteComparableVersion, BufferType.OFF_HEAP, null); + return InMemoryTrie.longLived(VERSION, BufferType.OFF_HEAP, null); + } + }, + SHORT_LIVED_ORDERED + { + InMemoryTrie create() + { + return InMemoryTrie.shortLivedOrdered(VERSION); } }; abstract InMemoryTrie create(); } - @Parameterized.Parameters(name="{0} version {1}") + @Parameterized.Parameters(name="{0}") public static List generateData() { var list = new ArrayList(); for (var s : ReuseStrategy.values()) - for (var v : ByteComparable.Version.values()) - list.add(new Object[] {s, v}); + list.add(new Object[] {s}); return list; } @Parameterized.Parameter(0) - public static ReuseStrategy strategy = ReuseStrategy.LONG_LIVED; - - @Parameterized.Parameter(1) - public static ByteComparable.Version byteComparableVersion = ByteComparable.Version.OSS50; + public static ReuseStrategy strategy = ReuseStrategy.SHORT_LIVED; - public static Comparator forwardComparator = - (bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, byteComparableVersion); - public static Comparator reverseComparator = - (bytes1, bytes2) -> ByteComparable.compare(invert(bytes1), invert(bytes2), byteComparableVersion); + public static Comparator forwardComparator = + (bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION); + public static Comparator reverseComparator = + (bytes1, bytes2) -> ByteComparable.compare(invert(bytes1), invert(bytes2), VERSION); @Test public void testSplitMulti() @@ -152,7 +162,7 @@ public void testSparse00bug() InMemoryTrie trie = strategy.create(); for (String test : tests) { - ByteComparable e = ByteComparable.preencoded(byteComparableVersion, ByteBufferUtil.hexToBytes(test)); + Preencoded e = ByteComparable.preencoded(VERSION, ByteBufferUtil.hexToBytes(test)); System.out.println("Adding " + asString(e) + ": " + test); putSimpleResolve(trie, e, test, (x, y) -> y); } @@ -160,7 +170,7 @@ public void testSparse00bug() System.out.println(trie.dump()); for (String test : tests) - assertEquals(test, trie.get(ByteComparable.preencoded(byteComparableVersion, ByteBufferUtil.hexToBytes(test)))); + assertEquals(test, trie.get(ByteComparable.preencoded(VERSION, ByteBufferUtil.hexToBytes(test)))); Arrays.sort(tests); @@ -184,7 +194,7 @@ public void testUpdateContent() { String test = tests[i]; String v = values[i]; - ByteComparable e = ByteComparable.of(test); + Preencoded e = TrieUtil.comparable(test); System.out.println("Adding " + asString(e) + ": " + v); putSimpleResolve(trie, e, v, (x, y) -> "" + x + y); System.out.println("Trie " + trie.dump()); @@ -198,160 +208,40 @@ public void testUpdateContent() .filter(x -> tests[x] == test) .map(x -> values[x]) .reduce("", (x, y) -> "" + x + y), - trie.get(ByteComparable.of(test))); - } - } - - static class SpecStackEntry - { - Object[] children; - int curChild; - Object content; - SpecStackEntry parent; - - public SpecStackEntry(Object[] spec, Object content, SpecStackEntry parent, Direction direction) - { - this.children = spec; - this.content = content; - this.parent = parent; - this.curChild = direction.select(-1, spec.length); - } - } - - public static class CursorFromSpec implements Trie.Cursor - { - SpecStackEntry stack; - int depth; - Direction direction; - - CursorFromSpec(Object[] spec, Direction direction) - { - this.direction = direction; - stack = new SpecStackEntry(spec, null, null, direction); - depth = 0; - } - - public int advance() - { - SpecStackEntry current = stack; - while (current != null && !direction.inLoop(current.curChild += direction.increase, 0, current.children.length - 1)) - { - current = current.parent; - --depth; - } - if (current == null) - { - assert depth == -1; - return depth; - } - - Object child = current.children[current.curChild]; - if (child instanceof Object[]) - stack = new SpecStackEntry((Object[]) child, null, current, direction); - else - stack = new SpecStackEntry(new Object[0], child, current, direction); - - return ++depth; - } - - public int skipTo(int skipDepth, int skipTransition) - { - assert skipDepth <= depth + 1 : "skipTo descends more than one level"; - - while (skipDepth < depth) - { - --depth; - stack = stack.parent; - } - int index = skipTransition - 0x30; - assert direction.gt(index, stack.curChild) : "Backwards skipTo"; - if (direction.gt(index, direction.select(stack.children.length - 1, 0))) - { - --depth; - stack = stack.parent; - return advance(); - } - stack.curChild = index - direction.increase; - return advance(); - } - - public int depth() - { - return depth; - } - - public ByteBuffer content() - { - return (ByteBuffer) stack.content; - } - - public int incomingTransition() - { - SpecStackEntry parent = stack.parent; - return parent != null ? parent.curChild + 0x30 : -1; - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - return byteComparableVersion; - } - - @Override - public Trie tailTrie() - { - throw new UnsupportedOperationException("tailTrie on test cursor"); + trie.get(TrieUtil.comparable(test))); } } - static Trie specifiedTrie(Object[] nodeDef) - { - return new Trie() - { - @Override - protected Cursor cursor(Direction direction) - { - return new CursorFromSpec(nodeDef, direction); - } - }; - } - @Test - public void testEntriesNullChildBug() + public void testEntriesNullChildBug() throws TrieSpaceExhaustedException { Object[] trieDef = new Object[] - { - new Object[] { // 0 - ByteBufferUtil.bytes(1), // 01 - ByteBufferUtil.bytes(2) // 02 - }, - // If requestChild returns null, bad things can happen (DB-2982) - null, // 1 - ByteBufferUtil.bytes(3), // 2 - new Object[] { // 3 - ByteBufferUtil.bytes(4), // 30 - // Also try null on the Remaining.ONE path - null // 31 - }, - ByteBufferUtil.bytes(5), // 4 - // Also test requestUniqueDescendant returning null - new Object[] { // 5 - new Object[] { // 50 - new Object[] { // 500 - null // 5000 - } - } - }, - ByteBufferUtil.bytes(6) // 6 - }; - - SortedMap expected = new TreeMap<>(forwardComparator); + { + new Object[] { // 0 + ByteBufferUtil.bytes(1), // 01 + ByteBufferUtil.bytes(2) // 02 + }, + // If requestChild returns null, bad things can happen (DB-2982) + null, // 1 + ByteBufferUtil.bytes(3), // 2 + new Object[] { // 3 + ByteBufferUtil.bytes(4), // 30 + // Also try null on the Remaining.ONE path + null // 31 + }, + ByteBufferUtil.bytes(5), // 4 + // Also test requestUniqueDescendant returning null + new Object[] { // 5 + new Object[] { // 50 + new Object[] { // 500 + null // 5000 + } + } + }, + ByteBufferUtil.bytes(6) // 6 + }; + + SortedMap expected = new TreeMap<>(forwardComparator); expected.put(comparable("00"), ByteBufferUtil.bytes(1)); expected.put(comparable("01"), ByteBufferUtil.bytes(2)); expected.put(comparable("2"), ByteBufferUtil.bytes(3)); @@ -359,28 +249,33 @@ public void testEntriesNullChildBug() expected.put(comparable("4"), ByteBufferUtil.bytes(5)); expected.put(comparable("6"), ByteBufferUtil.bytes(6)); - Trie trie = specifiedTrie(trieDef); + Trie trie = TrieUtil.specifiedTrie(trieDef); System.out.println(trie.dump()); assertSameContent(trie, expected); + + InMemoryTrie inmem = strategy.create(); + inmem.apply(trie, (x, y) -> y, Predicates.alwaysFalse()); + System.out.println(inmem.dump()); + assertSameContent(inmem, expected); } - static ByteComparable comparable(String s) + static Preencoded comparable(String s) { ByteBuffer b = ByteBufferUtil.bytes(s); - return ByteComparable.preencoded(byteComparableVersion, b); + return ByteComparable.preencoded(VERSION, b); } @Test public void testDirect() { - ByteComparable[] src = generateKeys(rand, COUNT); - SortedMap content = new TreeMap<>(forwardComparator); + Preencoded[] src = TrieUtil.generateKeys(rand, COUNT); + SortedMap content = new TreeMap<>(forwardComparator); InMemoryTrie trie = makeInMemoryTrie(src, content, usePut()); int keysize = Arrays.stream(src) - .mapToInt(src1 -> ByteComparable.length(src1, byteComparableVersion)) + .mapToInt(src1 -> ByteComparable.length(src1, VERSION)) .sum(); long ts = ObjectSizes.measureDeep(content); - long onh = ObjectSizes.measureDeep(trie.contentArrays); + long onh = ObjectSizes.measureDeep(((ContentManagerPojo) trie.contentManager).contentArrays); System.out.format("Trie size on heap %,d off heap %,d measured %,d keys %,d treemap %,d\n", trie.usedSizeOnHeap(), trie.usedSizeOffHeap(), onh, keysize, ts); System.out.format("per entry on heap %.2f off heap %.2f measured %.2f keys %.2f treemap %.2f\n", @@ -437,9 +332,9 @@ public void testPrefixEvolution() } @Test - public void testPrefixUnsafeMulti() + public void testPrefixUnsafeChain() { - // Make sure prefixes on inside a multi aren't overwritten by embedded metadata node. + // Make sure prefixes on inside a chain aren't overwritten by embedded metadata node. testEntries(new String[] { "test89012345678901234567890", "test8", @@ -454,9 +349,9 @@ public void testPrefixUnsafeMulti() private void testEntries(String[] tests) { - for (Function mapping : - ImmutableList.>of(ByteComparable::of, - s -> ByteComparable.preencoded(byteComparableVersion, s.getBytes()))) + for (Function mapping : + ImmutableList.>of(TrieUtil::comparable, + s -> ByteComparable.preencoded(VERSION, s.getBytes()))) { testEntries(tests, mapping); } @@ -464,18 +359,18 @@ private void testEntries(String[] tests) private void testEntriesHex(String[] tests) { - testEntries(tests, s -> ByteComparable.preencoded(byteComparableVersion, ByteBufferUtil.hexToBytes(s))); + testEntries(tests, s -> ByteComparable.preencoded(VERSION, ByteBufferUtil.hexToBytes(s))); // Run the other translations just in case. testEntries(tests); } - private void testEntries(String[] tests, Function mapping) + private void testEntries(String[] tests, Function mapping) { InMemoryTrie trie = strategy.create(); for (String test : tests) { - ByteComparable e = mapping.apply(test); + Preencoded e = mapping.apply(test); System.out.println("Adding " + asString(e) + ": " + test); putSimpleResolve(trie, e, test, (x, y) -> y); System.out.println("Trie\n" + trie.dump()); @@ -483,10 +378,76 @@ private void testEntries(String[] tests, Function mappin for (String test : tests) assertEquals(test, trie.get(mapping.apply(test))); + + testDeletions(tests, mapping, trie); + + randomizedTestEntries(tests, mapping, trie); + } + + private void testDeletions(String[] tests, Function mapping, InMemoryTrie trie) + { + System.out.println("\nDeleting all entries"); + List toDelete = Arrays.stream(tests).distinct().collect(Collectors.toList()); + while (!toDelete.isEmpty()) + { + int index = rand.nextInt(toDelete.size()); + String entry = toDelete.remove(index); + Preencoded e = mapping.apply(entry); + System.out.println("Deleting " + asString(e) + ": " + entry); + delete(trie, e); + System.out.println("Trie\n" + trie.dump()); + + for (String test : toDelete) + assertEquals(test, trie.get(mapping.apply(test))); + } + assertTrue(trie.isEmpty()); + if (((BufferManagerMultibuf) trie.bufferManager).cellAllocator instanceof MemoryAllocationStrategy.OpOrderReuseStrategy) + { + assertEquals(0L, trie.bufferManager.usedBufferSpace()); + assertEquals(0L, ((ContentManagerPojo) trie.contentManager).usedObjectSpace()); + } + } + + private void randomizedTestEntries(String[] tests, Function mapping, InMemoryTrie trie) + { + System.out.println("\nRandomized insert and delete"); + List toInsert = Arrays.stream(tests).distinct().collect(Collectors.toList()); + List inserted = new ArrayList<>(); + + while (!toInsert.isEmpty()) + { + if (rand.nextDouble() > 0.35) + { + // Insert one value + int index = rand.nextInt(toInsert.size()); + String entry = toInsert.remove(index); + Preencoded e = mapping.apply(entry); + System.out.println("Adding " + asString(e) + ": " + entry); + putSimpleResolve(trie, e, entry, (x, y) -> y); + System.out.println("Trie\n" + trie.dump()); + inserted.add(entry); + } + else if (!inserted.isEmpty()) + { + // Delete one value + int index = rand.nextInt(inserted.size()); + String entry = inserted.remove(index); + Preencoded e = mapping.apply(entry); + System.out.println("Deleting " + asString(e) + ": " + entry); + delete(trie, e); + System.out.println("Trie\n" + trie.dump()); + toInsert.add(entry); + } + + for (String test : inserted) + assertEquals(test, trie.get(mapping.apply(test))); + for (String test: toInsert) + assertEquals(null, trie.get(mapping.apply(test))); + } } - static InMemoryTrie makeInMemoryTrie(ByteComparable[] src, - Map content, + static InMemoryTrie makeInMemoryTrie(Preencoded[] src, + Map content, boolean usePut) { @@ -495,18 +456,18 @@ static InMemoryTrie makeInMemoryTrie(ByteComparable[] src, return trie; } - static void addToInMemoryTrie(ByteComparable[] src, - Map content, + static void addToInMemoryTrie(Preencoded[] src, + Map content, InMemoryTrie trie, boolean usePut) { - for (ByteComparable b : src) + for (Preencoded b : src) addToInMemoryTrie(content, trie, usePut, b); } - static void addNthToInMemoryTrie(ByteComparable[] src, - Map content, + static void addNthToInMemoryTrie(Preencoded[] src, + Map content, InMemoryTrie trie, boolean usePut, int divisor, @@ -514,7 +475,7 @@ static void addNthToInMemoryTrie(ByteComparable[] src, { int i = 0; - for (ByteComparable b : src) + for (Preencoded b : src) { if (i++ % divisor != remainder) continue; @@ -523,7 +484,7 @@ static void addNthToInMemoryTrie(ByteComparable[] src, } } - private static void addToInMemoryTrie(Map content, InMemoryTrie trie, boolean usePut, ByteComparable b) + private static void addToInMemoryTrie(Map content, InMemoryTrie trie, boolean usePut, Preencoded b) { // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload // (so that all sources have the same value). @@ -537,11 +498,11 @@ private static void addToInMemoryTrie(Map content, I System.out.println(trie.dump(x -> string(x))); } - static void addToMap(ByteComparable[] src, - Map content) + static void addToMap(Preencoded[] src, + Map content) { - for (ByteComparable b : src) + for (Preencoded b : src) { // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload // (so that all sources have the same value). @@ -551,18 +512,18 @@ static void addToMap(ByteComparable[] src, } } - private static String string(Object x) + static String string(Object x) { return x instanceof ByteBuffer ? ByteBufferUtil.bytesToHex((ByteBuffer) x) - : x instanceof ByteComparable - ? ((ByteComparable) x).byteComparableAsString(byteComparableVersion) + : x instanceof Preencoded + ? ((Preencoded) x).byteComparableAsString(VERSION) : x.toString(); } - static void checkGet(Trie trie, Map items) + static void checkGet(BaseTrie trie, Map items) { - for (Map.Entry en : items.entrySet()) + for (Map.Entry en : items.entrySet()) { if (VERBOSE) System.out.println("Checking " + asString(en.getKey()) + ": " + ByteBufferUtil.bytesToHex(en.getValue())); @@ -570,7 +531,7 @@ static void checkGet(Trie trie, Map trie, SortedMap map) + static void assertSameContent(Trie trie, SortedMap map) { assertMapEquals(trie, map, Direction.FORWARD); assertForEachEntryEquals(trie, map, Direction.FORWARD); @@ -582,12 +543,12 @@ static void assertSameContent(Trie trie, SortedMap trie, SortedMap map) + private static void assertValuesEqual(Trie trie, SortedMap map) { assertIterablesEqual(trie.values(), map.values()); } - private static void assertUnorderedValuesEqual(Trie trie, SortedMap map) + private static void assertUnorderedValuesEqual(Trie trie, SortedMap map) { Multiset unordered = HashMultiset.create(); StringBuilder errors = new StringBuilder(); @@ -604,36 +565,36 @@ private static void assertUnorderedValuesEqual(Trie trie, SortedMap< assertEquals("", errors.toString()); } - static Collection maybeReversed(Direction direction, Collection data) + static Collection maybeReversed(Direction direction, Collection data) { return direction.isForward() ? data : reorderBy(data, reverseComparator); } - static Map maybeReversed(Direction direction, Map data) + static Map maybeReversed(Direction direction, Map data) { return direction.isForward() ? data : reorderBy(data, reverseComparator); } - private static Map reorderBy(Map data, Comparator comparator) + private static Map reorderBy(Map data, Comparator comparator) { - Map newMap = new TreeMap<>(comparator); + Map newMap = new TreeMap<>(comparator); newMap.putAll(data); return newMap; } - private static void assertForEachEntryEquals(Trie trie, SortedMap map, Direction direction) + private static void assertForEachEntryEquals(Trie trie, SortedMap map, Direction direction) { - Iterator> it = maybeReversed(direction, map).entrySet().iterator(); + Iterator> it = maybeReversed(direction, map).entrySet().iterator(); trie.forEachEntry(direction, (key, value) -> { Assert.assertTrue("Map exhausted first, key " + asString(key), it.hasNext()); - Map.Entry entry = it.next(); - assertEquals(0, ByteComparable.compare(entry.getKey(), key, byteComparableVersion)); + Map.Entry entry = it.next(); + assertEquals(0, ByteComparable.compare(entry.getKey(), key, VERSION)); assertEquals(entry.getValue(), value); }); Assert.assertFalse("Trie exhausted first", it.hasNext()); } - private static void assertForEachValueEquals(Trie trie, SortedMap map) + private static void assertForEachValueEquals(Trie trie, SortedMap map) { Iterator it = map.values().iterator(); trie.forEachValue(value -> { @@ -644,7 +605,7 @@ private static void assertForEachValueEquals(Trie trie, SortedMap trie, SortedMap map, Direction direction) + static void assertMapEquals(Trie trie, SortedMap map, Direction direction) { assertMapEquals(trie.entryIterator(direction), maybeReversed(direction, map).entrySet().iterator()); } @@ -656,29 +617,29 @@ static Collection reorderBy(Collection original, Comparator compara return list; } - static + static void assertMapEquals(Iterator> it1, Iterator> it2) { - List failedAt = new ArrayList<>(); + List failedAt = new ArrayList<>(); StringBuilder b = new StringBuilder(); while (it1.hasNext() && it2.hasNext()) { - Map.Entry en1 = it1.next(); - Map.Entry en2 = it2.next(); + Map.Entry en1 = it1.next(); + Map.Entry en2 = it2.next(); b.append(String.format("TreeSet %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue()))); b.append(String.format("Trie %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue()))); - if (ByteComparable.compare(en1.getKey(), en2.getKey(), byteComparableVersion) != 0 || ByteBufferUtil.compareUnsigned(en1.getValue(), en2.getValue()) != 0) + if (ByteComparable.compare(en1.getKey(), en2.getKey(), VERSION) != 0 || ByteBufferUtil.compareUnsigned(en1.getValue(), en2.getValue()) != 0) failedAt.add(en1.getKey()); } while (it1.hasNext()) { - Map.Entry en1 = it1.next(); + Map.Entry en1 = it1.next(); b.append(String.format("Trie %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue()))); failedAt.add(en1.getKey()); } while (it2.hasNext()) { - Map.Entry en2 = it2.next(); + Map.Entry en2 = it2.next(); b.append(String.format("TreeSet %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue()))); failedAt.add(en2.getKey()); } @@ -705,61 +666,21 @@ else if (actual.hasNext()) Assert.fail("Remaing values in actual, starting with " + actual.next()); } - static ByteComparable[] generateKeys(Random rand, int count) - { - ByteComparable[] sources = new ByteComparable[count]; - TreeSet added = new TreeSet<>(forwardComparator); - for (int i = 0; i < count; ++i) - { - sources[i] = generateKey(rand); - if (!added.add(sources[i])) - --i; - } - - // note: not sorted! - return sources; - } - - static ByteComparable generateKey(Random rand) - { - return generateKey(rand, MIN_LENGTH, MAX_LENGTH); - } - - static ByteComparable generateKey(Random rand, int minLength, int maxLength) - { - int len = rand.nextInt(maxLength - minLength + 1) + minLength; - byte[] bytes = new byte[len]; - int p = 0; - int length = bytes.length; - while (p < length) - { - int seed = rand.nextInt(KEY_CHOICE); - Random r2 = new Random(seed); - int m = r2.nextInt(5) + 2 + p; - if (m > length) - m = length; - while (p < m) - bytes[p++] = (byte) r2.nextInt(256); - } - return prefixFree ? v -> ByteSource.withTerminator(ByteSource.TERMINATOR, ByteSource.of(bytes, v)) - : ByteComparable.preencoded(byteComparableVersion, bytes); - } - - static String asString(ByteComparable bc) + static String asString(Preencoded bc) { - return bc != null ? bc.byteComparableAsString(byteComparableVersion) : "null"; + return bc != null ? bc.byteComparableAsString(VERSION) : "null"; } - void putSimpleResolve(InMemoryTrie trie, - ByteComparable key, - T value, - Trie.MergeResolver resolver) + void putSimpleResolve(InMemoryTrie trie, + Preencoded key, + T value, + Trie.MergeResolver resolver) { putSimpleResolve(trie, key, value, resolver, usePut()); } - static void putSimpleResolve(InMemoryTrie trie, - ByteComparable key, + static void putSimpleResolve(InMemoryTrie trie, + Preencoded key, T value, Trie.MergeResolver resolver, boolean usePut) @@ -777,4 +698,24 @@ static void putSimpleResolve(InMemoryTrie trie, throw new AssertionError(e); } } + + void delete(InMemoryTrie trie, ByteComparable key) + { + delete(trie, key, usePut()); + } + + static void delete(InMemoryTrie trie, ByteComparable key, boolean usePut) + { + try + { + trie.putSingleton(key, + Boolean.TRUE, + (existing, update) -> update ? null : existing, + usePut); + } + catch (TrieSpaceExhaustedException e) + { + throw Throwables.propagate(e); + } + } } diff --git a/test/unit/org/apache/cassandra/db/tries/InMemoryTrieThreadedTest.java b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieThreadedTest.java index 8435772bd067..65da6edb5ed4 100644 --- a/test/unit/org/apache/cassandra/db/tries/InMemoryTrieThreadedTest.java +++ b/test/unit/org/apache/cassandra/db/tries/InMemoryTrieThreadedTest.java @@ -18,592 +18,29 @@ package org.apache.cassandra.db.tries; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ThreadLocalRandom; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Predicate; - -import org.junit.Assert; -import org.junit.Test; - import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.concurrent.OpOrder; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.byteComparableVersion; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.generateKeys; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; -public class InMemoryTrieThreadedTest +public class InMemoryTrieThreadedTest extends ThreadedTestBase> { - private static final int COUNT = 30000; - private static final int OTHERS = COUNT / 10; - private static final int PROGRESS_UPDATE = COUNT / 15; - private static final int READERS = 8; - private static final int WALKERS = 2; - private static final Random rand = new Random(); - - static + @Override + String value(ByteComparable b) { - InMemoryTrieTestBase.prefixFree = true; + return b.byteComparableAsString(VERSION); } - /** - * Force copy every modified cell below the partition/enumeration level. Provides atomicity of mutations within the - * partition level as well as consistency. - */ - public static final Predicate> FORCE_COPY_PARTITION = features -> isPartition(features.content()); - /** - * Force copy every modified cell below the earliest branching point. Provides atomicity of mutations at any level, - * but readers/walkers may see inconsistent views of the data, in the sense that older mutations may be missed - * while newer ones are returned. - */ - public static final Predicate> FORCE_ATOMIC = features -> features.isBranching(); - /** - * Do not do any additional copying beyond what is required to build the tries safely for concurrent readers. - * Mutations may be partially seen by readers, and older mutations may be missed while newer ones are returned. - */ - public static final Predicate> NO_ATOMICITY = features -> false; - static Value value(ByteComparable b, ByteComparable cprefix, ByteComparable c, int add, int seqId) + @Override + InMemoryTrie makeTrie(OpOrder readOrder) { - return new Value(b.byteComparableAsString(byteComparableVersion), - (cprefix != null ? cprefix.byteComparableAsString(byteComparableVersion) : "") + c.byteComparableAsString(byteComparableVersion), add, seqId); + return InMemoryTrie.longLived(VERSION, readOrder); } - static String value(ByteComparable b) + @Override + void add(InMemoryTrie trie, ByteComparable b, String v, int iteration) throws TrieSpaceExhaustedException { - return b.byteComparableAsString(byteComparableVersion); - } - - @Test - public void testThreaded() throws InterruptedException - { - OpOrder readOrder = new OpOrder(); - ByteComparable[] src = generateKeys(rand, COUNT + OTHERS); - InMemoryTrie trie = InMemoryTrie.longLived(byteComparableVersion, readOrder); - ConcurrentLinkedQueue errors = new ConcurrentLinkedQueue<>(); - List threads = new ArrayList<>(); - AtomicBoolean writeCompleted = new AtomicBoolean(false); - AtomicInteger writeProgress = new AtomicInteger(0); - - for (int i = 0; i < WALKERS; ++i) - threads.add(new Thread(() -> { - try - { - while (!writeCompleted.get()) - { - int min = writeProgress.get(); - int count = 0; - try (OpOrder.Group group = readOrder.start()) - { - for (Map.Entry en : trie.entrySet()) - { - String v = value(en.getKey()); - Assert.assertEquals(en.getKey().byteComparableAsString(byteComparableVersion), v, en.getValue()); - ++count; - } - } - Assert.assertTrue("Got only " + count + " while progress is at " + min, count >= min); - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - })); - - for (int i = 0; i < READERS; ++i) - { - threads.add(new Thread(() -> { - try - { - Random r = ThreadLocalRandom.current(); - while (!writeCompleted.get()) - { - int min = writeProgress.get(); - - for (int i1 = 0; i1 < PROGRESS_UPDATE; ++i1) - { - int index = r.nextInt(COUNT + OTHERS); - ByteComparable b = src[index]; - String v = value(b); - try (OpOrder.Group group = readOrder.start()) - { - String result = trie.get(b); - if (result != null) - { - Assert.assertTrue("Got not added " + index + " when COUNT is " + COUNT, - index < COUNT); - Assert.assertEquals("Failed " + index, v, result); - } - else if (index < min) - Assert.fail("Failed index " + index + " while progress is at " + min); - } - } - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - })); - } - - threads.add(new Thread(() -> { - try - { - for (int i = 0; i < COUNT; i++) - { - ByteComparable b = src[i]; - - // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload - // (so that all sources have the same value). - String v = value(b); - trie.putSingleton(b, v, (x, y) -> y, i % 2 != 0); - - if (i % PROGRESS_UPDATE == 0) - writeProgress.set(i); - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - finally - { - writeCompleted.set(true); - } - })); - - for (Thread t : threads) - t.start(); - - for (Thread t : threads) - t.join(); - - if (!errors.isEmpty()) - Assert.fail("Got errors:\n" + errors); - } - - static abstract class Content - { - final String pk; - - Content(String pk) - { - this.pk = pk; - } - - abstract boolean isPartition(); - } - - static class Value extends Content - { - final String ck; - final int value; - final int seq; - - Value(String pk, String ck, int value, int seq) - { - super(pk); - this.ck = ck; - this.value = value; - this.seq = seq; - } - - @Override - public String toString() - { - return "Value{" + - "pk='" + pk + '\'' + - ", ck='" + ck + '\'' + - ", value=" + value + - ", seq=" + seq + - '}'; - } - - @Override - boolean isPartition() - { - return false; - } - } - - static class Metadata extends Content - { - int updateCount; - - Metadata(String pk) - { - super(pk); - updateCount = 1; - } - - @Override - boolean isPartition() - { - return true; - } - - Metadata mergeWith(Metadata other) - { - Metadata m = new Metadata(pk); - m.updateCount = updateCount + other.updateCount; - return m; - } - - @Override - public String toString() - { - return "Metadata{" + - "pk='" + pk + '\'' + - ", updateCount=" + updateCount + - '}'; - } - } - - static boolean isPartition(Content c) - { - return c != null && c.isPartition(); - } - - @Test - public void testConsistentUpdates() throws Exception - { - // Check that multi-path updates with below-partition-level copying are safe for concurrent readers, - // and that content is atomically applied, i.e. that reader see either nothing from the update or all of it, - // and consistent, i.e. that it is not possible to receive some newer updates while missing - // older ones. (For example, if the sequence of additions is 3, 1, 5, without this requirement a reader - // could see an enumeration which lists 3 and 5 but not 1.) - testAtomicUpdates(3, FORCE_COPY_PARTITION, true, true); - // Note: using 3 per mutation, so that the first and second update fit in a sparse in-memory trie block. - } - - @Test - public void testAtomicUpdates() throws Exception - { - // Check that multi-path updates with below-branching-point copying are safe for concurrent readers, - // and that content is atomically applied, i.e. that reader see either nothing from the update or all of it. - testAtomicUpdates(3, FORCE_ATOMIC, true, false); - } - - @Test - public void testSafeUpdates() throws Exception - { - // Check that multi path updates without additional copying are safe for concurrent readers. - testAtomicUpdates(3, NO_ATOMICITY, false, false); - } - - @Test - public void testConsistentSinglePathUpdates() throws Exception - { - // Check that single path updates with below-partition-level copying are safe for concurrent readers, - // and that content is consistent, i.e. that it is not possible to receive some newer updates while missing - // older ones. (For example, if the sequence of additions is 3, 1, 5, without this requirement a reader - // could see an enumeration which lists 3 and 5 but not 1.) - testAtomicUpdates(1, FORCE_COPY_PARTITION, true, true); - } - - - @Test - public void testAtomicSinglePathUpdates() throws Exception - { - // When doing single path updates atomicity comes for free. This only checks that the branching checker is - // not doing anything funny. - testAtomicUpdates(1, FORCE_ATOMIC, true, false); - } - - @Test - public void testSafeSinglePathUpdates() throws Exception - { - // Check that single path updates without additional copying are safe for concurrent readers. - testAtomicUpdates(1, NO_ATOMICITY, true, false); - } - - // The generated keys all start with NEXT_COMPONENT, which makes it impossible to test the precise behavior of the - // partition-level force copying. Strip that byte. - private static ByteComparable[] skipFirst(ByteComparable[] keys) - { - ByteComparable[] result = new ByteComparable[keys.length]; - for (int i = 0; i < keys.length; ++i) - result[i] = skipFirst(keys[i]); - return result; - } - - private static ByteComparable skipFirst(ByteComparable key) - { - return v -> { - var bs = key.asComparableBytes(v); - int n = bs.next(); - assert n != ByteSource.END_OF_STREAM; - return bs; - }; - } - - public void testAtomicUpdates(int PER_MUTATION, - Predicate> forcedCopyChecker, - boolean checkAtomicity, - boolean checkSequence) - throws Exception - { - ByteComparable[] ckeys = skipFirst(generateKeys(rand, COUNT)); - ByteComparable[] pkeys = skipFirst(generateKeys(rand, Math.min(100, COUNT / 10))); // to guarantee repetition - - /* - * Adds COUNT partitions each with perPartition separate clusterings, where the sum of the values - * of all clusterings is 0. - * If the sum for any walk covering whole partitions is non-zero, we have had non-atomic updates. - */ - - OpOrder readOrder = new OpOrder(); -// InMemoryTrie trie = new InMemoryTrie<>(new MemtableAllocationStrategy.NoReuseStrategy(BufferType.OFF_HEAP)); - InMemoryTrie trie = InMemoryTrie.longLived(byteComparableVersion, readOrder); - ConcurrentLinkedQueue errors = new ConcurrentLinkedQueue<>(); - List threads = new ArrayList(); - AtomicBoolean writeCompleted = new AtomicBoolean(false); - AtomicInteger writeProgress = new AtomicInteger(0); - - for (int i = 0; i < WALKERS; ++i) - threads.add(new Thread() - { - public void run() - { - try - { - Random r = ThreadLocalRandom.current(); - while (!writeCompleted.get()) - { - int min = writeProgress.get(); - try (OpOrder.Group group = readOrder.start()) - { - Iterable> entries = trie.entrySet(); - checkEntries("", min, true, checkAtomicity, false, PER_MUTATION, entries); - } - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - } - }); - - for (int i = 0; i < READERS; ++i) - { - ByteComparable[] srcLocal = pkeys; - threads.add(new Thread() - { - public void run() - { - try - { - // await at least one ready partition - while (writeProgress.get() == 0) {} - - Random r = ThreadLocalRandom.current(); - while (!writeCompleted.get()) - { - ByteComparable key = srcLocal[r.nextInt(srcLocal.length)]; - int min = writeProgress.get() / (pkeys.length * PER_MUTATION) * PER_MUTATION; - Iterable> entries; - - try (OpOrder.Group group = readOrder.start()) - { - entries = trie.tailTrie(key).entrySet(); - checkEntries(" in tail " + key.byteComparableAsString(byteComparableVersion), min, false, checkAtomicity, checkSequence, PER_MUTATION, entries); - } - - try (OpOrder.Group group = readOrder.start()) - { - entries = trie.subtrie(key, nextBranch(key)).entrySet(); - checkEntries(" in branch " + key.byteComparableAsString(byteComparableVersion), min, true, checkAtomicity, checkSequence, PER_MUTATION, entries); - } - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - } - }); - } - - threads.add(new Thread() - { - public void run() - { - ThreadLocalRandom r = ThreadLocalRandom.current(); - final Trie.CollectionMergeResolver mergeResolver = new Trie.CollectionMergeResolver() - { - @Override - public Content resolve(Content c1, Content c2) - { - if (c1.isPartition() && c2.isPartition()) - return ((Metadata) c1).mergeWith((Metadata) c2); - throw new AssertionError("Test error, keys should be distinct."); - } - - public Content resolve(Collection contents) - { - return contents.stream().reduce(this::resolve).get(); - } - }; - - try - { - int lastUpdate = 0; - for (int i = 0; i < COUNT; i += PER_MUTATION) - { - ByteComparable b = pkeys[(i / PER_MUTATION) % pkeys.length]; - Metadata partitionMarker = new Metadata(b.byteComparableAsString(byteComparableVersion)); - ByteComparable cprefix = null; - if (r.nextBoolean()) - cprefix = ckeys[i]; // Also test branching point below the partition level - - List> sources = new ArrayList<>(); - for (int j = 0; j < PER_MUTATION; ++j) - { - - ByteComparable k = ckeys[i + j]; - Trie row = Trie.singleton(k, byteComparableVersion, - value(b, cprefix, k, - j == 0 ? -PER_MUTATION + 1 : 1, - (i / PER_MUTATION / pkeys.length) * PER_MUTATION + j)); - - if (cprefix != null) - row = row.prefixedBy(cprefix); - - row = withRootMetadata(row, partitionMarker); - row = row.prefixedBy(b); - sources.add(row); - } - - final Trie mutation = Trie.merge(sources, mergeResolver); - - trie.apply(mutation, - (existing, update) -> existing == null ? update : mergeResolver.resolve(existing, update), - forcedCopyChecker); - - if (i >= pkeys.length * PER_MUTATION && i - lastUpdate >= PROGRESS_UPDATE) - { - writeProgress.set(i); - lastUpdate = i; - } - } - } - catch (Throwable t) - { - t.printStackTrace(); - errors.add(t); - } - finally - { - writeCompleted.set(true); - } - } - }); - - for (Thread t : threads) - t.start(); - - for (Thread t : threads) - t.join(); - - System.out.format("Reuse %s %s atomicity %s on-heap %,d (+%,d) off-heap %,d\n", - trie.cellAllocator.getClass().getSimpleName(), - trie.bufferType, - forcedCopyChecker == NO_ATOMICITY ? "none" : - forcedCopyChecker == FORCE_ATOMIC ? "atomic" : "consistent partition", - trie.usedSizeOnHeap(), - trie.unusedReservedOnHeapMemory(), - trie.usedSizeOffHeap()); - - if (!errors.isEmpty()) - Assert.fail("Got errors:\n" + errors); - } - - static ByteComparable nextBranch(ByteComparable key) - { - return version -> { - byte[] bytes = key.asByteComparableArray(version); - int last = bytes.length - 1; - while (last >= 0 && bytes[last] == ((byte) 0xFF)) - --last; - if (last < 0) - return null; - ++bytes[last]; - return ByteSource.preencoded(bytes, 0, last + 1); - }; - } - - static Trie withRootMetadata(Trie wrapped, T metadata) - { - return wrapped.mergeWith(Trie.singleton(ByteComparable.EMPTY, byteComparableVersion, metadata), Trie.throwingResolver()); - } - - public void checkEntries(String location, - int min, - boolean usePk, - boolean checkAtomicity, - boolean checkConsecutiveIds, - int PER_MUTATION, - Iterable> entries) - { - long sum = 0; - int count = 0; - long idSum = 0; - long idMax = 0; - int updateCount = 0; - for (var en : entries) - { - String path = en.getKey().byteComparableAsString(byteComparableVersion); - if (en.getValue().isPartition()) - { - Metadata m = (Metadata) en.getValue(); - Assert.assertEquals("Partition metadata" + location, (usePk ? m.pk : ""), path); - updateCount += m.updateCount; - continue; - } - final Value value = (Value) en.getValue(); - String valueKey = (usePk ? value.pk : "") + value.ck; - Assert.assertEquals(location, valueKey, path); - ++count; - sum += value.value; - int seq = value.seq; - idSum += seq; - if (seq > idMax) - idMax = seq; - } - - Assert.assertTrue("Values" + location + " should be at least " + min + ", got " + count, min <= count); - - if (checkAtomicity) - { - // If mutations apply atomically, the row count is always a multiple of the mutation size... - Assert.assertTrue("Values" + location + " should be a multiple of " + PER_MUTATION + ", got " + count, count % PER_MUTATION == 0); - // ... and the sum of the values is 0 (as the sum for each individual mutation is 0). - Assert.assertEquals("Value sum" + location, 0, sum); - } - - if (checkConsecutiveIds) - { - // The update count reflected in the partition metadata must match the row count. - Assert.assertEquals("Update count" + location, count, updateCount); - // If mutations apply consistently for the partition, for any row we see we have to have seen all rows that - // were applied before that. In other words, the id sum should be the sum of the integers from 1 to the - // highest id seen in the partition. - Assert.assertEquals("Id sum" + location, idMax * (idMax + 1) / 2, idSum); - } + trie.putSingleton(b, v, (x, y) -> y, iteration % 2 != 0); } } diff --git a/test/unit/org/apache/cassandra/db/tries/IntersectedTailsTest.java b/test/unit/org/apache/cassandra/db/tries/IntersectedTailsTest.java new file mode 100644 index 000000000000..57d40ad9ba09 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/IntersectedTailsTest.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.function.BiFunction; + +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; + +public class IntersectedTailsTest +{ + static final BiFunction applyDeletion = (d, v) -> d.applicableAfter ? null : v; + @SuppressWarnings("rawtypes") + static final Trie.CollectionMergeResolver onlySame = x -> + { + if (x.stream().distinct().count() != 1) + throw new AssertionError(); + return x.iterator().next(); + }; + + @BeforeClass + public static void setup() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + InMemoryTrieTestBase.strategy = InMemoryTrieTestBase.ReuseStrategy.SHORT_LIVED_ORDERED; + InMemoryTrieTestBase.reverseComparator = InMemoryTrieTestBase.forwardComparator.reversed(); + } + + @Test + public void testRangeApplyCursor() + { + testIntersectedTails((trie, set) -> + { + RangeTrie setAsRangeTrie = RangeTrie.fromSet(set, TrieSetCursor.RangeState.CONTAINED); + return setAsRangeTrie.applyTo(trie, + applyDeletion); + }); + } + + + @Test + public void testIntersection() + { + testIntersectedTails((trie, set) -> trie.intersectSlicing(set.negation())); + } + + @Test + public void testDeletionAwareMerge() + { + testIntersectedTails((trie, set) -> + { + RangeTrie setAsRangeTrie = RangeTrie.fromSet(set, TrieSetCursor.RangeState.CONTAINED); + DeletionAwareTrie data = DeletionAwareTrie.wrap(trie); + DeletionAwareTrie deletions = DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, VERSION, setAsRangeTrie); + return data.mergeWith(deletions, DeletionAwareTrie.throwingResolver(), DeletionAwareTrie.throwingResolver(), applyDeletion, true) + .contentOnlyTrie(); + }); + } + + @Test + public void testDeletionAwareCollectionMerge() + { + testIntersectedTails((trie, set) -> + { + RangeTrie setAsRangeTrie = RangeTrie.fromSet(set, TrieSetCursor.RangeState.CONTAINED); + DeletionAwareTrie data = DeletionAwareTrie.wrap(trie); + DeletionAwareTrie deletions = DeletionAwareTrie.deletionBranch(ByteComparable.EMPTY, VERSION, setAsRangeTrie); + //noinspection unchecked + return DeletionAwareTrie.merge(List.of(data, deletions, data, deletions), + onlySame, + onlySame, + applyDeletion, + true) + .contentOnlyTrie(); + }); + } + + public void testIntersectedTails(BiFunction, TrieSet, Trie> deleter) + { + String[] data = new String[]{ "abc", "acd", "ade", "adgh", "adhi", "adij", + "bbc", "bcd", "bde", "bdgh", "bdhi", "bdij", + "dbc", "dcd", "dde", "ddgh", "ddhi", "ddij", + "ebc", "ecd", "ede", "edgh", "edhi", "edij", + "gbc", "gcd", "gde", "gdgh", "gdhi", "gdij" }; + String[] ranges = new String[]{ "a", "a", + "c", "c", + "eaa", "ede", + "edjj", "ef", + "ga", "gb", + "gd", "gg" }; + + String[] expected = new String[] {"bbc", "bcd", "bde", "bdgh", "bdhi", "bdij", + "dbc", "dcd", "dde", "ddgh", "ddhi", "ddij", + "edgh", "edhi", "edij", + "gcd" }; + + SortedMap expectedAsMap = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + InMemoryTrie trie = InMemoryTrieTestBase.makeInMemoryTrie(Arrays.stream(data) + .map(TrieUtil::directComparable) + .toArray(ByteComparable.Preencoded[]::new), + expectedAsMap, + true); + + TrieSet set = TrieSet.ranges(TrieUtil.VERSION, true, true, Arrays.stream(ranges) + .map(TrieUtil::directComparable) + .toArray(ByteComparable.Preencoded[]::new)); + Trie del = deleter.apply(trie, set); + + expectedAsMap.clear(); + InMemoryTrie expTrie = InMemoryTrieTestBase.makeInMemoryTrie(Arrays.stream(expected) + .map(TrieUtil::directComparable) + .toArray(ByteComparable.Preencoded[]::new), + expectedAsMap, + true); + TrieUtil.assertSameContent(del, expectedAsMap); + + // Check a random selection of tails. + Random r = new Random(1); + for (int i = 0; i < 10000; ++i) + { + String k = data[r.nextInt(data.length)]; + String f = k.substring(0, r.nextInt(k.length() + 1)); // "" to all of k + + Trie expTail = expTrie.tailTrie(TrieUtil.directComparable(f)); + Trie tail = del.tailTrie(TrieUtil.directComparable(f)); + checkSameTries(expectedAsMap, expTail, tail, f); + + // Take a tail of the tail too. + String s = k.substring(f.length(), f.length() + r.nextInt(k.length() - f.length() + 1)); + expTail = expTrie.tailTrie(TrieUtil.directComparable(f + s)); + if (tail != null) + tail = tail.tailTrie(TrieUtil.directComparable(s)); + checkSameTries(expectedAsMap, expTail, tail, f + s); + } + } + + private static void checkSameTries(SortedMap expectedAsMap, Trie expTail, Trie tail, String k) + { + expectedAsMap.clear(); + if (expTail != null) + { + for (var entries : expTail.entrySet()) + expectedAsMap.put(entries.getKey(), entries.getValue()); + } + + if (tail == null) + tail = Trie.empty(VERSION); + + try + { + TrieUtil.assertSameContent(tail, expectedAsMap); + } + catch (Throwable t) + { + System.err.println("Prefix " + k); + TrieUtil.dumpToOut(tail); + System.err.println(expectedAsMap); + throw t; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/IntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/IntersectionTrieTest.java new file mode 100644 index 000000000000..08c5e04bb7fb --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/IntersectionTrieTest.java @@ -0,0 +1,692 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.NavigableMap; +import java.util.Random; +import java.util.TreeMap; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.google.common.base.Predicates; +import com.google.common.collect.ImmutableList; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import com.googlecode.concurrenttrees.common.Iterables; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.asString; +import static org.apache.cassandra.db.tries.TrieUtil.assertMapEquals; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.makeInMemoryTrie; +import static org.apache.cassandra.db.tries.TrieUtil.toBound; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertEquals; + +@RunWith(Parameterized.class) +public class IntersectionTrieTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + private static final int COUNT = 15000; + Random rand = new Random(); + int seed = rand.nextInt(); + final static int bitsNeeded = 4; + + @Parameterized.Parameters(name = "bits per transition {0} dropDanglingZero {1}") + public static List data() + { + return IntStream.rangeClosed(1, bitsNeeded) + .boxed() + .flatMap(x -> Stream.of(new Object[] { x, false}, + new Object[] {x, true})) + .collect(Collectors.toList()); + } + + @Parameterized.Parameter(0) + public int bits = bitsNeeded; + + @Parameterized.Parameter(1) + public boolean dropDanglingZeros = false; + + public static final Trie.CollectionMergeResolver RESOLVER = new Trie.CollectionMergeResolver<>() + { + public Integer resolve(Collection contents) + { + return contents.iterator().next(); + } + + public Integer resolve(Integer b1, Integer b2) + { + return b1; + } + }; + + public interface RangeOp + { + Trie apply(Trie t, ByteComparable left, ByteComparable right); + } + + @Test + public void testIntersectRangeDirect() throws Exception + { + testIntersectRange(COUNT, Trie::subtrie); + } + + @Test + public void testIntersectRangesOneDirect() throws Exception + { + testIntersectRange(COUNT, (t, l, r) -> t.intersect(TrieSet.ranges(VERSION, l, r))); + } + + public void testIntersectRange(int count, RangeOp op) throws Exception + { + System.out.format("intersectrange seed %d\n", ++seed); + rand.setSeed(seed); + Preencoded[] src1 = generateKeys(rand, count); + NavigableMap content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION)); + + Trie t1 = makeInMemoryTrie(src1, content1, true); + + checkEqualRange(content1, t1, null, null, op); + checkEqualRange(content1, t1, TrieUtil.generateKeyBound(rand), null, op); + checkEqualRange(content1, t1, null, TrieUtil.generateKeyBound(rand), op); + + Preencoded l = rand.nextBoolean() ? TrieUtil.generateKeyBound(rand) : toBound(src1[rand.nextInt(src1.length)]); + Preencoded r = rand.nextBoolean() ? TrieUtil.generateKeyBound(rand) : toBound(src1[rand.nextInt(src1.length)]); + int cmp = ByteComparable.compare(l, r, VERSION); + if (cmp > 0) + { + Preencoded t = l;l = r;r = t; // swap + } + + checkEqualRange(content1, t1, l, r, op); + } + + public void checkEqualRange(NavigableMap content1, + Trie t1, + Preencoded l, + Preencoded r, + RangeOp op) + { + System.out.format("Intersection with [%s:%s]\n", asString(l), asString(r)); + NavigableMap imap = TrieUtil.boundedMap(content1, l, true, r, false); + + Trie intersection = op.apply(t1, l, r); + + assertMapEquals(intersection, imap, Direction.FORWARD); + assertMapEquals(intersection, imap, Direction.REVERSE); + } + + /** + * Extract the values of the provide trie into a list. + */ + private static List toList(Trie trie, Direction direction) + { + return Iterables.toList(trie.values(direction)); + } + + private Trie fromList(int... list) throws TrieSpaceExhaustedException + { + InMemoryTrie trie = InMemoryTrie.shortLivedOrdered(VERSION); + for (int i : list) + { + trie.putRecursive(of(i), i, (ex, n) -> n); + } + return trie; + } + + /** Creates a {@link ByteComparable} for the provided value by splitting the integer in sequences of "bits" bits. */ + private ByteComparable of(int value) + { + // TODO: Also in all other tests of this type + assert value >= 0 && value <= Byte.MAX_VALUE; + + byte[] splitBytes = new byte[(bitsNeeded + bits - 1) / bits]; + int pos = 0; + int mask = (1 << bits) - 1; + for (int i = bitsNeeded - bits; i > 0; i -= bits) + splitBytes[pos++] = (byte) ((value >> i) & mask); + + splitBytes[pos++] = (byte) (value & mask); + if (dropDanglingZeros) + { + while (pos > 0 && splitBytes[pos - 1] == 0) + --pos; + } + return ByteComparable.preencoded(VERSION, splitBytes, 0, pos); + } + + @Test + public void testSimpleSubtrie() throws TrieSpaceExhaustedException + { + { + Trie trie = fromList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + testIntersection("", asList(3, 4, 5, 6), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(3), of(7))); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6), trie, + TrieSet.rangeExclusiveEnd(VERSION, null, of(7))); + + testIntersection("", asList(3, 4, 5, 6, 7, 8, 9), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(3), null)); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie, + TrieSet.rangeExclusiveEnd(VERSION, null, null)); + + testIntersection("", asList(), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(7), of(7))); + } + } + + @Test + public void testRangeOnSubtrie() throws TrieSpaceExhaustedException + { + { + Trie trie = fromList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + + // non-overlapping + testIntersection("", asList(), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(4), of(7))); + // touching, i.e. still non-overlapping + testIntersection("", asList(), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(3), of(7))); + // overlapping 1 + testIntersection("", asList(2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(2), of(7))); + // overlapping 2 + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(7))); + // covered + testIntersection("", asList(0, 1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(7))); + // covered 2 + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(7))); + // nulls + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, null, null)); + + // null left + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(7)), + TrieSet.rangeExclusiveEnd(VERSION, null, of(3))); + + // null left contained + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, null, of(7))); + + // null right + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(0), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(1), null)); + + // null right contained + testIntersection("", asList(1, 2), trie, + TrieSet.rangeExclusiveEnd(VERSION, of(1), of(3)), + TrieSet.rangeExclusiveEnd(VERSION, of(0), null)); + } + } + + @Test + public void testSimpleRanges() throws TrieSpaceExhaustedException + { + { + Trie trie = fromList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + testIntersection("", asList(3, 4, 5, 6), trie, + TrieSet.ranges(VERSION, of(3), of(7))); + + testIntersection("", asList(3), trie, + TrieSet.ranges(VERSION, of(3), of(4))); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6), trie, + TrieSet.ranges(VERSION, null, of(7))); + + testIntersection("", asList(3, 4, 5, 6, 7, 8, 9), trie, + TrieSet.ranges(VERSION, of(3), null)); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie, + TrieSet.ranges(VERSION, null, null)); + + testIntersection("", asList(3, 4, 5, 7, 8), trie, + TrieSet.ranges(VERSION, of(3), of(6), of(7), of(9))); + + testIntersection("", asList(3, 7, 8), trie, + TrieSet.ranges(VERSION, of(3), of(4), of(7), of(9))); + + testIntersection("", asList(3, 7, 8), trie, + TrieSet.ranges(VERSION, of(3), of(4), of(7), of(9), of(12), of(15))); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 7, 8), trie, + TrieSet.ranges(VERSION, null, of(6), of(7), of(9))); + + testIntersection("", asList(3, 4, 5, 7, 8, 9), trie, + TrieSet.ranges(VERSION, of(3), of(6), of(7), null)); + + testIntersection("", asList(0, 1, 2, 3, 4, 5, 7, 8, 9), trie, + TrieSet.ranges(VERSION, null, of(6), of(7), null)); + + // Test some touching slices. + testIntersection("", asList(3, 4, 5, 6, 7, 8), trie, + TrieSet.ranges(VERSION, of(3), of(6), of(6), of(9))); + + testIntersection("", asList(3, 4, 5, 7, 8), trie, + TrieSet.ranges(VERSION, of(3), of(6), of(6), of(6), of(7), of(9))); + } + } + + @Test + public void testRangesOnRangesOne() throws TrieSpaceExhaustedException + { + { + Trie trie = fromList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); + + // non-overlapping + testIntersection("non-overlapping", asList(), trie, + TrieSet.ranges(VERSION, of(0), of(4)), + TrieSet.ranges(VERSION, of(4), of(8))); + // touching + testIntersection("touching", asList(3), trie, + TrieSet.ranges(VERSION, of(0), of(4)), + TrieSet.ranges(VERSION, of(3), of(8))); + // overlapping 1 + testIntersection("overlapping A", asList(2, 3), trie, + TrieSet.ranges(VERSION, of(0), of(4)), + TrieSet.ranges(VERSION, of(2), of(8))); + // overlapping 2 + testIntersection("overlapping B", asList(1, 2, 3), trie, + TrieSet.ranges(VERSION, of(0), of(4)), + TrieSet.ranges(VERSION, of(1), of(8))); + // covered + testIntersection("covered same end A", asList(0, 1, 2, 3), trie, + TrieSet.ranges(VERSION, of(0), of(4)), + TrieSet.ranges(VERSION, of(0), of(8))); + // covered 2 + testIntersection("covered same end B", asList(4, 5, 6, 7), trie, + TrieSet.ranges(VERSION, of(4), of(8)), + TrieSet.ranges(VERSION, of(0), of(8))); + // covered 3 + testIntersection("covered", asList(1, 2, 3), trie, + TrieSet.ranges(VERSION, of(1), of(4)), + TrieSet.ranges(VERSION, of(0), of(8))); + } + } + + @Test + public void testRangesOnRanges() throws TrieSpaceExhaustedException + { + testIntersections(fromList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)); + } + + @Test + public void testRangesOnMerge() throws TrieSpaceExhaustedException + { + + testIntersections(Trie.merge(ImmutableList.of(fromList(0, 1, 2, 3, 5, 8, 9, 13, 14), + fromList(4, 6, 7, 9, 10, 11, 12, 13)), + RESOLVER)); + } + + @Test + public void testRangesOnCollectionMerge2() throws TrieSpaceExhaustedException + { + { + List> inputs = ImmutableList.of(fromList(0, 1, 2, 3, 5, 8, 9, 13, 14), + fromList(4, 6, 7, 9, 10, 11, 12, 13)); + testIntersections(dir -> new CollectionMergeCursor.Plain<>(RESOLVER, dir, inputs, Trie::cursor)); + } + } + + @Test + public void testRangesOnCollectionMerge3() throws TrieSpaceExhaustedException + { + testIntersections(Trie.merge( + ImmutableList.of(fromList(0, 1, 2, 3, 5, 8, 9, 13, 14), + fromList(4, 6, 9, 10), + fromList(4, 7, 11, 12, 13)), + RESOLVER)); + } + + @Test + public void testRangesOnCollectionMerge10() throws TrieSpaceExhaustedException + { + testIntersections(Trie.merge( + ImmutableList.of(fromList(0, 14), + fromList(1, 2), + fromList(2, 13), + fromList(3), + fromList(4, 7), + fromList(5, 9, 12), + fromList(6, 8), + fromList(7), + fromList(8), + fromList(10, 11)), + RESOLVER)); + } + + private void testIntersections(Trie trie) + { + testIntersection("", asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), trie); + + TrieSet set1 = TrieSet.ranges(VERSION, null, of(4), of(5), of(9), of(12), null); + TrieSet set2 = TrieSet.ranges(VERSION, of(2), of(7), of(8), of(10), of(12), of(14)); + TrieSet set3 = TrieSet.ranges(VERSION, of(1), of(2), of(3), of(4), of(5), of(6), of(7), of(8), of(9), of(10)); + + testIntersections(trie, set1, set2, set3); + + testSetAlgebraIntersection(trie); + } + + private void testSetAlgebraIntersection(Trie trie) + { + TrieSet set1 = TrieSet.rangeExclusiveEnd(VERSION, null, of(3)) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(2), of(4))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(5), of(7))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(7), of(9))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(14), of(15))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(12), null)); + TrieSet set2 = TrieSet.rangeExclusiveEnd(VERSION, of(2), of(7)) + .union(TrieSet.ranges(VERSION, null, of(8), of(10), null).negation()) + .union(TrieSet.ranges(VERSION, of(8), of(10), of(12), of(14))); + TrieSet set3 = TrieSet.rangeExclusiveEnd(VERSION, of(1), of(2)) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(3), of(4))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(5), of(6))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(7), of(8))) + .union(TrieSet.rangeExclusiveEnd(VERSION, of(9), of(10))); + + testIntersections(trie, set1, set2, set3); + } + + private void testIntersections(Trie trie, TrieSet set1, TrieSet set2, TrieSet set3) + { + testIntersection("1", asList(0, 1, 2, 3, 5, 6, 7, 8, 12, 13, 14), trie, set1); + + testIntersection("2", asList(2, 3, 4, 5, 6, 8, 9, 12, 13), trie, set2); + + testIntersection("3", asList(1, 3, 5, 7, 9), trie, set3); + + testIntersection("12", asList(2, 3, 5, 6, 8, 12, 13), trie, set1, set2); + + testIntersection("13", asList(1, 3, 5, 7), trie, set1, set3); + + testIntersection("23", asList(3, 5, 9), trie, set2, set3); + + testIntersection("123", asList(3, 5), trie, set1, set2, set3); + } + + public void testIntersection(String message, List expected, Trie trie, TrieSet... sets) + { + testIntersectionTries(message, expected, trie, sets); + testIntersectionSets(message + " setix", expected, trie, TrieSet.rangeExclusiveEnd(VERSION, null, null), sets); + testIntersectionTriesByRangeApplyTo(message + " applyTo", expected, trie, sets); + testIntersectionTriesByMixed(message + " applyTo", expected, trie, sets); + testIntersectionInMemoryTrieDelete(message + " delete", expected, trie, sets); + } + + public void checkEqual(String message, List expected, Trie trie) + { + assertEquals(message + " forward", expected, toList(trie, Direction.FORWARD)); + assertEquals(message + " reverse", expected.stream() + .sorted(Comparator.naturalOrder().reversed()) + .collect(Collectors.toList()), + toList(trie, Direction.REVERSE)); + } + + public void testIntersectionSets(String message, List expected, Trie trie, TrieSet intersectedSet, TrieSet[] sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + checkEqual(message + " b" + bits, expected, trie.intersectSlicing(intersectedSet)); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + testIntersectionSets(message + ' ' + toRemove, expected, + trie, + intersectedSet.intersection(set), + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + } + + public void testIntersectionTries(String message, List expected, Trie trie, TrieSet[] sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + checkEqual(message + " b" + bits, expected, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + testIntersectionTries(message + ' ' + toRemove, expected, + trie.intersectSlicing(set), + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + } + + public void testIntersectionTriesByMixed(String message, List expected, Trie trie, TrieSet[] sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + checkEqual(message + " b" + bits, expected, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + Trie next; + if (((toRemove ^ sets.length ^ bits) & 1) == 0) + next = applySet(set, trie); + else + next = trie.intersectSlicing(set); + + testIntersectionTriesByMixed(message + ' ' + toRemove, expected, + next, + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + } + + public void testIntersectionTriesByRangeApplyTo(String message, List expected, Trie trie, TrieSet[] sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + checkEqual(message + " b" + bits, expected, trie); + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + testIntersectionTriesByRangeApplyTo(message + ' ' + toRemove, expected, + applySet(set, trie), + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + } + + private Trie applySet(TrieSet set, Trie trie) + { + // Convert the set to a range trie of its negation to apply deletion to anything that is not in the set. + // Do this by reinterpreting the cursor and avoiding verification + // (instead of e.g. RangeTrie.fromSet(set, TrieSetCursor.RangeState.END_START_PREFIX)), + // because some of the sets we use here are open and thus technically not valid range tries. + RangeTrie setAsRangeTrie = new RangeTrie<>() + { + @Override + public RangeCursor makeCursor(Direction direction) + { + throw new AssertionError(); + } + + @Override + public RangeCursor cursor(Direction direction) + { + // We are overriding cursor to disable debug verification (the source cursor is already checked by + // TrieSet.cursor()). + // We also want to negate the cursor in order to delete anything that is not contained in the set. + return set.cursor(direction).negated(); + } + }; + return setAsRangeTrie.applyTo(trie, (range, value) -> range.applicableAfter ? null : value); + } + + private static InMemoryTrie duplicateTrie(Trie trie) + { + try + { + InMemoryTrie dupe = InMemoryTrie.shortLivedOrdered(VERSION); + dupe.apply(trie, (x, y) -> y, Predicates.alwaysFalse()); + return dupe; + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + public void testIntersectionInMemoryTrieDelete(String message, List expected, Trie trie, TrieSet[] sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + checkEqual(message + " b" + bits, expected, trie); + } + else + { + try + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + InMemoryTrie ix = duplicateTrie(trie); + ix.delete(set.negation()); + testIntersectionInMemoryTrieDelete(message + ' ' + toRemove, expected, + ix, + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + } + + + @Test + public void testReturnsContentOnPrefix() throws TrieSpaceExhaustedException + { + TrieSet set = TrieSet.branch(VERSION, TrieUtil.directComparable("abc")); + Trie trie = TrieUtil.directTrie("a", "aa", "ab", "abc", "cd"); + Trie expected = TrieUtil.directTrie("a", "ab", "abc"); + assertMapEquals(expected.entrySet(Direction.FORWARD), trie.intersect(set).entrySet(Direction.FORWARD), TrieUtil.FORWARD_COMPARATOR); + assertMapEquals(expected.entrySet(Direction.REVERSE), trie.intersect(set).entrySet(Direction.REVERSE), TrieUtil.REVERSE_COMPARATOR); + assertEquals(expected.process(Direction.FORWARD, new TrieDumper.Plain<>(Object::toString)), trie.intersect(set).dump()); + } + + @Test + public void testReturnsBranchContents() throws TrieSpaceExhaustedException + { + TrieSet set = TrieSet.branch(VERSION, TrieUtil.directComparable("abc")); + Trie trie = TrieUtil.directTrie("aaa", "abc", "abce", "abcfff", "bcd"); + Trie expected = TrieUtil.directTrie("abc", "abce", "abcfff"); + assertMapEquals(expected.entrySet(Direction.FORWARD), trie.intersect(set).entrySet(Direction.FORWARD), TrieUtil.FORWARD_COMPARATOR); + assertMapEquals(expected.entrySet(Direction.REVERSE), trie.intersect(set).entrySet(Direction.REVERSE), TrieUtil.REVERSE_COMPARATOR); + assertEquals(expected.process(Direction.FORWARD, new TrieDumper.Plain<>(Object::toString)), trie.intersect(set).dump()); + } + + @Test + public void testRangeUnderCoveredBranch() + { + TrieSet set1 = TrieSet.branch(VERSION, TrieUtil.directComparable("b")); + TrieSet set2 = TrieUtil.directRanges("aa", "ab", "bc", "bd", "ce", "cf"); + TrieSet expected = TrieUtil.directRanges("bc", "bd"); + assertEquals(expected.dump(), set1.intersection(set2).dump()); + } + + @Test + public void testRangeUnderCoveredRoot() + { + TrieSet set1 = TrieSet.branch(VERSION, ByteComparable.EMPTY); + TrieSet set2 = TrieUtil.directRanges("aa", "ab", "bc", "bd", "ce", "cf"); + TrieSet expected = set2; + assertEquals(expected.dump(), set1.intersection(set2).dump()); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/LivePoint.java b/test/unit/org/apache/cassandra/db/tries/LivePoint.java new file mode 100644 index 000000000000..83e2f70b17e8 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/LivePoint.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Collection; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +public class LivePoint implements DataPoint +{ + final ByteComparable position; + final int timestamp; + + public LivePoint(ByteComparable position, int timestamp) + { + this.position = position; + this.timestamp = timestamp; + } + + public LivePoint delete(int timestamp) + { + return this.timestamp < timestamp ? null : this; + } + + @Override + public DeletionMarker marker() + { + return null; + } + + @Override + public LivePoint live() + { + return this; + } + + @Override + public ByteComparable position() + { + return position; + } + + @Override + public DataPoint withMarker(DeletionMarker newMarker) + { + if (newMarker == null) + return this; + else + return new CombinedDataPoint(this, newMarker); + } + + @Override + public LivePoint remap(ByteComparable newKey) + { + return new LivePoint(newKey, timestamp); + } + + @Override + public DataPoint toContent() + { + return this; + } + + @Override + public String toString() + { + return '{' + DataPoint.toString(position) + "}L" + timestamp; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + LivePoint livePoint = (LivePoint) o; + return timestamp == livePoint.timestamp + && ByteComparable.compare(this.position, livePoint.position, TrieUtil.VERSION) == 0; + } + + static LivePoint combine(LivePoint a, LivePoint b) + { + return a.timestamp >= b.timestamp ? a : b; + } + + static LivePoint combineCollection(Collection values) + { + return values.stream().reduce(LivePoint::combine).orElseThrow(); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/MapValuesTest.java b/test/unit/org/apache/cassandra/db/tries/MapValuesTest.java new file mode 100644 index 000000000000..6e2d32bc8861 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/MapValuesTest.java @@ -0,0 +1,438 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.google.common.collect.Streams; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.quicktheories.core.Gen; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.quicktheories.QuickTheory.qt; +import static org.quicktheories.generators.SourceDSL.integers; +import static org.quicktheories.generators.SourceDSL.lists; + +/// Property-based tests for [Trie#mapValues] using QuickTheories framework. +/// +/// Tests that the `mapValues` operation correctly transforms all content in a trie +/// through a mapping function while preserving the trie structure and key ordering. +/// Also tests various types of filtered iteration. +public class MapValuesTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static final int MAX_KEYS = 100; + static final int MAX_VALUE = 1000; + static final ByteComparable.Version VERSION = TrieUtil.VERSION; + + /** + * Generator for random ByteBuffer values. + */ + static Gen byteBufferGen() + { + return integers().between(0, MAX_VALUE) + .map(i -> { + ByteBuffer buf = ByteBuffer.allocate(4); + buf.putInt(i); + buf.flip(); + return buf; + }); + } + + /** + * Generator for lists of key-value pairs. + */ + static Gen> keyValueListGen() + { + return lists().of(integers().between(0, MAX_VALUE) + .zip(byteBufferGen(), KeyValue::new)) + .ofSizeBetween(0, MAX_KEYS); + } + + /** + * Helper to create and populate a trie with integer values from key-value pairs. + */ + static TrieWithContent createIntegerTrie(List keyValues) + { + InMemoryTrie trie = InMemoryTrie.shortLivedOrdered(VERSION); + NavigableMap content = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + + try + { + for (KeyValue kv : keyValues) + { + Integer value = kv.integer; + trie.putSingleton(kv.key, value, (existing, update) -> update, true); + content.put(kv.key, value); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + + return new TrieWithContent<>(trie, content); + } + + /** + * Helper to create and populate a trie with ByteBuffer values from key-value pairs. + */ + static TrieWithContent createByteBufferTrie(List keyValues) + { + InMemoryTrie trie = InMemoryTrie.shortLivedOrdered(VERSION); + NavigableMap content = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + + try + { + for (KeyValue kv : keyValues) + { + ByteBuffer value = kv.value; + trie.putSingleton(kv.key, value, (existing, update) -> update, true); + content.put(kv.key, value); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + + return new TrieWithContent<>(trie, content); + } + + /** + * Verifies that all values in the mapped trie match the expected transformed values. + */ + static void verifyMappedValues(Trie mappedTrie, + NavigableMap sourceContent, + Function mapper) + { + for (Map.Entry entry : sourceContent.entrySet()) + { + T originalValue = entry.getValue(); + V expectedMappedValue = mapper.apply(originalValue); + + V actualMappedValue = mappedTrie.get(entry.getKey()); + assertEquals("Mapped value should match expected", expectedMappedValue, actualMappedValue); + + Trie tail = mappedTrie.tailTrie(entry.getKey()); + assertNotNull("Tail trie should exist for key", tail); + + actualMappedValue = tail.get(ByteComparable.EMPTY); + assertEquals("Mapped value in tail should match expected", expectedMappedValue, actualMappedValue); + + if (expectedMappedValue != null) + { + actualMappedValue = Iterables.getFirst(tail.values(), null); + assertEquals("First value in tail should match expected", expectedMappedValue, actualMappedValue); + + actualMappedValue = Iterables.getLast(tail.values(Direction.REVERSE), null); + assertEquals("Last value in tail should match expected", expectedMappedValue, actualMappedValue); + } // otherwise iteration will skip over the null + } + + assertEquals(sourceContent.values() + .stream() + .map(mapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.values()).collect(Collectors.toList())); + assertEquals(sourceContent.descendingMap() + .values() + .stream() + .map(mapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.values(Direction.REVERSE)).collect(Collectors.toList())); + + testTailTries(mappedTrie, null, sourceContent, mapper); + } + + static void testTailTries(Trie mappedTrie, + Class clazz, NavigableMap sourceContent, + Function mapper) + { + // test tailTries. We must account for the fact that when a prefix is reported the code will skip over all + // descendants. + Set keys = new HashSet<>(); + String prev = null; + for (var en : sourceContent.entrySet()) + { + if (mapper.apply(en.getValue()) == null) // skip over null mappings + continue; + + String string = en.getKey().toString(); + if (prev != null && string.startsWith(prev)) // skip descentants + continue; + keys.add(string); + prev = string; + } + TreeMap tailFilteredMap = new TreeMap<>(sourceContent); + for (var it = tailFilteredMap.keySet().iterator(); it.hasNext();) + { + var v = it.next(); + if (!keys.contains(v.toString())) + it.remove(); + } + + + assertEquals(tailFilteredMap.values() + .stream() + .map(mapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(mappedTrie.tailTries(Direction.FORWARD, clazz != null ? clazz::isInstance : Predicates.alwaysTrue())) + .map(t -> t.getValue().get(ByteComparable.EMPTY)) + .collect(Collectors.toList())); + // We can't do reverse iteration because the content in an ordered trie is presented on the return path where + // we can't take a tail trie. + } + + static class NullFilteredEntries extends TrieEntriesIterator.WithNullFiltering + { + private final Function mapper; + + NullFilteredEntries(Function mapper, BaseTrie trie, Direction direction) + { + super(trie, direction); + this.mapper = mapper; + } + + @Override + protected V mapContent(T content, byte[] bytes, int byteLength) + { + return mapper.apply(content); + } + } + + /** + * Helper class to hold a trie and its expected content together. + */ + static class TrieWithContent + { + final InMemoryTrie trie; + final NavigableMap content; + + TrieWithContent(InMemoryTrie trie, NavigableMap content) + { + this.trie = trie; + this.content = content; + } + } + + @Test + public void testMapValuesInteger() + { + qt().forAll(keyValueListGen()) + .checkAssert(keyValues -> { + if (keyValues.isEmpty()) + return; // Skip empty case + + TrieWithContent trieWithContent = createIntegerTrie(keyValues); + + // Apply mapping function: multiply by 2 + Function mapper = x -> x * 2; + Trie mappedTrie = trieWithContent.trie.mapValues(mapper); + + // Verify all values are correctly mapped + verifyMappedValues(mappedTrie, trieWithContent.content, mapper); + + testNullFilteredEntries(trieWithContent, mapper); + }); + } + + static void testNullFilteredEntries(TrieWithContent trieWithContent, Function mapper) + { + assertEquals(trieWithContent.content.values() + .stream() + .map(mapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(new NullFilteredEntries<>(mapper, trieWithContent.trie, Direction.FORWARD)) + .collect(Collectors.toList())); + assertEquals(trieWithContent.content.descendingMap() + .values() + .stream() + .map(mapper) + .filter(Predicates.notNull()) + .collect(Collectors.toList()), + Streams.stream(new NullFilteredEntries<>(mapper, trieWithContent.trie, Direction.REVERSE)) + .collect(Collectors.toList())); + } + + @Test + public void testMapValuesByteBuffer() + { + qt().forAll(keyValueListGen()) + .checkAssert(keyValues -> { + if (keyValues.isEmpty()) + return; // Skip empty case + + TrieWithContent trieWithContent = createByteBufferTrie(keyValues); + + // Apply mapping function: extract integer and add 100 + Function mapper = buf -> { + ByteBuffer duplicate = buf.duplicate(); + return duplicate.getInt() + 100; + }; + Trie mappedTrie = trieWithContent.trie.mapValues(mapper); + + // Verify all values are correctly mapped + verifyMappedValues(mappedTrie, trieWithContent.content, mapper); + + testNullFilteredEntries(trieWithContent, mapper); + }); + } + + @Test + public void testMapValuesStrings() + { + qt().forAll(keyValueListGen()) + .checkAssert(keyValues -> { + if (keyValues.isEmpty()) + return; // Skip empty case + + TrieWithContent trieWithContent = createIntegerTrie(keyValues); + + // Apply mapping + Function mapper = x -> "value_" + x; + Trie mappedTrie = trieWithContent.trie.mapValues(mapper); + + // Verify keys with content are mapped + verifyMappedValues(mappedTrie, trieWithContent.content, mapper); + + testNullFilteredEntries(trieWithContent, mapper); + }); + } + + /** + * Test that chained mapValues operations work correctly. + */ + @Test + public void testChainedMapValues() + { + qt().forAll(keyValueListGen()) + .checkAssert(keyValues -> { + if (keyValues.isEmpty()) + return; // Skip empty case + + TrieWithContent trieWithContent = createIntegerTrie(keyValues); + + // Chain multiple mappings + Function mapper1 = x -> x * 2; + Function mapper2 = x -> "value_" + x; + Function mapper3 = s -> s.length(); + + Trie mappedTrie = trieWithContent.trie.mapValues(mapper1) + .mapValues(mapper2) + .mapValues(mapper3); + + // Compute combined mapper for verification + Function combinedMapper = x -> mapper3.apply(mapper2.apply(mapper1.apply(x))); + + // Verify chained mapping + verifyMappedValues(mappedTrie, trieWithContent.content, combinedMapper); + + testNullFilteredEntries(trieWithContent, combinedMapper); + }); + } + + + @Test + public void testMappingMergeWith() + { + qt().forAll(keyValueListGen(), keyValueListGen()) + .checkAssert((keyValues1, keyValues2) -> { + TrieWithContent trieWithContent1 = createIntegerTrie(keyValues1); + TrieWithContent trieWithContent2 = createByteBufferTrie(keyValues2); + + BiFunction mapper = (x, y) -> + x != null ? y != null ? "" + (x * 2) + ":" + ByteBufferUtil.bytesToHex(y) + : "" + (x * 2) + : y != null ? ByteBufferUtil.bytesToHex(y) + : null; + + Trie mappedTrie = trieWithContent1.trie.mappingMergeWith(trieWithContent2.trie, mapper); + var mergedContent = mergeAndMapContent(trieWithContent1.content, trieWithContent2.content, mapper); + + // Verify all values are correctly mapped + verifyMappedValues(mappedTrie, mergedContent, x -> x); + }); + } + + private static + NavigableMap mergeAndMapContent(NavigableMap c1, NavigableMap c2, BiFunction mapper) + { + NavigableMap mergedContent = new TreeMap<>(); + for (K key : Sets.union(c1.keySet(), c2.keySet())) + mergedContent.put(key, mapper.apply(c1.get(key), c2.get(key))); + return mergedContent; + } + + + /** + * Helper class to hold key-value pairs for testing. + */ + static class KeyValue + { + final ByteComparable.Preencoded key; + final int integer; + final ByteBuffer value; + + KeyValue(int intKey, ByteBuffer value) + { + this.integer = intKey; + this.key = TrieUtil.generateKeyAllowingPrefixes(new Random(intKey)); + this.value = value; + } + + @Override + public String toString() + { + return "KeyValue{" + + "key=" + key.byteComparableAsString(key.encodingVersion()) + + ", integer=" + integer + + ", value=" + ByteBufferUtil.bytesToHex(value) + + '}'; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java index 1bd025b213c3..b8879c76eb1c 100644 --- a/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java +++ b/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java @@ -24,24 +24,32 @@ import java.util.SortedMap; import java.util.TreeMap; +import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.config.CassandraRelevantProperties; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.*; public class MergeTrieTest { + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + private static final int COUNT = 15000; Random rand = new Random(); @Test public void testDirect() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - ByteComparable[] src2 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); - SortedMap content2 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); @@ -55,45 +63,45 @@ public void testDirect() @Test public void testWithDuplicates() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - ByteComparable[] src2 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); - SortedMap content2 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); - addToInMemoryTrie(generateKeys(new Random(5), COUNT), content1, trie1, true); - addToInMemoryTrie(generateKeys(new Random(5), COUNT), content2, trie2, true); + addToInMemoryTrie(TrieUtil.generateKeys(new Random(5), COUNT), content1, trie1, true); + addToInMemoryTrie(TrieUtil.generateKeys(new Random(5), COUNT), content2, trie2, true); content1.putAll(content2); Trie union = trie1.mergeWith(trie2, (x, y) -> y); - assertSameContent(union, content1); + TrieUtil.assertSameContent(union, content1); } @Test public void testDistinct() { - ByteComparable[] src1 = generateKeys(rand, COUNT); - SortedMap content1 = new TreeMap<>(forwardComparator); + Preencoded[] src1 = TrieUtil.generateKeys(rand, COUNT); + SortedMap content1 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); - ByteComparable[] src2 = generateKeys(rand, COUNT); + Preencoded[] src2 = TrieUtil.generateKeys(rand, COUNT); src2 = removeDuplicates(src2, content1); - SortedMap content2 = new TreeMap<>(forwardComparator); + SortedMap content2 = new TreeMap<>(TrieUtil.FORWARD_COMPARATOR); InMemoryTrie trie2 = makeInMemoryTrie(src2, content2, true); content1.putAll(content2); - Trie union = new MergeTrie.Distinct<>(trie1, trie2); + Trie union = Trie.mergeDistinct(trie1, trie2); - assertSameContent(union, content1); + TrieUtil.assertSameContent(union, content1); } - static ByteComparable[] removeDuplicates(ByteComparable[] keys, SortedMap content1) + static Preencoded[] removeDuplicates(Preencoded[] keys, SortedMap content1) { return Arrays.stream(keys) .filter(key -> !content1.containsKey(key)) - .toArray(ByteComparable[]::new); + .toArray(Preencoded[]::new); } } diff --git a/test/unit/org/apache/cassandra/db/tries/PrefixTailDeletionAwareTrieTest.java b/test/unit/org/apache/cassandra/db/tries/PrefixTailDeletionAwareTrieTest.java new file mode 100644 index 000000000000..41a289174348 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/PrefixTailDeletionAwareTrieTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Map; +import java.util.NavigableMap; + +import javax.annotation.Nonnull; + +import com.google.common.base.Predicates; + +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; + +public class PrefixTailDeletionAwareTrieTest +extends PrefixTailTestBase, + DeletionAwareTrie> +{ + @Override + InMemoryDeletionAwareTrie[] makeArray(int length) + { + return new InMemoryDeletionAwareTrie[length]; + } + + @Override + InMemoryDeletionAwareTrie makeInMemoryTrie() + { + return InMemoryDeletionAwareTrie.shortLived(VERSION); + } + + @Override + void applyPrefixed(InMemoryDeletionAwareTrie destination, ByteComparable prefix, InMemoryDeletionAwareTrie tail, InMemoryBaseTrie.UpsertTransformer upsertTransformer) throws TrieSpaceExhaustedException + { + destination.apply(tail.prefixedBy(prefix), + upsertTransformer, + (x, y) -> (TestRangeState) upsertTransformer.apply(x, y), + (x, y) -> { throw new AssertionError(); }, + (x, y) -> { throw new AssertionError(); }, + true, + Predicates.alwaysFalse()); + } + + @Override + void apply(InMemoryDeletionAwareTrie destination, DeletionAwareTrie tail, UpsertTransformerWithKeys upsertTransformer) throws TrieSpaceExhaustedException + { + class Updater implements InMemoryTrie.UpsertTransformer + { + InMemoryDeletionAwareTrie.Mutator mutator = + destination.mutator(this, + this::applyRangeState, + (x, y) -> { throw new AssertionError(); }, + (x, y) -> { throw new AssertionError(); }, + true, + Predicates.alwaysFalse()); + + @Override + public Object apply(Object existing, @Nonnull Object update) + { + return upsertTransformer.apply(existing, update, mutator); + } + + public TestRangeState applyRangeState(TestRangeState existing, @Nonnull TestRangeState update) + { + return (TestRangeState) upsertTransformer.apply(existing, update, mutator); + } + } + new Updater().mutator.apply(tail); + } + + @Override + DeletionAwareTrie merge(InMemoryDeletionAwareTrie[] tries, Trie.CollectionMergeResolver resolver) + { + return DeletionAwareTrie.merge(Arrays.asList(tries), + resolver, + TestRangeState::combineCollection, + (d, v) -> { throw new AssertionError(); }, + true); + } + + @Override + DeletionAwareTrie cast(InMemoryDeletionAwareTrie inMemoryTrie) + { + return inMemoryTrie; + } + + @Override + void addToInMemoryTrie(Preencoded[] src, + NavigableMap content, + InMemoryDeletionAwareTrie trie) + + { + for (Preencoded b : src) + addToInMemoryTrie(content, trie, b); + } + + @Override + void addNthToInMemoryTrie(Preencoded[] src, + NavigableMap content, + InMemoryDeletionAwareTrie trie, + int divisor, + int remainder) + + { + int i = 0; + for (Preencoded b : src) + { + if (i++ % divisor != remainder) + continue; + + addToInMemoryTrie(content, trie, b); + } + } + + private static void addToInMemoryTrie(Map content, InMemoryDeletionAwareTrie trie, Preencoded b) + { + // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload + // (so that all sources have the same value). + int payload = InMemoryTrieTestBase.asString(b).hashCode() & 0x7fffffff; // must be positive for TestRangeState + ByteBuffer v = ByteBufferUtil.bytes(payload); + content.put(b, v); + if (InMemoryTrieTestBase.VERBOSE) + System.out.println("Adding " + InMemoryTrieTestBase.asString(b) + ": " + ByteBufferUtil.bytesToHex(v)); + + try + { + DeletionAwareTrie toInsert; + toInsert = (payload & 1) == 1 ? DeletionAwareTrie.deletedRange(ByteComparable.EMPTY, b, true, b, true, TrieUtil.VERSION, new TestRangeState(b, payload, payload)) : DeletionAwareTrie.singleton(b, VERSION, v); + + trie.apply(toInsert, + THROWING_UPSERT, + (x, y) -> y, + (x, y) -> { + throw new AssertionError(); + }, + (x, y) -> { + throw new AssertionError(); + }, + true, + Predicates.alwaysFalse()); + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + + if (InMemoryTrieTestBase.VERBOSE) + System.out.println(trie.dump(InMemoryTrieTestBase::string)); + } + + @Override + Trie processContent(DeletionAwareTrie trie) + { + return trie.mergedTrie((v, rs) -> + { + if (v != null) + return (v instanceof ByteBuffer) ? (ByteBuffer) v : null; + + assert rs != null; + // We only want one side of the branch deletion marker pairs. + if (rs.rightSide == -1) + return null; + + return ByteBufferUtil.bytes(rs.rightSide); + }); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/PrefixTailTestBase.java b/test/unit/org/apache/cassandra/db/tries/PrefixTailTestBase.java new file mode 100644 index 000000000000..bde2feb96e1f --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/PrefixTailTestBase.java @@ -0,0 +1,508 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.Random; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; + +import com.google.common.collect.Iterables; +import com.google.common.primitives.Bytes; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Hex; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.checkGet; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.assertIterablesEqual; +import static org.apache.cassandra.db.tries.TrieUtil.assertMapEquals; +import static org.apache.cassandra.db.tries.TrieUtil.generateKey; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public abstract class PrefixTailTestBase, Q extends BaseTrie> +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + private static final int COUNT_TAIL = 5000; + private static final int COUNT_HEAD = 25; + public static final Comparator BYTE_COMPARABLE_COMPARATOR = (a, b) -> ByteComparable.compare(a, b, VERSION); + Random rand = new Random(); + + static final InMemoryBaseTrie.UpsertTransformer THROWING_UPSERT = (e, u) -> { + if (e != null) throw new AssertionError(); + return u; + }; + + static final Function CONTENT_TO_STRING = x -> x instanceof ByteBuffer + ? ByteBufferUtil.bytesToHex((ByteBuffer) x) + : x.toString(); + + static class Tail + { + byte[] prefix; + NavigableMap data; + + public Tail(byte[] prefix, NavigableMap map) + { + this.prefix = prefix; + this.data = map; + } + + public String toString() + { + return "Tail{" + ByteBufferUtil.bytesToHex(ByteBuffer.wrap(prefix)) + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Tail tail = (Tail) o; + return Arrays.equals(prefix, tail.prefix) && Objects.equals(data, tail.data); + } + } + + static T getRootContent(BaseTrie trie) + { + return trie.get(ByteComparable.EMPTY); + } + + @Test + public void testPrefixTail() throws Exception + { + testPrefixTail(1, false); + } + + @Test + public void testPrefixTailMerge2InHead() throws Exception + { + testPrefixTail(2, false); + } + + @Test + public void testPrefixTailMerge2InTail() throws Exception + { + testPrefixTail(2, true); + } + + @Test + public void testPrefixTailMerge5InHead() throws Exception + { + testPrefixTail(5, false); + } + + @Test + public void testPrefixTailMerge5InTail() throws Exception + { + testPrefixTail(5, true); + } + + static Tail combineTails(Object x, Object y) + { + // Cast failure is a test problem + Tail tx = (Tail) x; + Tail ty = (Tail) y; + var map = new TreeMap(BYTE_COMPARABLE_COMPARATOR); + map.putAll(tx.data); + map.putAll(ty.data); + return new Tail(tx.prefix, map); + } + + public void testPrefixTail(int splits, boolean splitInTail) throws Exception + { + Preencoded[] prefixes = generateKeys(rand, COUNT_HEAD); + + NavigableMap data = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + final Q trie = splitInTail ? prepareSplitInTailTrie(splits, prefixes, data) + : prepareSplitInHeadTrie(splits, prefixes, data); +// System.out.println(trie.dump(CONTENT_TO_STRING)); + + // Test tailTrie for known prefix + for (int i = 0; i < COUNT_HEAD; ++i) + { + Tail t = data.get(prefixes[i]); + Q tail = trie.tailTrie(prefixes[i]); + assertEquals(t, getRootContent(tail)); + checkContent(processContent(tail), t.data); + } + + // Test tail iteration for given class + for (Direction td : Direction.values()) + { + long count = 0; + for (var en : trie.tailTries(td, Tail.class)) + { + System.out.println(en.getKey().byteComparableAsString(VERSION)); + Q tail = en.getValue(); + Tail t = data.get(en.getKey()); + assertNotNull(t); + assertEquals(t, getRootContent(tail)); + checkContent(processContent(tail), t.data); + ++count; + } + assertEquals(COUNT_HEAD, count); + } + + // test a sample of tail slices + for (int i = rand.nextInt(7); i < COUNT_HEAD; i += 1 + rand.nextInt(7)) + { + Tail t = data.get(prefixes[i]); + int keyCount = t.data.size(); + int firstIndex = rand.nextInt(keyCount - 1); + int lastIndex = firstIndex + rand.nextInt(keyCount - firstIndex); + Preencoded first = rand.nextInt(5) > 0 ? Iterables.get(t.data.keySet(), firstIndex) : null; + Preencoded last = rand.nextInt(5) > 0 ? Iterables.get(t.data.keySet(), lastIndex) : null; + Preencoded prefix = prefixes[i]; + final ByteComparable leftWithPrefix = concat(prefix, first, rand.nextBoolean() ? data.lowerKey(prefix) + : null); + final ByteComparable rightWithPrefix = concat(prefix, last, rand.nextBoolean() ? data.higherKey(prefix) + : null); + System.out.println("Between " + (leftWithPrefix == null ? "null" : leftWithPrefix.byteComparableAsString(VERSION)) + " and " + (rightWithPrefix == null ? "null" : rightWithPrefix.byteComparableAsString(VERSION))); + Q tail = trie.subtrie(leftWithPrefix, + rightWithPrefix) + .tailTrie(prefixes[i]); + assertEquals(t, getRootContent(tail)); + checkContent(processContent(tail), subMap(t.data, first, last)); + } + + // Test processSkippingBranches variations + for (Direction td : Direction.values()) + { + final AtomicLong count = new AtomicLong(0); + trie.forEachValueSkippingBranches(td, v -> count.incrementAndGet()); + assertEquals(COUNT_HEAD, count.get()); + + count.set(0); + trie.forEachEntrySkippingBranches(td, (key, tail) -> + { + assertArrayEquals(((Tail) tail).prefix, key.asByteComparableArray(VERSION)); + count.incrementAndGet(); + }); + assertEquals(COUNT_HEAD, count.get()); + } + + // Test filteredValues and filteredEntrySet + for (Direction td : Direction.values()) + { + long count = 0; + for (Tail t : trie.filteredValues(td, Tail.class)) + ++count; + assertEquals(COUNT_HEAD, count); + + count = 0; + for (var en : trie.filteredEntrySet(td, Tail.class)) + { + assertArrayEquals(en.getValue().prefix, en.getKey().asByteComparableArray(VERSION)); + ++count; + } + assertEquals(COUNT_HEAD, count); + + count = 0; + for (var it = new TrieEntriesIterator.WithNullFiltering(trie, td) + { + @Override + protected Tail mapContent(Object content, byte[] bytes, int byteLength) + { + if (!(content instanceof Tail)) + return null; + + Tail tail = (Tail) content; + assertArrayEquals(tail.prefix, Arrays.copyOf(bytes, byteLength)); + return tail; + } + }; it.hasNext(); ) + { + assertNotNull(it.next()); + ++count; + } + assertEquals(COUNT_HEAD, count); + } + } + + private static void checkContent(Trie tail, NavigableMap data) + { + assertMapEquals(tail.entryIterator(Direction.FORWARD), + data.entrySet().iterator()); + assertIterablesEqual(tail.values(Direction.FORWARD), + data.values()); + // As the keys are prefix-free, reverse iteration is the inverse of forward. + assertMapEquals(tail.entryIterator(Direction.REVERSE), + data.descendingMap().entrySet().iterator()); + assertIterablesEqual(tail.values(Direction.REVERSE), + data.descendingMap().values()); + checkGet(tail, data); + } + + private static NavigableMap subMap(NavigableMap data, K left, K right) + { + // Subtries are always inclusive. + if (left == null) + return right == null ? data : data.headMap(right, true); + else + return right == null + ? data.tailMap(left, true) + : data.subMap(left, true, right, true); + } + + private static ByteComparable concat(ByteComparable a, ByteComparable b, ByteComparable ifBNull) + { + if (b == null) + return ifBNull; + return ByteComparable.preencoded(VERSION, + Bytes.concat(a.asByteComparableArray(VERSION), + b.asByteComparableArray(VERSION))); + } + + interface UpsertTransformerWithKeys + { + Object apply(Object existing, Object update, InMemoryBaseTrie.Mutator mutator); + } + + abstract T[] makeArray(int length); + abstract T makeInMemoryTrie(); + abstract void applyPrefixed(T destination, ByteComparable prefix, T tail, InMemoryBaseTrie.UpsertTransformer upsertTransformer) throws TrieSpaceExhaustedException; + abstract void apply(T destination, Q tail, UpsertTransformerWithKeys upsertTransformer) throws TrieSpaceExhaustedException; + abstract Q merge(T[] tries, Trie.CollectionMergeResolver resolver); + abstract Q cast(T inMemoryTrie); + abstract void addToInMemoryTrie(Preencoded[] src, NavigableMap content, T tail); + abstract void addNthToInMemoryTrie(Preencoded[] src, NavigableMap content, T tail, int splits, int k); + abstract Trie processContent(Q trie); + + private Q prepareSplitInTailTrie(int splits, Preencoded[] prefixes, Map data) throws TrieSpaceExhaustedException + { + T[] tries = makeArray(splits); + for (int i = 0; i < splits; ++i) + tries[i] = makeInMemoryTrie(); + for (int i = 0; i < COUNT_HEAD; ++i) + { + Preencoded[] src = generateKeys(rand, COUNT_TAIL); + NavigableMap allContent = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + for (int k = 0; k < splits; ++k) + { + NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + T tail = makeInMemoryTrie(); + addNthToInMemoryTrie(src, content, tail, splits, k); + + Tail t = new Tail(prefixes[i].asByteComparableArray(VERSION), content); + allContent.putAll(content); + tail.putRecursive(ByteComparable.EMPTY, t, THROWING_UPSERT); +// System.out.println(tail.dump(CONTENT_TO_STRING)); + + applyPrefixed(tries[k], prefixes[i], tail, THROWING_UPSERT); + } + Tail t = new Tail(prefixes[i].asByteComparableArray(VERSION), allContent); + data.put(ByteComparable.preencoded(VERSION, t.prefix), t); + } + + return merge(tries, c -> c.stream().reduce(PrefixTailTestBase::combineTails).get()); + } + + + private Q prepareSplitInHeadTrie(int splits, Preencoded[] prefixes, Map data) throws TrieSpaceExhaustedException + { + T[] tries = makeArray(splits); + for (int i = 0; i < splits; ++i) + tries[i] = makeInMemoryTrie(); + int trieIndex = 0; + for (int i = 0; i < prefixes.length; ++i) + { + Preencoded[] src = generateKeys(rand, COUNT_TAIL); + + NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + T tail = makeInMemoryTrie(); + addToInMemoryTrie(src, content, tail); + + Tail t = new Tail(prefixes[i].asByteComparableArray(VERSION), content); + tail.putRecursive(ByteComparable.EMPTY, t, THROWING_UPSERT); +// System.out.println(tail.dump(CONTENT_TO_STRING)); + applyPrefixed(tries[trieIndex], prefixes[i], tail, THROWING_UPSERT); + + data.put(ByteComparable.preencoded(VERSION, t.prefix), t); + trieIndex = (trieIndex + 1) % splits; + } + + return merge(tries, Trie.throwingResolver()); + } + + // also do same prefix updates + + @Test + public void testTailMerge() throws Exception + { + ByteComparable prefix = generateKey(rand); + T trie = makeInMemoryTrie(); + NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + + for (int i = 0; i < COUNT_HEAD; ++i) + { + Preencoded[] src = generateKeys(rand, COUNT_TAIL); + T tail = makeInMemoryTrie(); + addToInMemoryTrie(src, content, tail); +// System.out.println(tail.dump(CONTENT_TO_STRING)); + tail.putRecursive(ByteComparable.EMPTY, 1, THROWING_UPSERT); + applyPrefixed(trie, prefix, tail, + (x, y) -> x instanceof Integer ? (Integer) x + (Integer) y : y); + } + +// System.out.println(trie.dump(CONTENT_TO_STRING)); + + Q tail = cast(trie).tailTrie(prefix); + assertEquals(COUNT_HEAD, ((Integer) getRootContent(tail)).intValue()); + checkContent(processContent(tail), content); + + + // Test tail iteration for metadata + long count = 0; + for (var en : cast(trie).tailTries(Direction.FORWARD, Integer.class)) + { + System.out.println(en.getKey().byteComparableAsString(VERSION)); + Q tt = en.getValue(); + assertNotNull(tt); + assertEquals(COUNT_HEAD, ((Integer) getRootContent(tail)).intValue()); + checkContent(processContent(tt), content); + ++count; + } + assertEquals(1, count); + } + + @Test + public void testKeyProducer() throws Exception + { + + testKeyProducer(generateKeys(rand, COUNT_HEAD)); + } + + @Test + public void testKeyProducerMarkedRoot() throws Exception + { + // Check that path construction works correctly also when the root is the starting position. + testKeyProducer(new Preencoded[] { Preencoded.EMPTY.preencode(VERSION) }); + } + + private void testKeyProducer(Preencoded[] prefixes) throws TrieSpaceExhaustedException + { + NavigableMap data = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + final Q trie = prepareSplitInHeadTrie(1, prefixes, data); +// System.out.println(trie.dump(CONTENT_TO_STRING)); + + T dest = makeInMemoryTrie(); + InclusionChecker checker = new InclusionChecker(); + apply(dest, trie, checker); + assertEquals("", checker.output.toString()); + } + + static class InclusionChecker implements UpsertTransformerWithKeys + { + Tail currentTail = null; + StringBuilder output = new StringBuilder(); + + @Override + public Object apply(Object existing, Object update, InMemoryBaseTrie.Mutator mutator) + { + if (existing != null) + output.append("Non-null existing\n"); + + byte[] tailPath = mutator.getCurrentKeyBytesToNearestAncestorSatisfying(Tail.class::isInstance); + byte[] fullPath = mutator.getCurrentKeyBytes(); + String tail = Hex.bytesToHex(tailPath); + String full = Hex.bytesToHex(fullPath); + if (!full.endsWith(tail)) + { + output.append("Tail " + tail + " is not suffix of full path " + full + "\n"); + return update; // can't continue + } + + String msg = "\n@key " + full.substring(0, full.length() - tail.length()) + ":" + tail + "\n"; + + if (update instanceof Tail) + { + // At + if (tailPath.length != fullPath.length) + output.append("Prefix not empty on tail root").append(msg); + Tail t = (Tail) update; + if (!Arrays.equals(t.prefix, fullPath)) + output.append("Tail root path expected ").append(Hex.bytesToHex(t.prefix)).append(msg); + currentTail = t; + } + else + { + if (currentTail == null) + output.append("Null currentTail").append(msg); + + if (update instanceof ByteBuffer) + { + byte[] prefix = Arrays.copyOfRange(fullPath, 0, fullPath.length - tailPath.length); + if (!Arrays.equals(currentTail.prefix, prefix)) + output.append("Prefix expected " + Hex.bytesToHex(currentTail.prefix) + msg); + } + else if (update instanceof TestRangeState) + { + if (!Arrays.equals(currentTail.prefix, fullPath) || !Arrays.equals(currentTail.prefix, tailPath)) + output.append("Prefix expected ").append(Hex.bytesToHex(currentTail.prefix)).append(msg); + // on deletions the tail path is queried separately + tailPath = ((InMemoryDeletionAwareTrie.Mutator) mutator).getDeletionBranchKeyBytes(); + } + + ByteBuffer updateAsBuf = null; + + if (update instanceof TestRangeState) + { + TestRangeState rs = (TestRangeState) update; + if (rs.leftSide >= 0) + updateAsBuf = ByteBufferUtil.bytes(rs.leftSide); + else if (rs.rightSide >= 0) + updateAsBuf = ByteBufferUtil.bytes(rs.rightSide); + else + output.append("Invalid range state ").append(rs); + } + else if (update instanceof ByteBuffer) + updateAsBuf = (ByteBuffer) update; + else + output.append("Not ByteBuffer or TestRangeState ").append(update).append(msg); + + ByteBuffer expected = currentTail.data.get(ByteComparable.preencoded(VERSION, tailPath)); + if (expected == null) + output.append("Suffix not found").append(msg); + else if (!expected.equals(updateAsBuf)) + output.append("Data mismatch ").append(ByteBufferUtil.bytesToHex(updateAsBuf)).append(" expected ").append(ByteBufferUtil.bytesToHex(expected)).append(msg); + } + return update; + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/PrefixTailTrieTest.java b/test/unit/org/apache/cassandra/db/tries/PrefixTailTrieTest.java index 1424a8d1f530..2d8dded15cdb 100644 --- a/test/unit/org/apache/cassandra/db/tries/PrefixTailTrieTest.java +++ b/test/unit/org/apache/cassandra/db/tries/PrefixTailTrieTest.java @@ -20,419 +20,80 @@ import java.nio.ByteBuffer; import java.util.Arrays; -import java.util.Comparator; -import java.util.Map; import java.util.NavigableMap; -import java.util.Objects; -import java.util.Random; -import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicLong; -import java.util.function.Function; + +import javax.annotation.Nonnull; import com.google.common.base.Predicates; -import com.google.common.collect.Iterables; -import com.google.common.primitives.Bytes; -import org.junit.Test; -import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.Hex; import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.byteComparableVersion; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.addNthToInMemoryTrie; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.addToInMemoryTrie; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertIterablesEqual; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertMapEquals; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.checkGet; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.generateKey; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.generateKeys; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; -public class PrefixTailTrieTest +public class PrefixTailTrieTest extends PrefixTailTestBase, Trie> { - private static final int COUNT_TAIL = 5000; - private static final int COUNT_HEAD = 25; - public static final Comparator BYTE_COMPARABLE_COMPARATOR = (a, b) -> ByteComparable.compare(a, b, byteComparableVersion); - Random rand = new Random(); - - static - { - // Use prefix-free keys to avoid putting partitions within partitions - InMemoryTrieTestBase.prefixFree = true; - } - - static final InMemoryTrie.UpsertTransformer THROWING_UPSERT = (e, u) -> { - if (e != null) throw new AssertionError(); - return u; - }; - - static final Function CONTENT_TO_STRING = x -> x instanceof ByteBuffer - ? ByteBufferUtil.bytesToHex((ByteBuffer) x) - : x.toString(); - - static class Tail - { - byte[] prefix; - NavigableMap data; - - public Tail(byte[] prefix, NavigableMap map) - { - this.prefix = prefix; - this.data = map; - } - - public String toString() - { - return "Tail{" + ByteBufferUtil.bytesToHex(ByteBuffer.wrap(prefix)) + '}'; - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Tail tail = (Tail) o; - return Arrays.equals(prefix, tail.prefix) && Objects.equals(data, tail.data); - } - } - - static T getRootContent(Trie trie) - { - return trie.get(ByteComparable.EMPTY); - } - - @Test - public void testPrefixTail() throws Exception - { - testPrefixTail(1, false); - } - - @Test - public void testPrefixTailMerge2InHead() throws Exception + @Override + InMemoryTrie[] makeArray(int length) { - testPrefixTail(2, false); + return new InMemoryTrie[length]; } - @Test - public void testPrefixTailMerge2InTail() throws Exception + @Override + InMemoryTrie makeInMemoryTrie() { - testPrefixTail(2, true); + return InMemoryTrie.shortLived(VERSION); } - @Test - public void testPrefixTailMerge5InHead() throws Exception + @Override + void applyPrefixed(InMemoryTrie destination, ByteComparable prefix, InMemoryTrie tail, InMemoryBaseTrie.UpsertTransformer upsertTransformer) throws TrieSpaceExhaustedException { - testPrefixTail(5, false); + destination.apply(tail.prefixedBy(prefix), upsertTransformer, Predicates.alwaysFalse()); } - @Test - public void testPrefixTailMerge5InTail() throws Exception + @Override + void apply(InMemoryTrie destination, Trie tail, UpsertTransformerWithKeys upsertTransformer) throws TrieSpaceExhaustedException { - testPrefixTail(5, true); - } - - static Tail combineTails(Object x, Object y) - { - // Cast failure is a test problem - Tail tx = (Tail) x; - Tail ty = (Tail) y; - var map = new TreeMap(BYTE_COMPARABLE_COMPARATOR); - map.putAll(tx.data); - map.putAll(ty.data); - return new Tail(tx.prefix, map); - } - - public void testPrefixTail(int splits, boolean splitInTail) throws Exception - { - ByteComparable[] prefixes = generateKeys(rand, COUNT_HEAD); - - NavigableMap data = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - final Trie trie = splitInTail ? prepareSplitInTailTrie(splits, prefixes, data) - : prepareSplitInHeadTrie(splits, prefixes, data); -// System.out.println(trie.dump(CONTENT_TO_STRING)); - - // Test tailTrie for known prefix - for (int i = 0; i < COUNT_HEAD; ++i) + class Updater implements InMemoryTrie.UpsertTransformer { - Tail t = data.get(prefixes[i]); - Trie tail = trie.tailTrie(prefixes[i]); - assertEquals(t, getRootContent(tail)); - checkContent(tail, t.data); - } + InMemoryTrie.Mutator mutator = destination.mutator(this, Predicates.alwaysFalse()); - // Test tail iteration for given class - for (Direction td : Direction.values()) - { - long count = 0; - for (var en : trie.tailTries(td, Tail.class)) + @Override + public Object apply(Object existing, @Nonnull Object update) { - System.out.println(en.getKey().byteComparableAsString(byteComparableVersion)); - Trie tail = en.getValue(); - Tail t = data.get(en.getKey()); - assertNotNull(t); - assertEquals(t, getRootContent(tail)); - checkContent(tail, t.data); - ++count; + return upsertTransformer.apply(existing, update, mutator); } - assertEquals(COUNT_HEAD, count); - } - - // test a sample of tail slices - for (int i = rand.nextInt(7); i < COUNT_HEAD; i += 1 + rand.nextInt(7)) - { - Tail t = data.get(prefixes[i]); - int keyCount = t.data.keySet().size(); - int firstIndex = rand.nextInt(keyCount - 1); - int lastIndex = firstIndex + rand.nextInt(keyCount - firstIndex); - ByteComparable first = rand.nextInt(5) > 0 ? Iterables.get(t.data.keySet(), firstIndex) : null; - ByteComparable last = rand.nextInt(5) > 0 ? Iterables.get(t.data.keySet(), lastIndex) : null; - ByteComparable prefix = prefixes[i]; - final ByteComparable leftWithPrefix = concat(prefix, first, rand.nextBoolean() ? prefix - : rand.nextBoolean() - ? data.lowerKey(prefix) - : null); - final ByteComparable rightWithPrefix = concat(prefix, last, rand.nextBoolean() ? data.higherKey(prefix) - : null); - Trie tail = trie.subtrie(leftWithPrefix, - rightWithPrefix) - .tailTrie(prefixes[i]); - System.out.println("Between " + (leftWithPrefix == null ? "null" : leftWithPrefix.byteComparableAsString(byteComparableVersion)) + " and " + (rightWithPrefix == null ? "null" : rightWithPrefix.byteComparableAsString(byteComparableVersion))); - assertEquals(first == null ? t : null, getRootContent(tail)); // this behavior will change soon to report all prefixes - checkContent(tail, subMap(t.data, first, last)); - } - - // Test processSkippingBranches variations - for (Direction td : Direction.values()) - { - final AtomicLong count = new AtomicLong(0); - trie.forEachValueSkippingBranches(td, v -> count.incrementAndGet()); - assertEquals(COUNT_HEAD, count.get()); - - count.set(0); - trie.forEachEntrySkippingBranches(td, (key, tail) -> - { - assertArrayEquals(((Tail) tail).prefix, key.asByteComparableArray(byteComparableVersion)); - count.incrementAndGet(); - }); - assertEquals(COUNT_HEAD, count.get()); } + new Updater().mutator.apply(tail); } - private static void checkContent(Trie tail, NavigableMap data) + @Override + Trie merge(InMemoryTrie[] tries, Trie.CollectionMergeResolver resolver) { - - assertMapEquals(tail.filteredEntryIterator(Direction.FORWARD, ByteBuffer.class), - data.entrySet().iterator()); - assertIterablesEqual(tail.filteredValues(Direction.FORWARD, ByteBuffer.class), - data.values()); - // As the keys are prefix-free, reverse iteration is the inverse of forward. - assertMapEquals(tail.filteredEntryIterator(Direction.REVERSE, ByteBuffer.class), - data.descendingMap().entrySet().iterator()); - assertIterablesEqual(tail.filteredValues(Direction.REVERSE, ByteBuffer.class), - data.descendingMap().values()); - checkGet(tail, data); + return Trie.merge(Arrays.asList(tries), resolver); } - private static NavigableMap subMap(NavigableMap data, K left, K right) + @Override + Trie cast(InMemoryTrie inMemoryTrie) { - if (left == null) - return right == null ? data : data.headMap(right, false); - else - return right == null - ? data.tailMap(left, true) - : data.subMap(left, true, right, false); + return inMemoryTrie; } - private static ByteComparable concat(ByteComparable a, ByteComparable b, ByteComparable ifBNull) + @Override + void addToInMemoryTrie(Preencoded[] src, NavigableMap content, InMemoryTrie tail) { - if (b == null) - return ifBNull; - return ByteComparable.preencoded(byteComparableVersion, - Bytes.concat(a.asByteComparableArray(byteComparableVersion), - b.asByteComparableArray(byteComparableVersion))); + InMemoryTrieTestBase.addToInMemoryTrie(src, content, tail, true); } - private Trie prepareSplitInTailTrie(int splits, ByteComparable[] prefixes, Map data) throws TrieSpaceExhaustedException + @Override + void addNthToInMemoryTrie(Preencoded[] src, NavigableMap content, InMemoryTrie tail, int splits, int k) { - InMemoryTrie[] tries = new InMemoryTrie[splits]; - for (int i = 0; i < splits; ++i) - tries[i] = InMemoryTrie.shortLived(byteComparableVersion); - for (int i = 0; i < COUNT_HEAD; ++i) - { - ByteComparable[] src = generateKeys(rand, COUNT_TAIL); - NavigableMap allContent = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - for (int k = 0; k < splits; ++k) - { - NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - InMemoryTrie tail = InMemoryTrie.shortLived(byteComparableVersion); - addNthToInMemoryTrie(src, content, tail, true, splits, k); - - Tail t = new Tail(prefixes[i].asByteComparableArray(byteComparableVersion), content); - allContent.putAll(content); - tail.putRecursive(ByteComparable.EMPTY, t, THROWING_UPSERT); -// System.out.println(tail.dump(CONTENT_TO_STRING)); - tries[k].apply(tail.prefixedBy(prefixes[i]), THROWING_UPSERT, Predicates.alwaysFalse()); - } - Tail t = new Tail(prefixes[i].asByteComparableArray(byteComparableVersion), allContent); - data.put(ByteComparable.preencoded(byteComparableVersion, t.prefix), t); - } - - return Trie.merge(Arrays.asList(tries), c -> c.stream().reduce(PrefixTailTrieTest::combineTails).get()); - } - - - private Trie prepareSplitInHeadTrie(int splits, ByteComparable[] prefixes, Map data) throws TrieSpaceExhaustedException - { - InMemoryTrie[] tries = new InMemoryTrie[splits]; - for (int i = 0; i < splits; ++i) - tries[i] = InMemoryTrie.shortLived(byteComparableVersion); - int trieIndex = 0; - for (int i = 0; i < prefixes.length; ++i) - { - ByteComparable[] src = generateKeys(rand, COUNT_TAIL); - - NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - InMemoryTrie tail = InMemoryTrie.shortLived(byteComparableVersion); - addToInMemoryTrie(src, content, tail, true); - - Tail t = new Tail(prefixes[i].asByteComparableArray(byteComparableVersion), content); - tail.putRecursive(ByteComparable.EMPTY, t, THROWING_UPSERT); -// System.out.println(tail.dump(CONTENT_TO_STRING)); - tries[trieIndex].apply(tail.prefixedBy(prefixes[i]), THROWING_UPSERT, Predicates.alwaysFalse()); - - data.put(ByteComparable.preencoded(byteComparableVersion, t.prefix), t); - trieIndex = (trieIndex + 1) % splits; - } - - return Trie.mergeDistinct(Arrays.asList(tries)); - } - - // also do same prefix updates - - @Test - public void testTailMerge() throws Exception - { - ByteComparable prefix = generateKey(rand); - InMemoryTrie trie = InMemoryTrie.shortLived(byteComparableVersion); - NavigableMap content = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - - for (int i = 0; i < COUNT_HEAD; ++i) - { - ByteComparable[] src = generateKeys(rand, COUNT_TAIL); - InMemoryTrie tail = InMemoryTrie.shortLived(byteComparableVersion); - addToInMemoryTrie(src, content, tail, true); -// System.out.println(tail.dump(CONTENT_TO_STRING)); - tail.putRecursive(ByteComparable.EMPTY, 1, THROWING_UPSERT); - trie.apply(tail.prefixedBy(prefix), - (x, y) -> x instanceof Integer ? (Integer) x + (Integer) y : y, - Predicates.alwaysFalse()); - } - -// System.out.println(trie.dump(CONTENT_TO_STRING)); - - Trie tail = trie.tailTrie(prefix); - assertEquals(COUNT_HEAD, ((Integer) getRootContent(tail)).intValue()); - assertMapEquals(tail.filteredEntryIterator(Direction.FORWARD, ByteBuffer.class), - content.entrySet().iterator()); - assertIterablesEqual(tail.filteredValues(Direction.FORWARD, ByteBuffer.class), - content.values()); - - - // Test tail iteration for metadata - long count = 0; - for (var en : trie.tailTries(Direction.FORWARD, Integer.class)) - { - System.out.println(en.getKey().byteComparableAsString(byteComparableVersion)); - Trie tt = en.getValue(); - assertNotNull(tt); - assertEquals(COUNT_HEAD, ((Integer) getRootContent(tail)).intValue()); - assertMapEquals(tt.filteredEntryIterator(Direction.FORWARD, ByteBuffer.class), - content.entrySet().iterator()); - assertIterablesEqual(tt.filteredValues(Direction.FORWARD, ByteBuffer.class), - content.values()); - ++count; - } - assertEquals(1, count); + InMemoryTrieTestBase.addNthToInMemoryTrie(src, content, tail, true, splits, k); } - @Test - public void testKeyProducer() throws Exception + @Override + Trie processContent(Trie trie) { - - testKeyProducer(generateKeys(rand, COUNT_HEAD)); - } - - @Test - public void testKeyProducerMarkedRoot() throws Exception - { - // Check that path construction works correctly also when the root is the starting position. - testKeyProducer(new ByteComparable[] { ByteComparable.EMPTY }); - } - - private void testKeyProducer(ByteComparable[] prefixes) throws TrieSpaceExhaustedException - { - NavigableMap data = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); - final Trie trie = prepareSplitInHeadTrie(1, prefixes, data); -// System.out.println(trie.dump(CONTENT_TO_STRING)); - - InMemoryTrie dest = InMemoryTrie.shortLived(byteComparableVersion); - InclusionChecker checker = new InclusionChecker(); - dest.apply(trie, checker, Predicates.alwaysFalse()); - assertEquals("", checker.output.toString()); - } - - static class InclusionChecker implements InMemoryTrie.UpsertTransformerWithKeyProducer - { - Tail currentTail = null; - StringBuilder output = new StringBuilder(); - - @Override - public Object apply(Object existing, Object update, InMemoryTrie.KeyProducer keyProducer) - { - if (existing != null) - output.append("Non-null existing\n"); - - byte[] tailPath = keyProducer.getBytes(Tail.class::isInstance); - byte[] fullPath = keyProducer.getBytes(); - String tail = Hex.bytesToHex(tailPath); - String full = Hex.bytesToHex(fullPath); - if (!full.endsWith(tail)) - { - output.append("Tail " + tail + " is not suffix of full path " + full + "\n"); - return update; // can't continue - } - - String msg = "\n@key " + full.substring(0, full.length() - tail.length()) + ":" + tail + "\n"; - - if (update instanceof Tail) - { - // At - if (tailPath.length != fullPath.length) - output.append("Prefix not empty on tail root" + msg); - Tail t = (Tail) update; - if (!Arrays.equals(t.prefix, fullPath)) - output.append("Tail root path expected " + Hex.bytesToHex(t.prefix) + msg); - currentTail = t; - } - else - { - byte[] prefix = Arrays.copyOfRange(fullPath, 0, fullPath.length - tailPath.length); - if (currentTail == null) - output.append("Null currentTail" + msg); - if (!Arrays.equals(currentTail.prefix, prefix)) - output.append("Prefix expected " + Hex.bytesToHex(currentTail.prefix) + msg); - - if (!(update instanceof ByteBuffer)) - output.append("Not ByteBuffer " + update + msg); - ByteBuffer expected = currentTail.data.get(ByteComparable.preencoded(byteComparableVersion, tailPath)); - if (expected == null) - output.append("Suffix not found" + msg); - if (!expected.equals(update)) - output.append("Data mismatch " + ByteBufferUtil.bytesToHex((ByteBuffer) update) + " expected " + ByteBufferUtil.bytesToHex(expected) + msg); - } - return update; - } + return trie.mapValues(x -> x instanceof ByteBuffer ? (ByteBuffer) x : null); } } diff --git a/test/unit/org/apache/cassandra/db/tries/ProcessSkippingBranchesWithPredicateTest.java b/test/unit/org/apache/cassandra/db/tries/ProcessSkippingBranchesWithPredicateTest.java new file mode 100644 index 000000000000..08840eff25e9 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/ProcessSkippingBranchesWithPredicateTest.java @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Tests for processSkippingBranches with acceptance predicate functionality added in commit d9b32443a8. + */ +public class ProcessSkippingBranchesWithPredicateTest +{ + private static class TestContent + { + final String value; + final boolean shouldAccept; + + TestContent(String value, boolean shouldAccept) + { + this.value = value; + this.shouldAccept = shouldAccept; + } + + @Override + public String toString() + { + return "TestContent{" + value + ", accept=" + shouldAccept + '}'; + } + } + + private static ByteComparable bc(String s) + { + return ByteComparable.preencoded(VERSION, s.getBytes(java.nio.charset.StandardCharsets.UTF_8)); + } + + @Test + public void testProcessSkippingBranchesWithPredicateAcceptsAll() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + trie.putSingleton(bc("a"), new TestContent("a", true), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", true), (x, y) -> y); + trie.putSingleton(bc("abc"), new TestContent("abc", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + assertEquals(Arrays.asList("a", "b"), collected); + } + + @Test + public void testProcessSkippingBranchesWithPredicateRejectsAll() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + trie.putSingleton(bc("a"), new TestContent("a", false), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", false), (x, y) -> y); + trie.putSingleton(bc("abc"), new TestContent("abc", false), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", false), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + assertTrue("Should collect nothing when all rejected", collected.isEmpty()); + } + + @Test + public void testProcessSkippingBranchesWithPredicateSelectiveAcceptance() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + trie.putSingleton(bc("a"), new TestContent("a", true), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", false), (x, y) -> y); + trie.putSingleton(bc("abc"), new TestContent("abc", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", false), (x, y) -> y); + trie.putSingleton(bc("c"), new TestContent("c", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + // Should collect "a" (accepted, skips branch including "ab" and "abc"), then "c" (accepted) + // "b" is rejected so we skip its branch + assertEquals(Arrays.asList("a", "c"), collected); + } + + @Test + public void testProcessSkippingBranchesWithPredicateReverseDirection() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + trie.putSingleton(bc("a"), new TestContent("a", true), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", false), (x, y) -> y); + trie.putSingleton(bc("abc"), new TestContent("abc", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.REVERSE, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + // In reverse: "b" (accepted, skips branch), then "a" (accepted, skips branch including "ab" and "abc") + assertEquals(Arrays.asList("b", "a"), collected); + } + + @Test + public void testProcessSkippingBranchesWithPredicateEmptyTrie() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + assertTrue("Empty trie should produce no results", collected.isEmpty()); + } + + @Test + public void testProcessSkippingBranchesWithPredicateRootContent() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + // Add root content + trie.putSingleton(bc(""), new TestContent("root", true), (x, y) -> y); + trie.putSingleton(bc("a"), new TestContent("a", false), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + // Root is accepted and should skip all branches + assertEquals(Arrays.asList("root"), collected); + } + + @Test + public void testProcessSkippingBranchesWithPredicateRootContentRejected() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + // Add root content that will be rejected + trie.putSingleton(bc(""), new TestContent("root", false), (x, y) -> y); + trie.putSingleton(bc("a"), new TestContent("a", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + // Root is rejected, so we continue to children + assertEquals(Arrays.asList("a", "b"), collected); + } + + @Test + public void testForEachEntrySkippingBranchesWithPredicate() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + trie.putSingleton(bc("a"), new TestContent("a", true), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", false), (x, y) -> y); + trie.putSingleton(bc("abc"), new TestContent("abc", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collectedKeys = new ArrayList<>(); + List collectedValues = new ArrayList<>(); + + trie.forEachEntrySkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + (key, content) -> { + collectedKeys.add(new String(key.asByteComparableArray(VERSION), java.nio.charset.StandardCharsets.UTF_8)); + collectedValues.add(content.value); + }); + + // "a" accepted (skips branch including "ab" and "abc"), "b" accepted (skips branch) + assertEquals(Arrays.asList("a", "b"), collectedKeys); + assertEquals(Arrays.asList("a", "b"), collectedValues); + } + + @Test + public void testProcessSkippingBranchesWithPredicateComplexPaths() throws Exception + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + + // Create a more complex tree structure + trie.putSingleton(bc("aa"), new TestContent("aa", true), (x, y) -> y); + trie.putSingleton(bc("aaa"), new TestContent("aaa", false), (x, y) -> y); + trie.putSingleton(bc("aaaa"), new TestContent("aaaa", true), (x, y) -> y); + trie.putSingleton(bc("aab"), new TestContent("aab", true), (x, y) -> y); + trie.putSingleton(bc("ab"), new TestContent("ab", false), (x, y) -> y); + trie.putSingleton(bc("aba"), new TestContent("aba", true), (x, y) -> y); + trie.putSingleton(bc("b"), new TestContent("b", true), (x, y) -> y); + + List collected = new ArrayList<>(); + trie.processSkippingBranches(Direction.FORWARD, + content -> content.shouldAccept, + new Trie.ValueConsumer() + { + @Override + public void content(TestContent content) + { + collected.add(content.value); + } + }); + + // "aa" accepted (skips entire branch including "aaa", "aaaa", "aab"), "aba" accepted, "b" accepted + // When a node is accepted, ALL its descendants are skipped + assertEquals(Arrays.asList("aa", "aba", "b"), collected); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/RangeTrieIntersectionTest.java b/test/unit/org/apache/cassandra/db/tries/RangeTrieIntersectionTest.java new file mode 100644 index 000000000000..57fc5eb358da --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/RangeTrieIntersectionTest.java @@ -0,0 +1,484 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.google.common.collect.Lists; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.TestRangeState.fromList; +import static org.apache.cassandra.db.tries.TestRangeState.toList; +import static org.junit.Assert.assertEquals; + +@RunWith(Parameterized.class) +public class RangeTrieIntersectionTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static final int bitsNeeded = 4; + + + @Parameterized.Parameters(name = "bits per transition {0}") + public static List data() + { + return IntStream.rangeClosed(1, bitsNeeded) + .boxed() + .collect(Collectors.toList()); + } + + @Parameterized.Parameter(0) + public int bits = bitsNeeded; + + /** Creates a {@link ByteComparable} for the provided value by splitting the integer in sequences of "bits" bits. */ + private ByteComparable of(int value) + { + assert value >= 0 && value <= Byte.MAX_VALUE; + + byte[] splitBytes = new byte[(bitsNeeded + bits - 1) / bits]; + int pos = 0; + int mask = (1 << bits) - 1; + for (int i = bitsNeeded - bits; i > 0; i -= bits) + splitBytes[pos++] = (byte) ((value >> i) & mask); + + splitBytes[pos] = (byte) (value & mask); + return ByteComparable.preencoded(TrieUtil.VERSION, splitBytes); + } + + private TestRangeState from(int where, int value) + { + return new TestRangeState(of(where), -1, value); + } + + private TestRangeState to(int where, int value) + { + return new TestRangeState(of(where), value, -1); + } + + private TestRangeState change(int where, int from, int to) + { + return new TestRangeState(of(where), from, to); + } + + private TrieSet range(ByteComparable left, ByteComparable right) + { + return TrieSet.rangeInclusiveEnd(TrieUtil.VERSION, left, right); + } + + private TrieSet ranges(ByteComparable... bounds) + { + return TrieSet.ranges(TrieUtil.VERSION, true, true, bounds); + } + + @Test + public void testSubtrie() + { + { + RangeTrie trie = fromList(asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12))); + + TrieUtil.dumpToOut(trie); + + assertEquals("No intersection", asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12)), toList(trie, Direction.FORWARD)); + + testIntersection("all", + asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12)), + trie, + range(null, null)); + testIntersection("fully covered range", + asList(from(1, 10), to(4, 10)), + trie, + range(of(0), of(5))); + testIntersection("fully covered range", + asList(from(6, 11), change(8, 11, 12), to(10, 12)), + trie, + range(of(5), of(13))); + testIntersection("matching range", + asList(from(1, 10), to(4, 10)), + trie, + range(of(1), of(4))); + testIntersection("touching", + asList(from(4, 10), to(4, 10), from(6, 11), to(6, 11)), + trie, + range(of(4), of(6))); + + testIntersection("partial left", + asList(from(2, 10), to(4, 10)), + trie, + range(of(2), of(5))); + testIntersection("partial left on change", + asList(from(8, 12), to(10, 12)), + trie, + range(of(8), of(12))); + testIntersection("partial left with null", + asList(from(9, 12), to(10, 12)), + trie, + range(of(9), null)); + + + testIntersection("partial right", + asList(from(6, 11), to(7, 11)), + trie, + range(of(5), of(7))); + testIntersection("partial right on change", + asList(from(6, 11), change(8, 11, 12), to(8, 12)), + trie, + range(of(5), of(8))); + testIntersection("partial right with null", + asList(from(1, 10), to(2, 10)), + trie, + range(null, of(2))); + + testIntersection("inside range", + asList(from(2, 10), to(3, 10)), + trie, + range(of(2), of(3))); + testIntersection("inside with change", + asList(from(7, 11), change(8, 11, 12), to(9, 12)), + trie, + range(of(7), of(9))); + + testIntersection("point inside", + asList(from(7, 11), to(7, 11)), + trie, + range(of(7), of(7))); + } + } + + @Test + public void testRanges() + { + { + RangeTrie trie = fromList(asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12))); + + testIntersection("fully covered ranges", + asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12)), + trie, + ranges(of(0), of(13))); + testIntersection("matching ranges", + asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12)), + trie, + ranges(of(1), of(4), of(6), of(11))); + testIntersection("touching", + asList(from(1, 10), to(1, 10), from(4, 10), to(4, 10), from(6, 11), to(6, 11)), + trie, + ranges(of(0), of(1), of(4), of(6), of(12), of(15))); + testIntersection("partial left", + asList(from(2, 10), to(4, 10), from(9, 12), to(10, 12)), + trie, + ranges(of(2), of(5), of(9), null)); + + testIntersection("partial right", + asList(from(1, 10), to(2, 10), from(6, 11), to(7, 11)), + trie, + ranges(null, of(2), of(5), of(7))); + + testIntersection("inside ranges", + asList(from(2, 10), to(3, 10), from(7, 11), change(8, 11, 12), to(9, 12)), + trie, + ranges(of(2), of(3), of(7), of(9))); + + testIntersection("jumping inside", + asList(from(1, 10), to(2, 10), from(3, 10), to(4, 10), from(6, 11), to(6, 11), from(7, 11), change(8, 11, 12), to(8, 12), from(9, 12), to(10, 12)), + trie, + ranges(of(1), of(2), of(3), of(4), of(5), of(6), of(7), of(8), of(9), of(10))); + } + } + + @Test + public void testRangeOnSubtrie() + { + { + RangeTrie trie = fromList(asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12), from(13, 13), to(14, 13))); + + // non-overlapping + testIntersection("", asList(), trie, range(of(0), of(3)), range(of(4), of(7))); + // touching + testIntersection("", asList(from(3, 10), to(3, 10)), trie, range(of(0), of(3)), range(of(3), of(7))); + // overlapping 1 + testIntersection("", asList(from(2, 10), to(3, 10)), trie, range(of(0), of(3)), range(of(2), of(7))); + // overlapping 2 + testIntersection("", asList(from(1, 10), to(3, 10)), trie, range(of(0), of(3)), range(of(1), of(7))); + // covered + testIntersection("", asList(from(1, 10), to(3, 10)), trie, range(of(0), of(3)), range(of(0), of(7))); + // covered + testIntersection("", asList(from(3, 10), to(4, 10), from(6, 11), to(7, 11)), trie, range(of(3), of(7)), range(of(0), of(7))); + // covered 2 + testIntersection("", asList(from(1, 10), to(3, 10)), trie, range(of(1), of(3)), range(of(0), of(7))); + } + } + + @Test + public void testRangesOnRanges() + { + testIntersections(fromList(asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12), from(13, 13), to(14, 13)))); + } + + private void testIntersections(RangeTrie trie) + { + System.out.println(trie.dump()); + testIntersection("", asList(from(1, 10), to(4, 10), from(6, 11), change(8, 11, 12), to(10, 12), from(13, 13), to(14, 13)), trie); + + TrieSet set1 = ranges(null, of(4), of(5), of(9), of(12), null); + TrieSet set2 = ranges(of(2), of(7), of(8), of(10), of(12), of(14)); + TrieSet set3 = ranges(of(1), of(2), of(3), of(4), of(5), of(6), of(7), of(8), of(9), of(10)); + + testIntersections(trie, set1, set2, set3); + + testSetAlgebraIntersection(trie); + } + + private void testSetAlgebraIntersection(RangeTrie trie) + { + TrieSet set1 = range(null, of(3)) + .union(range(of(2), of(4))) + .union(range(of(5), of(7))) + .union(range(of(7), of(9))) + .union(range(of(14), of(15))) + .union(range(of(12), null)); + TrieSet set2 = range(of(2), of(7)) + .union(ranges(null, of(8), of(10), null).negation()) + .union(ranges(of(8), of(10), of(12), of(14))); + TrieSet set3 = range(of(1), of(2)) + .union(range(of(3), of(4))) + .union(range(of(5), of(6))) + .union(range(of(7), of(8))) + .union(range(of(9), of(10))); +// System.out.println("Set 0:\n" + set1.dump()); +// System.out.println("Set 1:\n" + set2.dump()); +// System.out.println("Set 2:\n" + set3.dump()); + + testIntersections(trie, set1, set2, set3); + } + + private void testIntersections(RangeTrie trie, TrieSet set1, TrieSet set2, TrieSet set3) + { + // set1 = ranges(-4, 5-9, 12-); + // set2 = ranges(2-7, 8-10, 12-14); + // set3 = ranges(1-2, 3-4, 5-6, 7-8, 9-10); + testIntersection("1", asList(from(1, 10), to(4, 10), + from(6, 11), change(8, 11, 12), to(9, 12), + from(13, 13), to(14,13)), trie, set1); + + testIntersection("2", asList(from(2, 10), to(4, 10), + from(6, 11), to(7, 11), + from(8, 12), to(10, 12), + from(13, 13), to(14, 13)), trie, set2); + + testIntersection("3", asList(from(1, 10), to(2, 10), + from(3, 10), to(4, 10), + from(6, 11), to(6, 11), + from(7, 11), change(8, 11, 12), to(8, 12), + from(9, 12), to(10, 12)), trie, set3); + + testIntersection("12", asList(from(2, 10), to(4, 10), + from(6, 11), to(7, 11), + from(8, 12), to(9, 12), + from(13, 13), to(14, 13)), trie, set1, set2); + + testIntersection("13", asList(from(1, 10), to(2, 10), + from(3, 10), to(4, 10), + from(6, 11), to(6, 11), + from(7, 11), change(8, 11, 12), to(8, 12), + from(9, 12), to(9, 12)), trie, set1, set3); + + testIntersection("23", asList(from(2, 10), to(2, 10), + from(3, 10), to(4, 10), + from(6, 11), to(6, 11), from(7, 11), to(7, 11), from(8, 12), to(8, 12), + from(9, 12), to(10, 12)), trie, set2, set3); + + testIntersection("123", asList(from(2, 10), to(2, 10), + from(3, 10), to(4, 10), + from(6, 11), to(6, 11), from(7, 11), to(7, 11), + from(8, 12), to(8, 12), from(9, 12), to(9, 12)), trie, set1, set2, set3); + } + + public void testIntersection(String message, List expected, RangeTrie trie, TrieSet... sets) + { + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + try + { + assertEquals(message + " forward b" + bits, expected, toList(trie, Direction.FORWARD)); + assertEquals(message + " reverse b" + bits, Lists.reverse(expected), toList(trie, Direction.REVERSE)); + } + catch (AssertionError e) + { + System.out.println("\nFORWARD:\n" + trie.dump(TestRangeState::toStringNoPosition)); + System.out.println("\nREVERSE:\n" + trie.cursor(Direction.REVERSE).process(new TrieDumper.Plain<>(TestRangeState::toStringNoPosition))); + throw e; + } + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + TrieSet set = sets[toRemove]; + testIntersection(message + " " + toRemove, expected, + trie.intersect(set), + Arrays.stream(sets) + .filter(x -> x != set) + .toArray(TrieSet[]::new) + ); + } + } + } + + @Test + public void testRangeMethod() + { + ByteComparable left = TrieUtil.directComparable("aa"); + ByteComparable right = TrieUtil.directComparable("bb"); + RangeTrie trie = RangeTrie.range(left, true, right, true, TrieUtil.VERSION, new TestRangeState(ByteComparable.EMPTY, 1, 1)); + RangeTrie expected = TrieUtil.directRangeTrie("aa", "bb"); + TrieUtil.verifyEqualRangeTries(trie, expected); + } + + @Test + public void testSkipToSimple() + { + String[] ranges1 = {"aaa", "ddd"}; + String[] ranges2 = {"bbb", "eee"}; + String[] ixranges = {"bbb", "ddd"}; + String[] points = {"___", "aaa", "bbb", "ccc", "ddd", "eee"}; + testIntersectionSkipTo(ranges1, ranges2, ixranges, points); + } + + @Test + public void testRangeUnderCoveredBranchPoint() + { + String[] ranges1 = {"ba", "bc"}; + String[] ranges2 = {"aa", "ab", "bbc", "bbd", "bbfff", "bbffg", "bde", "bdf", "ce", "cf"}; + String[] expected2 = {"bbc", "bbd", "bbfff", "bbffg"}; + String[] ranges3 = {"bbfe", "bbfg"}; + String[] expected3 = {"bbfff", "bbffg"}; + testDirectIntersections(ranges1, ranges2, expected2, ranges3, expected3); + } + + @Test + public void testRangeUnderCoveredBranchRight() + { + String[] ranges1 = {"_a", "_b", "abba", "abf", "d", "e"}; + String[] ranges2 = {"aaa", "aab", "abc", "abd", "abef", "abeg", "abehhh", "abehhi", "ce", "cf"}; + String[] expected2 = {"abc", "abd", "abef", "abeg", "abehhh", "abehhi"}; + String[] ranges3 = {"abehg", "abehi"}; + String[] expected3 = {"abehhh", "abehhi"}; + testDirectIntersections(ranges1, ranges2, expected2, ranges3, expected3); + } + + @Test + public void testRangeUnderCoveredBranchLeft() + { + String[] ranges1 = {"_a", "_b", "abb_", "abe", "d", "e"}; + String[] ranges2 = {"aaa", "aab", "abbac", "abbad", "abbafff", "abbaffg", "abc", "abd", "ce", "cf"}; + String[] expected2 = {"abbac", "abbad", "abbafff", "abbaffg", "abc", "abd"}; + String[] ranges3 = {"abbafe", "abbafg"}; + String[] expected3 = {"abbafff", "abbaffg"}; + testDirectIntersections(ranges1, ranges2, expected2, ranges3, expected3); + } + + private void testDirectIntersections(String[] ranges1, String[] ranges2, String[] expected2, String[] ranges3, String[] expected3) + { + testDirectIntersectionsRangeSet(ranges1, ranges2, expected2, ranges3, expected3); + testDirectIntersectionsRangeSet(ranges2, ranges1, expected2, ranges3, expected3); + } + + private void testDirectIntersectionsRangeSet(String[] ranges1, String[] ranges2, String[] expected2, String[] ranges3, String[] expected3) + { + RangeTrie set1 = TrieUtil.directRangeTrie(ranges1); + TrieSet set2 = TrieUtil.directRanges(ranges2); + RangeTrie expected = TrieUtil.directRangeTrie(expected2); + TrieUtil.verifyEqualRangeTries(set1.intersect(set2), expected); + String[] allpoints = Stream.of(ranges1, ranges2, expected2, ranges3, expected3) + .flatMap(Arrays::stream) + .distinct() + .toArray(String[]::new); + verifyIntersectionContainsCorrectness(allpoints, set1, set2); + // check skipTo in a covered branch + TrieSet set3 = TrieUtil.directRanges(ranges3); + expected = TrieUtil.directRangeTrie(expected3); + TrieUtil.verifyEqualRangeTries(set1.intersect(set2).intersect(set3), expected); + TrieUtil.verifyEqualRangeTries(set1.intersect(set3.intersection(set2)), expected); + verifyIntersectionContainsCorrectness(allpoints, set1.intersect(set2), set3); + verifyIntersectionContainsCorrectness(allpoints, set1, set3.intersection(set2)); + } + + private void testIntersectionSkipTo(String[] ranges1, String[] ranges2, String[] ixranges, String[] points) + { + testIntersectionSkipToRangeSet(ranges1, ranges2, ixranges, points); + testIntersectionSkipToRangeSet(ranges2, ranges1, ixranges, points); + } + + private void testIntersectionSkipToRangeSet(String[] ranges1, String[] ranges2, String[] ixranges, String[] points) + { + RangeTrie set1 = TrieUtil.directRangeTrie(ranges1); + TrieSet set2 = TrieUtil.directRanges(ranges2); + RangeTrie ix = TrieUtil.directRangeTrie(ixranges); + TrieUtil.verifyEqualRangeTries(set1.intersect(set2), ix); + + verifyIntersectionContainsCorrectness(points, set1, set2); + + for (int i = 1; i < 1 << points.length; i++) // at least one set bit + { + String[] ranges = new String[Integer.bitCount(i) * 2]; + int p = 0; + for (int j = 0; j < points.length; j++) + { + if ((i & (1 << j)) != 0) + { + ranges[p++] = points[j]; + ranges[p++] = points[j]; + } + } + System.out.println(Arrays.toString(ranges)); + TrieSet set3 = TrieUtil.directRanges(ranges); + RangeTrie expected = TrieUtil.directRangeTrie(Arrays.stream(ranges) + .filter(x -> ix.applicableRange(TrieUtil.directComparable(x)) != null) + .toArray(String[]::new)); + TrieUtil.verifyEqualRangeTries(set1.intersect(set2).intersect(set3), expected); + TrieUtil.verifyEqualRangeTries(set1.intersect(set3.intersection(set2)), expected); + } + } + + private static void verifyIntersectionContainsCorrectness(String[] points, RangeTrie trie, TrieSet set) + { + RangeTrie ix = trie.intersect(set); + for (String s : points) + { + ByteComparable bc = TrieUtil.directComparable(s); + assertEquals(s, set.strictlyContains(bc) && trie.applicableRange(bc) != null, ix.applicableRange(bc) != null); + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/RangeTrieMergeTest.java b/test/unit/org/apache/cassandra/db/tries/RangeTrieMergeTest.java new file mode 100644 index 000000000000..682c3bed5c0d --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/RangeTrieMergeTest.java @@ -0,0 +1,601 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.google.common.collect.Lists; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.TestRangeState.fromList; +import static org.apache.cassandra.db.tries.TestRangeState.toList; +import static org.apache.cassandra.db.tries.TestRangeState.verify; +import static org.junit.Assert.assertEquals; + +@RunWith(Parameterized.class) +public class RangeTrieMergeTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static final int bitsNeeded = 6; + + @Parameterized.Parameters(name = "bits per transition {0} open-ended {1}") + public static List data() + { + return IntStream.rangeClosed(1, bitsNeeded) + .boxed() + .flatMap(x -> Stream.of(false, true) + .map(y -> new Object[] { x, y })) + .collect(Collectors.toList()); + } + + @Parameterized.Parameter(0) + public final int bits = bitsNeeded; + + @Parameterized.Parameter(1) + public final boolean useNulls = true; + + /** Creates a {@link ByteComparable} for the provided value by splitting the integer in sequences of "bits" bits. */ + private ByteComparable of(int value) + { + assert value >= 0 && value < 1<< bitsNeeded; + + byte[] splitBytes = new byte[(bitsNeeded + bits - 1) / bits]; + int pos = 0; + int mask = (1 << bits) - 1; + for (int i = bitsNeeded - bits; i > 0; i -= bits) + splitBytes[pos++] = (byte) ((value >> i) & mask); + + splitBytes[pos] = (byte) (value & mask); + return ByteComparable.preencoded(TrieUtil.VERSION, splitBytes); + } + + private TestRangeState from(int where, int value) + { + return new TestRangeState(of(where), false, -1, value); + } + + private TestRangeState fromAfter(int where, int value) + { + return new TestRangeState(of(where), true, -1, value); + } + + private TestRangeState to(int where, int value) + { + return new TestRangeState(of(where), true, value, -1); + } + + private TestRangeState toBefore(int where, int value) + { + return new TestRangeState(of(where), false, value, -1); + } + + private TestRangeState changeBefore(int where, int from, int to) + { + return new TestRangeState(of(where), false, from, to); + } + + private TestRangeState changeAfter(int where, int from, int to) + { + return new TestRangeState(of(where), true, from, to); + } + + private List deletedRanges(ByteComparable... dataPoints) + { + List data = new ArrayList<>(asList(dataPoints)); + invertDataRangeList(data); + filterOutEmptyRepetitions(data); + + List markers = new ArrayList<>(); + for (int i = 0; i < data.size(); ++i) + { + ByteComparable pos = data.get(i); + if (pos == null) + pos = i % 2 == 0 ? useNulls ? null : of(0) : useNulls ? null : of((1< data) + { + // invert list + if (data.get(0) != null) + data.add(0, null); + else + data.remove(0); + if (data.get(data.size() - 1) != null) + data.add(null); + else + data.remove(data.size() - 1); + } + + private static void filterOutEmptyRepetitions(List data) + { + for (int i = 0; i < data.size() - 1; ++i) + { + if (data.get(i) != null && data.get(i + 1) != null && + ByteComparable.compare(data.get(i), data.get(i + 1), TrieUtil.VERSION) == 0) + { + data.remove(i + 1); + data.remove(i); + --i; + } + } + } + + @Test + public void testSubtrie() + { + testMerge("no merge"); + + testMerge("all", + deletedRanges(null, null)); + testMerge("fully covered range", + deletedRanges(of(20), of(25))); + testMerge("fully covered range", + deletedRanges(of(25), of(33))); + testMerge("matching range", + deletedRanges(of(21), of(24))); + testMerge("touching empty", + deletedRanges(of(24), of(26))); + + testMerge("partial left", + deletedRanges(of(22), of(25))); + testMerge("partial left on change", + deletedRanges(of(28), of(32))); + testMerge("partial left with null", + deletedRanges(of(29), null)); + + + testMerge("partial right", + deletedRanges(of(25), of(27))); + testMerge("partial right on change", + deletedRanges(of(25), of(28))); + testMerge("partial right with null", + deletedRanges(null, of(22))); + + testMerge("inside range", + deletedRanges(of(22), of(23))); + testMerge("inside with change", + deletedRanges(of(27), of(29))); + + testMerge("empty range inside", + deletedRanges(of(27), of(27))); + + testMerge("point covered", + deletedRanges(of(16), of(18))); + testMerge("point at range start", + deletedRanges(of(17), of(18))); + testMerge("point at range end", + deletedRanges(of(16), of(17))); + + + testMerge("start point covered", + deletedRanges(of(32), of(35))); + testMerge("start point at range start", + deletedRanges(of(33), of(35))); + testMerge("start point at range end", + deletedRanges(of(32), of(33))); + + + testMerge("end point covered", + deletedRanges(of(36), of(40))); + testMerge("end point at range start", + deletedRanges(of(38), of(40))); + testMerge("end point at range end", + deletedRanges(of(36), of(38))); + } + + @Test + public void testRanges() + { + { + testMerge("fully covered ranges", + deletedRanges(of(20), of(25), of(25), of(33))); + testMerge("matching ranges", + deletedRanges(of(21), of(24), of(26), of(31))); + testMerge("touching empty", + deletedRanges(of(20), of(21), of(24), of(26), of(32), of(33), of(34), of(36))); + testMerge("partial left", + deletedRanges(of(22), of(25), of(29), null)); + + testMerge("partial right", + deletedRanges(null, of(22), of(25), of(27))); + + testMerge("inside ranges", + deletedRanges(of(22), of(23), of(27), of(29))); + + testMerge("jumping inside", + deletedRanges(of(21), of(22), of(23), of(24), of(25), of(26), of(27), of(28), of(29), of(30))); + } + } + + @Test + public void testRangeOnSubtrie() + { + { + // non-overlapping + testMerge("non-overlapping", deletedRanges(of(20), of(23)), deletedRanges(of(24), of(27))); + // touching, i.e. still non-overlapping + testMerge("touching", deletedRanges(of(20), of(23)), deletedRanges(of(23), of(27))); + // overlapping 1 + testMerge("overlapping1", deletedRanges(of(20), of(23)), deletedRanges(of(22), of(27))); + // overlapping 2 + testMerge("overlapping2", deletedRanges(of(20), of(23)), deletedRanges(of(21), of(27))); + // covered + testMerge("covered1", deletedRanges(of(20), of(23)), deletedRanges(of(20), of(27))); + // covered + testMerge("covered2", deletedRanges(of(23), of(27)), deletedRanges(of(20), of(27))); + // covered 2 + testMerge("covered3", deletedRanges(of(21), of(23)), deletedRanges(of(20), of(27))); + } + } + + @Test + public void testRangesOnRanges() + { + testMerges(); + } + + private List getTestRanges() + { + return asList(fromAfter(3, 15), toBefore(5, 15), + from(17, 20), to(17, 20), + from(21, 10), changeBefore(22, 10, 21), changeAfter(22, 21, 10), to(24, 10), + from(26, 11), changeBefore(28, 11, 22), changeAfter(28, 22, 12), to(30, 12), + from(33, 23), changeAfter(33, 23, 13), to(34, 13), + from(36, 14), changeBefore(38, 14, 24), to(38, 24), + fromAfter(40, 15), toBefore(43, 15)); + } + + private void testMerges() + { + testMerge("", fromList(getTestRanges()), getTestRanges()); + + List set1 = deletedRanges(null, of(24), of(25), of(29), of(32), null); + List set2 = deletedRanges(of(14), of(17), + of(22), of(27), + of(28), of(30), + of(32), of(34), + of(36), of(40)); + List set3 = deletedRanges(of(17), of(18), + of(19), of(20), + of(21), of(22), + of(23), of(24), + of(25), of(26), + of(27), of(28), + of(29), of(30), + of(31), of(32), + of(33), of(34), + of(35), of(36), + of(37), of(38)); + + testMerges(set1, set2, set3); + } + + private void testMerges(List set1, List set2, List set3) + { + // set1 = TrieSet.ranges(null, of(24), of(25), of(29), of(32), null); + // set2 = TrieSet.ranges(of(22), of(27), of(28), of(30), of(32), of(34)); + // set3 = TrieSet.ranges(of(21), of(22), of(23), of(24), of(25), of(26), of(27), of(28), of(29), of(30)); + // from(21, 10), to(24, 10), from(26, 11), change(28, 11, 12), to(30, 12), from(33, 13), to(34, 13) + testMerge("1", set1); + + testMerge("2", set2); + + testMerge("3", set3); + + testMerge("12", set1, set2); + + testMerge("13", set1, set3); + + testMerge("23", set2, set3); + + testMerge("123", set1, set2, set3); + } + + @SafeVarargs + public final void testMerge(String message, List... sets) + { + List testRanges = getTestRanges(); + testMerge(message, fromList(testRanges), testRanges, sets); + testCollectionMerge(message + " collection", Lists.newArrayList(fromList(testRanges)), testRanges, sets); + testMergeToInMemoryTrie(message + " inmem.apply", fromList(testRanges), testRanges, sets); + } + + + public void testMerge(String message, RangeTrie trie, List merged, List... sets) + { + System.out.println("Markers: " + merged); + verify(merged); + // Test that merging the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + try + { + assertEquals(message + " forward b" + bits, merged, toList(trie, Direction.FORWARD)); + assertEquals(message + " reverse b" + bits, Lists.reverse(merged), toList(trie, Direction.REVERSE)); + System.out.println(message + " b" + bits + " matched."); + } + catch (AssertionError e) + { + System.out.println("\n" + trie.dump()); + throw e; + } + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + System.out.println("Adding: " + ranges); + testMerge(message + " " + toRemove, + trie.mergeWith(fromList(ranges), TestRangeState::combine), + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + } + } + } + + InMemoryRangeTrie duplicateTrie(RangeTrie trie) + { + try + { + InMemoryRangeTrie dupe = InMemoryRangeTrie.shortLived(TrieUtil.VERSION); + dupe.apply(trie, this::upsertMarkers, x -> false); + return dupe; + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + + public void testMergeToInMemoryTrie(String message, InMemoryRangeTrie trie, List merged, List... sets) + { + System.out.println("Markers: " + merged); + verify(merged); + System.out.println("Trie: \n" + trie.dump()); + // Test that intersecting the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + try + { + assertEquals(message + " forward b" + bits, merged, toList(trie, Direction.FORWARD)); + assertEquals(message + " reverse b" + bits, Lists.reverse(merged), toList(trie, Direction.REVERSE)); + System.out.println(message + " b" + bits + " matched."); + } + catch (AssertionError e) + { + System.out.println("\n" + trie.dump()); + throw e; + } + } + else + { + try + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + System.out.println("Adding: " + ranges); + InMemoryRangeTrie dupe = duplicateTrie(trie); + dupe.apply(fromList(ranges), this::upsertMarkers, x -> false); + testMergeToInMemoryTrie(message + " " + toRemove, + dupe, + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + } + } + catch (TrieSpaceExhaustedException e) + { + throw new AssertionError(e); + } + } + } + + TestRangeState upsertMarkers(TestRangeState left, TestRangeState right) + { + if (left == null) + return right; + if (right == null) + return left; + return TestRangeState.combine(left, right); + } + + public void testCollectionMerge(String message, List> triesToMerge, List merged, List... sets) + { + System.out.println("Markers: " + merged); + verify(merged); + // Test that merging the given trie with the given sets, in any order, results in the expected list. + // Checks both forward and reverse iteration direction. + if (sets.length == 0) + { + RangeTrie trie = RangeTrie.merge(triesToMerge, TestRangeState::combineCollection); + try + { + assertEquals(message + " forward b" + bits, merged, toList(trie, Direction.FORWARD)); + assertEquals(message + " reverse b" + bits, Lists.reverse(merged), toList(trie, Direction.REVERSE)); + System.out.println(message + " b" + bits + " matched."); + } + catch (AssertionError e) + { + System.out.println("\n" + trie.dump()); + throw e; + } + } + else + { + for (int toRemove = 0; toRemove < sets.length; ++toRemove) + { + List ranges = sets[toRemove]; + System.out.println("Adding: " + ranges); + triesToMerge.add(fromList(ranges)); + testCollectionMerge(message + " " + toRemove, + triesToMerge, + mergeLists(merged, ranges), + Arrays.stream(sets) + .filter(x -> x != ranges) + .toArray(List[]::new) + ); + triesToMerge.remove(triesToMerge.size() - 1); + } + } + } + + int delete(int deletionTime, int data) + { + if (data <= deletionTime) + return -1; + else + return data; + } + + TestRangeState delete(int deletionTime, TestRangeState marker) + { + if (deletionTime < 0) + return marker; + + int newLeft = delete(deletionTime, marker.leftSide); + int newRight = delete(deletionTime, marker.rightSide); + if (newLeft < 0 && newRight < 0 || newLeft == newRight) + return null; + if (newLeft == marker.leftSide && newRight == marker.rightSide) + return marker; + return new TestRangeState(marker.position, newLeft, newRight); + } + + List mergeLists(List left, List right) + { + int active = -1; + Iterator rightIt = right.iterator(); + TestRangeState nextRight = rightIt.hasNext() ? rightIt.next() : null; + List result = new ArrayList<>(); + for (TestRangeState nextLeft : left) + { + while (true) + { + int cmp; + if (nextRight == null) + cmp = -1; + else + { + if (nextLeft.position == null || nextRight.position == null) + { + if (nextLeft.position == null && nextRight.position == null) + cmp = Boolean.compare(nextLeft.appliesAfter, nextRight.appliesAfter); + else if (nextLeft.position == null) + cmp = nextLeft.appliesAfter ? 1 : -1; + else // (nextRight.position == null) + cmp = nextRight.appliesAfter ? -1 : 1; + } + else + cmp = ByteComparable.compare(nextLeft.position, nextRight.position, TrieUtil.VERSION); + if (cmp == 0) + cmp = Boolean.compare(nextLeft.appliesAfter, nextRight.appliesAfter); + } + + if (cmp < 0) + { + maybeAdd(result, nextRight != null ? delete(nextRight.leftSide, nextLeft) : nextLeft); + break; + } + + if (cmp == 0) + { + TestRangeState processed = TestRangeState.combine(nextRight, nextLeft).toContent(); + maybeAdd(result, processed); + nextRight = rightIt.hasNext() ? rightIt.next() : null; + break; + } + else + { + // Must close active if it becomes covered, and must open active if it is no longer covered. + if (active >= 0) + { + TestRangeState activeMarker = new TestRangeState(nextRight.position, active, active); + nextRight = TestRangeState.combine(activeMarker, nextRight).toContent(); + } + maybeAdd(result, nextRight); + } + + nextRight = rightIt.hasNext() ? rightIt.next() : null; + } + active = nextLeft.rightSide; + } + assert active == -1; + while (nextRight != null) + { + maybeAdd(result, delete(active, nextRight));// deletion is not needed (active == -1), do just in case + nextRight = rightIt.hasNext() ? rightIt.next() : null; + } + return result; + } + + static void maybeAdd(List list, T value) + { + if (value == null) + return; + list.add(value); + } + + @Test + public void testRangeUnderCoveredRange() + { + String[] ranges1 = {"ba", "bb"}; + String[] ranges2 = {"aa", "ab", "bbc", "bbd", "bbfff", "bbfff", "bce", "bcf", "ce", "cf"}; + // We don't currently handle boundaries that are prefixes of entries and we should identify this and throw an exception. + RangeTrie merge = RangeTrie.merge(List.of(TrieUtil.directRangeTrie(1, ranges1), + TrieUtil.directRangeTrie(2, ranges2)), + TestRangeState::combineCollection); + var list = toList(merge, + Direction.FORWARD); + System.out.println(list); + System.out.println(merge.dump()); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/RangesTrieSetTest.java b/test/unit/org/apache/cassandra/db/tries/RangesTrieSetTest.java new file mode 100644 index 000000000000..6be6b1dce0f1 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/RangesTrieSetTest.java @@ -0,0 +1,781 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.db.tries.TrieUtil.FORWARD_COMPARATOR; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.EMPTY; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertEquals; + +@RunWith(Parameterized.class) +public class RangesTrieSetTest +{ + @Parameterized.Parameter(0) + public boolean endsInclusive = false; + + @Parameterized.Parameter(1) + public boolean negated = false; + + @Parameterized.Parameter(2) + public boolean sendThroughInMemoryTrie = false; + + @Parameterized.Parameters(name = "endInclusive {0} negated {1} through InMemoryTrie {2}") + public static List data() + { + return Arrays.asList(new Object[][] { + { true, false, false }, + { false, false, false }, + { true, true, false }, + { false, true, false }, + { true, false, true }, + { false, false, true }, + { true, true, true }, + { false, true, true } + }); + } + + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + static RangeTrie fullTrie(TrieSet s) + { + return new RangeTrie<>() + { + @Override + public RangeCursor makeCursor(Direction direction) + { + throw new AssertionError(); + } + + // Override cursor to disable verification which does not like the content this returns. + // The source is already verified. + @Override + public RangeCursor cursor(Direction direction) + { + return new RangeCursor<>() + { + private final TrieSetCursor cursor = s.cursor(direction); + + public TrieSetCursor.RangeState content() + { + return cursor.state(); + } + + @Override + public TrieSetCursor.RangeState state() + { + return cursor.state(); + } + + public long encodedPosition() + { + return cursor.encodedPosition(); + } + + @Override + public long advance() + { + return cursor.advance(); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return cursor.skipTo(encodedSkipPosition); + } + + @Override + public RangeCursor tailCursor(Direction dir) + { + throw new AssertionError(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return VERSION; + } + }; + } + }; + } + + static void dumpToOut(TrieSet s) + { + TrieUtil.dumpToOut(fullTrie(s)); + } + + void check(String... boundariesAsStrings) + { + if (!negated) + check(endsInclusive, boundariesAsStrings); + else + checkNegated(endsInclusive, boundariesAsStrings); + } + + TrieSet ranges(boolean endsInclusive, ByteComparable[] boundaries) + { + return dir -> RangesCursor.create(dir, VERSION, true, endsInclusive, boundaries); + } + + void check(boolean endsInclusive, String... boundariesAsStrings) + { + Preencoded[] boundaries = new Preencoded[boundariesAsStrings.length]; + for (int i = 0; i < boundariesAsStrings.length; ++i) + boundaries[i] = boundariesAsStrings[i] != null ? TrieUtil.directComparable(boundariesAsStrings[i]) : null; + + System.out.println("Boundaries: " + Arrays.stream(boundaries).map(x -> x != null ? x.byteComparableAsString(VERSION) : null).collect(Collectors.toList())); + if (!boundariesValid(endsInclusive, false, boundaries)) + { + System.out.println("Skipping endsInclusive " + endsInclusive + " because boundaries do not make sense for it"); + return; + } + + TrieSet set = maybeSendThroughInMemoryTrie(ranges(endsInclusive, boundaries)); + check(endsInclusive, false, sendThroughInMemoryTrie, boundaries, set); + verifyTails(endsInclusive, false, sendThroughInMemoryTrie, boundaries, set); + } + + private TrieSet maybeSendThroughInMemoryTrie(TrieSet ranges) + { + if (!sendThroughInMemoryTrie) + return ranges; + + InMemoryRangeTrie inMem = InMemoryRangeTrie.shortLived(VERSION); + try + { + inMem.apply(dir -> ranges.cursor(dir), + (x, y) -> x != null ? x.union(y) : y, + Predicates.alwaysFalse()); + } + catch (TrieSpaceExhaustedException e) + { + throw new RuntimeException(e); + } + return dir -> new TrieSetOverRangeCursor(inMem.cursor(dir)); + } + + void checkNegated(boolean endsInclusive, String... boundariesAsStrings) + { + Preencoded[] boundaries = new Preencoded[boundariesAsStrings.length]; + for (int i = 0; i < boundariesAsStrings.length; ++i) + boundaries[i] = boundariesAsStrings[i] != null ? TrieUtil.directComparable(boundariesAsStrings[i]) : null; + + Preencoded[] negatedBoundaries = getNegatedBoundaries(boundaries, Preencoded[]::new); + System.out.println("Negated boundaries: " + Arrays.stream(negatedBoundaries).map(x -> x != null ? x.byteComparableAsString(VERSION) : null).collect(Collectors.toList())); + if (!boundariesValid(false, endsInclusive, negatedBoundaries)) + { + System.out.println("Skipping negated for endsInclusive " + endsInclusive + " because boundaries do not make sense for it"); + return; + } + + // Go through in-memory first, because we want to test the negation's complexity on top of it. + TrieSet set = maybeSendThroughInMemoryTrie(ranges(endsInclusive, boundaries)); + TrieSet negatedSet = set.negation(); + + System.out.println("Negated set"); + check(false, endsInclusive, sendThroughInMemoryTrie, negatedBoundaries, negatedSet); + verifyTails(false, endsInclusive, sendThroughInMemoryTrie, negatedBoundaries, negatedSet); + } + + private boolean boundariesValid(boolean endsInclusive, boolean startsExclusive, ByteComparable[] boundaries) + { + ByteComparable[] processedBoundaries = new ByteComparable[boundaries.length]; + for (int i = 0; i < processedBoundaries.length; ++i) + { + if (boundaries[i] == null) + { + assert i == 0 || i == processedBoundaries.length - 1; + continue; + } + if (i % 2 == 0) + processedBoundaries[i] = append(boundaries[i], (startsExclusive ? 255 : 0)); + else + processedBoundaries[i] = append(boundaries[i], (endsInclusive ? 255 : 0)); + } + ByteComparable prev = null; + for (ByteComparable v : processedBoundaries) + { + if (prev != null && v != null && ByteComparable.compare(prev, v, VERSION) > 0) + return false; + prev = v; + } + return true; + } + + private static ByteComparable append(ByteComparable bc, int lastByte) + { + return dir -> ByteSource.append(bc.asComparableBytes(VERSION), lastByte); + } + + private static T[] getNegatedBoundaries(T[] boundaries, Function createArray) + { + // If the first entry is not null, drop it; otherwise add a null. + int addLeft = boundaries.length == 0 || boundaries[0] != null ? +1 + : -1; + // If the last entry is not null, drop it; otherwise add a null. If the length is odd, don't adjust anything + // as the length will now become even. + int addRight = boundaries.length == 0 + ? +1 + : boundaries.length % 2 != 0 ? 0 + : boundaries[boundaries.length - 1] != null ? +1 + : -1; + + // Add/remove nulls on both sides of the boundaries + T[] negatedBoundaries = createArray.apply(boundaries.length + addLeft + addRight); + for (int i = Math.max(-addLeft, 0); i < boundaries.length + Math.min(addRight, 0); ++i) + negatedBoundaries[i + addLeft] = boundaries[i]; + return negatedBoundaries; + } + + private static TrieSet tailTrie(TrieSet set, ByteComparable prefix, Direction direction) + { + TrieSetCursor c = set.cursor(direction); + if (c.descendAlong(prefix.asComparableBytes(c.byteComparableVersion()))) + return c::tailCursor; + else if (c.precedingIncluded()) + return TrieSet.full(c.byteComparableVersion()); + else + return TrieSet.empty(c.byteComparableVersion()); + } + + private static boolean startsWith(ByteComparable b, ByteComparable prefix) + { + ByteSource sb = b.asComparableBytes(VERSION); + ByteSource pb = prefix.asComparableBytes(VERSION); + int next = pb.next(); + while (next != ByteSource.END_OF_STREAM) + { + if (sb.next() != next) + return false; + next = pb.next(); + } + return true; + } + + private static void verifyTails(boolean endsInclusive, boolean startsExclusive, boolean sendThroughInMemoryTrie, Preencoded[] boundaries, TrieSet set) + { + Set prefixes = new TreeSet<>(FORWARD_COMPARATOR); + for (ByteComparable b : boundaries) + { + if (b == null) + continue; + for (int i = 0; i <= ByteComparable.length(b, VERSION); ++i) + prefixes.add(ByteComparable.cut(b, i).preencode(VERSION)); + } + + for (ByteComparable prefix : prefixes) + { + List tails = null; + int prefixLength = ByteComparable.length(prefix, VERSION); + for (int i = 0; i < boundaries.length; ++i) + { + ByteComparable b = boundaries[i]; + if (b == null || !startsWith(b, prefix)) + continue; + if (tails == null) + { + tails = new ArrayList<>(); + if ((i & 1) != 0) + tails.add(null); + } + + final byte[] byteComparableArray = b.asByteComparableArray(VERSION); + tails.add(ByteComparable.preencoded(VERSION, Arrays.copyOfRange(byteComparableArray, prefixLength, byteComparableArray.length))); + } + + for (Direction dir : Direction.values()) + { + System.out.println("Tail for " + prefix.byteComparableAsString(VERSION) + " " + dir); + System.out.println(" tail bounds " + tails.stream().map(x -> x == null ? "null" : x.byteComparableAsString(VERSION)).collect(Collectors.toList())); + TrieSet tail = tailTrie(set, prefix, dir); + check(endsInclusive, startsExclusive, sendThroughInMemoryTrie, tails.toArray(ByteComparable[]::new), tail); + } + } + } + + static void check(boolean endsInclusive, boolean startsExclusive, boolean sendThroughInMemoryTrie, ByteComparable[] boundaries, TrieSet s) + { + dumpToOut(s); + var expectations = getExpectations(endsInclusive, startsExclusive, sendThroughInMemoryTrie, boundaries); + assertTrieEquals(expectations, s); + } + + private static void assertTrieEquals(NavigableMap expectations, TrieSet s) + { + BaseTrie trie = fullTrie(s); + TrieUtil.assertMapEquals(TrieUtil.toStringMap(trie, Direction.FORWARD), + TrieUtil.toStringMap(expectations, PointState::forwardSide)); + TrieUtil.assertMapEquals(TrieUtil.toStringMap(trie, Direction.REVERSE), + TrieUtil.toStringMap(TrieUtil.reorderBy(expectations, TrieUtil.REVERSE_COMPARATOR), PointState::reverseSide)); + } + + static class PointState + { + int firstIndex = Integer.MAX_VALUE; + int lastIndex = Integer.MIN_VALUE; + boolean firstExact = false; + boolean lastExact = false; + boolean firstIsAfter = true; + boolean lastIsAfter = false; + + void addIndex(int index, boolean exact, boolean pointIsAfter) + { + if (index < firstIndex) + { + firstIndex = index; + firstExact = exact; + firstIsAfter = !exact || pointIsAfter; + } + else if (index == firstIndex) + { + firstExact |= exact; + firstIsAfter &= !exact || pointIsAfter; + } + + if (index > lastIndex) + { + lastIndex = index; + lastExact = exact; + lastIsAfter = exact && pointIsAfter; + } + else if (index == lastIndex) + { + lastExact |= exact; + lastIsAfter |= exact && pointIsAfter; + } + } + + static PointState empty() + { + PointState state = new PointState(); + state.firstIndex = 0; + state.lastIndex = 1; + return state; + } + + public static Object forwardSide(PointState pointState) + { + boolean applicableBefore = (pointState.firstIndex & 1) == 1; + TrieSetCursor.RangeState b1 = null; + TrieSetCursor.RangeState b2 = null; + // choose to report b1 based on diff between first and last + if (pointState.firstExact) + { + if (pointState.firstIsAfter) + b2 = TrieSetCursor.RangeState.fromProperties(applicableBefore, !applicableBefore); + else + b1 = TrieSetCursor.RangeState.fromProperties(applicableBefore, !applicableBefore); + } + else if (pointState.lastIndex > pointState.firstIndex) + b1 = TrieSetCursor.RangeState.fromProperties(applicableBefore, applicableBefore); + + if (pointState.lastExact) + { + boolean applicableAfter = (pointState.lastIndex & 1) == 1; + if (pointState.lastIsAfter) + b2 = combineFwd(b2, TrieSetCursor.RangeState.fromProperties(applicableAfter, !applicableAfter)); + else + b1 = combineFwd(b1, TrieSetCursor.RangeState.fromProperties(applicableAfter, !applicableAfter)); + } + + if (b1 == null && b2 == null) + return TrieSetCursor.RangeState.fromProperties(applicableBefore, applicableBefore); + if (b1 != null && b2 != null) + return Pair.create(b1, b2); + if (b1 != null) + return b1; + return b2; + } + + static TrieSetCursor.RangeState combineFwd(TrieSetCursor.RangeState b1, TrieSetCursor.RangeState b2) + { + if (b1 == null) + return b2; + return TrieSetCursor.RangeState.fromProperties(b1.applicableBefore, b2.applicableAfter); + } + + public static Object reverseSide(PointState pointState) + { + boolean applicableBefore = (pointState.lastIndex & 1) != 1; + TrieSetCursor.RangeState b1 = null; + TrieSetCursor.RangeState b2 = null; + if (pointState.lastExact) + { + if (pointState.lastIsAfter) + b1 = TrieSetCursor.RangeState.fromProperties(!applicableBefore, applicableBefore); + else + b2 = TrieSetCursor.RangeState.fromProperties(!applicableBefore, applicableBefore); + } + else if (pointState.lastIndex > pointState.firstIndex) + b1 = TrieSetCursor.RangeState.fromProperties(applicableBefore, applicableBefore); + + if (pointState.firstExact) + { + boolean applicableAfter = (pointState.firstIndex & 1) != 1; + if (pointState.firstIsAfter) + b1 = combineRev(b1, TrieSetCursor.RangeState.fromProperties(!applicableAfter, applicableAfter)); + else + b2 = combineRev(b2, TrieSetCursor.RangeState.fromProperties(!applicableAfter, applicableAfter)); + } + + if (b1 == null && b2 == null) + return TrieSetCursor.RangeState.fromProperties(applicableBefore, applicableBefore); + if (b1 != null && b2 != null) + return Pair.create(b1, b2); + if (b1 != null) + return b1; + return b2; + } + } + + static TrieSetCursor.RangeState combineRev(TrieSetCursor.RangeState b1, TrieSetCursor.RangeState b2) + { + if (b1 == null) + return b2; + return TrieSetCursor.RangeState.fromProperties(b2.applicableBefore, b1.applicableAfter); + } + + + static NavigableMap getExpectations(boolean endsInclusive, boolean startsExclusive, boolean sendThroughInMemoryTrie, ByteComparable... boundaries) + { + boundaries = maybeDropRepetitions(endsInclusive, startsExclusive, sendThroughInMemoryTrie, boundaries); + + // Leading [null, EMPTY ...] sequence is nonsensical if endsInclusive is not true and causes us trouble. + if (!endsInclusive && + boundaries.length >= 2 && + boundaries[0] == null && + boundaries[1] != null && + ByteComparable.compare(EMPTY, boundaries[1], VERSION) == 0) + { + boundaries = Arrays.copyOfRange(boundaries, 2, boundaries.length); + } + int l = (boundaries.length + 1) & ~1; + // Same for trailing [... EMPTY, null] when startsExclusive is true + if (startsExclusive && + boundaries.length >= 2 && + (boundaries.length <= l - 1 || boundaries[l - 1] == null) && + boundaries[l - 2] != null && + ByteComparable.compare(EMPTY, boundaries[l - 2], VERSION) == 0) + { + l -= 2; + boundaries = Arrays.copyOfRange(boundaries, 0, l); + } + + var expectations = new TreeMap(FORWARD_COMPARATOR); + for (int bi = 0; bi < l; ++bi) + { + boolean pointIsAfter = bi % 2 == 0 ? startsExclusive : endsInclusive; + ByteComparable b = bi < boundaries.length ? boundaries[bi] : null; + if (b == null) + { + b = ByteComparable.EMPTY; + pointIsAfter = bi % 2 == 1; // always inclusive left exclusive right + } + int len = ByteComparable.length(b, VERSION); + for (int i = 0; i <= len; ++i) + { + Preencoded v = ByteComparable.cut(b, i).preencode(VERSION); + PointState state = expectations.computeIfAbsent(v, k -> new PointState()); + state.addIndex(bi, i == len, pointIsAfter); + } + } + if (expectations.isEmpty()) + expectations.put(ByteComparable.EMPTY.preencode(VERSION), PointState.empty()); + return expectations; + } + + private static ByteComparable[] maybeDropRepetitions(boolean endsInclusive, boolean startsExclusive, boolean sendThroughInMemoryTrie, ByteComparable[] boundaries) + { + if (sendThroughInMemoryTrie && endsInclusive == startsExclusive) + { + // We need to remove boundary repetitions (which have no effect) because the in-memory trie will not contain + // them at all. + List reworked = null; + int i; + for (i = 0; i < boundaries.length - 1; ++i) + { + if (boundaries[i] != null && + boundaries[i + 1] != null && + ByteComparable.compare(boundaries[i], boundaries[i + 1], VERSION) == 0) + { + if (reworked == null) + reworked = new ArrayList<>(Arrays.asList(boundaries).subList(0, i)); + i += 1; + } + else + if (reworked != null) + reworked.add(boundaries[i]); + } + if (i < boundaries.length && reworked != null) + reworked.add(boundaries[i]); + if (reworked != null) + { + boundaries = reworked.toArray(new ByteComparable[0]); + } + } + return boundaries; + } + + @Test + public void testEmptyInterval() + { + check(); + } + + @Test + public void testFullInterval() + { + check(null, null); + } + + @Test + public void testOneNull() + { + check((String) null); + } + + @Test + public void testLeftNull() + { + check(null, "afg"); + } + + @Test + public void testRightNull() + { + check("abc", null); + } + + @Test + public void testSpan() + { + check("abc", "afg"); + } + + @Test + public void testPoint() + { + check("abc", "abc"); + } + + @Test + public void testDual() + { + check("abc", "afg", "aga", "ajb"); + } + + @Test + public void testHole() + { + check(null, "abc", "afg", null); + } + + // prefixes + + @Test + public void testPrefixLeft() + { + check(" a", " abc"); + } + + @Test + public void testPrefixRight() + { + check(" abc", " a"); + } + + @Test + public void testPrefixHole() + { + check(" a", " aaa", " acc", " a"); + } + + @Test + public void testPrefixLeftHole() + { + check(" a", " aaa", " acc", " d"); + } + + @Test + public void testPrefixRightHole() + { + check(" a", " daa", " dcc", " d"); + } + + + // Repeats aren't valid, because they doubly list a branch + + @Test + public void testRepeatLeft() + { + check("abc", "abc", "abc", null); + } + + @Test + public void testRepeatRight() + { + check(null, "abc", "abc", "abc"); + } + + @Test + public void testPointRepeat() + { + check("abc", "abc", "abc", "abc"); + } + + @Test + public void testPointInSpan() + { + check("aa", "abc", "abc", "ad"); + } + + @Test + public void testPrefixRepeatsInSpanOdd() + { + check("aaa", "abc", "abe", "aff"); + } + + @Test + public void testPrefixRepeatsInSpanEven() + { + check("abc", "abe", "aff"); + } + + @Test + public void testBothEmpty() + { + check("", ""); + } + + @Test + public void testLeftEmpty() + { + check("", null); + } + + @Test + public void testOneEmpty() + { + check(""); + } + + @Test + public void testRightEmpty() + { + check(null, ""); + } + + @Test + public void testLong() + { + check("aaa", "aab", "aba", "aca", "acb", "ada", "adba", "adba", "baa", "bba", "bbb", "bbc", "bcc", "bcd"); + } + + @Test + public void testRangeStateFromProperties() + { + for (boolean applicableBefore : List.of(false, true)) + for (boolean applicableAfter : List.of(false, true)) + { + TrieSetCursor.RangeState state = TrieSetCursor.RangeState.fromProperties(applicableBefore, applicableAfter); + assertEquals(applicableBefore, state.applicableBefore); + assertEquals(applicableAfter, state.applicableAfter); + } + } + + private static class TrieSetOverRangeCursor implements TrieSetCursor + { + final RangeCursor source; + + public TrieSetOverRangeCursor(RangeCursor src) + { + source = src; + } + + @Override + public RangeState state() + { + RangeState state = source.state(); + return state != null ? state + : RangeState.NOT_CONTAINED; + } + + @Override + public RangeState content() + { + return source.content(); + } + + @Override + public TrieSetCursor tailCursor(Direction direction) + { + return new TrieSetOverRangeCursor(source.tailCursor(direction)); + } + + @Override + public long encodedPosition() + { + return source.encodedPosition(); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return source.byteComparableVersion(); + } + + @Override + public long advance() + { + return source.advance(); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + return source.skipTo(encodedSkipPosition); + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/ReturnPathSeekTest.java b/test/unit/org/apache/cassandra/db/tries/ReturnPathSeekTest.java new file mode 100644 index 000000000000..b6b7a630a8b3 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/ReturnPathSeekTest.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import com.google.common.collect.Iterators; +import com.google.common.collect.Streams; +import org.junit.Assume; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.asString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class ReturnPathSeekTest +{ + static final String[] tests = new String[] {"", "testing", "test", "tests", "tested", "tesa", "tesz"}; + + @Parameterized.Parameter(0) + public boolean inMemorySingleton = true; + @Parameterized.Parameter(1) + public boolean useUnion = false; + @Parameterized.Parameter(2) + public boolean intersectTrie = false; + + @Parameterized.Parameters(name="inMemorySingleton={0}, useUnion={1}, intersectTrie={2}") + public static List generateData() + { + var list = new ArrayList(); + for (boolean inMemorySingleton : new boolean[] {true, false}) + for (boolean useUnion : new boolean[] {true, false}) + for (boolean intersectTrie : new boolean[] {true, false}) + list.add(new Object[] {inMemorySingleton, useUnion, intersectTrie}); + return list; + } + + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + Trie makeSingleton(ByteComparable.Preencoded e, String v) throws TrieSpaceExhaustedException + { + if (inMemorySingleton) + { + InMemoryTrie tt = InMemoryTrie.shortLivedOrdered(VERSION); + tt.putRecursive(e, v, (x, y) -> y); + return tt; + } + else + return Trie.singletonOrdered(e, e.encodingVersion(), v); + } + + @Test + public void testSingle() throws TrieSpaceExhaustedException + { + Assume.assumeFalse("Test can't use union", useUnion); + for (String test : tests) + { + ByteComparable.Preencoded e = TrieUtil.directComparable(test); + System.out.println("Testing singleton " + asString(e) + ": " + test); + Trie tt = makeSingleton(e, test); + if (intersectTrie) + tt = tt.slice(e, true, e, true); // ineffective but might mess it up + + testOrderedSeek(test, e, tt); + assertEquals(test, Iterators.getOnlyElement(tt.valueIterator(Direction.FORWARD))); + assertEquals(test, Iterators.getOnlyElement(tt.valueIterator(Direction.REVERSE))); + } + } + + @Test + public void testMultiple() throws TrieSpaceExhaustedException + { + Trie trie = useUnion ? buildMergeTrie() : buildInMemoryTrie(); + + System.out.println("Forward\n" + trie.dump()); + System.out.println("Reverse\n" + trie.process(Direction.REVERSE, new TrieDumper.Plain<>(Objects::toString))); + + Set expected = new HashSet<>(Arrays.asList(tests)); + assertEquals(expected, Streams.stream(trie.values(Direction.FORWARD)).collect(Collectors.toSet())); + assertEquals(expected, Streams.stream(trie.values(Direction.REVERSE)).collect(Collectors.toSet())); + + for (String test : tests) + { + ByteComparable.Preencoded e = TrieUtil.directComparable(test); + System.out.println("Testing " + asString(e) + ": " + test); + Trie tt = trie; + if (intersectTrie) + tt = tt.slice(e, true, e, true); + testOrderedSeek(test, e, tt); + } + } + + private InMemoryTrie buildInMemoryTrie() throws TrieSpaceExhaustedException + { + InMemoryTrie trie = InMemoryTrie.shortLivedOrdered(VERSION); + for (String test : tests) + { + ByteComparable.Preencoded e = TrieUtil.directComparable(test); + System.out.println("Adding " + asString(e) + ": " + test); + if (inMemorySingleton) + trie.putRecursive(e, test, (x, y) -> y); + else + trie.mutator((String x, String y) -> y, Predicates.alwaysFalse()).apply(makeSingleton(e, test)); + } + return trie; + } + + private Trie buildMergeTrie() throws TrieSpaceExhaustedException + { + List> tries = new ArrayList<>(); + for (String test : tests) + { + ByteComparable.Preencoded e = TrieUtil.directComparable(test); + System.out.println("Adding " + asString(e) + ": " + test); + tries.add(makeSingleton(e, test)); + } + return Trie.merge(tries, Trie.throwingResolver()); + } + + private static void testOrderedSeek(String value, ByteComparable.Preencoded key, Trie trie) + { + assertEquals(value, trie.get(key)); // FORWARD direction + testForwardSeekBeyond(value, key, trie); + testReversedOrderedGet(value, key, trie); + testReversedGetBefore(value, key, trie); + } + + private static boolean descendAlongToReturnPath(Cursor cursor, ByteSource bytes) + { + long position = cursor.encodedPosition(); + int next = bytes.next(); + if (next == ByteSource.END_OF_STREAM) + { + long nextPosition = position | Cursor.ON_RETURN_PATH_BIT; + if (Cursor.compare(position, nextPosition) == 0) + return true; + return Cursor.compare(cursor.skipTo(nextPosition), nextPosition) == 0; + } + while (next != ByteSource.END_OF_STREAM) + { + long nextPosition = Cursor.positionForDescentWithByte(position, next); + next = bytes.next(); + if (next == ByteSource.END_OF_STREAM) + nextPosition |= Cursor.ON_RETURN_PATH_BIT; + if (Cursor.compare(cursor.skipTo(nextPosition), nextPosition) != 0) + return false; + position = nextPosition; + } + return true; + } + + private static void testReversedOrderedGet(String test, ByteComparable.Preencoded e, Trie trie) + { + Cursor c = trie.cursor(Direction.REVERSE); + assertTrue(descendAlongToReturnPath(c, e.getPreencodedBytes())); + assertEquals(test, c.content()); + } + + private static void testReversedGetBefore(String test, ByteComparable.Preencoded e, Trie trie) + { + Cursor c = trie.cursor(Direction.REVERSE); + assertFalse(c.descendAlong(e.getPreencodedBytes()) && c.content() == test); + } + + private static void testForwardSeekBeyond(String test, ByteComparable.Preencoded e, Trie trie) + { + Cursor c = trie.cursor(Direction.FORWARD); + assertFalse(descendAlongToReturnPath(c, e.getPreencodedBytes())); + } + + @Test + public void testSeeksAfterDescent() throws TrieSpaceExhaustedException + { + Trie trie = useUnion ? buildMergeTrie() : buildInMemoryTrie(); + + for (String first : tests) + { + for (String second : tests) + { + if (first.compareTo(second) >= 0) + continue; + + Trie tt = trie; + if (intersectTrie) + tt = tt.subtrie(TrieUtil.directComparable(first), TrieUtil.directComparable(second)); + testSeekAfterDescent(first, second, tt); + } + } + } + + @Test + public void testSeeksAfterDescentOnPair() throws TrieSpaceExhaustedException + { + Assume.assumeTrue("Test can't not use union", useUnion); + for (String first : tests) + { + for (String second : tests) + { + if (first.compareTo(second) >= 0) + continue; + + ByteComparable.Preencoded firstKey = TrieUtil.directComparable(first); + ByteComparable.Preencoded secondKey = TrieUtil.directComparable(second); + + Trie tt = makeSingleton(firstKey, first).mergeWith(makeSingleton(secondKey, second), + Trie.throwingResolver()); + if (intersectTrie) + tt = tt.subtrie(firstKey, secondKey); // ineffective but might mess it up + + testSeekAfterDescent(first, second, tt); + } + } + } + + private static void testSeekAfterDescent(String first, String second, Trie trie) + { + System.out.println("Testing " + second + " to " + first + " reverse"); + ByteComparable.Preencoded firstKey = TrieUtil.directComparable(first); + ByteComparable.Preencoded secondKey = TrieUtil.directComparable(second); + + Cursor cursor = trie.cursor(Direction.REVERSE); + descendAlongToReturnPath(cursor, secondKey.getPreencodedBytes()); + assertTrue(advanceByDifference(secondKey, firstKey, cursor)); + assertEquals(first, cursor.content()); + + System.out.println("Testing " + first + " to " + second + " forward"); + cursor = trie.cursor(Direction.FORWARD); + descendAlongToReturnPath(cursor, firstKey.getPreencodedBytes()); + assertFalse(advanceByDifference(firstKey, secondKey, cursor)); + } + + private static boolean advanceByDifference(ByteComparable.Preencoded from, ByteComparable.Preencoded to, Cursor cursor) + { + int depth = 0; + ByteSource.Peekable keyTo = to.getPreencodedBytes(); + ByteSource.Peekable keyFrom = from.getPreencodedBytes(); + int lastToByte = 0; // root return path + while (keyTo.peek() == keyFrom.peek()) + { + ++depth; + lastToByte = keyTo.next(); + keyFrom.next(); + } + int next = keyTo.next(); + Direction direction = Cursor.direction(cursor.encodedPosition()); + if (next == ByteSource.END_OF_STREAM) + { + long skipPosition = Cursor.encode(depth, lastToByte, direction) | Cursor.ON_RETURN_PATH_BIT; + if (Cursor.compare(skipPosition, cursor.encodedPosition()) < 0) + return false; // already beyond target + long advancedPosition; + if (Cursor.compare(skipPosition, cursor.encodedPosition()) == 0) + advancedPosition = skipPosition; + else + advancedPosition = cursor.skipTo(skipPosition); + return Cursor.compare(advancedPosition, skipPosition) == 0; + } + else + { + long skipPosition = Cursor.encode(depth + 1, next, direction); + if (keyTo.peek() == ByteSource.END_OF_STREAM) + skipPosition |= Cursor.ON_RETURN_PATH_BIT; + if (Cursor.compare(skipPosition, cursor.encodedPosition()) < 0) + return false; // already beyond target + long advancedPosition; + if (Cursor.compare(skipPosition, cursor.encodedPosition()) == 0) + advancedPosition = skipPosition; + else + advancedPosition = cursor.skipTo(skipPosition); + if (Cursor.compare(advancedPosition, skipPosition) != 0) + return false; + return descendAlongToReturnPath(cursor, keyTo); + } + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/SlicedTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SlicedTrieTest.java index df811a2a41ca..071d005a797f 100644 --- a/test/unit/org/apache/cassandra/db/tries/SlicedTrieTest.java +++ b/test/unit/org/apache/cassandra/db/tries/SlicedTrieTest.java @@ -22,36 +22,54 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; -import java.util.Comparator; import java.util.List; import java.util.NavigableMap; import java.util.Random; import java.util.SortedMap; import java.util.TreeMap; +import java.util.function.Function; +import java.util.stream.Collectors; +import com.google.common.collect.Iterables; +import com.google.common.collect.Streams; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; -import com.googlecode.concurrenttrees.common.Iterables; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.asString; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertSameContent; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.byteComparableVersion; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.generateKeys; -import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.makeInMemoryTrie; import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.makeInMemoryTrie; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertSameContent; +import static org.apache.cassandra.db.tries.TrieUtil.FORWARD_COMPARATOR; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.asString; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; +import static org.apache.cassandra.db.tries.TrieUtil.singleLevelIntTrie; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.EMPTY; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; import static org.junit.Assert.assertEquals; public class SlicedTrieTest { - public static final ByteComparable[] BOUNDARIES = toByteComparable(new String[]{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + InMemoryTrieTestBase.strategy = InMemoryTrieTestBase.ReuseStrategy.SHORT_LIVED_ORDERED; + InMemoryTrieTestBase.reverseComparator = InMemoryTrieTestBase.forwardComparator.reversed(); + } + + public static final Preencoded[] BOUNDARIES = toByteComparable(new String[]{ "test1", "test11", "test12", "test13", "test2", "test21", + "test", "te", "s", "q", @@ -64,7 +82,7 @@ public class SlicedTrieTest "\000\000\377", "\377\377" }); - public static final ByteComparable[] KEYS = toByteComparable(new String[]{ + public static final Preencoded[] KEYS = toByteComparable(new String[]{ "test1", "test2", "test55", @@ -72,6 +90,8 @@ public class SlicedTrieTest "test124", "test12", "test21", + "test", + "te", "tease", "sort", "sorting", @@ -84,7 +104,6 @@ public class SlicedTrieTest "\377\377" }); - public static final Comparator BYTE_COMPARABLE_COMPARATOR = (bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, byteComparableVersion); private static final int COUNT = 15000; Random rand = new Random(); @@ -96,22 +115,22 @@ public void testIntersectRangeDirect() public void testIntersectRange(int count) { - ByteComparable[] src1 = generateKeys(rand, count); - NavigableMap content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, byteComparableVersion)); + Preencoded[] src1 = generateKeys(rand, count); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); checkEqualRange(content1, trie1, null, true, null, true); - checkEqualRange(content1, trie1, InMemoryTrieTestBase.generateKey(rand), true, null, true); - checkEqualRange(content1, trie1, null, true, InMemoryTrieTestBase.generateKey(rand), true); + checkEqualRange(content1, trie1, TrieUtil.generateKey(rand), true, null, true); + checkEqualRange(content1, trie1, null, true, TrieUtil.generateKey(rand), true); for (int i = 0; i < 4; ++i) { - ByteComparable l = rand.nextBoolean() ? InMemoryTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)]; - ByteComparable r = rand.nextBoolean() ? InMemoryTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)]; - int cmp = ByteComparable.compare(l, r, byteComparableVersion); + Preencoded l = rand.nextBoolean() ? TrieUtil.generateKey(rand) : src1[rand.nextInt(src1.length)]; + Preencoded r = rand.nextBoolean() ? TrieUtil.generateKey(rand) : src1[rand.nextInt(src1.length)]; + int cmp = ByteComparable.compare(l, r, VERSION); if (cmp > 0) { - ByteComparable t = l; + Preencoded t = l; l = r; r = t; // swap } @@ -124,52 +143,136 @@ public void testIntersectRange(int count) } } - private static ByteComparable[] toByteComparable(String[] keys) + private static Preencoded[] toByteComparable(String[] keys) { return Arrays.stream(keys) - .map(x -> ByteComparable.preencoded(byteComparableVersion, x.getBytes(StandardCharsets.UTF_8))) - .toArray(ByteComparable[]::new); + .map(x -> ByteComparable.preencoded(VERSION, x.getBytes(StandardCharsets.UTF_8))) + .toArray(Preencoded[]::new); + } + + @Test + public void testSingletonOrdered() + { + ByteBuffer b = ByteBuffer.allocate(0); + List> singletons = new ArrayList<>(); + TreeMap map = new TreeMap<>(ByteComparable::compare); + for (Preencoded key : KEYS) + { + singletons.add(Trie.singletonOrdered(key, VERSION, b)); + map.put(key, b); + } + Trie trie = Trie.merge(singletons, collection -> b); + + System.out.println(trie.cursor(Direction.FORWARD).process(new TrieDumper.Plain<>(x -> "X"))); + System.out.println(trie.cursor(Direction.REVERSE).process(new TrieDumper.Plain<>(x -> "X"))); + + TrieUtil.assertMapEquals(trie.entryIterator(Direction.FORWARD), map.entrySet().iterator()); + TrieUtil.assertMapEquals(trie.entryIterator(Direction.REVERSE), map.descendingMap().entrySet().iterator()); + } + + @Test + public void testSingletonSlice() + { + testSingletonSlice(key -> Trie.singletonOrdered(key, VERSION, true)); + } + + @Test + public void testSingletonSliceTailForward() + { + testSingletonSlice(key -> Trie.singletonOrdered(key, VERSION, true) + .cursor(Direction.FORWARD)::tailCursor); + } + + @Test + public void testSingletonSliceTailReverse() + { + testSingletonSlice(key -> Trie.singletonOrdered(key, VERSION, true) + .cursor(Direction.REVERSE)::tailCursor); } @Test - public void testSingletonSubtrie() + public void testPrefixToEmptySlice() + { + testSingletonSlice(key -> Trie.singletonOrdered(EMPTY, VERSION, true).prefixedBy(key)); + } + + @Test + public void testEmptyPrefixSlice() + { + testSingletonSlice(key -> Trie.singletonOrdered(key, VERSION, true).prefixedBy(EMPTY)); + } + + @Test + public void testSplitPrefixSlice() + { + testSingletonSlice(key -> + { + byte[] bytes = key.asByteComparableArray(VERSION); + int cut = bytes.length / 2; + Preencoded k1 = ByteComparable.preencoded(VERSION, bytes, 0, cut); + Preencoded k2 = ByteComparable.preencoded(VERSION, bytes, cut, bytes.length - cut); + return Trie.singletonOrdered(k2, VERSION, true).prefixedBy(k1); + }); + } + + public void testSingletonSlice(Function> make) { - Arrays.sort(BOUNDARIES, (a, b) -> ByteComparable.compare(a, b, byteComparableVersion)); + Arrays.sort(BOUNDARIES, (a, b) -> ByteComparable.compare(a, b, VERSION)); for (int li = -1; li < BOUNDARIES.length; ++li) { - ByteComparable l = li < 0 ? null : BOUNDARIES[li]; + Preencoded l = li < 0 ? null : BOUNDARIES[li]; for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) { - ByteComparable r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; for (int i = li == ri ? 3 : 0; i < 4; ++i) { boolean includeLeft = (i & 1) != 0; boolean includeRight = (i & 2) != 0; - for (ByteComparable key : KEYS) + for (Preencoded key : KEYS) { - int cmp1 = l != null ? ByteComparable.compare(key, l, byteComparableVersion) : 1; - int cmp2 = r != null ? ByteComparable.compare(r, key, byteComparableVersion) : 1; - Trie ix = new SlicedTrie<>(Trie.singleton(key, byteComparableVersion, true), l, includeLeft, r, includeRight); + int cmp1 = l != null ? ByteComparable.compare(key, l, VERSION) : 1; + int cmp2 = r != null ? ByteComparable.compare(r, key, VERSION) : 1; + Trie ix = make.apply(key).slice(l, includeLeft, r, includeRight); boolean expected = true; if (cmp1 < 0 || cmp1 == 0 && !includeLeft) expected = false; if (cmp2 < 0 || cmp2 == 0 && !includeRight) expected = false; - boolean actual = com.google.common.collect.Iterables.getFirst(ix.values(), false); - if (expected != actual) + + try + { + assertEquals(expected, Iterables.getFirst(ix.values(), false)); + } + catch (Throwable t) { System.err.println("Intersection"); System.err.println(ix.dump()); - Assert.fail(String.format("Failed on range %s%s,%s%s key %s expected %s got %s\n", + Assert.fail(String.format("Failed on range %s%s,%s%s key %s\n%s\n", includeLeft ? "[" : "(", - l != null ? l.byteComparableAsString(byteComparableVersion) : null, - r != null ? r.byteComparableAsString(byteComparableVersion) : null, + l != null ? l.byteComparableAsString(VERSION) : null, + r != null ? r.byteComparableAsString(VERSION) : null, includeRight ? "]" : ")", - key.byteComparableAsString(byteComparableVersion), - expected, - actual)); + key.byteComparableAsString(VERSION), + t)); + } + + try + { + assertEquals(expected, Iterables.getFirst(ix.values(Direction.REVERSE), false)); + } + catch (Throwable t) + { + System.err.println("Intersection REV"); + System.err.println(ix.cursor(Direction.REVERSE).process(new TrieDumper.Plain<>(Object::toString))); + Assert.fail(String.format("Failed on range %s%s,%s%s REV key %s\n%s\n", + includeLeft ? "[" : "(", + l != null ? l.byteComparableAsString(VERSION) : null, + r != null ? r.byteComparableAsString(VERSION) : null, + includeRight ? "]" : ")", + key.byteComparableAsString(VERSION), + t)); } } } @@ -178,18 +281,18 @@ public void testSingletonSubtrie() } @Test - public void testMemtableSubtrie() + public void testMemtableSlice() { - Arrays.sort(BOUNDARIES, BYTE_COMPARABLE_COMPARATOR); - NavigableMap content1 = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + Arrays.sort(BOUNDARIES, FORWARD_COMPARATOR); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); InMemoryTrie trie1 = makeInMemoryTrie(KEYS, content1, true); for (int li = -1; li < BOUNDARIES.length; ++li) { - ByteComparable l = li < 0 ? null : BOUNDARIES[li]; + Preencoded l = li < 0 ? null : BOUNDARIES[li]; for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) { - ByteComparable r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; for (int i = 0; i < 4; ++i) { boolean includeLeft = (i & 1) != 0; @@ -203,27 +306,27 @@ public void testMemtableSubtrie() } @Test - public void testMergeSubtrie() + public void testMergeSlice() { - testMergeSubtrie(2); + testMergeSlice(2); } @Test - public void testCollectionMergeSubtrie3() + public void testCollectionMergeSlice3() { - testMergeSubtrie(3); + testMergeSlice(3); } @Test - public void testCollectionMergeSubtrie5() + public void testCollectionMergeSlice5() { - testMergeSubtrie(5); + testMergeSlice(5); } - public void testMergeSubtrie(int mergeCount) + public void testMergeSlice(int mergeCount) { - Arrays.sort(BOUNDARIES, BYTE_COMPARABLE_COMPARATOR); - NavigableMap content1 = new TreeMap<>(BYTE_COMPARABLE_COMPARATOR); + Arrays.sort(BOUNDARIES, FORWARD_COMPARATOR); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); List> tries = new ArrayList<>(); for (int i = 0; i < mergeCount; ++i) { @@ -237,10 +340,10 @@ public void testMergeSubtrie(int mergeCount) for (int li = -1; li < BOUNDARIES.length; ++li) { - ByteComparable l = li < 0 ? null : BOUNDARIES[li]; + Preencoded l = li < 0 ? null : BOUNDARIES[li]; for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) { - ByteComparable r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; for (int i = 0; i < 4; ++i) { boolean includeLeft = (i & 1) != 0; @@ -253,283 +356,201 @@ public void testMergeSubtrie(int mergeCount) } } - public void checkEqualRange(NavigableMap content1, + public void checkEqualRange(NavigableMap content1, Trie t1, - ByteComparable l, + Preencoded l, boolean includeLeft, - ByteComparable r, + Preencoded r, boolean includeRight) { System.out.println(String.format("Intersection with %s%s:%s%s", includeLeft ? "[" : "(", asString(l), asString(r), includeRight ? "]" : ")")); - SortedMap imap = l == null - ? r == null - ? content1 - : content1.headMap(r, includeRight) - : r == null - ? content1.tailMap(l, includeLeft) - : content1.subMap(l, includeLeft, r, includeRight); - - Trie intersection = t1.subtrie(l, includeLeft, r, includeRight); - assertSameContent(intersection, imap); + SortedMap imap = boundedOrderedMap(content1, l, includeLeft, r, includeRight); + Trie intersection = t1.slice(l, includeLeft, r, includeRight); + try + { + assertSameContent(intersection, imap); + } + catch (AssertionError e) + { + System.out.println("\n" + t1.dump(ByteBufferUtil::bytesToHex)); + + System.out.println("\n" + intersection.dump(ByteBufferUtil::bytesToHex)); + throw e; + } if (l == null || r == null) return; // Test intersecting intersection. - intersection = t1.subtrie(l, includeLeft, null, false).subtrie(null, false, r, includeRight); + intersection = t1.slice(l, includeLeft, null, false).slice(null, false, r, includeRight); assertSameContent(intersection, imap); - intersection = t1.subtrie(null, false, r, includeRight).subtrie(l, includeLeft, null, false); + intersection = t1.slice(null, false, r, includeRight).slice(l, includeLeft, null, false); assertSameContent(intersection, imap); } - /** - * Extract the values of the provide trie into a list. - */ - private static List toList(Trie trie) + private static SortedMap boundedOrderedMap(NavigableMap content1, Preencoded l, boolean includeLeft, Preencoded r, boolean includeRight) { - return Iterables.toList(trie.values()); + return l != null ? r != null ? content1.subMap(l, includeLeft, r, includeRight) + : content1.tailMap(l, includeLeft) + : r != null ? content1.headMap(r, includeRight) + : content1; } /** - * Creates a simple trie with a root having the provided number of childs, where each child is a leaf whose content - * is simply the value of the transition leading to it. - * - * In other words, {@code singleLevelIntTrie(4)} creates the following trie: - * Root - * t= 0 1 2 3 - * | | | | - * 0 1 2 3 + * Extract the values of the provided trie into a list. */ - private static Trie singleLevelIntTrie(int childs) + private static List toList(Trie trie, Direction direction) { - return new Trie() - { - @Override - protected Cursor cursor(Direction direction) - { - return new singleLevelCursor(direction); - } - - class singleLevelCursor implements Cursor - { - final Direction direction; - int current = -1; - - singleLevelCursor(Direction direction) - { - this.direction = direction; - current = direction.select(-1, childs); - } - - @Override - public int advance() - { - current += direction.increase; - return depth(); - } - - @Override - public int skipTo(int depth, int transition) - { - if (depth > 1) - return advance(); - if (depth < 1) - transition = direction.select(childs, -1); - - if (direction.isForward()) - current = Math.max(0, transition); - else - current = Math.min(childs - 1, transition); - - return depth(); - } - - @Override - public int depth() - { - if (current == direction.select(-1, childs)) - return 0; - if (direction.inLoop(current, 0, childs - 1)) - return 1; - return -1; - } - - @Override - public int incomingTransition() - { - return current >= childs ? -1 : current; - } - - @Override - public Integer content() - { - return current == direction.select(-1, childs) ? -1 : current; - } - - @Override - public Direction direction() - { - return direction; - } - - @Override - public ByteComparable.Version byteComparableVersion() - { - return byteComparableVersion; - } - - @Override - public Trie tailTrie() - { - throw new UnsupportedOperationException("tailTrie on test cursor"); - } - } - }; + return Streams.stream(trie.values(direction)).collect(Collectors.toList()); } /** Creates a single byte {@link ByteComparable} with the provide value */ private static ByteComparable of(int value) { assert value >= 0 && value <= Byte.MAX_VALUE; - return ByteComparable.preencoded(byteComparableVersion, new byte[]{ (byte)value }); + return ByteComparable.preencoded(VERSION, new byte[]{ (byte)value }); + } + + List maybeReversed(Direction direction, List list) + { + if (direction.isForward()) + return list; + List reversed = new ArrayList<>(list); + reversed.sort((x, y) -> Integer.compare(y, x)); + return reversed; + } + + void assertTrieEquals(List expected, Trie trie) + { + assertEquals(expected, toList(trie, Direction.FORWARD)); + assertEquals(maybeReversed(Direction.REVERSE, expected), toList(trie, Direction.REVERSE)); } @Test public void testSimpleIntersectionII() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), true, of(7), true); - assertEquals(asList(3, 4, 5, 6, 7), toList(intersection)); + Trie intersection = trie.slice(of(3), true, of(7), true); + assertTrieEquals(asList(3, 4, 5, 6, 7), intersection); } @Test public void testSimpleIntersectionEI() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), false, of(7), true); - assertEquals(asList(4, 5, 6, 7), toList(intersection)); + Trie intersection = trie.slice(of(3), false, of(7), true); + assertTrieEquals(asList(4, 5, 6, 7), intersection); } @Test public void testSimpleIntersectionIE() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), true, of(7), false); - assertEquals(asList(3, 4, 5, 6), toList(intersection)); + Trie intersection = trie.slice(of(3), true, of(7), false); + assertTrieEquals(asList(3, 4, 5, 6), intersection); } @Test public void testSimpleIntersectionEE() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), false, of(7), false); - assertEquals(asList(4, 5, 6), toList(intersection)); + Trie intersection = trie.slice(of(3), false, of(7), false); + assertTrieEquals(asList(4, 5, 6), intersection); } @Test public void testSimpleLeftIntersectionE() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), false, null, true); - assertEquals(asList(4, 5, 6, 7, 8, 9), toList(intersection)); + Trie intersection = trie.slice(of(3), false, null, true); + assertTrieEquals(asList(4, 5, 6, 7, 8, 9), intersection); } @Test public void testSimpleLeftIntersectionI() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(of(3), true, null, true); - assertEquals(asList(3, 4, 5, 6, 7, 8, 9), toList(intersection)); + Trie intersection = trie.slice(of(3), true, null, true); + assertTrieEquals(asList(3, 4, 5, 6, 7, 8, 9), intersection); } @Test public void testSimpleRightIntersectionE() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(null, true, of(7), false); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6), toList(intersection)); + Trie intersection = trie.slice(null, true, of(7), false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6), intersection); } @Test public void testSimpleRightIntersectionI() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(null, true, of(7), true); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7), toList(intersection)); + Trie intersection = trie.slice(null, true, of(7), true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7), intersection); } @Test public void testSimpleNoIntersection() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - Trie intersection = trie.subtrie(null, true, null, true); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection)); + Trie intersection = trie.slice(null, true, null, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); // The two boolean flags don't have a meaning when the bound does not exist. For completeness, also test // with them set to false. - intersection = trie.subtrie(null, false, null, false); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection)); + intersection = trie.slice(null, false, null, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); } @Test public void testSimpleEmptyIntersectionLeft() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); - - Trie intersection = trie.subtrie(ByteComparable.EMPTY, true, null, true); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection)); - - intersection = trie.subtrie(ByteComparable.EMPTY, false, null, true); - assertEquals(asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - intersection = trie.subtrie(ByteComparable.EMPTY, true, of(5), true); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5), toList(intersection)); - - intersection = trie.subtrie(ByteComparable.EMPTY, false, of(5), true); - assertEquals(asList(0, 1, 2, 3, 4, 5), toList(intersection)); + Trie intersection = trie.slice(ByteComparable.EMPTY, true, null, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + // Not currently supported + intersection = trie.slice(ByteComparable.EMPTY, false, null, true); + assertTrieEquals(asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); } @Test public void testSimpleEmptyIntersectionRight() { - Trie trie = singleLevelIntTrie(10); - assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie)); - - Trie intersection = trie.subtrie(null, true, ByteComparable.EMPTY, true); - assertEquals(asList(-1), toList(intersection)); - - intersection = trie.subtrie(null, true, ByteComparable.EMPTY, false); - assertEquals(asList(), toList(intersection)); + Trie trie = singleLevelIntTrie(10, true); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); - intersection = trie.subtrie(ByteComparable.EMPTY, true, ByteComparable.EMPTY, true); - assertEquals(asList(-1), toList(intersection)); + Trie intersection = trie.slice(null, true, ByteComparable.EMPTY, true); + assertTrieEquals(asList(-1), intersection); - intersection = trie.subtrie(ByteComparable.EMPTY, false, ByteComparable.EMPTY, true); - assertEquals(asList(), toList(intersection)); + // Not currently supported + intersection = trie.slice(null, true, ByteComparable.EMPTY, false); + assertTrieEquals(asList(), intersection); - intersection = trie.subtrie(ByteComparable.EMPTY, true, ByteComparable.EMPTY, false); - assertEquals(asList(), toList(intersection)); + intersection = trie.slice(ByteComparable.EMPTY, true, ByteComparable.EMPTY, true); + assertTrieEquals(asList(-1), intersection); // (empty, empty) is an invalid call as the "(empty" is greater than "empty)" } @@ -537,28 +558,46 @@ public void testSimpleEmptyIntersectionRight() @Test public void testSubtrieOnSubtrie() { - Trie trie = singleLevelIntTrie(15); + Trie trie = singleLevelIntTrie(15, true); // non-overlapping - Trie intersection = trie.subtrie(of(0), of(4)).subtrie(of(4), of(8)); - assertEquals(asList(), toList(intersection)); + ByteComparable left9 = of(0); + Trie integerTrie6 = trie.slice(left9, true, of(4), false); + ByteComparable left10 = of(4); + Trie intersection = integerTrie6.slice(left10, true, of(8), false); + assertTrieEquals(asList(), intersection); // touching - intersection = trie.subtrie(of(0), true, of(3), true).subtrie(of(3), of(8)); - assertEquals(asList(3), toList(intersection)); + Trie integerTrie5 = trie.slice(of(0), true, of(3), true); + ByteComparable left8 = of(3); + intersection = integerTrie5.slice(left8, true, of(8), false); + assertTrieEquals(asList(3), intersection); // overlapping 1 - intersection = trie.subtrie(of(0), of(4)).subtrie(of(2), of(8)); - assertEquals(asList(2, 3), toList(intersection)); + ByteComparable left6 = of(0); + Trie integerTrie4 = trie.slice(left6, true, of(4), false); + ByteComparable left7 = of(2); + intersection = integerTrie4.slice(left7, true, of(8), false); + assertTrieEquals(asList(2, 3), intersection); // overlapping 2 - intersection = trie.subtrie(of(0), of(4)).subtrie(of(1), of(8)); - assertEquals(asList(1, 2, 3), toList(intersection)); + ByteComparable left4 = of(0); + Trie integerTrie3 = trie.slice(left4, true, of(4), false); + ByteComparable left5 = of(1); + intersection = integerTrie3.slice(left5, true, of(8), false); + assertTrieEquals(asList(1, 2, 3), intersection); // covered - intersection = trie.subtrie(of(0), of(4)).subtrie(of(0), of(8)); - assertEquals(asList(0, 1, 2, 3), toList(intersection)); + ByteComparable left2 = of(0); + Trie integerTrie2 = trie.slice(left2, true, of(4), false); + ByteComparable left3 = of(0); + intersection = integerTrie2.slice(left3, true, of(8), false); + assertTrieEquals(asList(0, 1, 2, 3), intersection); // covered 2 - intersection = trie.subtrie(of(4), true, of(8), true).subtrie(of(0), of(8)); - assertEquals(asList(4, 5, 6, 7), toList(intersection)); + Trie integerTrie1 = trie.slice(of(4), true, of(8), true); + ByteComparable left1 = of(0); + intersection = integerTrie1.slice(left1, true, of(8), false); + assertTrieEquals(asList(4, 5, 6, 7), intersection); // covered 3 - intersection = trie.subtrie(of(1), false, of(4), true).subtrie(of(0), of(8)); - assertEquals(asList(2, 3, 4), toList(intersection)); + Trie integerTrie = trie.slice(of(1), false, of(4), true); + ByteComparable left = of(0); + intersection = integerTrie.slice(left, true, of(8), false); + assertTrieEquals(asList(2, 3, 4), intersection); } } diff --git a/test/unit/org/apache/cassandra/db/tries/SubtrieTest.java b/test/unit/org/apache/cassandra/db/tries/SubtrieTest.java new file mode 100644 index 000000000000..ec75a3ad4101 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/SubtrieTest.java @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.NavigableMap; +import java.util.Random; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Streams; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static java.util.Arrays.asList; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.makeInMemoryTrie; +import static org.apache.cassandra.db.tries.InMemoryTrieTestBase.assertSameContent; +import static org.apache.cassandra.db.tries.TrieUtil.FORWARD_COMPARATOR; +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.asString; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; +import static org.apache.cassandra.db.tries.TrieUtil.singleLevelIntTrie; +import static org.apache.cassandra.db.tries.TrieUtil.toBound; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.EMPTY; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertEquals; + +public class SubtrieTest +{ + @BeforeClass + public static void enableVerification() + { + CassandraRelevantProperties.TRIE_DEBUG.setBoolean(true); + } + + public static final Preencoded[] BOUNDARIES = toByteComparable(new String[]{ + "test1", + "test11", + "test12", + "test13", + "test2", + "test21", + "test", + "te", + "s", + "q", + "\000", + "\377", + "\377\000", + "\000\377", + "\000\000", + "\000\000\000", + "\000\000\377", + "\377\377" + }); + public static final Preencoded[] KEYS = toByteComparable(new String[]{ + "test1", + "test2", + "test55", + "test123", + "test124", + "test12", + "test21", + "test", + "te", + "tease", + "sort", + "sorting", + "square", + "\377\000", + "\000\377", + "\000\000", + "\000\000\000", + "\000\000\377", + "\377\377" + }); + + private static final int COUNT = 15000; + Random rand = new Random(); + + @Test + public void testIntersectRangeDirect() + { + testIntersectRange(COUNT); + } + + public void testIntersectRange(int count) + { + Preencoded[] src1 = generateKeys(rand, count); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); + + InMemoryTrie trie1 = makeInMemoryTrie(src1, content1, true); + + checkEqualRange(content1, trie1, null, true, null, true); + checkEqualRange(content1, trie1, TrieUtil.generateKey(rand), true, null, true); + checkEqualRange(content1, trie1, null, true, TrieUtil.generateKey(rand), true); + for (int i = 0; i < 4; ++i) + { + Preencoded l = rand.nextBoolean() ? TrieUtil.generateKey(rand) : src1[rand.nextInt(src1.length)]; + Preencoded r = rand.nextBoolean() ? TrieUtil.generateKey(rand) : src1[rand.nextInt(src1.length)]; + int cmp = ByteComparable.compare(l, r, VERSION); + if (cmp > 0) + { + Preencoded t = l; + l = r; + r = t; // swap + } + + boolean includeLeft = (i & 1) != 0 || cmp == 0; + boolean includeRight = (i & 2) != 0 || cmp == 0; + checkEqualRange(content1, trie1, l, includeLeft, r, includeRight); + checkEqualRange(content1, trie1, null, includeLeft, r, includeRight); + checkEqualRange(content1, trie1, l, includeLeft, null, includeRight); + } + } + + private static Preencoded[] toByteComparable(String[] keys) + { + return Arrays.stream(keys) + .map(TrieUtil::comparable) + .toArray(Preencoded[]::new); + } + + @Test + public void testSingletonSubtrie() + { + testSingletonSubtrie(key -> Trie.singleton(key, VERSION, true)); + } + + @Test + public void testSingletonSubtrieTailForward() + { + testSingletonSubtrie(key -> Trie.singleton(key, VERSION, true) + .cursor(Direction.FORWARD)::tailCursor); + } + + @Test + public void testSingletonSubtrieTailReverse() + { + testSingletonSubtrie(key -> Trie.singleton(key, VERSION, true) + .cursor(Direction.REVERSE)::tailCursor); + } + + @Test + public void testPrefixToEmptySubtrie() + { + testSingletonSubtrie(key -> Trie.singleton(EMPTY, VERSION, true).prefixedBy(key)); + } + + @Test + public void testEmptyPrefixSubtrie() + { + testSingletonSubtrie(key -> Trie.singleton(key, VERSION, true).prefixedBy(EMPTY)); + } + + @Test + public void testSplitPrefixSubtrie() + { + testSingletonSubtrie(key -> + { + byte[] bytes = key.asByteComparableArray(VERSION); + int cut = bytes.length / 2; + Preencoded k1 = ByteComparable.preencoded(VERSION, bytes, 0, cut); + Preencoded k2 = ByteComparable.preencoded(VERSION, bytes, cut, bytes.length - cut); + return Trie.singleton(k2, VERSION, true).prefixedBy(k1); + }); + } + + public void testSingletonSubtrie(Function> make) + { + Arrays.sort(BOUNDARIES, (a, b) -> ByteComparable.compare(a, b, VERSION)); + for (int li = -1; li < BOUNDARIES.length; ++li) + { + Preencoded l = li < 0 ? null : BOUNDARIES[li]; + for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) + { + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + + for (int i = li == ri ? 3 : 0; i < 4; ++i) + { + boolean includeLeft = (i & 1) != 0; + boolean includeRight = (i & 2) != 0; + + for (Preencoded key : KEYS) + { + int cmp1 = l != null ? ByteComparable.compare(key, l, VERSION) : 1; + int cmp2 = r != null ? ByteComparable.compare(r, key, VERSION) : 1; + Trie ix = make.apply(key).subtrie(toBound(l, !includeLeft), toBound(r, includeRight)); + boolean expected = true; + if (cmp1 < 0 || cmp1 == 0 && !includeLeft) + expected = false; + if (cmp2 < 0 || cmp2 == 0 && !includeRight) + expected = false; + + try + { + assertEquals(expected, Iterables.getFirst(ix.values(), false)); + } + catch (Throwable t) + { + System.err.println("Intersection"); + System.err.println(ix.dump()); + Assert.fail(String.format("Failed on range %s%s,%s%s key %s\n%s\n", + includeLeft ? "[" : "(", + l != null ? l.byteComparableAsString(VERSION) : null, + r != null ? r.byteComparableAsString(VERSION) : null, + includeRight ? "]" : ")", + key.byteComparableAsString(VERSION), + t)); + } + + try + { + assertEquals(expected, Iterables.getFirst(ix.values(Direction.REVERSE), false)); + } + catch (Throwable t) + { + System.err.println("Intersection REV"); + System.err.println(ix.cursor(Direction.REVERSE).process(new TrieDumper.Plain<>(Object::toString))); + Assert.fail(String.format("Failed on range %s%s,%s%s REV key %s\n%s\n", + includeLeft ? "[" : "(", + l != null ? l.byteComparableAsString(VERSION) : null, + r != null ? r.byteComparableAsString(VERSION) : null, + includeRight ? "]" : ")", + key.byteComparableAsString(VERSION), + t)); + } + } + } + } + } + } + + @Test + public void testMemtableSubtrie() + { + Arrays.sort(BOUNDARIES, FORWARD_COMPARATOR); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); + InMemoryTrie trie1 = makeInMemoryTrie(KEYS, content1, true); + + for (int li = -1; li < BOUNDARIES.length; ++li) + { + Preencoded l = li < 0 ? null : BOUNDARIES[li]; + for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) + { + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + for (int i = 0; i < 4; ++i) + { + boolean includeLeft = (i & 1) != 0; + boolean includeRight = (i & 2) != 0; + if ((!includeLeft || !includeRight) && li == ri) + continue; + checkEqualRange(content1, trie1, l, includeLeft, r, includeRight); + } + } + } + } + + @Test + public void testMergeSubtrie() + { + testMergeSubtrie(2); + } + + @Test + public void testCollectionMergeSubtrie3() + { + testMergeSubtrie(3); + } + + @Test + public void testCollectionMergeSubtrie5() + { + testMergeSubtrie(5); + } + + public void testMergeSubtrie(int mergeCount) + { + Arrays.sort(BOUNDARIES, FORWARD_COMPARATOR); + NavigableMap content1 = new TreeMap<>(FORWARD_COMPARATOR); + List> tries = new ArrayList<>(); + for (int i = 0; i < mergeCount; ++i) + { + tries.add(makeInMemoryTrie(Arrays.copyOfRange(KEYS, + KEYS.length * i / mergeCount, + KEYS.length * (i + 1) / mergeCount), + content1, + true)); + } + Trie trie1 = Trie.mergeDistinct(tries); + + for (int li = -1; li < BOUNDARIES.length; ++li) + { + Preencoded l = li < 0 ? null : BOUNDARIES[li]; + for (int ri = Math.max(0, li); ri <= BOUNDARIES.length; ++ri) + { + Preencoded r = ri == BOUNDARIES.length ? null : BOUNDARIES[ri]; + for (int i = 0; i < 4; ++i) + { + boolean includeLeft = (i & 1) != 0; + boolean includeRight = (i & 2) != 0; + if ((!includeLeft || !includeRight) && li == ri) + continue; + checkEqualRange(content1, trie1, l, includeLeft, r, includeRight); + } + } + } + } + + public void checkEqualRange(NavigableMap content1, + Trie t1, + Preencoded l, + boolean includeLeft, + Preencoded r, + boolean includeRight) + { + System.out.println(String.format("Intersection with %s%s:%s%s", includeLeft ? "[" : "(", asString(l), asString(r), includeRight ? "]" : ")")); + SortedMap imap = TrieUtil.boundedMap(content1, l, includeLeft, r, includeRight); + Trie intersection = t1.subtrie(toBound(l, !includeLeft), toBound(r, includeRight)); + try + { + assertSameContent(intersection, imap); + } + catch (AssertionError e) + { + System.out.println("\n" + t1.dump(ByteBufferUtil::bytesToHex)); + + System.out.println("\n" + intersection.dump(ByteBufferUtil::bytesToHex)); + throw e; + } + + if (l == null || r == null) + return; + + // Test intersecting intersection. + intersection = t1.subtrie(toBound(l, !includeLeft), null).subtrie(null, toBound(r, includeRight)); + assertSameContent(intersection, imap); + + intersection = t1.subtrie(null, toBound(r, includeRight)).subtrie(toBound(l, !includeLeft), null); + assertSameContent(intersection, imap); + } + + /** + * Extract the values of the provide trie into a list. + */ + private static List toList(Trie trie, Direction direction) + { + return Streams.stream(trie.values(direction)).collect(Collectors.toList()); + } + + /** Creates a single byte {@link ByteComparable} with the provide value */ + private static ByteComparable of(int value) + { + assert value >= 0 && value <= Byte.MAX_VALUE; + return ByteComparable.preencoded(VERSION, new byte[]{ (byte)value }); + } + + List maybeReversed(Direction direction, List list) + { + if (direction.isForward()) + return list; + List reversed = new ArrayList<>(list); + reversed.sort((x, y) -> x == -1 ? -1 : y == -1 ? 1 : Integer.compare(y, x)); + return reversed; + } + + void assertTrieEquals(List expected, Trie trie) + { + assertEquals(expected, toList(trie, Direction.FORWARD)); + assertEquals(maybeReversed(Direction.REVERSE, expected), toList(trie, Direction.REVERSE)); + } + + @Test + public void testSimpleIntersection() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(of(3), of(7)); + assertTrieEquals(asList(-1, 3, 4, 5, 6, 7), intersection); + } + + @Test + public void testSimpleLeftIntersection() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(of(3), null); + assertTrieEquals(asList(-1, 3, 4, 5, 6, 7, 8, 9), intersection); + } + + @Test + public void testSimpleRightIntersection() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(null, of(7)); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7), intersection); + } + + @Test + public void testSimpleNoIntersection() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(null, null); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + } + + @Test + public void testSimpleEmptyIntersectionLeft() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(ByteComparable.EMPTY, null); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + + intersection = trie.subtrie(ByteComparable.EMPTY, ByteComparable.EMPTY); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + } + + @Test + public void testSimpleEmptyIntersectionRight() + { + Trie trie = singleLevelIntTrie(10, false); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), trie); + + Trie intersection = trie.subtrie(null, ByteComparable.EMPTY); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + + intersection = trie.subtrie(ByteComparable.EMPTY, ByteComparable.EMPTY); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), intersection); + } + + @Test + public void testSubtrieOnSubtrie() + { + Trie trie = singleLevelIntTrie(15, false); + + // non-overlapping + Trie intersection = trie.subtrie(of(0), of(4)).subtrie(of(5), of(8)); + assertTrieEquals(asList(-1), intersection); + // touching + intersection = trie.subtrie(of(0), of(3)).subtrie(of(3), of(8)); + assertTrieEquals(asList(-1, 3), intersection); + // overlapping 1 + intersection = trie.subtrie(of(0), of(4)).subtrie(of(2), of(8)); + assertTrieEquals(asList(-1, 2, 3, 4), intersection); + // overlapping 2 + intersection = trie.subtrie(of(0), of(4)).subtrie(of(1), of(8)); + assertTrieEquals(asList(-1, 1, 2, 3, 4), intersection); + // covered + intersection = trie.subtrie(of(0), of(4)).subtrie(of(0), of(8)); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4), intersection); + // covered 2 + intersection = trie.subtrie(of(4), of(8)).subtrie(of(0), of(8)); + assertTrieEquals(asList(-1, 4, 5, 6, 7, 8), intersection); + } + + @Test + public void testIntersectedIntersection() + { + Trie trie = singleLevelIntTrie(15, false); + + // non-overlapping + Trie intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(4))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(5), of(8))); + assertTrieEquals(asList(-1), intersection); + // touching + intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(3))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(3), of(8))); + assertTrieEquals(asList(-1, 3), intersection); + // overlapping 1 + intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(4))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(2), of(8))); + assertTrieEquals(asList(-1, 2, 3, 4), intersection); + // overlapping 2 + intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(4))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(1), of(8))); + assertTrieEquals(asList(-1, 1, 2, 3, 4), intersection); + // covered + intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(4))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(8))); + assertTrieEquals(asList(-1, 0, 1, 2, 3, 4), intersection); + // covered 2 + intersection = trie.intersect(TrieSet.rangeInclusiveEnd(VERSION, of(4), of(8))) + .intersect(TrieSet.rangeInclusiveEnd(VERSION, of(0), of(8))); + assertTrieEquals(asList(-1, 4, 5, 6, 7, 8), intersection); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/TestRangeState.java b/test/unit/org/apache/cassandra/db/tries/TestRangeState.java new file mode 100644 index 000000000000..170255743312 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/TestRangeState.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +import com.google.common.base.Predicates; +import com.google.common.base.Throwables; +import com.google.common.collect.Streams; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/// Range state used for testing range tries. It is a general implementation of [RangeState] state that can represent +/// any combination of deletions before, after, as well as at a specific point. It will also hold a position, which is +/// not necessary for the trie logic, but makes it possible to describe a range trie using a list of [TestRangeState] +/// as well as perform some operations on it (see [RangeTrieMergeTest#mergeLists]). +class TestRangeState implements RangeState +{ + final ByteComparable position; + final boolean appliesAfter; // part of the position, needs to be remapped for comparisons + + final int leftSide; + final int rightSide; + + final TestRangeState leftState; + final TestRangeState rightState; + + TestRangeState(ByteComparable position, int leftSide, int rightSide) + { + // By default put left starts (which do not have a left side) before branch and ends (which do not have a right + // side) after the branch, and leave switches to the left side. + this(position, rightSide < 0, leftSide, rightSide); + } + + TestRangeState(ByteComparable position, boolean appliesAfter, int leftSide, int rightSide) + { + this.position = position; + this.appliesAfter = appliesAfter; + this.leftSide = leftSide; + this.rightSide = rightSide; + if (leftSide == rightSide) + { + this.leftState = this; + this.rightState = this; + } + else + { + this.leftState = leftSide >= 0 ? new TestRangeState(position, false, leftSide, leftSide) : null; + this.rightState = rightSide >= 0 ? new TestRangeState(position, false, rightSide, rightSide) : null; + } + } + + static TestRangeState combine(TestRangeState m1, TestRangeState m2) + { + return combineCollection(Arrays.asList(m1, m2)); + } + + + static TestRangeState upsert(TestRangeState m1, TestRangeState m2) + { + if (m1 == null) + return m2; + return combineCollection(Arrays.asList(m1, m2)); + } + + public static TestRangeState combineCollection(Collection rangeStates) + { + int newLeft = -1; + int newRight = -1; + ByteComparable position = null; + boolean appliesAfter = false; + for (TestRangeState marker : rangeStates) + { + newLeft = Math.max(newLeft, marker.leftSide); + newRight = Math.max(newRight, marker.rightSide); + position = marker.position; + appliesAfter = marker.appliesAfter; + } + if (newLeft < 0 && newRight < 0) + return null; + + return new TestRangeState(position, appliesAfter, newLeft, newRight); + } + + @Override + public int hashCode() + { + return Objects.hash(position, leftSide, rightSide); + } + + @Override + public String toString() + { + return toString('"' + toString(position, appliesAfter) + '"'); + } + + public String toStringNoPosition() + { + return toString("X"); + } + + public String toString(String positionString) + { + + return (leftSide >= 0 ? leftSide + "<" : "") + + positionString + + (rightSide >= 0 ? "<" + rightSide : "") + + (isBoundary() ? "" : " not reportable"); + } + + @Override + public boolean isBoundary() + { + return leftSide != rightSide; + } + + public TestRangeState toContent() + { + return isBoundary() ? this : null; + } + + @Override + public TestRangeState precedingState(Direction direction) + { + return direction.select(leftState, rightState); + } + + @Override + public TestRangeState succedingState(Direction direction) + { + return direction.select(rightState, leftState); + } + + @Override + public TestRangeState restrict(boolean applicableBefore, boolean applicableAfter) + { + assert isBoundary(); + if ((applicableBefore || leftSide < 0) && (applicableAfter || rightSide < 0)) + return this; + int newLeft = applicableBefore ? leftSide : -1; + int newRight = applicableAfter ? rightSide : -1; + if (newLeft >= 0 || newRight >= 0) + return new TestRangeState(position, appliesAfter, newLeft, newRight); + else + return null; + } + + @Override + public TestRangeState asBoundary(Direction direction) + { + assert !isBoundary(); + final boolean isForward = direction.isForward(); + int newLeft = !isForward ? leftSide : -1; + int newRight = isForward ? rightSide : -1; + return new TestRangeState(position, appliesAfter, newLeft, newRight); + } + + static String toString(ByteComparable position, boolean appliesAfter) + { + if (position == null) + return "null"; + return position.byteComparableAsString(TrieUtil.VERSION) + (appliesAfter ? "↑" : ""); + } + + static List verify(List markers) + { + int active = -1; + TestRangeState prev = null; + for (TestRangeState marker : markers) + { + if (prev != null && prev.position != null && marker != null && marker.position != null) + assertTrue("Order violation " + toString(prev.position, prev.appliesAfter) + " vs " + toString(marker.position, marker.appliesAfter), + ByteComparable.compare(prev.position, marker.position, TrieUtil.VERSION) < 0 || + ByteComparable.compare(prev.position, marker.position, TrieUtil.VERSION) == 0 && !prev.appliesAfter && marker.appliesAfter); + + if (marker != null) + assertEquals("Range close violation", active, marker.leftSide); + else + assertEquals("Open range at end", null, active); + + assertTrue(marker.leftSide != marker.rightSide); + prev = marker; + active = marker.rightSide; + } + assertEquals("Unclosed range", -1, active); + return markers; + } + + static class TestRangeStateIterator extends TrieEntriesIterator + { + boolean onReturnPath; + + TestRangeStateIterator(RangeTrie trie, Direction direction) + { + super(trie.cursor(direction), Predicates.alwaysTrue()); + } + + @Override + public void onReturnPath() + { + onReturnPath = true; + } + + @Override + protected TestRangeState mapContent(TestRangeState content, byte[] bytes, int byteLength) + { + ByteComparable key = ByteComparable.preencoded(byteComparableVersion(), Arrays.copyOf(bytes, byteLength)); + boolean appliesAfter = onReturnPath == direction().isForward(); + onReturnPath = false; + return remap(content, key, appliesAfter); + } + } + + /** + * Extract the values of the provided trie into a list. + */ + static List toList(RangeTrie trie, Direction direction) + { + return Streams.stream(new TestRangeStateIterator(trie, direction)) + .collect(Collectors.toList()); + } + + /** + * Extract the values of the provided trie into a map. + */ + static Map toStringMap(RangeTrie trie, Direction direction) + { + return Streams.stream(new TestRangeStateIterator(trie, direction)) + .collect(Collectors.toMap(x -> TrieUtil.asString(x.position), + x -> x.toString(), + (x, y) -> '(' + x + ',' + ')', + LinkedHashMap::new)); + } + + static TestRangeState remap(TestRangeState dm, ByteComparable newKey, boolean appliesAfter) + { + return new TestRangeState(newKey, appliesAfter, dm.leftSide, dm.rightSide); + } + + static InMemoryRangeTrie fromList(List list) + { + InMemoryRangeTrie trie = InMemoryRangeTrie.shortLived(TrieUtil.VERSION); + for (TestRangeState i : list) + { + try + { + ByteComparable pos = i.position; + if (pos == null) + pos = ByteComparable.EMPTY; + trie.putRecursive(pos, i, i.appliesAfter, (ex, n) -> n); + } + catch (TrieSpaceExhaustedException e) + { + throw Throwables.propagate(e); + } + } + return trie; + } + + @Override + public boolean equals(Object other) + { + if (other == null || !(other instanceof TestRangeState)) + return false; + TestRangeState otherMarker = (TestRangeState) other; + return otherMarker.leftSide == leftSide && otherMarker.rightSide == rightSide; + } + + static TestRangeState covering(int value) + { + return new TestRangeState(ByteComparable.EMPTY, value, value); + } + + static TestRangeState open(int value) + { + return new TestRangeState(ByteComparable.EMPTY, -1, value); + } + + static TestRangeState close(int value) + { + return new TestRangeState(ByteComparable.EMPTY, value, -1); + } + + static TestRangeState boundary(int left, int right) + { + return new TestRangeState(ByteComparable.EMPTY, left, right); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/ThreadedTestBase.java b/test/unit/org/apache/cassandra/db/tries/ThreadedTestBase.java new file mode 100644 index 000000000000..d8a5b7cdcf01 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/ThreadedTestBase.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.db.tries.TrieUtil.VERSION; +import static org.apache.cassandra.db.tries.TrieUtil.generateKeys; + +public abstract class ThreadedTestBase> +{ + // Note: This should not be run by default with verification to have the higher concurrency of faster writes and reads. + + private static final int COUNT = 30000; + private static final int OTHERS = COUNT / 10; + private static final int PROGRESS_UPDATE = COUNT / 15; + private static final int READERS = 0; + private static final int WALKERS = 1; + private static final Random rand = new Random(); + + abstract T value(ByteComparable b); + abstract R makeTrie(OpOrder readOrder); + abstract void add(R trie, ByteComparable b, T v, int iteration) throws TrieSpaceExhaustedException; + + @Test + public void testThreaded() throws InterruptedException + { + OpOrder readOrder = new OpOrder(); + ByteComparable[] src = generateKeys(rand, COUNT + OTHERS); + R trie = makeTrie(readOrder); + ConcurrentLinkedQueue errors = new ConcurrentLinkedQueue<>(); + List threads = new ArrayList<>(); + AtomicBoolean writeCompleted = new AtomicBoolean(false); + AtomicInteger writeProgress = new AtomicInteger(0); + + for (int i = 0; i < WALKERS; ++i) + threads.add(new Thread(() -> { + try + { + while (!writeCompleted.get()) + { + int min = writeProgress.get(); + int count = 0; + try (OpOrder.Group group = readOrder.start()) + { + for (Map.Entry en : trie.entrySet()) + { + T v = value(en.getKey()); + Assert.assertEquals(en.getKey().byteComparableAsString(VERSION), v, en.getValue()); + ++count; + } + } + Assert.assertTrue("Got only " + count + " while progress is at " + min, count >= min); + } + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + })); + + for (int i = 0; i < READERS; ++i) + { + threads.add(new Thread(() -> { + try + { + Random r = ThreadLocalRandom.current(); + while (!writeCompleted.get()) + { + int min = writeProgress.get(); + + for (int i1 = 0; i1 < PROGRESS_UPDATE; ++i1) + { + int index = r.nextInt(COUNT + OTHERS); + ByteComparable b = src[index]; + T v = value(b); + try (OpOrder.Group group = readOrder.start()) + { + T result = trie.get(b); + if (result != null) + { + Assert.assertTrue("Got not added " + index + " when COUNT is " + COUNT, + index < COUNT); + Assert.assertEquals("Failed " + index, v, result); + } + else if (index < min) + Assert.fail("Failed index " + index + " while progress is at " + min); + } + } + } + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + })); + } + +// threads.add +// (new Thread(() -> { + try + { + for (int i = 0; i < COUNT; i++) + { + ByteComparable b = src[i]; + + // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload + // (so that all sources have the same value). + T v = value(b); + add(trie, b, v, i); + + if (i % PROGRESS_UPDATE == 0) + writeProgress.set(i); + } + } + catch (Throwable t) + { + t.printStackTrace(); + errors.add(t); + } + finally + { + writeCompleted.set(true); + } +// })); + + for (Thread t : threads) + t.start(); + + for (Thread t : threads) + t.join(); + + if (!errors.isEmpty()) + Assert.fail("Got errors:\n" + errors); + } +} diff --git a/test/unit/org/apache/cassandra/db/tries/TrieToDot.java b/test/unit/org/apache/cassandra/db/tries/TrieToDot.java index fd47c0be3d17..6ccacdbaa048 100644 --- a/test/unit/org/apache/cassandra/db/tries/TrieToDot.java +++ b/test/unit/org/apache/cassandra/db/tries/TrieToDot.java @@ -24,31 +24,46 @@ /** * A class for dumping the structure of a trie to a graphviz/dot representation for making trie graphs. */ -class TrieToDot extends TriePathReconstructor implements Trie.Walker +public class TrieToDot> extends TriePathReconstructor implements DeletionAwareCursor.DeletionAwareWalker { private final StringBuilder b; private final Function contentToString; + private final Function deletionBoundaryToString; private final Function transitionToString; private final boolean useMultiByte; private int prevPos; private int currNodeTextPos; + private int depthAdjustment; + private boolean inDeletionBranch; public TrieToDot(Function contentToString, Function transitionToString, boolean useMultiByte) + { + this(contentToString, null, transitionToString, useMultiByte); + } + + public TrieToDot(Function contentToString, + Function deletionBoundaryToString, + Function transitionToString, + boolean useMultiByte) { this.contentToString = contentToString; + this.deletionBoundaryToString = deletionBoundaryToString; this.transitionToString = transitionToString; this.useMultiByte = useMultiByte; this.b = new StringBuilder(); b.append("digraph G {\n" + " splines=curved"); addNodeDefinition(nodeString(0)); + depthAdjustment = 0; + inDeletionBranch = false; } @Override public void resetPathLength(int newLength) { + newLength += depthAdjustment; super.resetPathLength(newLength); prevPos = newLength; } @@ -87,11 +102,11 @@ private void addNodeDefinition(String newNode) private String nodeString(int keyPos) { - StringBuilder b = new StringBuilder(); - b.append("Node_"); + StringBuilder r = new StringBuilder(); + r.append(inDeletionBranch ? "NodeD_" : "Node_"); for (int i = 0; i < keyPos; ++i) - b.append(transitionToString.apply(keyBytes[i] & 0xFF)); - return b.toString(); + r.append(transitionToString.apply(keyBytes[i] & 0xFF)); + return r.toString(); } @Override @@ -120,4 +135,36 @@ public String complete() b.append("\n}\n"); return b.toString(); } + + @Override + public boolean enterDeletionsBranch() + { + newLineAndIndent(); + String oldNode = nodeString(keyPos); + b.append(oldNode); + inDeletionBranch = true; + String newNode = nodeString(keyPos); + b.append(" -> "); + addNodeDefinition(newNode); + + newLineAndIndent(); + b.append("{ rank=same; ").append(oldNode).append("; ").append(newNode).append("; }"); + + depthAdjustment = keyPos; + return true; + } + + @Override + public void deletionMarker(D marker) + { + b.replace(currNodeTextPos, b.length(), String.format("%s [shape=doublecircle label=\"%s\"]", nodeString(keyPos), deletionBoundaryToString.apply(marker))); + } + + @Override + public void exitDeletionsBranch() + { + resetPathLength(0); + depthAdjustment = 0; + inDeletionBranch = false; + } } diff --git a/test/unit/org/apache/cassandra/db/tries/TrieToDotTest.java b/test/unit/org/apache/cassandra/db/tries/TrieToDotTest.java index b10de587bba3..d6e3dffdc2e2 100644 --- a/test/unit/org/apache/cassandra/db/tries/TrieToDotTest.java +++ b/test/unit/org/apache/cassandra/db/tries/TrieToDotTest.java @@ -20,22 +20,21 @@ import org.junit.Test; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; - public class TrieToDotTest { @Test public void testToDotContent() throws Exception { - InMemoryTrie trie = InMemoryTrie.shortLived(ByteComparable.Version.OSS50); + InMemoryTrie trie = InMemoryTrie.shortLived(TrieUtil.VERSION); String s = "Trie node types and manipulation mechanisms. The main purpose of this is to allow for handling tries directly as" + " they are on disk without any serialization, and to enable the creation of such files."; s = s.toLowerCase(); for (String word : s.split("[^a-z]+")) - trie.putRecursive(InMemoryTrieTestBase.comparable(word), word, (x, y) -> y); + trie.putRecursive(TrieUtil.directComparable(word), word, (x, y) -> y); - System.out.println(trie.process(new TrieToDot(Object::toString, + System.out.println(trie.process(Direction.FORWARD, + new TrieToDot(Object::toString, x -> Character.toString((char) ((int) x)), - true), Direction.FORWARD)); + true))); } } diff --git a/test/unit/org/apache/cassandra/db/tries/TrieToMermaid.java b/test/unit/org/apache/cassandra/db/tries/TrieToMermaid.java index a75d301c1df0..aeedbaaefc77 100644 --- a/test/unit/org/apache/cassandra/db/tries/TrieToMermaid.java +++ b/test/unit/org/apache/cassandra/db/tries/TrieToMermaid.java @@ -24,20 +24,32 @@ /** * A class for dumping the structure of a trie to a graphviz/dot representation for making trie graphs. */ -class TrieToMermaid extends TriePathReconstructor implements Trie.Walker +public class TrieToMermaid> extends TriePathReconstructor implements DeletionAwareCursor.DeletionAwareWalker { private final StringBuilder b; private final Function contentToString; + private final Function deletionBoundaryToString; private final Function transitionToString; private final boolean useMultiByte; private int prevPos; private int currNodeTextPos; + private int depthAdjustment; + private boolean inDeletionBranch; public TrieToMermaid(Function contentToString, Function transitionToString, boolean useMultiByte) + { + this(contentToString, null, transitionToString, useMultiByte); + } + + public TrieToMermaid(Function contentToString, + Function deletionBoundaryToString, + Function transitionToString, + boolean useMultiByte) { this.contentToString = contentToString; + this.deletionBoundaryToString = deletionBoundaryToString; this.transitionToString = transitionToString; this.useMultiByte = useMultiByte; this.b = new StringBuilder(); @@ -46,11 +58,14 @@ public TrieToMermaid(Function contentToString, addNodeDefinition(nodeString(0)); newLineAndIndent(); b.append("style " + nodeString(0) + " fill:darkgrey"); + depthAdjustment = 0; + inDeletionBranch = false; } @Override public void resetPathLength(int newLength) { + newLength += depthAdjustment; super.resetPathLength(newLength); prevPos = newLength; } @@ -86,11 +101,11 @@ private void addNodeDefinition(String newNode) private String nodeString(int keyPos) { - StringBuilder b = new StringBuilder(); - b.append("Node_"); + StringBuilder r = new StringBuilder(); + r.append(inDeletionBranch ? "NodeD_" : "Node_"); for (int i = 0; i < keyPos; ++i) - b.append(transitionToString.apply(keyBytes[i] & 0xFF)); - return b.toString(); + r.append(transitionToString.apply(keyBytes[i] & 0xFF)); + return r.toString(); } @Override @@ -110,7 +125,7 @@ public void addPathBytes(DirectBuffer buffer, int pos, int count) @Override public void content(T content) { - b.replace(currNodeTextPos, b.length(), String.format("%s(((%s)))", nodeString(keyPos), contentToString.apply(content))); + b.replace(currNodeTextPos, b.length(), String.format("%s(((\"%s\")))", nodeString(keyPos), contentToString.apply(content))); } @Override @@ -119,4 +134,33 @@ public String complete() b.append("\n"); return b.toString(); } + + @Override + public boolean enterDeletionsBranch() + { + newLineAndIndent(); + b.append(nodeString(keyPos)); + inDeletionBranch = true; + depthAdjustment = keyPos; + + String newNode = nodeString(keyPos); + b.append(" ----> "); + addNodeDefinition(newNode); + + return true; + } + + @Override + public void deletionMarker(D marker) + { + b.replace(currNodeTextPos, b.length(), String.format("%s(((\"%s\")))", nodeString(keyPos), deletionBoundaryToString.apply(marker))); + } + + @Override + public void exitDeletionsBranch() + { + resetPathLength(0); + depthAdjustment = 0; + inDeletionBranch = false; + } } diff --git a/test/unit/org/apache/cassandra/db/tries/TrieToMermaidTest.java b/test/unit/org/apache/cassandra/db/tries/TrieToMermaidTest.java index f19c1eb53e09..7989ce001570 100644 --- a/test/unit/org/apache/cassandra/db/tries/TrieToMermaidTest.java +++ b/test/unit/org/apache/cassandra/db/tries/TrieToMermaidTest.java @@ -20,22 +20,21 @@ import org.junit.Test; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; - public class TrieToMermaidTest { @Test public void testToMermaidContent() throws Exception { - InMemoryTrie trie = InMemoryTrie.shortLived(ByteComparable.Version.OSS50); + InMemoryTrie trie = InMemoryTrie.shortLived(TrieUtil.VERSION); // This was used as a basis the graphs in BTIFormat.md String s = "a allow an and any are as node of on the this to trie types with without"; s = s.toLowerCase(); for (String word : s.split("[^a-z]+")) - trie.putRecursive(InMemoryTrieTestBase.comparable(word), word, (x, y) -> y); + trie.putRecursive(TrieUtil.directComparable(word), word, (x, y) -> y); - System.out.println(trie.process(new TrieToMermaid(Object::toString, + System.out.println(trie.process(Direction.FORWARD, + new TrieToMermaid(Object::toString, x -> Character.toString((char) ((int) x)), - false), Direction.FORWARD)); + false))); } } diff --git a/test/unit/org/apache/cassandra/db/tries/TrieUtil.java b/test/unit/org/apache/cassandra/db/tries/TrieUtil.java new file mode 100644 index 000000000000..56066fcd1024 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/tries/TrieUtil.java @@ -0,0 +1,740 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.tries; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Random; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Objects; +import com.google.common.collect.HashMultiset; +import com.google.common.collect.Lists; +import com.google.common.collect.Multiset; +import com.google.common.collect.Sets; +import com.google.common.collect.Streams; +import org.junit.Assert; + +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.EMPTY; +import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Preencoded; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class TrieUtil +{ + static final ByteComparable.Version VERSION = ByteComparable.Version.OSS50; + public static final Comparator REVERSE_COMPARATOR = (bytes1, bytes2) -> ByteComparable.compare(invert(bytes1), invert(bytes2), VERSION); + public static final Comparator FORWARD_COMPARATOR = (bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION); + private static final int KEY_CHOICE = 25; + private static final int MIN_LENGTH = 10; + private static final int MAX_LENGTH = 50; + + static Map toStringMap(BaseTrie trie, Direction direction) + { + return Streams.stream(trie.entryIterator(direction)) + .collect(Collectors.toMap(x -> asString(x.getKey()), + x -> x.getValue().toString(), + (x, y) -> '(' + x + ',' + y + ')', + LinkedHashMap::new)); + } + + static Map toStringMap(Map map, Function mapper) + { + return map.entrySet() + .stream() + .collect(Collectors.toMap(x -> asString(x.getKey()), + x -> mapper.apply(x.getValue()).toString(), + (x, y) -> '(' + x + ',' + y + ')', + LinkedHashMap::new)); + } + + static void assertMapEquals(Iterable> actual, + Iterable> expected, + Comparator comparator) + { + Map values1 = collectAsStrings(actual, comparator); + Map values2 = collectAsStrings(expected, comparator); + assertMapEquals(values1, values2); + } + + static void assertMapEquals(Map actual, Map expected) + { + if (actual.equals(expected)) + return; + + // If the maps are not equal, we want to print out the differences in a way that is easy to read. + final Set allKeys = Sets.union(actual.keySet(), expected.keySet()); + Set keyDifference = allKeys.stream() + .filter(k -> !Objects.equal(actual.get(k), expected.get(k))) + .collect(Collectors.toCollection(TreeSet::new)); + System.err.println("All data"); + dumpDiff(actual, expected, allKeys); + System.err.println("\nDifferences"); + dumpDiff(actual, expected, keyDifference); + fail("Maps are not equal at " + keyDifference); + } + + private static void dumpDiff(Map actual, Map expected, Set set) + { + for (String key : set) + { + String v1 = actual.get(key); + if (v1 != null) + System.err.println(String.format("Actual %s:%s", key, v1)); + String v2 = expected.get(key); + if (v2 != null) + System.err.println(String.format("Expected %s:%s", key, v2)); + } + } + + private static Map collectAsStrings(Iterable> container, + Comparator comparator) + { + var map = new LinkedHashMap(); + Preencoded prevKey = null; + for (var e : container) + { + var key = e.getKey(); + if (prevKey != null && comparator.compare(prevKey, key) >= 0) + fail("Keys are not sorted: " + asString(prevKey) + " >= " + asString(key)); + prevKey = key; + map.put(asString(key), e.getValue().toString()); + } + return map; + } + + static ByteComparable invert(ByteComparable b) + { + return version -> invert(b.asComparableBytes(version)); + } + + static ByteSource invert(ByteSource src) + { + return () -> + { + int v = src.next(); + if (v == ByteSource.END_OF_STREAM) + return v; + return v ^ 0xFF; + }; + } + + static SpecStackEntry makeSpecStackEntry(Direction direction, Object spec, SpecStackEntry parent) + { + assert !(spec instanceof Pair); + if (spec instanceof Object[]) + { + final Object[] specArray = (Object[]) spec; + return new SpecStackEntry(specArray, null, parent, direction.select(-1, specArray.length)); + } + else + return new SpecStackEntry(new Object[0], spec, parent, direction.select(-1, 1)); + + } + + static Trie specifiedTrie(Object[] nodeDef) + { + return dir -> new CursorFromSpec<>(nodeDef, dir); + } + + static Preencoded directComparable(String s) + { + ByteBuffer b = ByteBufferUtil.bytes(s); + return ByteComparable.preencoded(VERSION, b); + } + + @VisibleForTesting + static Preencoded comparable(String s) + { + return ((ByteComparable) (v -> ByteSource.withTerminator(ByteSource.TERMINATOR, ByteSource.of(s, v)))).preencode(VERSION); + } + + static void assertSameContent(Trie trie, SortedMap map) + { + // Don't use a loop for the direction to see it in the stack path in case of failure. + assertMapEquals(trie, map, Direction.FORWARD); + assertForEachEntryEquals(trie, map, Direction.FORWARD); + assertValuesEqual(trie, map, Direction.FORWARD); + assertForEachValueEquals(trie, map, Direction.FORWARD); + + assertMapEquals(trie, map, Direction.REVERSE); + assertForEachEntryEquals(trie, map, Direction.REVERSE); + assertValuesEqual(trie, map, Direction.REVERSE); + assertForEachValueEquals(trie, map, Direction.REVERSE); + + assertUnorderedValuesEqual(trie, map); + checkGet(trie, map); + } + + static void checkGet(Trie trie, Map items) + { + for (Map.Entry en : items.entrySet()) + { + assertEquals(en.getValue(), trie.get(en.getKey())); + } + } + + private static void assertValuesEqual(Trie trie, SortedMap map, Direction direction) + { + assertIterablesEqual(trie.values(direction), maybeReversed(direction, map).values()); + } + + private static void assertUnorderedValuesEqual(Trie trie, SortedMap map) + { + Multiset unordered = HashMultiset.create(); + StringBuilder errors = new StringBuilder(); + for (ByteBuffer b : trie.valuesUnordered()) + unordered.add(b); + + for (ByteBuffer b : map.values()) + if (!unordered.remove(b)) + errors.append("\nMissing value in valuesUnordered: ").append(ByteBufferUtil.bytesToHex(b)); + + for (ByteBuffer b : unordered) + errors.append("\nExtra value in valuesUnordered: ").append(ByteBufferUtil.bytesToHex(b)); + + assertEquals("", errors.toString()); + } + + static Map maybeReversed(Direction direction, Map data) + { + return direction.isForward() ? data : reorderBy(data, REVERSE_COMPARATOR); + } + + static Map reorderBy(Map data, Comparator comparator) + { + Map newMap = new TreeMap<>(comparator); + newMap.putAll(data); + return newMap; + } + + private static void assertForEachEntryEquals(Trie trie, SortedMap map, Direction direction) + { + Iterator> it = maybeReversed(direction, map).entrySet().iterator(); + trie.forEachEntry(direction, (key, value) -> { + Assert.assertTrue("Map exhausted first, key " + asString(key), it.hasNext()); + Map.Entry entry = it.next(); + assertEquals(0, ByteComparable.compare(entry.getKey(), key, VERSION)); + assertEquals(entry.getValue(), value); + }); + Assert.assertFalse("Trie exhausted first", it.hasNext()); + } + + private static void assertForEachValueEquals(Trie trie, SortedMap map, Direction direction) + { + Iterator it = maybeReversed(direction, map).values().iterator(); + trie.forEachValue(direction, value -> { + Assert.assertTrue("Map exhausted first, value " + ByteBufferUtil.bytesToHex(value), it.hasNext()); + ByteBuffer entry = it.next(); + assertEquals("Map " + ByteBufferUtil.bytesToHex(entry) + " vs trie " + ByteBufferUtil.bytesToHex(value), entry, value); + }); + Assert.assertFalse("Trie exhausted first", it.hasNext()); + } + + static void assertMapEquals(Trie trie, SortedMap map, Direction direction) + { + assertMapEquals(trie.entryIterator(direction), maybeReversed(direction, map).entrySet().iterator()); + } + + static void assertMapEquals(Iterator> actual, + Iterator> expected) + { + List failedAt = new ArrayList<>(); + StringBuilder b = new StringBuilder(); + while (actual.hasNext() && expected.hasNext()) + { + Map.Entry en1 = actual.next(); + Map.Entry en2 = expected.next(); + b.append(String.format("Expected %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue()))); + b.append(String.format("Actual %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue()))); + if (ByteComparable.compare(en1.getKey(), en2.getKey(), VERSION) != 0 || ByteBufferUtil.compareUnsigned(en1.getValue(), en2.getValue()) != 0) + failedAt.add(en1.getKey()); + } + while (actual.hasNext()) + { + Map.Entry en1 = actual.next(); + b.append(String.format("Actual %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue()))); + failedAt.add(en1.getKey()); + } + while (expected.hasNext()) + { + Map.Entry en2 = expected.next(); + b.append(String.format("Expected %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue()))); + failedAt.add(en2.getKey()); + } + if (!failedAt.isEmpty()) + { + String message = "Failed at " + Lists.transform(failedAt, TrieUtil::asString); + System.err.println(message); + System.err.println(b); + Assert.fail(message); + } + } + + static > void assertIterablesEqual(Iterable expectedIterable, Iterable actualIterable) + { + Iterator expected = expectedIterable.iterator(); + Iterator actual = actualIterable.iterator(); + while (actual.hasNext() && expected.hasNext()) + { + Assert.assertEquals(actual.next(), expected.next()); + } + if (expected.hasNext()) + Assert.fail("Remaing values in expected, starting with " + expected.next()); + else if (actual.hasNext()) + Assert.fail("Remaing values in actual, starting with " + actual.next()); + } + + static Preencoded[] generateKeys(Random rand, int count) + { + Preencoded[] sources = new Preencoded[count]; + TreeSet added = new TreeSet<>(FORWARD_COMPARATOR); + for (int i = 0; i < count; ++i) + { + sources[i] = generateKey(rand); + if (!added.add(sources[i])) + --i; + } + + // note: not sorted! + return sources; + } + + static Preencoded generateKeyAllowingPrefixes(Random rand) + { + return generateKey(rand, MIN_LENGTH, MAX_LENGTH, -1); + } + + static Preencoded generateKey(Random rand) + { + return generateKey(rand, MIN_LENGTH, MAX_LENGTH, ByteSource.TERMINATOR); + } + + static Preencoded generateKeyBound(Random rand) + { + return generateKey(rand, MIN_LENGTH, MAX_LENGTH, ByteSource.LT_NEXT_COMPONENT); + } + + static Preencoded generateKey(Random rand, int minLength, int maxLength, int terminator) + { + int len = rand.nextInt(maxLength - minLength + 1) + minLength; + byte[] bytes = new byte[len]; + int p = 0; + int length = bytes.length; + while (p < length) + { + int seed = rand.nextInt(KEY_CHOICE); + Random r2 = new Random(seed); + int m = r2.nextInt(5) + 2 + p; + if (m > length) + m = length; + while (p < m) + bytes[p++] = (byte) r2.nextInt(256); + } + if (terminator != -1) + return ((ByteComparable)(v -> ByteSource.withTerminator(terminator, ByteSource.of(bytes, v)))).preencode(VERSION); + else + return ByteComparable.preencoded(VERSION, bytes); + } + + public static Trie withRootMetadata(Trie wrapped, T metadata) + { + return wrapped.mergeWith(Trie.singleton(ByteComparable.EMPTY, VERSION, metadata), Trie.throwingResolver()); + } + + public static > RangeTrie withRootMetadata(RangeTrie wrapped, S metadata) + { + return wrapped.mergeWith(RangeTrie.point(ByteComparable.EMPTY, VERSION, true, metadata), Trie.throwingResolver()); + } + + public static > DeletionAwareTrie withRootMetadata(DeletionAwareTrie wrapped, T metadata) + { + return wrapped.mergeWith(DeletionAwareTrie.singleton(ByteComparable.EMPTY, VERSION, metadata), + Trie.throwingResolver(), + Trie.throwingResolver(), + (d,t) -> { throw new AssertionError(); }, + false); + } + + static Trie directTrie(String... points) throws TrieSpaceExhaustedException + { + InMemoryTrie trie = InMemoryTrie.shortLived(VERSION); + for (String s : points) + trie.putRecursive(directComparable(s), s, (ex, n) -> n); + return trie; + } + + static TrieSet directRanges(String... ranges) + { + return TrieSet.ranges(VERSION, true, true, Arrays.stream(ranges) + .map(TrieUtil::directComparable) + .toArray(ByteComparable[]::new)); + } + + static RangeTrie directRangeTrie(int value, String... keys) + { + return RangeTrie.fromSet(directRanges(keys), new TestRangeState(EMPTY, value, value)); + } + + static RangeTrie directRangeTrie(String... keys) + { + return directRangeTrie(1, keys); + } + + static void verifyEqualRangeTries(RangeTrie trie, RangeTrie expected) + { +// System.out.println("Actual: \n" + trie.dump(TestRangeState::toStringNoPosition)); +// System.out.println("Expected:\n" + expected.cursor(Direction.FORWARD).process(new TrieDumper.Plain<>(TestRangeState::toStringNoPosition))); + assertMapEquals(TestRangeState.toStringMap(trie, Direction.FORWARD), + TestRangeState.toStringMap(expected, Direction.FORWARD)); + assertMapEquals(TestRangeState.toStringMap(trie, Direction.REVERSE), + TestRangeState.toStringMap(expected, Direction.REVERSE)); + } + + static Preencoded toBound(Preencoded bc) + { + return toBound(bc, false); + } + + static Preencoded toBound(Preencoded bc, boolean greater) + { + if (bc == null) + return null; + + byte[] data = bc.getPreencodedBytes().remainingBytesToArray(); + data[data.length - 1] = (byte) (greater ? ByteSource.GT_NEXT_COMPONENT : ByteSource.LT_NEXT_COMPONENT); + return ByteComparable.preencoded(bc.encodingVersion(), data); + } + + static String asString(ByteComparable bc) + { + return bc != null ? bc.byteComparableAsString(VERSION) : "null"; + } + + static NavigableMap boundedMap(NavigableMap sourceMap, Preencoded ll, boolean includeLeft, Preencoded rr, boolean includeRight) + { + // Our slice has somewhat different semantics: + // - prefixes are not supported, i.e. a range like (a, aaa) cannot be used + // - inclusivity extends to the branches of each bound + Preencoded l = !includeLeft ? nudge(ll) : ll; + Preencoded r = includeRight ? nudge(rr) : rr; + + return l == null + ? r == null + ? sourceMap + : sourceMap.headMap(r, false) + : r == null + ? sourceMap.tailMap(l, true) + : sourceMap.subMap(l, true, r, false); + } + + static Preencoded nudge(Preencoded v) + { + if (v == null) + return null; + + byte[] data = v.getPreencodedBytes().remainingBytesToArray(); + int len = data.length; + while (len > 0 && data[len-1] == -1) + --len; + + if (len == 0) + return null; + + ++data[len - 1]; + return ByteComparable.preencoded(v.encodingVersion(), data, 0, len); + } + + /// Creates a simple trie with a root having the provided number of childs, where each child is a leaf whose content + /// is simply the value of the transition leading to it. + /// In other words, `singleLevelIntTrie(4)` creates the following trie: + /// ``` + /// Root + /// t= 0 1 2 3 + /// | | | | + /// 0 1 2 3 + /// ``` + static Trie singleLevelIntTrie(int childs, boolean sliceCompatibleContent) + { + return new Trie<>() + { + @Override + public Cursor makeCursor(Direction direction) + { + return new SingleLevelCursor(direction); + } + + class SingleLevelCursor implements Cursor + { + final Direction direction; + int current; + final boolean presentContentOnReturnPath; + + SingleLevelCursor(Direction direction) + { + this.direction = direction; + current = direction.select(-1, childs); + presentContentOnReturnPath = !direction.isForward() && sliceCompatibleContent; + } + + @Override + public long advance() + { + current += direction.increase; + return encodedPosition(); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + int depth = Cursor.depth(encodedSkipPosition); + int transition = Cursor.incomingTransition(encodedSkipPosition); + + if (Cursor.isOnReturnPath(encodedSkipPosition) && !presentContentOnReturnPath) + transition += direction.increase; + + if (depth > 1) + return advance(); + if (depth < 1) + transition = exhausted(); + + if (direction.isForward()) + current = Math.max(0, transition); + else + current = Math.min(childs - 1, transition); + + return encodedPosition(); + } + + int exhausted() + { + int lastPos = direction.select(childs, -1); + if (presentContentOnReturnPath) + lastPos += direction.increase; + return lastPos; + } + + @Override + public long encodedPosition() + { + if (current == direction.select(-1, childs)) + return Cursor.rootPosition(direction); + else if (presentContentOnReturnPath && current == direction.select(childs, -1)) + return Cursor.rootPosition(direction) | Cursor.ON_RETURN_PATH_BIT; + else if (direction.inLoop(current, 0, childs - 1)) + return Cursor.encode(1, current, direction) | + (presentContentOnReturnPath ? Cursor.ON_RETURN_PATH_BIT : 0); + return Cursor.exhaustedPosition(direction); + } + + @Override + public Integer content() + { + if (presentContentOnReturnPath != Cursor.isOnReturnPath(encodedPosition())) + return null; + return current == childs ? -1 : current; + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return VERSION; + } + + @Override + public Cursor tailCursor(Direction d) + { + if (current == -1) + return makeCursor(d); + else + throw new UnsupportedOperationException("tailTrie on test cursor"); + } + } + }; + } + + static String dump(BaseTrie s, Direction direction) + { + return s.process(direction, new TrieDumper.Plain<>(Object::toString)); + } + + static void dumpToOut(BaseTrie s) + { + System.out.println("Forward:"); + System.out.println(dump(s, Direction.FORWARD)); + System.out.println("Reverse:"); + System.out.println(dump(s, Direction.REVERSE)); + } + + static class SpecStackEntry + { + Object[] children; + int curChild; + Object content; + SpecStackEntry parent; + + public SpecStackEntry(Object[] spec, Object content, SpecStackEntry parent, int curChild) + { + this.children = spec; + this.content = content; + this.parent = parent; + this.curChild = curChild; + } + } + + public static class CursorFromSpec implements Cursor + { + SpecStackEntry stack; + Direction direction; + long position; + + CursorFromSpec(Object[] spec, Direction direction) + { + this.direction = direction; + stack = makeSpecStackEntry(direction, spec, null); + position = Cursor.rootPosition(direction); + } + + @Override + public long advance() + { + SpecStackEntry current = stack; + Object child; + int depth = Cursor.depth(position); + do + { + while (current != null + && (current.children.length == 0 + || !direction.inLoop(current.curChild += direction.increase, 0, current.children.length - 1))) + { + current = current.parent; + --depth; + } + if (current == null) + { + stack = null; + return position = Cursor.exhaustedPosition(direction); + } + + child = current.children[current.curChild]; + } + while (child == null); + stack = makeSpecStackEntry(direction, child, current); + + return position = encode(++depth); + } + + @Override + public long skipTo(long encodedSkipPosition) + { + int skipDepth = Cursor.depth(encodedSkipPosition); + int skipTransition = Cursor.incomingTransition(encodedSkipPosition); + if (Cursor.isOnReturnPath(encodedSkipPosition)) + skipTransition += direction.increase; + int depth = Cursor.depth(position); + assert skipDepth <= depth + 1 : "skipTo descends more than one level"; + + while (stack != null && skipDepth <= depth) + { + --depth; + stack = stack.parent; + } + if (stack == null) + { + return position = Cursor.exhaustedPosition(direction); + } + + int index = skipTransition - 0x30; + assert direction.gt(index, stack.curChild) : "Backwards skipTo"; + if (direction.gt(index, direction.select(stack.children.length - 1, 0))) + { + --depth; + stack = stack.parent; + return advance(); + } + stack.curChild = index - direction.increase; + return advance(); + } + + @Override + public long encodedPosition() + { + return position; + } + + @SuppressWarnings("unchecked") + @Override + public T content() + { + return (T) stack.content; + } + + private long encode(int depth) + { + return Cursor.encode(depth, stack.parent.curChild + 0x30, direction); + } + + @Override + public ByteComparable.Version byteComparableVersion() + { + return VERSION; + } + + @Override + public Cursor tailCursor(Direction direction) + { + throw new UnsupportedOperationException("not implemented"); + } + + @Override + public String toString() + { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append(Cursor.toString(position)); + if (stack.content != null) + stringBuilder.append(" content ") + .append(stack.content); + stringBuilder.append(" children "); + stringBuilder.append(IntStream.range(0, stack.children.length) + .filter(i -> stack.children[i] != null) + .mapToObj(i -> (i + 1 == stack.curChild ? "*" : "") + (char) (i + 0x30)) + .reduce("", (x, y) -> x + y)); + return stringBuilder.toString(); + } + } +} diff --git a/test/unit/org/apache/cassandra/index/sai/cql/FeaturesVersionSupportTest.java b/test/unit/org/apache/cassandra/index/sai/cql/FeaturesVersionSupportTest.java index 7119345da352..4e31d64b22af 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/FeaturesVersionSupportTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/FeaturesVersionSupportTest.java @@ -270,7 +270,8 @@ public void testIndexMetaForNumRows() assertNumRowsMemtable(bodyIndexName, DATASET.length, totalTermsCount); assertNumRowsMemtable(mapIndexName, DATASET.length); execute("DELETE FROM %s WHERE id = ?", 4); - // Deletion is not tracked by Memindex + // Since TrieMemtable stage 3, deletion is tracked by Memindex + totalTermsCount -= calculateTotalTermsForRow(4); assertNumRowsMemtable(bodyIndexName, DATASET.length, totalTermsCount); // Test an update to a different value for the analyzed index execute("UPDATE %s SET body = ? WHERE id = ?", DATASET[10][DATASET_BODY_COLUMN], 6); @@ -280,8 +281,6 @@ public void testIndexMetaForNumRows() execute("UPDATE %s SET body = ? WHERE id = ?", DATASET[6][DATASET_BODY_COLUMN], 10); totalTermsCount += calculateTotalTermsForRow(6) - calculateTotalTermsForRow(10); assertNumRowsMemtable(bodyIndexName, DATASET.length, totalTermsCount); - // Flush will account for the deleted row - totalTermsCount -= calculateTotalTermsForRow(4); flush(); assertNumRowsAndTotalTermsSSTable(scoreIndexName, DATASET.length - 1, DATASET.length - 1); assertNumRowsAndTotalTermsSSTable(bodyIndexName, DATASET.length - 1, totalTermsCount); diff --git a/test/unit/org/apache/cassandra/index/sai/cql/LuceneUpdateDeleteTest.java b/test/unit/org/apache/cassandra/index/sai/cql/LuceneUpdateDeleteTest.java index 0d3afacccd08..898c9d37aa2a 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/LuceneUpdateDeleteTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/LuceneUpdateDeleteTest.java @@ -605,9 +605,20 @@ public void testRangeDeletionThenOverwrite() throws Throwable // delete range execute("DELETE FROM %s WHERE pk = 0"); - // Still expect both rows to be in the index because range deletion doesn't remove from index - searchMemtable(indexName, "indexed", 0); - searchMemtable(indexName, "random", 1, 0); + if (version.equals(Version.AA)) + { + // Still expect both rows to be in the index because range deletion doesn't remove from index + searchMemtable(indexName, "indexed", 0); + searchMemtable(indexName, "random", 1, 0); + searchMemtable(indexName, "something", 0); // range deleted, but not yet removed + } + else + { + // The range deletion causes an update since trie memtable stage 3 + searchMemtable(indexName, "indexed"); + searchMemtable(indexName, "random", 1); + searchMemtable(indexName, "something"); + } // Overwrite the value for the first of the 2 rows in partition 0 execute("INSERT INTO %s (pk, x, val) VALUES (0, 0, 'random')"); @@ -619,15 +630,15 @@ public void testRangeDeletionThenOverwrite() throws Throwable searchMemtable(indexName, "phrase", 1, 0); // random is in all 3 memtable index rows, but only 2 partitions, and AA indexes partition keys searchMemtable(indexName, "random", 1, 0); + searchMemtable(indexName, "something", 0); // range deleted, but not yet removed } else { searchMemtable(indexName, "indexed"); // overwritten, and the update removes the value searchMemtable(indexName, "phrase", 1); // was deleted/overwritten in 0, so just in 1 now - searchMemtable(indexName, "random", 1, 0, 0); // random is in all 3 memtable index rows + searchMemtable(indexName, "random", 1, 0); // random is in all 3 memtable index rows + searchMemtable(indexName, "something"); } - // True for all versions - searchMemtable(indexName, "something", 0); // range deleted, but not yet removed } @Test diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/trie/TrieTermsDictionaryTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/trie/TrieTermsDictionaryTest.java index faa1f323793d..68f6c71fc412 100644 --- a/test/unit/org/apache/cassandra/index/sai/disk/v1/trie/TrieTermsDictionaryTest.java +++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/trie/TrieTermsDictionaryTest.java @@ -505,7 +505,7 @@ private ByteComparable primaryKey(Function asByteCompara private void testForDifferentByteComparableEncodings(ThrowingConsumer> test) throws Exception { test.accept(s -> ByteComparable.preencoded(VERSION, ByteBufferUtil.bytes(s))); - test.accept(ByteComparable::of); + test.accept(s -> v -> ByteSource.of(s, v)); } @FunctionalInterface diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java index 04378ddd83eb..5efb995b98af 100644 --- a/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java +++ b/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java @@ -28,7 +28,6 @@ import org.junit.rules.ExpectedException; import com.datastax.driver.core.ResultSet; - import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.ReadCommand; @@ -490,20 +489,24 @@ private void testQueryKindMetrics(boolean perTable, boolean perQuery) createIndex("CREATE CUSTOM INDEX ON %s(n) USING 'StorageAttachedIndex'"); createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex'"); - // insert some data int numPartitions = 11; int numRowsPerPartition = 13; int numRows = numPartitions * numRowsPerPartition; + + // Create an sstable with rows to be deleted, so that memtable cannot make them disappear. + execute("INSERT INTO %s (k, c, n, v) VALUES (?, ?, 1, [1, 1])", numPartitions, numRowsPerPartition); + execute("INSERT INTO %s (k, c, n, v) VALUES (?, ?, 1, [1, 1])", numPartitions + 1, numRowsPerPartition); + flush(); + + // insert some data for (int k = 0; k < numPartitions; k++) for (int c = 0; c < numRowsPerPartition; c++) execute("INSERT INTO %s (k, c, n, v) VALUES (?, ?, 1, [1, 1])", k, c); // add a partition tombstone - execute("INSERT INTO %s (k, c, n, v) VALUES (?, ?, 1, [1, 1])", numPartitions, numRowsPerPartition); execute("DELETE FROM %s WHERE k = ?", numPartitions); // add a row range tombstone - execute("INSERT INTO %s (k, c, n, v) VALUES (?, ?, 1, [1, 1])", numPartitions + 1, numRowsPerPartition); execute("DELETE FROM %s WHERE k = ? AND c > 0", numPartitions + 1); // filter query (goes to the general, filter and range query metrics) diff --git a/test/unit/org/apache/cassandra/index/sai/plan/SingleRestrictionEstimatedRowCountTest.java b/test/unit/org/apache/cassandra/index/sai/plan/SingleRestrictionEstimatedRowCountTest.java index d068b6eedfd0..64e57a79ebbb 100644 --- a/test/unit/org/apache/cassandra/index/sai/plan/SingleRestrictionEstimatedRowCountTest.java +++ b/test/unit/org/apache/cassandra/index/sai/plan/SingleRestrictionEstimatedRowCountTest.java @@ -25,6 +25,8 @@ import java.util.Map; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.apache.cassandra.Util; import org.apache.cassandra.cql3.CQL3Type; @@ -46,8 +48,18 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; +@RunWith(Parameterized.class) public class SingleRestrictionEstimatedRowCountTest extends SAITester { + @Parameterized.Parameter + public String memtableDef; + + @Parameterized.Parameters(name = "{0}") + public static String[] memtableClasses() + { + return new String[]{ "trie", "trie_stage3", "trie_stage2", "trie_stage1" }; + } + static protected Map, ColumnFamilyStore> tables = new HashMap<>(); static Version[] versions = new Version[]{ Version.DB, Version.EB }; static CQL3Type.Native[] types = new CQL3Type.Native[]{ INT, DECIMAL, VARINT }; @@ -132,7 +144,8 @@ void createTables() SAIUtil.setCurrentVersion(version); for (CQL3Type.Native type : types) { - createTable("CREATE TABLE %s (pk text PRIMARY KEY, age " + type + ')'); + createTable("CREATE TABLE %s (pk text PRIMARY KEY, age " + type + ") " + + "WITH memtable = '" + memtableDef + "'"); createIndex("CREATE CUSTOM INDEX ON %s(age) USING 'StorageAttachedIndex'"); tables.put(tablesEntryKey(version, type), getCurrentColumnFamilyStore()); } @@ -182,6 +195,7 @@ void doTest(Version version, CQL3Type.Native type, double minExpectedRows, doubl long totalRows = controller.planFactory.tableMetrics.rows; assertEquals(0, cfs.metrics().liveSSTableCount.getValue().intValue()); + assertEquals(100, totalRows); Plan plan = controller.buildPlan(); assert plan instanceof Plan.RowsIteration; diff --git a/test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTableTest.java b/test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTest.java similarity index 69% rename from test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTableTest.java rename to test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTest.java index 0462f9eb567f..46d2878b6bf5 100644 --- a/test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTableTest.java +++ b/test/unit/org/apache/cassandra/index/sai/utils/CellWithSourceTest.java @@ -18,6 +18,7 @@ package org.apache.cassandra.index.sai.utils; +import org.apache.cassandra.db.CellSourceIdentifier; import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.marshal.Int32Type; @@ -43,12 +44,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class CellWithSourceTableTest { +public class CellWithSourceTest +{ private ColumnMetadata column; private Cell wrappedCell; - private Object sourceTable; - private CellWithSourceTable cellWithSourceTable; + private CellSourceIdentifier sourceTable = RowWithSourceTest.source; + private CellWithSource cellWithSource; private final long timestamp = System.currentTimeMillis(); // We use a 4 byte array because the Int32Type is used in the test @@ -59,83 +61,82 @@ public void setUp() { column = ColumnMetadata.regularColumn("keyspace1", "table1", "name1", Int32Type.instance); wrappedCell = new ArrayCell(column, timestamp, Cell.NO_TTL, Cell.NO_DELETION_TIME, value, null); - sourceTable = new Object(); - cellWithSourceTable = new CellWithSourceTable<>(wrappedCell, sourceTable); + cellWithSource = new CellWithSource<>(wrappedCell, sourceTable); } @Test public void testSourceTable() { - assertEquals(sourceTable, cellWithSourceTable.sourceTable()); + assertEquals(sourceTable, cellWithSource.sourceTable()); } @Test public void testIsCounterCell() { - assertEquals(wrappedCell.isCounterCell(), cellWithSourceTable.isCounterCell()); + assertEquals(wrappedCell.isCounterCell(), cellWithSource.isCounterCell()); } @Test public void testValue() { - assertEquals(wrappedCell.value(), cellWithSourceTable.value()); + assertEquals(wrappedCell.value(), cellWithSource.value()); } @Test public void testAccessor() { - assertEquals(wrappedCell.accessor(), cellWithSourceTable.accessor()); + assertEquals(wrappedCell.accessor(), cellWithSource.accessor()); } @Test public void testTimestamp() { - assertEquals(wrappedCell.timestamp(), cellWithSourceTable.timestamp()); + assertEquals(wrappedCell.timestamp(), cellWithSource.timestamp()); } @Test public void testTtl() { - assertEquals(wrappedCell.ttl(), cellWithSourceTable.ttl()); + assertEquals(wrappedCell.ttl(), cellWithSource.ttl()); } @Test public void testLocalDeletionTime() { - assertEquals(wrappedCell.localDeletionTime(), cellWithSourceTable.localDeletionTime()); + assertEquals(wrappedCell.localDeletionTime(), cellWithSource.localDeletionTime()); } @Test public void testIsTombstone() { - assertEquals(wrappedCell.isTombstone(), cellWithSourceTable.isTombstone()); + assertEquals(wrappedCell.isTombstone(), cellWithSource.isTombstone()); } @Test public void testIsExpiring() { - assertEquals(wrappedCell.isExpiring(), cellWithSourceTable.isExpiring()); + assertEquals(wrappedCell.isExpiring(), cellWithSource.isExpiring()); } @Test public void testIsLive() { var nowInSec = 0; - assertEquals(wrappedCell.isLive(nowInSec), cellWithSourceTable.isLive(nowInSec)); + assertEquals(wrappedCell.isLive(nowInSec), cellWithSource.isLive(nowInSec)); } @Test public void testPath() { - assertEquals(wrappedCell.path(), cellWithSourceTable.path()); + assertEquals(wrappedCell.path(), cellWithSource.path()); } @Test public void testWithUpdatedColumn() { - var originalColumn = cellWithSourceTable.column(); + var originalColumn = cellWithSource.column(); var newColumn = ColumnMetadata.regularColumn("keyspace1", "table1", "name2", Int32Type.instance); - var resultColumn = cellWithSourceTable.withUpdatedColumn(newColumn).column(); + var resultColumn = cellWithSource.withUpdatedColumn(newColumn).column(); assertNotEquals(originalColumn, resultColumn); assertEquals(newColumn, resultColumn); } @@ -144,8 +145,8 @@ public void testWithUpdatedColumn() public void testWithUpdatedValue() { ByteBuffer newValue = ByteBuffer.allocate(4); - var oldValue = cellWithSourceTable.value(); - var resultValue = cellWithSourceTable.withUpdatedValue(newValue).value(); + var oldValue = cellWithSource.value(); + var resultValue = cellWithSource.withUpdatedValue(newValue).value(); assertNotEquals(oldValue, resultValue); assertTrue(resultValue instanceof byte[]); assertArrayEquals(newValue.array(), (byte[]) resultValue); @@ -156,10 +157,10 @@ public void testWithUpdatedTimestampAndLocalDeletionTime() { long newTimestamp = 1234567890L; int newLocalDeletionTime = 987654321; - var originalTimestamp = cellWithSourceTable.timestamp(); - var originalDeletionTime = cellWithSourceTable.localDeletionTime(); - var resultTimestamp = cellWithSourceTable.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime).timestamp(); - var resultDeletionTime = cellWithSourceTable.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime).localDeletionTime(); + var originalTimestamp = cellWithSource.timestamp(); + var originalDeletionTime = cellWithSource.localDeletionTime(); + var resultTimestamp = cellWithSource.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime).timestamp(); + var resultDeletionTime = cellWithSource.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime).localDeletionTime(); assertNotEquals(originalTimestamp, resultTimestamp); assertEquals(newTimestamp, resultTimestamp); assertNotEquals(originalDeletionTime, resultDeletionTime); @@ -169,19 +170,19 @@ public void testWithUpdatedTimestampAndLocalDeletionTime() @Test public void testWithSkippedValue() { - var originalValue = cellWithSourceTable.value(); - var resultValue = cellWithSourceTable.withSkippedValue().value(); + var originalValue = cellWithSource.value(); + var resultValue = cellWithSource.withSkippedValue().value(); assertNotEquals(originalValue, resultValue); } @Test public void testClone() { - var resultClone = cellWithSourceTable.clone(HeapCloner.instance); + var resultClone = cellWithSource.clone(HeapCloner.instance); // The reference is not equal here because we have a non-zero length value - assertNotSame(cellWithSourceTable, resultClone); + assertNotSame(cellWithSource, resultClone); // Now make the value 0 length and we should get the same reference - var skippedCell = cellWithSourceTable.withSkippedValue(); + var skippedCell = cellWithSource.withSkippedValue(); var clonedSkippedCell = skippedCell.clone(HeapCloner.instance); assertSame(skippedCell, clonedSkippedCell); } @@ -189,33 +190,33 @@ public void testClone() @Test public void testDataSize() { - assertEquals(wrappedCell.dataSize(), cellWithSourceTable.dataSize()); + assertEquals(wrappedCell.dataSize(), cellWithSource.dataSize()); } @Test public void testUnsharedHeapSizeExcludingData() { - assertEquals(wrappedCell.unsharedHeapSizeExcludingData(), cellWithSourceTable.unsharedHeapSizeExcludingData()); + assertEquals(wrappedCell.unsharedHeapSizeExcludingData(), cellWithSource.unsharedHeapSizeExcludingData()); } @Test public void testValidate() { wrappedCell.validate(); - cellWithSourceTable.validate(); + cellWithSource.validate(); } @Test public void testHasInvalidDeletions() { - assertEquals(wrappedCell.hasInvalidDeletions(), cellWithSourceTable.hasInvalidDeletions()); + assertEquals(wrappedCell.hasInvalidDeletions(), cellWithSource.hasInvalidDeletions()); } @Test public void testDigest() { var digest1 = Digest.forValidator(); - cellWithSourceTable.digest(digest1); + cellWithSource.digest(digest1); var digest2 = Digest.forValidator(); wrappedCell.digest(digest2); assertArrayEquals(digest1.digest(), digest2.digest()); @@ -225,7 +226,7 @@ public void testDigest() public void testUpdateAllTimestamp() { long newTimestamp = 1234567890L; - var resultData = cellWithSourceTable.updateAllTimestamp(newTimestamp); + var resultData = cellWithSource.updateAllTimestamp(newTimestamp); assertEquals(newTimestamp, resultData.minTimestamp()); assertEquals(newTimestamp, resultData.minTimestamp()); } @@ -233,8 +234,8 @@ public void testUpdateAllTimestamp() @Test public void testMarkCounterLocalToBeCleared() { - var resultCell = cellWithSourceTable.markCounterLocalToBeCleared(); - assertSame(cellWithSourceTable, resultCell); + var resultCell = cellWithSource.markCounterLocalToBeCleared(); + assertSame(cellWithSource, resultCell); } @Test @@ -246,7 +247,7 @@ public void testPurge() long purgeSame = 98765; when(mockCell.purge(any(), eq(purgeNull))).thenReturn(null); when(mockCell.purge(any(), eq(purgeSame))).thenReturn(mockCell); - var cell = new CellWithSourceTable<>(mockCell, sourceTable); + var cell = new CellWithSource<>(mockCell, sourceTable); assertNull(cell.purge(purger, purgeNull)); assertSame(cell, cell.purge(purger, purgeSame)); } @@ -254,12 +255,12 @@ public void testPurge() @Test public void testMaxTimestamp() { - assertEquals(timestamp, cellWithSourceTable.maxTimestamp()); + assertEquals(timestamp, cellWithSource.maxTimestamp()); } @Test public void testMinTimestamp() { - assertEquals(timestamp, cellWithSourceTable.minTimestamp()); + assertEquals(timestamp, cellWithSource.minTimestamp()); } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTableTest.java b/test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTest.java similarity index 56% rename from test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTableTest.java rename to test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTest.java index 69de13b69fee..3f03904a35ca 100644 --- a/test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTableTest.java +++ b/test/unit/org/apache/cassandra/index/sai/utils/RowWithSourceTest.java @@ -20,19 +20,21 @@ import java.nio.ByteBuffer; +import org.apache.cassandra.db.CellSourceIdentifier; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.Digest; -import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.rows.ArrayCell; import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.io.sstable.SSTableId; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ObjectSizes; @@ -42,9 +44,9 @@ import org.junit.Test; import static org.junit.Assert.*; -public class RowWithSourceTableTest { +public class RowWithSourceTest { - private RowWithSourceTable rowWithSourceTable; + private RowWithSource rowWithSource; private TableMetadata tableMetadata; private ColumnMetadata complexColumn; private ColumnMetadata column; @@ -52,7 +54,26 @@ public class RowWithSourceTableTest { private Cell complexCell; private Cell cell; private Row originalRow; - private final Object source = new Object(); + static final CellSourceIdentifier source = new SSTableId() + { + @Override + public int compareTo(Object o) + { + return 0; + } + + @Override + public ByteBuffer asBytes() + { + return null; + } + + @Override + public boolean isEqualSource(CellSourceIdentifier other) + { + return other == this; + } + }; // We use a 4 byte array because the Int32Type is used in the test private final byte[] value = new byte[]{0,0,0,1}; @@ -75,26 +96,26 @@ public void setUp() builder.addCell(complexCell); builder.addCell(cell); originalRow = builder.build(); - rowWithSourceTable = new RowWithSourceTable(originalRow, source); + rowWithSource = new RowWithSource(originalRow, source); } @Test public void testKind() { - assertEquals(originalRow.kind(), rowWithSourceTable.kind()); + assertEquals(originalRow.kind(), rowWithSource.kind()); } @Test public void testClustering() { - assertEquals(originalRow.clustering(), rowWithSourceTable.clustering()); + assertEquals(originalRow.clustering(), rowWithSource.clustering()); } @Test public void testDigest() { var digest1 = Digest.forValidator(); - rowWithSourceTable.digest(digest1); + rowWithSource.digest(digest1); var digest2 = Digest.forValidator(); originalRow.digest(digest2); assertArrayEquals(digest1.digest(), digest2.digest()); @@ -103,75 +124,75 @@ public void testDigest() @Test public void testValidateData() { - rowWithSourceTable.validateData(tableMetadata); + rowWithSource.validateData(tableMetadata); } @Test public void testHasInvalidDeletions() { - assertFalse(rowWithSourceTable.hasInvalidDeletions()); + assertFalse(rowWithSource.hasInvalidDeletions()); } @Test public void testColumns() { - assertEquals(2, rowWithSourceTable.columns().size()); - assertTrue(rowWithSourceTable.columns().contains(complexColumn)); - assertTrue(rowWithSourceTable.columns().contains(column)); + assertEquals(2, rowWithSource.columns().size()); + assertTrue(rowWithSource.columns().contains(complexColumn)); + assertTrue(rowWithSource.columns().contains(column)); } @Test public void testColumnCount() { - assertEquals(2, rowWithSourceTable.columnCount()); - assertEquals(originalRow.columnCount(), rowWithSourceTable.columnCount()); + assertEquals(2, rowWithSource.columnCount()); + assertEquals(originalRow.columnCount(), rowWithSource.columnCount()); } @Test public void testDeletion() { - assertEquals(originalRow.deletion(), rowWithSourceTable.deletion()); + assertEquals(originalRow.deletion(), rowWithSource.deletion()); } @Test public void testPrimaryKeyLivenessInfo() { - assertEquals(originalRow.primaryKeyLivenessInfo(), rowWithSourceTable.primaryKeyLivenessInfo()); + assertEquals(originalRow.primaryKeyLivenessInfo(), rowWithSource.primaryKeyLivenessInfo()); } @Test public void testIsStatic() { - assertEquals(originalRow.isStatic(), rowWithSourceTable.isStatic()); + assertEquals(originalRow.isStatic(), rowWithSource.isStatic()); } @Test public void testIsEmpty() { - assertFalse(rowWithSourceTable.isEmpty()); - assertEquals(originalRow.isEmpty(), rowWithSourceTable.isEmpty()); + assertFalse(rowWithSource.isEmpty()); + assertEquals(originalRow.isEmpty(), rowWithSource.isEmpty()); } @Test public void testToString() { - assertEquals(originalRow.toString(tableMetadata), rowWithSourceTable.toString(tableMetadata)); + assertEquals(originalRow.toString(tableMetadata), rowWithSource.toString(tableMetadata)); } @Test public void testHasLiveData() { assertTrue(originalRow.hasLiveData(1000, false)); - assertTrue(rowWithSourceTable.hasLiveData(1000, false)); + assertTrue(rowWithSource.hasLiveData(1000, false)); } @Test public void testGetCellWithCorrectColumn() { - var resultCell = rowWithSourceTable.getCell(column); - assertTrue(resultCell instanceof CellWithSourceTable); + var resultCell = rowWithSource.getCell(column); + assertTrue(resultCell instanceof CellWithSource); // This mapping is the whole point of these two classes. - assertSame(source, ((CellWithSourceTable)resultCell).sourceTable()); + assertSame(source, ((CellWithSource)resultCell).sourceTable()); assertSame(cell.value(), resultCell.value()); } @@ -179,52 +200,52 @@ public void testGetCellWithCorrectColumn() public void testGetCellWithMissingColumn() { var diffCol = ColumnMetadata.regularColumn("keyspace1", "table1", "name2", Int32Type.instance); - assertNull(rowWithSourceTable.getCell(diffCol)); + assertNull(rowWithSource.getCell(diffCol)); } @Test public void testGetCellWithPath() { - Cell resultCell = rowWithSourceTable.getCell(complexColumn, complexCellPath); - assertTrue(resultCell instanceof CellWithSourceTable); + Cell resultCell = rowWithSource.getCell(complexColumn, complexCellPath); + assertTrue(resultCell instanceof CellWithSource); // This mapping is the whole point of these two classes. - assertSame(source, ((CellWithSourceTable)resultCell).sourceTable()); + assertSame(source, ((CellWithSource)resultCell).sourceTable()); assertSame(cell.value(), resultCell.value()); } @Test public void testGetComplexColumnData() { - var complexColumnData = rowWithSourceTable.getComplexColumnData(complexColumn); + var complexColumnData = rowWithSource.getComplexColumnData(complexColumn); var firstCell = complexColumnData.iterator().next(); - assertTrue(firstCell instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)firstCell).sourceTable()); + assertTrue(firstCell instanceof CellWithSource); + assertSame(source, ((CellWithSource)firstCell).sourceTable()); } @Test public void testGetColumnData() { - var simpleColumnData = rowWithSourceTable.getColumnData(column); - assertTrue(simpleColumnData instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)simpleColumnData).sourceTable()); - var complexColumnData = rowWithSourceTable.getColumnData(complexColumn); + var simpleColumnData = rowWithSource.getColumnData(column); + assertTrue(simpleColumnData instanceof CellWithSource); + assertSame(source, ((CellWithSource)simpleColumnData).sourceTable()); + var complexColumnData = rowWithSource.getColumnData(complexColumn); assertTrue(complexColumnData instanceof ComplexColumnData); var firstCell = ((ComplexColumnData)complexColumnData).iterator().next(); - assertTrue(firstCell instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)firstCell).sourceTable()); + assertTrue(firstCell instanceof CellWithSource); + assertSame(source, ((CellWithSource)firstCell).sourceTable()); } @Test public void testCells() { var cells = originalRow.cells().iterator(); - var wrappedCells = rowWithSourceTable.cells().iterator(); + var wrappedCells = rowWithSource.cells().iterator(); while (cells.hasNext()) { var cell = cells.next(); var wrappedCell = wrappedCells.next(); - assertTrue(wrappedCell instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)wrappedCell).sourceTable()); + assertTrue(wrappedCell instanceof CellWithSource); + assertSame(source, ((CellWithSource)wrappedCell).sourceTable()); assertSame(cell.value(), wrappedCell.value()); } assertFalse(wrappedCells.hasNext()); @@ -233,15 +254,15 @@ public void testCells() @Test public void testColumnData() { - var columnDataCollection = rowWithSourceTable.columnData(); - assertEquals(2, columnDataCollection.size()); + var columnDataCollection = rowWithSource; + assertEquals(2, columnDataCollection.columnCount()); var iter = columnDataCollection.iterator(); while (iter.hasNext()) { var columnData = iter.next(); - if (columnData instanceof CellWithSourceTable) + if (columnData instanceof CellWithSource) { - assertSame(source, ((CellWithSourceTable)columnData).sourceTable()); + assertSame(source, ((CellWithSource)columnData).sourceTable()); } else if (columnData instanceof ComplexColumnData) { @@ -249,8 +270,8 @@ else if (columnData instanceof ComplexColumnData) while (complexIter.hasNext()) { var cell = complexIter.next(); - assertTrue(cell instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)cell).sourceTable()); + assertTrue(cell instanceof CellWithSource); + assertSame(source, ((CellWithSource)cell).sourceTable()); } } else @@ -261,93 +282,65 @@ else if (columnData instanceof ComplexColumnData) } - @Test - public void testCellsInLegacyOrder() - { - var cells = originalRow.cellsInLegacyOrder(tableMetadata, false).iterator(); - var wrappedCells = rowWithSourceTable.cellsInLegacyOrder(tableMetadata, false).iterator(); - while (cells.hasNext()) - { - var cell = cells.next(); - var wrappedCell = wrappedCells.next(); - assertTrue(wrappedCell instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)wrappedCell).sourceTable()); - assertSame(cell.value(), wrappedCell.value()); - } - assertFalse(wrappedCells.hasNext()); - } - @Test public void testHasComplexDeletion() { - assertFalse(rowWithSourceTable.hasComplexDeletion()); - } - - @Test - public void testHasComplex() - { - assertTrue(rowWithSourceTable.hasComplex()); + assertFalse(rowWithSource.hasComplexDeletion()); } @Test public void testHasDeletion() { - assertFalse(rowWithSourceTable.hasDeletion(1000)); - } - - @Test - public void testSearchIterator() - { - var iterator = rowWithSourceTable.searchIterator(); - var columnData = iterator.next(column); - assertTrue(columnData instanceof CellWithSourceTable); - assertSame(source, ((CellWithSourceTable)columnData).sourceTable()); - assertNull(iterator.next(column)); + assertFalse(rowWithSource.hasDeletion(1000)); } @Test public void testFilter() { - assertSame(rowWithSourceTable, rowWithSourceTable.filter(ColumnFilter.all(tableMetadata), tableMetadata)); + assertSame(rowWithSource, rowWithSource.filter(ColumnFilter.all(tableMetadata), tableMetadata)); } @Test public void testFilterWithDeletion() { - assertSame(rowWithSourceTable, rowWithSourceTable.filter(ColumnFilter.all(tableMetadata), DeletionTime.LIVE, true, tableMetadata)); + assertSame(rowWithSource, rowWithSource.filter(ColumnFilter.all(tableMetadata), DeletionTime.LIVE, true, tableMetadata)); } @Test public void testTransformAndFilter() { - assertSame(rowWithSourceTable, rowWithSourceTable.transformAndFilter(LivenessInfo.EMPTY, Row.Deletion.LIVE, c -> c)); + assertNull(rowWithSource.transformAndFilter(li -> li, RowWithSourceTest::toNull)); + assertSame(rowWithSource, rowWithSource.transformAndFilter(li -> li, RowWithSourceTest::unchanged)); } - @Test - public void testTransformAndFilterWithFunction() + private static > C toNull(C c) + { + return null; + } + + private static > C unchanged(C c) { - assertNull(rowWithSourceTable.transformAndFilter(c -> null)); - assertSame(rowWithSourceTable, rowWithSourceTable.transformAndFilter(c -> c)); + return c; } @Test public void testClone() { - assertTrue(rowWithSourceTable.clone(HeapCloner.instance) instanceof RowWithSourceTable); + assertTrue(rowWithSource.clone(HeapCloner.instance) instanceof RowWithSource); } @Test public void testDataSize() { - assertEquals(originalRow.dataSize(), rowWithSourceTable.dataSize()); + assertEquals(originalRow.dataSize(), rowWithSource.dataSize()); } @Test public void testUnsharedHeapSizeExcludingData() { - var wrapperSize = ObjectSizes.measure(new RowWithSourceTable(null, null)); + var wrapperSize = ObjectSizes.measure(new RowWithSource(null, null)); assertEquals(originalRow.unsharedHeapSizeExcludingData() + wrapperSize, - rowWithSourceTable.unsharedHeapSizeExcludingData()); + rowWithSource.unsharedHeapSizeExcludingData()); } } diff --git a/test/unit/org/apache/cassandra/metrics/ClientRequestRowAndColumnMetricsTest.java b/test/unit/org/apache/cassandra/metrics/ClientRequestRowAndColumnMetricsTest.java index c0f659361e0d..5fcb6ced4088 100644 --- a/test/unit/org/apache/cassandra/metrics/ClientRequestRowAndColumnMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/ClientRequestRowAndColumnMetricsTest.java @@ -371,7 +371,8 @@ public void shouldRecordWriteMetricsForIntraRowBatch() throws Exception client.connect(false); String first = String.format("INSERT INTO %s.%s (pk, ck, v1, v2) VALUES (1, 2, 3, 4)", KEYSPACE, currentTable()); - String second = String.format("DELETE FROM %s.%s WHERE pk = 1 AND ck > 1", KEYSPACE, currentTable()); + // If deletion affects the insert above, we may get incorrect row/column counts. + String second = String.format("DELETE FROM %s.%s WHERE pk = 1 AND ck > 3", KEYSPACE, currentTable()); List> values = ImmutableList.of(Collections.emptyList(), Collections.emptyList()); BatchMessage batch = new BatchMessage(BatchStatement.Type.LOGGED, ImmutableList.of(first, second), values, QueryOptions.DEFAULT); diff --git a/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java b/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java index 05160521d4e4..41d9aaa6dee1 100644 --- a/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java +++ b/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java @@ -40,30 +40,6 @@ public static void setupDD() DatabaseDescriptor.daemonInitialization(); } - @Test - public void testSerializationBackwardCompatibility() - { - /* - * Tests that the serialized paging state for the native protocol V3 is backward compatible - * with what old nodes generate. For that, it compares the serialized format to the hard-coded - * value of the same state generated on a 2.1. For the curious, said hardcoded value has been - * generated by the following code: - * ByteBuffer pk = ByteBufferUtil.bytes("someKey"); - * CellName cn = CellNames.compositeSparse(new ByteBuffer[]{ ByteBufferUtil.bytes("c1"), ByteBufferUtil.bytes(42) }, - * new ColumnIdentifier("myCol", false), - * false); - * PagingState state = new PagingState(pk, cn.toByteBuffer(), 10); - * System.out.println("PagingState = " + ByteBufferUtil.bytesToHex(state.serialize())); - */ - PagingState state = Util.makeSomePagingState(ProtocolVersion.V3); - - String serializedState = ByteBufferUtil.bytesToHex(state.serialize(ProtocolVersion.V3)); - // Note that we don't assert exact equality because we know 3.0 nodes include the "remainingInPartition" number - // that is not present on 2.1/2.2 nodes. We know this is ok however because we know that 2.1/2.2 nodes will ignore - // anything remaining once they have properly deserialized a paging state. - assertTrue(serializedState.startsWith("0007736f6d654b65790014000263310000040000002a0000056d79636f6c000000000a")); - } - @Test public void testSerializeV3DeserializeV3() { diff --git a/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java b/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java index e7df8fd169be..61315188dad7 100644 --- a/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java +++ b/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java @@ -36,7 +36,6 @@ import org.apache.cassandra.db.DeletionInfo; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.EmptyIterators; -import org.apache.cassandra.db.MutableDeletionInfo; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.RangeTombstone; import org.apache.cassandra.db.ReadCommand; @@ -660,10 +659,9 @@ public void testRepairRangeTombstoneWithPartitionDeletion2() // 1st "stream": a partition deletion and a range tombstone RangeTombstone rt1 = tombstone("0", true , "9", true, 11, nowInSec); - PartitionUpdate upd1 = new RowUpdateBuilder(cfm, nowInSec, 1L, dk) + PartitionUpdate upd1 = new RowUpdateBuilder(cfm, DeletionTime.build(10, nowInSec), nowInSec, 1L, dk) .addRangeTombstone(rt1) .buildUpdate(); - ((MutableDeletionInfo)upd1.deletionInfo()).add(DeletionTime.build(10, nowInSec)); UnfilteredPartitionIterator iter1 = iter(upd1); // 2nd "stream": a range tombstone that is covered by the other stream rt diff --git a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java index 801eb60ae61c..0926ba1ed9b9 100644 --- a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java +++ b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java @@ -33,7 +33,6 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; - import javax.annotation.Nullable; import com.google.common.collect.ImmutableList; @@ -54,8 +53,8 @@ import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.db.marshal.EmptyType; import org.apache.cassandra.db.marshal.TimeUUIDType; -import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.CellData; import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.LocalPartitioner; @@ -125,7 +124,7 @@ public final class CassandraGenerators private static final Gen SINGLE_PARTITION_READ_COMMAND_GEN = gen(rnd -> { TableMetadata metadata = TABLE_METADATA_GEN.generate(rnd); - long nowInSec = rnd.next(Constraint.between(1, Cell.getVersionedMaxDeletiontionTime())); + long nowInSec = rnd.next(Constraint.between(1, CellData.getVersionedMaxDeletiontionTime())); ByteBuffer key = partitionKeyDataGen(metadata).generate(rnd); //TODO support all fields of SinglePartitionReadCommand return SinglePartitionReadCommand.create(metadata, nowInSec, key, Slices.ALL);