Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
39eb2ba
Remove duplicate Row/CellWithSourceTable
blambov Apr 27, 2026
0031098
TrieMemtable Stage 3
blambov Mar 13, 2025
a1244ce
Change trie interfaces to combine depth and incoming character
blambov Nov 12, 2025
1808900
Change trie interfaces to permit stopping on the return path
blambov Nov 20, 2025
9364fd5
Test fixes
blambov Apr 7, 2026
180c0af
Copy TrieBackedPartition, TriePartitionUpdate, TriePartritionUpdater …
blambov Dec 5, 2025
06cb4a9
Implements cell-level trie
blambov Dec 10, 2025
bc6fe06
Test fixes
blambov Apr 7, 2026
d1fe7e9
Permit in-memory tries to store bytes in the trie structure
blambov Jan 7, 2026
9212642
Implement InMemoryRangeCursor.getNearestContent directly
blambov Feb 17, 2026
2567618
Provide merged rows to indexer
blambov Apr 9, 2026
f43f32e
Switch to indexing by cells rather than rows
blambov Apr 9, 2026
65c3972
Remove unused imports
blambov Apr 29, 2026
4a4b634
JMH test fixes
blambov Apr 30, 2026
3afd5c5
Improve column counting
blambov Apr 30, 2026
53df4f2
Store TrieBackedRow's livenessInfo and deletion for reuse
blambov May 11, 2026
c1a1f2d
Improve deletion branch walks for rows and partitions
blambov May 12, 2026
81e7231
Fix CASSANDRA-21353
blambov May 12, 2026
45f94f1
Fix TrieMemtable behaviour on changing schema
blambov May 15, 2026
0a6fcb9
Address test failures
blambov May 19, 2026
1ba6e68
Correct `assertRowsIgnoringOrderInternal` for nulls
blambov May 20, 2026
762b7ac
Restore ProtocolVersion.V3 compatibility without V2 support
blambov May 22, 2026
9041b25
Fix QueryMetricsTest
blambov May 22, 2026
80f3b79
Fix issues in payload cell tracking
blambov May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -733,10 +733,10 @@ public enum CassandraRelevantProperties

/** Class used to discover/load the proper SAI index components file for a given sstable. */
SAI_ANN_USE_SYNTHETIC_SCORE("cassandra.sai.ann_use_synthetic_score", "false"),

/** The current version of the SAI on-disk index format. */
SAI_CURRENT_VERSION("cassandra.sai.latest.version", "ec"),

SAI_CUSTOM_COMPONENTS_DISCOVERY_CLASS("cassandra.sai.custom_components_discovery_class"),
SAI_ENABLE_EDGES_CACHE("cassandra.sai.enable_edges_cache", "false"),
SAI_ENABLE_GENERAL_ORDER_BY("cassandra.sai.general_order_by", "true"),
Expand Down Expand Up @@ -874,8 +874,7 @@ public enum CassandraRelevantProperties
// NVQ number of subvectors. This isn't really expected to change much so we're only exposing
// it as a global variable in case it's needed.
SAI_VECTOR_NVQ_NUM_SUB_VECTORS("cassandra.sai.vector.nvq_num_sub_vectors", "2"),

// The allowed ratio of extra rows (that map to "holes" in the ordinal space) to total rows indexed in the graph
// The allowed ratio of extra rows (that map to "holes" in the ordinal space) to total rows indexed in the graph
// Higher percentages will result in more memory utilized to store the extra postings mappings and larger graph
// file sizes to store the empty nodes.
SAI_VECTOR_ORDINAL_HOLE_DENSITY_LIMIT("cassandra.sai.vector.ordinal_hole_density_limit", "0.01"),
Expand All @@ -890,7 +889,6 @@ public enum CassandraRelevantProperties
* build a potential result set for search-then-sort query execution.
*/
SAI_VECTOR_SEARCH_MAX_MATERIALIZE_KEYS("cassandra.sai.vector_search.max_materialized_keys", "16000"),

/** Controls the maximum top-k limit for vector search */
SAI_VECTOR_SEARCH_MAX_TOP_K("cassandra.sai.vector_search.max_top_k", "1000"),
SAI_VECTOR_USE_PRUNING_DEFAULT("cassandra.sai.jvector.use_pruning_default", "1000"),
Expand Down Expand Up @@ -1059,6 +1057,8 @@ public enum CassandraRelevantProperties
// i.e. that all replicas except for at most one in the cluster (across all DCs) must accept the write for it to be successful.
THREE_MEANS_ALL_BUT_ONE("dse.consistency_level.three_means_all_but_one", "false"),
TOLERATE_SSTABLE_SIZE("cassandra.tolerate_sstable_size"),
/** To be used for tests: whether trie cursors should be verified for correctness. */
TRIE_DEBUG("cassandra.debug_tries"),
/**
* Allows to set custom current trie index format. This node will produce sstables in this format.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ private void updatePerBatchMetrics(Collection<? extends IMutation> mutations)
for (PartitionUpdate update : mutation.getPartitionUpdates())
{
for (Row row : update.rows())
nrUpdatedColumns += row.columns().size();
nrUpdatedColumns += row.columnCount();
}
}
metrics.update(type, nrUpdatedPartitions, nrUpdatedColumns);
Expand Down
9 changes: 5 additions & 4 deletions src/java/org/apache/cassandra/db/DeletionTime.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import org.apache.cassandra.cache.IMeasurableMemory;
import org.apache.cassandra.db.rows.Cell;
import org.apache.cassandra.db.rows.CellData;
import org.apache.cassandra.io.ISerializer;
import org.apache.cassandra.io.sstable.format.Version;
import org.apache.cassandra.io.util.DataInputPlus;
Expand Down Expand Up @@ -63,14 +64,14 @@
// You should use 'build' instead to not workaround validations, corruption detections, etc
static DeletionTime buildUnsafeWithUnsignedInteger(long markedForDeleteAt, int localDeletionTimeUnsignedInteger)
{
return CassandraUInt.compare(Cell.MAX_DELETION_TIME_UNSIGNED_INTEGER, localDeletionTimeUnsignedInteger) < 0

Check failure on line 67 in src/java/org/apache/cassandra/db/DeletionTime.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use static access with "org.apache.cassandra.db.rows.CellData" for "MAX_DELETION_TIME_UNSIGNED_INTEGER".

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4lqocAek4mpMlU4E1a&open=AZ4lqocAek4mpMlU4E1a&pullRequest=2308
? new InvalidDeletionTime(markedForDeleteAt)
: new DeletionTime(markedForDeleteAt, localDeletionTimeUnsignedInteger);
}

private DeletionTime(long markedForDeleteAt, long localDeletionTime)
protected DeletionTime(long markedForDeleteAt, long localDeletionTime)
{
this(markedForDeleteAt, Cell.deletionTimeLongToUnsignedInteger(localDeletionTime));
this(markedForDeleteAt, CellData.deletionTimeLongToUnsignedInteger(localDeletionTime));
}

private DeletionTime(long markedForDeleteAt, int localDeletionTimeUnsignedInteger)
Expand All @@ -95,7 +96,7 @@
*/
public long localDeletionTime()
{
return Cell.deletionTimeUnsignedIntegerToLong(localDeletionTimeUnsignedInteger);
return CellData.deletionTimeUnsignedIntegerToLong(localDeletionTimeUnsignedInteger);
}

/**
Expand Down Expand Up @@ -170,7 +171,7 @@
return deletes(info.timestamp());
}

public boolean deletes(Cell<?> cell)
public boolean deletes(CellData<?, ?> cell)
{
return deletes(cell.timestamp());
}
Expand Down Expand Up @@ -306,7 +307,7 @@
{
public void serialize(DeletionTime delTime, DataOutputPlus out) throws IOException
{
int ldt = delTime.localDeletionTime() == Cell.NO_DELETION_TIME ? Integer.MAX_VALUE : (int) min(delTime.localDeletionTime(), (long)Integer.MAX_VALUE - 1);

Check failure on line 310 in src/java/org/apache/cassandra/db/DeletionTime.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use static access with "org.apache.cassandra.db.rows.CellData" for "NO_DELETION_TIME".

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4lqocAek4mpMlU4E1b&open=AZ4lqocAek4mpMlU4E1b&pullRequest=2308
out.writeInt(ldt);
out.writeLong(delTime.markedForDeleteAt);
}
Expand Down Expand Up @@ -354,7 +355,7 @@
@Override
public long localDeletionTime()
{
return Cell.INVALID_DELETION_TIME;

Check failure on line 358 in src/java/org/apache/cassandra/db/DeletionTime.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use static access with "org.apache.cassandra.db.rows.CellData" for "INVALID_DELETION_TIME".

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4lqocAek4mpMlU4E1d&open=AZ4lqocAek4mpMlU4E1d&pullRequest=2308
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.cassandra.config.CassandraRelevantProperties;
import org.apache.cassandra.db.rows.BufferCell;
import org.apache.cassandra.db.rows.Cell;
import org.apache.cassandra.db.rows.CellData;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ClientWarn;
Expand Down Expand Up @@ -79,7 +80,7 @@

// Check for localExpirationTime overflow (CASSANDRA-14092) to apply a policy if needed
long nowInSecs = currentTimeMillis() / 1000;
if (((long) ttl + nowInSecs) > Cell.getVersionedMaxDeletiontionTime())
if (((long) ttl + nowInSecs) > CellData.getVersionedMaxDeletiontionTime())

Check warning on line 83 in src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove this unnecessary cast to "long".

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4lqocYek4mpMlU4E1e&open=AZ4lqocYek4mpMlU4E1e&pullRequest=2308
{
switch (policy)
{
Expand Down Expand Up @@ -121,13 +122,13 @@
{

long localExpirationTime = (long) (nowInSec + timeToLive);
long cellMaxDeletionTime = Cell.getVersionedMaxDeletiontionTime();
long cellMaxDeletionTime = CellData.getVersionedMaxDeletiontionTime();
return localExpirationTime <= cellMaxDeletionTime ? localExpirationTime : cellMaxDeletionTime;
}

private static String getMaxExpirationDateTS()
{
return Cell.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP ? "2038-01-19T03:14:06+00:00"
return CellData.getVersionedMaxDeletiontionTime() == Cell.MAX_DELETION_TIME_2038_LEGACY_CAP ? "2038-01-19T03:14:06+00:00"

Check failure on line 131 in src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use static access with "org.apache.cassandra.db.rows.CellData" for "MAX_DELETION_TIME_2038_LEGACY_CAP".

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4lqocYek4mpMlU4E1f&open=AZ4lqocYek4mpMlU4E1f&pullRequest=2308
: "2106-02-07T06:28:13+00:00";
}
}
25 changes: 25 additions & 0 deletions src/java/org/apache/cassandra/db/IDataSize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.cassandra.db;

/// Shared interface for providing data size information
public interface IDataSize
{
int dataSize();
}
9 changes: 5 additions & 4 deletions src/java/org/apache/cassandra/db/LivenessInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.cassandra.cache.IMeasurableMemory;
import org.apache.cassandra.db.rows.Cell;
import org.apache.cassandra.db.rows.CellData;
import org.apache.cassandra.serializers.MarshalException;
import org.apache.cassandra.utils.ObjectSizes;

Expand All @@ -37,7 +38,7 @@
* unaffected (of course, the rest of said row data might be ttl'ed on its own but this is
* separate).
*/
public class LivenessInfo implements IMeasurableMemory
public class LivenessInfo implements IMeasurableMemory, IDataSize
{
public static final long NO_TIMESTAMP = Long.MIN_VALUE;
public static final int NO_TTL = Cell.NO_TTL;
Expand Down Expand Up @@ -329,13 +330,13 @@ public LivenessInfo withUpdatedTimestamp(long newTimestamp)
}
}

private static class ExpiringLivenessInfo extends LivenessInfo
protected static class ExpiringLivenessInfo extends LivenessInfo
{
private final int ttl;
private final long localExpirationTime;
private static final long UNSHARED_HEAP_SIZE = ObjectSizes.measure(new ExpiringLivenessInfo(-1, -1, -1));

private ExpiringLivenessInfo(long timestamp, int ttl, long localExpirationTime)
protected ExpiringLivenessInfo(long timestamp, int ttl, long localExpirationTime)
{
super(timestamp);
assert ttl != NO_TTL && localExpirationTime != NO_EXPIRATION_TIME;
Expand Down Expand Up @@ -375,7 +376,7 @@ public void digest(Digest digest)
// As of 5.0, local expiration times are encoded as unsigned integers on disk, so we can do the
// same thing here to populate the digest. This supports extended TTLs, but also maintains digest
// compatibility with previous versions, avoiding false digest mismatches during upgrades.
digest.updateWithInt(Cell.deletionTimeLongToUnsignedInteger(localExpirationTime));
digest.updateWithInt(CellData.deletionTimeLongToUnsignedInteger(localExpirationTime));
digest.updateWithInt(ttl);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs, ReadExe

public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs,
ColumnFamilyStore.ViewFragment view,
Function<Object, Transformation<BaseRowIterator<?>>> rowTransformer,
Function<CellSourceIdentifier, Transformation<BaseRowIterator<?>>> rowTransformer,
ReadExecutionController executionController)
{
assert executionController != null && executionController.validForReadOn(cfs);
Expand All @@ -700,7 +700,7 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs

private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs,
ColumnFamilyStore.ViewFragment view,
Function<Object, Transformation<BaseRowIterator<?>>> rowTransformer,
Function<CellSourceIdentifier, Transformation<BaseRowIterator<?>>> rowTransformer,
ReadExecutionController controller,
long startTimeNanos)
{
Expand Down Expand Up @@ -985,7 +985,7 @@ private boolean queriesMulticellType()
* no collection or counters are included).
* This method assumes the filter is a {@code ClusteringIndexNamesFilter}.
*/
private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, Function<Object, Transformation<BaseRowIterator<?>>> rowTransformer, ClusteringIndexNamesFilter filter, ReadExecutionController controller, long startTimeNanos)
private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ColumnFamilyStore.ViewFragment view, Function<CellSourceIdentifier, Transformation<BaseRowIterator<?>>> rowTransformer, ClusteringIndexNamesFilter filter, ReadExecutionController controller, long startTimeNanos)
{
if (Tracing.traceSinglePartitions())
Tracing.trace("Acquiring sstable references");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.rows.Unfiltered;
import org.apache.cassandra.db.rows.UnfilteredRowIterator;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.schema.TableMetadataRef;
import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.FBUtilities;
Expand Down Expand Up @@ -86,7 +87,7 @@
* the estimate is updated only whenever the number of operations on the memtable increases significantly from the
* last update. This estimate is not very accurate but should be ok for planning or diagnostic purposes.
*/
private volatile MemtableAverageRowSize estimatedAverageRowSize;
protected volatile MemtableAverageRowSize estimatedAverageRowSize;

Check warning on line 90 in src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Use a thread-safe type; adding "volatile" is not enough to make this field thread-safe.

See more on https://sonarcloud.io/project/issues?id=cassandra-stargazer&issues=AZ4sIriPfdkBgWhNh5GL&open=AZ4sIriPfdkBgWhNh5GL&pullRequest=2308

@VisibleForTesting
static MemtablePool createMemtableAllocatorPool()
Expand Down Expand Up @@ -128,9 +129,10 @@
public AbstractAllocatorMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
{
super(metadataRef, commitLogLowerBound);
this.allocator = MEMORY_POOL.newAllocator(metadataRef.toString());
this.initialComparator = metadata.get().comparator;
this.initialFactory = metadata().params.memtable.factory();
TableMetadata tableMetadata = metadataRef.get();
this.allocator = MEMORY_POOL.newAllocator(tableMetadata.toString());
this.initialComparator = tableMetadata.comparator;
this.initialFactory = tableMetadata.params.memtable.factory();
this.owner = owner;
scheduleFlush();
}
Expand Down Expand Up @@ -170,8 +172,9 @@
switch (reason)
{
case SCHEMA_CHANGE:
return initialComparator != metadata().comparator // If the CF comparator has changed, because our partitions reference the old one
|| !initialFactory.equals(metadata().params.memtable.factory()); // If a different type of memtable is requested
TableMetadata tableMetadata = metadata.get(); // do not use metadata() as this may be overridden
return initialComparator != tableMetadata.comparator // If the CF comparator has changed, because our partitions reference the old one
|| !initialFactory.equals(tableMetadata.params.memtable.factory()); // If a different type of memtable is requested
case OWNED_RANGES_CHANGE:
return false; // by default we don't use the local ranges, thus this has no effect
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@
package org.apache.cassandra.db.memtable;

import org.apache.cassandra.db.DataRange;
import org.apache.cassandra.db.IDataSize;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.db.rows.TrieTombstoneMarker;
import org.apache.cassandra.db.rows.Unfiltered;
import org.apache.cassandra.db.rows.UnfilteredRowIterator;
import org.apache.cassandra.db.tries.DeletionAwareTrie;
import org.apache.cassandra.db.tries.Direction;
import org.apache.cassandra.db.tries.Trie;
import org.apache.cassandra.io.sstable.SSTableReadsListener;

class MemtableAverageRowSize
Expand All @@ -32,6 +37,62 @@ class MemtableAverageRowSize
public final long rowSize;
public final long operations;

static class SizeCalculator implements DeletionAwareTrie.ValueConsumer<Object, TrieTombstoneMarker>
{
long totalSize = 0;
long count = 0;

@Override
public void content(Object o)
{
if (o instanceof IDataSize)
{
totalSize += ((IDataSize) o).dataSize();
++count;
}
}

@Override
public void deletionMarker(TrieTombstoneMarker marker)
{
// Count one side of the marker
TrieTombstoneMarker.Covering startedDeletion = marker.rightDeletion();
if (startedDeletion != null)
{
totalSize += startedDeletion.dataSize();
++count;
}
}
}

public MemtableAverageRowSize(Memtable memtable, DeletionAwareTrie<Object, TrieTombstoneMarker> trie)
{
// If this is a trie-based memtable, get the row sizes from the trie elements. This achieves two things:
// - makes sure the size used is the size reflected in the memtable's dataSize
// (which e.g. excludes clustering keys)
// - avoids the conversion to Row, which has non-trivial cost

SizeCalculator sizeCalculator = new SizeCalculator();
trie.process(Direction.FORWARD, sizeCalculator);

this.rowSize = sizeCalculator.count > 0 ? sizeCalculator.totalSize / sizeCalculator.count : 0;
this.operations = memtable.operationCount();
}

public MemtableAverageRowSize(Memtable memtable, Trie<?> trie)
{
// If this is a trie-based memtable, get the row sizes from the trie elements. This achieves two things:
// - makes sure the size used is the size reflected in the memtable's dataSize
// (which e.g. excludes clustering keys)
// - avoids the conversion to Row, which has non-trivial cost


SizeCalculator sizeCalculator = new SizeCalculator();
trie.process(Direction.FORWARD, sizeCalculator);

this.rowSize = sizeCalculator.count > 0 ? sizeCalculator.totalSize / sizeCalculator.count : 0;
this.operations = memtable.operationCount();
}

public MemtableAverageRowSize(Memtable memtable)
{
Expand Down
Loading
Loading