Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f3ded4a
Initial commit of splitting queue entries
ohadzeliger Feb 13, 2026
d7e130d
Style
ohadzeliger Feb 16, 2026
a3ef2ef
Implement serializer and allowed version/clear options
ohadzeliger Feb 18, 2026
515d431
Merge branch 'main' into split-queue-entries
ohadzeliger Feb 19, 2026
6bdd1cf
Rebase from main
ohadzeliger Feb 19, 2026
783c3b1
Complete unsplitting
ohadzeliger Feb 20, 2026
8656bdb
Small fixes.
ohadzeliger Feb 23, 2026
fc24b5b
Add more tests for large document.
ohadzeliger Feb 24, 2026
c2a3cc6
Add useVersionInKey dimension to SplitHelperTestConfig
ohadzeliger Feb 24, 2026
8c260e0
Add VersioningSplitKeyValueHelper coverage to SplitHelperTest
ohadzeliger Feb 24, 2026
493a40f
Refactor writeDummyRecord/writeDummyKV and add deleteWithSplitMultipl…
ohadzeliger Feb 25, 2026
57f960d
Add scanMultipleRecordsWithVersioningKey and writeDummyRecordsMultipl…
ohadzeliger Feb 25, 2026
84598da
Add scanContinuationsMultipleTransactions
ohadzeliger Feb 25, 2026
7071e40
cleanup comments
ohadzeliger Feb 26, 2026
0f89ece
Add tests for the case where we overwrite same record with version in…
ohadzeliger Feb 26, 2026
e7dd234
Create separate test with multiple transaction, revert split helper t…
ohadzeliger Feb 28, 2026
cf54a2d
Convert more tests
ohadzeliger Mar 1, 2026
05c958e
Cleanup
ohadzeliger Mar 1, 2026
fb0085c
More test changes, extract TestConfig and reuse from both test classes.
ohadzeliger Mar 2, 2026
8f7038e
typos, hashcode
ohadzeliger Mar 2, 2026
c1b641c
PR comments.
ohadzeliger Mar 3, 2026
1a68f22
PR comments: Add backwards-compatible method to API
ohadzeliger Mar 4, 2026
d7deeed
More PR comments
ohadzeliger Mar 4, 2026
e9da6a3
Style
ohadzeliger Mar 4, 2026
f93aa9f
Split two test methods that were too long
ohadzeliger Mar 5, 2026
96eccea
Revert API changes, use tuple.hasIncompleteVersion, adjust tests
ohadzeliger Mar 8, 2026
4524208
More refactoring, add asserts for key size in tests
ohadzeliger Mar 9, 2026
db31556
Removed helper and implementation
ohadzeliger Mar 9, 2026
3e68faa
More test fixes
ohadzeliger Mar 9, 2026
f196baf
PR comments
ohadzeliger Mar 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

package com.apple.foundationdb.record.provider.foundationdb;

import com.apple.foundationdb.annotation.API;
import com.apple.foundationdb.tuple.Tuple;

import javax.annotation.Nonnull;
Expand All @@ -33,7 +34,8 @@
* any splits have been removed), and its version. It also includes sizing information describing
* the record's on-disk footprint.
*/
class FDBRawRecord implements FDBStoredSizes {
@API(API.Status.INTERNAL)
public class FDBRawRecord implements FDBStoredSizes {
@Nonnull private final Tuple primaryKey;
@Nonnull private final byte[] rawRecord;
@Nullable private final FDBRecordVersion version;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import com.apple.foundationdb.record.FDBRecordStoreProperties;
import com.apple.foundationdb.record.RecordCoreArgumentException;
import com.apple.foundationdb.record.RecordCoreException;
import com.apple.foundationdb.record.RecordCoreInternalException;
import com.apple.foundationdb.record.RecordCoreStorageException;
import com.apple.foundationdb.record.RecordCursor;
import com.apple.foundationdb.record.RecordCursorContinuation;
Expand Down Expand Up @@ -121,7 +122,7 @@
* @param sizeInfo optional size information to populate
*/
@SuppressWarnings("PMD.CloseResource")
public static void saveWithSplit(@Nonnull final FDBRecordContext context, @Nonnull final Subspace subspace,

Check warning on line 125 in fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/SplitHelper.java

View check run for this annotation

fdb.teamscale.io / Teamscale | Findings

fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/SplitHelper.java#L125

Method `saveWithSplit` has 10 parameters but no more than 7 parameters are allowed https://fdb.teamscale.io/findings/details/foundationdb-fdb-record-layer?t=FORK_MR%2F3970%2Fohadzeliger%2Fsplit-queue-entries%3AHEAD&id=0D47AD459B4AAA8546CC3A078C556123
@Nonnull final Tuple key, @Nonnull final byte[] serialized, @Nullable final FDBRecordVersion version,
final boolean splitLongRecords, final boolean omitUnsplitSuffix,
final boolean clearBasedOnPreviousSizeInfo, @Nullable final FDBStoredSizes previousSizeInfo,
Expand All @@ -132,17 +133,26 @@
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspace.pack()))
.addLogInfo(LogMessageKeys.VERSION, version);
}
final Transaction tr = context.ensureActive();
boolean hasVersionInKey = key.hasIncompleteVersionstamp();
if ((version != null) && hasVersionInKey) {
// Cannot have versionStamps in BOTH key and value
throw new RecordCoreArgumentException("Cannot save versionStamp in both key and value");
}
if (serialized.length > SplitHelper.SPLIT_RECORD_SIZE) {
if (!splitLongRecords) {
throw new RecordCoreException("Record is too long to be stored in a single value; consider split_long_records")
.addLogInfo(LogMessageKeys.KEY_TUPLE, key)
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspace.pack()))
.addLogInfo(LogMessageKeys.VALUE_SIZE, serialized.length);
}
writeSplitRecord(context, subspace, key, serialized, clearBasedOnPreviousSizeInfo, previousSizeInfo, sizeInfo);
writeSplitRecord(context, subspace, key, serialized, hasVersionInKey, clearBasedOnPreviousSizeInfo, previousSizeInfo, sizeInfo);
} else {
if (splitLongRecords || previousSizeInfo == null || previousSizeInfo.isVersionedInline()) {
// An incomplete version in the key means that we shouldn't delete previous k/v pairs using these keys since,
// in the DB, from previous transactions, they would have been completed the versions already (and so wouldn't match)
Comment on lines +150 to +151
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is still a little unclear. It might be that we're conceiving of the problem in different ways.

My point has been that you can (and should) clear out the keys from the version mutation cache if you update the same key (with an incomplete version) multiple times in the same transaction. This comment is pointing out the (also true) statement that it is incorrect for the user to provide a Tuple with an incomplete version associated with a previous transaction.

I might rephrase it as something like:

We currently do not support clearing out data associated with keys with an incomplete version. If data was committed with an incomplete version in its key, then the version would have been filled in with a complete version at commit time, and the user should use the updated key to modify it. An incomplete version should thus only be used if there are multiple updates to the same value during one transaction. We need to consult the context's versionMutationCache to handle that correctly, which is currently unimplemented

It probably could be more succint. I kind of feel like the more important point is just that we need to search through a data structure that we currently don't look through (the versionMutationCache) to do the right thing here, but I do kind of see the point that any incomplete keys committed during previous transactions are now accessed via different complete keys

if (!hasVersionInKey && (splitLongRecords || previousSizeInfo == null || previousSizeInfo.isVersionedInline())) {
// Note that the clearPreviousSplitRecords also removes version splits from the context cache
// This is not currently supported for the case where we have versions in the keys since we can't trace the old values down
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can't know the previous value, how can we have a previousSizeInfo?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When shouldClearBeforeWrite()==true the assumption is that we have values we can use to clear the previous keys (with or without the sizeInfo).
when shouldClearBeforeWrite()==false then sizeInfo is ignored as we don't call clear at all (and chances are we will see null there).

Copy link
Contributor Author

@ohadzeliger ohadzeliger Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the changed logic, if the key has an incomplete version, then we can't clear older records based on size info since the size info either has complete version (previous transaction was committed) or it has incomplete versions (in which case the keys are not in the RYW cache)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I don't understand this, but it seems like we should be able to do this. The versionMutationCache always uses a fixed value (all \xff bytes) for the uncommitted version, so two keys with the same FDBRecordVersion in it should appear in the same location of the versionMutationCache.

As it happens, I think this is fine for the use case of the pending writes queue, as a different local version is claimed with each insert. Theoretically, you could imagine leveraging this code to support versionstamps in primary keys (which we currently don't, but this gets us pretty close to being able to support), in which case this may be more important

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue faced was that the cache contains specific keys with split suffixes which can span multiple cache entries. In the "incomplete version in key" case we would need to replace them all when a new record gets written with the same local version. One way to introduce that would be to group all the splits by the local version and replace the entire group every time, but that seems a little excessive since there is no use case for this currently.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do group all of the keys by their local version. The mutations currently go into a NavigableMap, which is sorted by unsigned byte order. Incomplete versions all get assigned 10 \xff bytes as their global version, and so they'll all be there in the cache in consecutive keys. If there's no use case for updating large values within the same transaction as they're included, then we could punt this, though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, added a comment about supporting in a future PR

// Will be improved on in a separate PR
clearPreviousSplitRecord(context, subspace, key, clearBasedOnPreviousSizeInfo, previousSizeInfo);
}
final Tuple recordKey;
Expand All @@ -151,8 +161,7 @@
} else {
recordKey = key;
}
final byte[] keyBytes = subspace.pack(recordKey);
tr.set(keyBytes, serialized);
final byte[] keyBytes = writeSplitValue(context, subspace, recordKey, serialized);
if (sizeInfo != null) {
sizeInfo.set(keyBytes, serialized);
sizeInfo.setSplit(false);
Expand All @@ -162,23 +171,25 @@
}

@SuppressWarnings("PMD.CloseResource")
private static void writeSplitRecord(@Nonnull final FDBRecordContext context, @Nonnull final Subspace subspace,

Check warning on line 174 in fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/SplitHelper.java

View check run for this annotation

fdb.teamscale.io / Teamscale | Findings

fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/SplitHelper.java#L174

Method `writeSplitRecord` has 8 parameters but no more than 7 parameters are allowed https://fdb.teamscale.io/findings/details/foundationdb-fdb-record-layer?t=FORK_MR%2F3970%2Fohadzeliger%2Fsplit-queue-entries%3AHEAD&id=27052715EC4F8D12D6A8BB40DE100D7E
@Nonnull final Tuple key, @Nonnull final byte[] serialized,
boolean hasVersionInKey,
final boolean clearBasedOnPreviousSizeInfo, @Nullable final FDBStoredSizes previousSizeInfo,
@Nullable SizeInfo sizeInfo) {
final Transaction tr = context.ensureActive();
final Subspace keySplitSubspace = subspace.subspace(key);
clearPreviousSplitRecord(context, subspace, key, clearBasedOnPreviousSizeInfo, previousSizeInfo);
if (!hasVersionInKey) {
// Note that the clearPreviousSplitRecords also removes version splits from the context cache
// This is not currently supported for the case where we have versions in the keys since we can't trace the old values down
clearPreviousSplitRecord(context, subspace, key, clearBasedOnPreviousSizeInfo, previousSizeInfo);
}
long index = SplitHelper.START_SPLIT_RECORD;
int offset = 0;
while (offset < serialized.length) {
int nextOffset = offset + SplitHelper.SPLIT_RECORD_SIZE;
if (nextOffset > serialized.length) {
nextOffset = serialized.length;
}
final byte[] keyBytes = keySplitSubspace.pack(index);
final byte[] valueBytes = Arrays.copyOfRange(serialized, offset, nextOffset);
tr.set(keyBytes, valueBytes);
final byte[] keyBytes = writeSplitValue(context, subspace, key.add(index), valueBytes);
if (sizeInfo != null) {
if (offset == 0) {
sizeInfo.set(keyBytes, valueBytes);
Expand All @@ -192,6 +203,28 @@
}
}

private static byte[] writeSplitValue(FDBRecordContext context, Subspace subspace, Tuple recordKey, byte[] serialized) {
byte[] keyBytes;
if (recordKey.hasIncompleteVersionstamp()) {
keyBytes = subspace.packWithVersionstamp(recordKey);
byte[] current = context.addVersionMutation(
MutationType.SET_VERSIONSTAMPED_KEY,
keyBytes,
serialized);
if (current != null) {
// This should never happen. It means that the same key (and suffix) and local version were used for subsequent
// write. It is most likely not an intended flow and this check will protect against that.
// It is an incomplete check since the same record can have different suffixes to the primary key that would not collide.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// It is an incomplete check since the same record can have different suffixes to the primary key that would not collide.
// It is an incomplete check since the same record can have different suffixes to the primary key that would not collide.
// Namely, if one is split, and one is not split they will not overlap. Anything else and the would.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, done.

// Namely, if one is split, and one is not split they will not overlap. Anything else and the would.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Namely, if one is split, and one is not split they will not overlap. Anything else and the would.
// Namely, if one is split, and one is not split they will not overlap. Anything else would.

throw new RecordCoreInternalException("Key with version overwritten");
}
} else {
keyBytes = subspace.pack(recordKey);
context.ensureActive().set(keyBytes, serialized);
}
return keyBytes;
}

@SuppressWarnings("PMD.CloseResource")
private static void writeVersion(@Nonnull final FDBRecordContext context, @Nonnull final Subspace subspace, @Nonnull final Tuple key,
@Nullable final FDBRecordVersion version, @Nullable final SizeInfo sizeInfo) {
Expand All @@ -201,11 +234,11 @@
}
return;
}
final Transaction tr = context.ensureActive();
// At this point we know the key does not have a version
final byte[] keyBytes = subspace.pack(key.add(RECORD_VERSION));
final byte[] valueBytes = packVersion(version);
if (version.isComplete()) {
tr.set(keyBytes, valueBytes);
context.ensureActive().set(keyBytes, valueBytes);
} else {
context.addVersionMutation(MutationType.SET_VERSIONSTAMPED_VALUE, keyBytes, valueBytes);
context.addToLocalVersionCache(keyBytes, version.getLocalVersion());
Expand Down
Loading
Loading