Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/modules/gigamap/pages/indexing/bitmap/types.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ Each long-bit corresponds to an entry, meaning there will be 64 entries max, no
Currently it only supports equality queries.
Range queries are not supported yet.

=== Zero and Negative Values

All binary indexers support zero and negative values.
Internally, each indexer maps the key to a non-zero long representation because the bitmap index uses bit positions as array indices, and a value of `0L` would have no bits set.

For sub-64-bit types (`Byte`, `Short`, `Integer`, `Float`), this mapping is collision-free — no two distinct keys ever map to the same internal representation.

`BinaryIndexerLong` uses `Long.MAX_VALUE` as the internal representation for zero.
As a consequence, `Long.MAX_VALUE` is not supported as an index key and will throw an `IllegalArgumentException`.
All other long values are fully supported.

== Choosing Between Regular and Binary Indexers

The bitmap index offers two families of indexers: **regular indexers** (e.g. `IndexerString`, `IndexerLong`) and **binary indexers** (e.g. `BinaryIndexerString`, `BinaryIndexerLong`). The right choice depends on your data characteristics and query needs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,16 @@ final void initializeTransientState()
private void updateCachedEntriesLength()
{
this.entriesCount = this.entries.length;
this.entriesLengthCheckMask = ~((1L << this.entriesCount) - 1);

// The mask has 1-bits for positions outside the entries array range,
// used by fitsInEntries() to detect keys requiring out-of-range bit positions.
// When entriesCount == Long.SIZE (64), all bit positions are covered,
// so the mask must be 0. A naive (1L << 64) overflows to 1L in Java
// (shifts are mod 64), which would produce an all-ones mask and
// cause every query to return empty results.
this.entriesLengthCheckMask = this.entriesCount >= Long.SIZE
? 0L
: ~((1L << this.entriesCount) - 1);
}

final BitmapEntry<E, I, Long>[] entries()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,6 @@ static void validate(final Long key, final BinaryIndexer<?> indexer)
{
throw new IllegalArgumentException("Null keys are not allowed in index " + indexer.name());
}
validate(key.longValue(), indexer);
}

static void validate(final long key, final BinaryIndexer<?> indexer)
{
if(key <= 0)
{
throw new IllegalArgumentException("Only positive values are allowed: " + key + " in index " + indexer.name());
}
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,40 @@ protected Abstract()
{
super();
}

@Override
protected final Byte getNumber(final E entity)
{
return this.getByte(entity);
}


/**
* Converts a Byte value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* For non-zero values, the unsigned representation is used. This is identical to
* {@code longValue()} for positive values (backwards compatible with existing storages)
* and maps negative values to the upper unsigned range [128, 255] using only 8 bits,
* avoiding the 64-bit sign extension that {@code longValue()} would produce.
* <p>
* For zero, a sentinel value of {@code 1L << 8} (= 256) is used. This is exactly one
* above the maximum unsigned byte value (255), so it can never collide with any
* non-zero byte's unsigned representation.
*/
@Override
protected long toLong(final Byte number)
{
if(number == 0)
{
return 1L << Byte.SIZE;
}
return Byte.toUnsignedLong(number);
}

protected abstract Byte getByte(final E entity);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,31 @@ protected final Double getNumber(final E entity)
return this.getDouble(entity);
}

/**
* Converts a Double value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* The IEEE 754 bit representation ({@code doubleToLongBits}) is used as the bitmap key.
* For non-zero bit patterns, the value is used as-is (backwards compatible with existing storages).
* Only {@code 0.0} produces a zero bit pattern ({@code -0.0} has bit 63 set and is non-zero).
* <p>
* For zero, the sentinel {@code 0x7FF0000000000001L} is used. This is a non-canonical
* IEEE 754 NaN encoding (exponent all-ones, non-zero mantissa) that {@code doubleToLongBits}
* can never return for any double value, because it normalizes all NaN representations
* to the canonical {@code 0x7FF8000000000000L}. This makes the sentinel completely
* collision-free, unlike the Long indexer which must sacrifice an actual value.
*/
@Override
protected long toLong(final Double number)
{
return Double.doubleToLongBits(number);
final long bits = Double.doubleToLongBits(number);
if(bits == 0L)
{
return 0x7FF0000000000001L;
}
return bits;
}

protected abstract Double getDouble(final E entity);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,32 @@ protected final Float getNumber(final E entity)
return this.getFloat(entity);
}

/**
* Converts a Float value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* The IEEE 754 bit representation ({@code floatToIntBits}) is used as the bitmap key.
* For non-zero bit patterns, the unsigned int-to-long conversion is used. This is identical
* to sign extension for positive floats (backwards compatible with existing storages)
* and maps negative floats to the upper unsigned 32-bit range using only 32 bits,
* avoiding the 64-bit sign extension that plain int-to-long widening would produce.
* <p>
* Only {@code 0.0f} produces a zero bit pattern ({@code -0.0f} has bit 31 set and is non-zero).
* For this case, a sentinel value of {@code 1L << 32} (= 2^32) is used. This is exactly one
* above the maximum unsigned 32-bit value, so it can never collide with any non-zero
* float's bit representation.
*/
@Override
protected long toLong(final Float number)
{
return Float.floatToIntBits(number);
final int bits = Float.floatToIntBits(number);
if(bits == 0)
{
return 1L << Integer.SIZE;
}
return Integer.toUnsignedLong(bits);
}

protected abstract Float getFloat(final E entity);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,40 @@ protected Abstract()
{
super();
}

@Override
protected final Integer getNumber(final E entity)
{
return this.getInteger(entity);
}


/**
* Converts an Integer value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* For non-zero values, the unsigned representation is used. This is identical to
* {@code longValue()} for positive values (backwards compatible with existing storages)
* and maps negative values to the upper unsigned range [2^31, 2^32-1] using only 32 bits,
* avoiding the 64-bit sign extension that {@code longValue()} would produce.
* <p>
* For zero, a sentinel value of {@code 1L << 32} (= 2^32) is used. This is exactly one
* above the maximum unsigned int value (2^32-1), so it can never collide with any
* non-zero int's unsigned representation.
*/
@Override
protected long toLong(final Integer number)
{
if(number == 0)
{
return 1L << Integer.SIZE;
}
return Integer.toUnsignedLong(number);
}

protected abstract Integer getInteger(final E entity);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
* An interface that extends {@link BinaryIndexerNumber} specifically for using {@link Long} as the key type.
* It provides indexing capabilities, optimized for binary operations and high-cardinality indices,
* while working with entities of type {@code E}.
* <p>
* <b>Restriction:</b> {@code Long.MAX_VALUE} is not supported as an index key and will throw an
* {@link IllegalArgumentException}. It is reserved internally as a sentinel for zero in the binary
* bitmap index. All other long values, including zero and negative values, are fully supported.
*
* @param <E> the type of entities being indexed
*/
Expand All @@ -37,15 +41,49 @@ protected Abstract()
{
super();
}

@Override
protected final Long getNumber(final E entity)
{
return this.getLong(entity);
}


/**
* Converts a Long value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* For non-zero values, the value is used as-is (backwards compatible with existing storages).
* Negative long values have the sign bit set and use up to 64 bitmap entries, which is
* inherent to the 64-bit representation.
* <p>
* For zero, {@code Long.MAX_VALUE} is used as a sentinel. Unlike sub-64-bit types,
* the full 64-bit long range has no unused bit position above it for a collision-free sentinel.
* {@code Long.MAX_VALUE} is sacrificed because zero is a far more common index value.
* Using {@code Long.MAX_VALUE} as an index key is therefore not supported and throws
* an {@link IllegalArgumentException}.
*
* @throws IllegalArgumentException if the value is {@code Long.MAX_VALUE}
*/
@Override
protected long toLong(final Long number)
{
if(number == Long.MAX_VALUE)
{
throw new IllegalArgumentException(
"Long.MAX_VALUE is not supported as an index key because it is reserved as the zero sentinel in the binary bitmap index."
);
}
if(number == 0L)
{
return Long.MAX_VALUE;
}
return number;
}

protected abstract Long getLong(final E entity);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,40 @@ protected Abstract()
{
super();
}

@Override
protected final Short getNumber(final E entity)
{
return this.getShort(entity);
}


/**
* Converts a Short value to a non-zero long suitable for the binary bitmap index.
* <p>
* The bitmap index uses bit positions in a long as array indices. A value of {@code 0L}
* has no bits set and would cause the entity to be silently skipped during indexing.
* <p>
* For non-zero values, the unsigned representation is used. This is identical to
* {@code longValue()} for positive values (backwards compatible with existing storages)
* and maps negative values to the upper unsigned range [32768, 65535] using only 16 bits,
* avoiding the 64-bit sign extension that {@code longValue()} would produce.
* <p>
* For zero, a sentinel value of {@code 1L << 16} (= 65536) is used. This is exactly one
* above the maximum unsigned short value (65535), so it can never collide with any
* non-zero short's unsigned representation.
*/
@Override
protected long toLong(final Short number)
{
if(number == 0)
{
return 1L << Short.SIZE;
}
return Short.toUnsignedLong(number);
}

protected abstract Short getShort(final E entity);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,17 @@ protected final String getValue(final E entity)
return this.getString(entity);
}

/**
* Packs the string's UTF-8 bytes into long values (8 bytes per long), ensuring no position
* is {@code 0L} since the composite bitmap index treats {@code 0L} as "empty position".
* <p>
* For non-zero packed values, the raw encoding is used (backwards compatible with existing storages).
* Only 8 consecutive null bytes ({@code 0x00}) at a position boundary produce {@code 0L},
* which is mapped to {@code Long.MAX_VALUE}. This is collision-free because
* {@code Long.MAX_VALUE} ({@code 0x7FFFFFFFFFFFFFFF}) requires {@code 0xFF} bytes in
* positions 0-6, and {@code 0xFF} is invalid in UTF-8 — it can never appear in the
* output of {@code String.getBytes(UTF_8)}.
*/
@Override
protected long[] fillCarrier(final String value, final long[] carrier)
{
Expand All @@ -100,14 +111,22 @@ protected long[] fillCarrier(final String value, final long[] carrier)
{
result = new long[size];
}

for(int i = 0; i < bytes.length; i++)
{
final int arrayIndex = i / 8;
final int bitPosition = (i % 8) * 8;
result[arrayIndex] |= ((long)(bytes[i] & 0xFF)) << bitPosition;
}


for(int i = 0; i < size; i++)
{
if(result[i] == 0L)
{
result[i] = Long.MAX_VALUE;
}
}

return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,25 @@ protected int compositeSize()
return 2;
}

/**
* Fills the composite carrier with the UUID's bit representation, ensuring no position
* is {@code 0L} since the composite bitmap index treats {@code 0L} as "empty position".
* <p>
* For non-zero bit halves, the raw value is used (backwards compatible with existing storages).
* For zero halves, {@code Long.MAX_VALUE} is used as a sentinel. UUIDs with a half equal
* to {@code Long.MAX_VALUE} would collide with zero, but this is astronomically unlikely
* (~1 in 2^64) and both cases were previously broken (silently skipped during indexing).
*/
@Override
protected void fillCarrier(final UUID uuid, final long[] carrier)
{
carrier[0] = uuid.getMostSignificantBits();
carrier[1] = uuid.getLeastSignificantBits();
carrier[0] = ensureNonZero(uuid.getMostSignificantBits());
carrier[1] = ensureNonZero(uuid.getLeastSignificantBits());
}

private static long ensureNonZero(final long bits)
{
return bits == 0L ? Long.MAX_VALUE : bits;
}

@Override
Expand Down
Loading