Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions faiss/factory_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ const std::map<faiss::ScalarQuantizer::QuantizerType, std::string> sq_types = {
{faiss::ScalarQuantizer::QT_bf16, "SQbf16"},
{faiss::ScalarQuantizer::QT_8bit_direct_signed, "SQ8_direct_signed"},
{faiss::ScalarQuantizer::QT_8bit_direct, "SQ8_direct"},
{faiss::ScalarQuantizer::QT_1bit_tqmse, "SQtqmse1"},
{faiss::ScalarQuantizer::QT_2bit_tqmse, "SQtqmse2"},
{faiss::ScalarQuantizer::QT_3bit_tqmse, "SQtqmse3"},
{faiss::ScalarQuantizer::QT_4bit_tqmse, "SQtqmse4"},
{faiss::ScalarQuantizer::QT_8bit_tqmse, "SQtqmse8"},
};

int get_hnsw_M(const faiss::IndexHNSW* index) {
Expand Down
29 changes: 29 additions & 0 deletions faiss/impl/ScalarQuantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,29 @@ ScalarQuantizer::ScalarQuantizer() {}

void ScalarQuantizer::set_derived_sizes() {
switch (qtype) {
case QT_1bit_tqmse:
code_size = (d + 7) / 8;
bits = 1;
break;
case QT_2bit_tqmse:
code_size = (d * 2 + 7) / 8;
bits = 2;
break;
case QT_3bit_tqmse:
code_size = (d * 3 + 7) / 8;
bits = 3;
break;
case QT_8bit:
case QT_8bit_uniform:
case QT_8bit_direct:
case QT_8bit_direct_signed:
case QT_8bit_tqmse:
code_size = d;
bits = 8;
break;
case QT_4bit:
case QT_4bit_uniform:
case QT_4bit_tqmse:
code_size = (d + 1) / 2;
bits = 4;
break;
Expand Down Expand Up @@ -107,6 +121,21 @@ void ScalarQuantizer::train(size_t n, const float* x) {
case QT_8bit_direct_signed:
// no training necessary
break;
case QT_1bit_tqmse:
scalar_quantizer::train_TurboQuantMSE(d, 1, trained);
break;
case QT_2bit_tqmse:
scalar_quantizer::train_TurboQuantMSE(d, 2, trained);
break;
case QT_3bit_tqmse:
scalar_quantizer::train_TurboQuantMSE(d, 3, trained);
break;
case QT_4bit_tqmse:
scalar_quantizer::train_TurboQuantMSE(d, 4, trained);
break;
case QT_8bit_tqmse:
scalar_quantizer::train_TurboQuantMSE(d, 8, trained);
break;
default:
break;
}
Expand Down
5 changes: 5 additions & 0 deletions faiss/impl/ScalarQuantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ struct ScalarQuantizer : Quantizer {
QT_bf16,
QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from
///< [-128 to 127]
QT_1bit_tqmse, ///< TurboQuant MSE-optimized, x bits per component
QT_2bit_tqmse,
QT_3bit_tqmse,
QT_4bit_tqmse,
QT_8bit_tqmse,
QT_count
};

Expand Down
17 changes: 16 additions & 1 deletion faiss/impl/index_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,7 @@ void read_ScalarQuantizer(
READ1(qtype_int);
FAISS_THROW_IF_NOT_FMT(
qtype_int >= ScalarQuantizer::QT_8bit &&
qtype_int <= ScalarQuantizer::QT_8bit_direct_signed,
qtype_int < ScalarQuantizer::QT_count,
"invalid ScalarQuantizer qtype %d",
qtype_int);
ivsc->qtype = static_cast<ScalarQuantizer::QuantizerType>(qtype_int);
Expand Down Expand Up @@ -906,6 +906,21 @@ void read_ScalarQuantizer(
case ScalarQuantizer::QT_count:
expected = 0;
break;
case ScalarQuantizer::QT_1bit_tqmse:
expected = 2 + 1; // 2^bits centroids + (2^bits - 1) boundaries
break;
case ScalarQuantizer::QT_2bit_tqmse:
expected = 4 + 3;
break;
case ScalarQuantizer::QT_3bit_tqmse:
expected = 8 + 7;
break;
case ScalarQuantizer::QT_4bit_tqmse:
expected = 16 + 15;
break;
case ScalarQuantizer::QT_8bit_tqmse:
expected = 256 + 255;
break;
}
if (ivsc->trained.empty() && expected > 0) {
// Empty trained is only valid for untrained indices.
Expand Down
87 changes: 87 additions & 0 deletions faiss/impl/scalar_quantizer/quantizers.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

#pragma once

#include <algorithm>

#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/ScalarQuantizer.h>
#include <faiss/impl/simdlib/simdlib_dispatch.h>
#include <faiss/utils/bf16.h>
Expand Down Expand Up @@ -113,6 +116,90 @@ struct QuantizerTemplate<
}
};

/*******************************************************************
* TurboQuant MSE quantizer
*******************************************************************/
template <int NBits, SIMDLevel SL>
struct QuantizerTurboQuantMSE;

template <int NBits>
struct QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE>
: ScalarQuantizer::SQuantizer {
static_assert(NBits >= 1 && NBits <= 8);

static constexpr size_t kCentroidsCount = size_t(1) << NBits;
static constexpr uint16_t kIndexMask =
static_cast<uint16_t>((1u << NBits) - 1);

const size_t d;
const float* centroids;
const float* boundaries;

QuantizerTurboQuantMSE(size_t d_in, const std::vector<float>& trained)
: d(d_in), centroids(nullptr), boundaries(nullptr) {
FAISS_THROW_IF_NOT(trained.size() == 2 * kCentroidsCount - 1);
centroids = trained.data();
boundaries = trained.data() + kCentroidsCount;
}

FAISS_ALWAYS_INLINE uint8_t select_index(float x) const {
return static_cast<uint8_t>(
std::upper_bound(
boundaries, boundaries + (kCentroidsCount - 1), x) -
boundaries);
}

FAISS_ALWAYS_INLINE void encode_index(uint8_t idx, uint8_t* code, size_t i)
const {
const size_t bit_offset = i * NBits;
const size_t byte_offset = bit_offset >> 3;
const size_t bit_shift = bit_offset & 7;
const uint16_t packed = static_cast<uint16_t>(idx & kIndexMask)
<< bit_shift;
code[byte_offset] |= packed & 0xff;
if (bit_shift + NBits > 8) {
code[byte_offset + 1] |= packed >> 8;
}
}

FAISS_ALWAYS_INLINE uint8_t
decode_index(const uint8_t* code, size_t i) const {
const size_t bit_offset = i * NBits;
const size_t byte_offset = bit_offset >> 3;
const size_t bit_shift = bit_offset & 7;

uint16_t packed = code[byte_offset];
if (bit_shift + NBits > 8) {
packed |= static_cast<uint16_t>(code[byte_offset + 1]) << 8;
}
return static_cast<uint8_t>((packed >> bit_shift) & kIndexMask);
}

void encode_vector(const float* x, uint8_t* code) const final {
for (size_t i = 0; i < d; i++) {
encode_index(select_index(x[i]), code, i);
}
}

void decode_vector(const uint8_t* code, float* x) const final {
for (size_t i = 0; i < d; i++) {
x[i] = centroids[decode_index(code, i)];
}
}

FAISS_ALWAYS_INLINE float reconstruct_component(
const uint8_t* code,
size_t i) const {
return centroids[decode_index(code, i)];
}
};

template <int NBits, SIMDLevel SL>
struct QuantizerTurboQuantMSE : QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE> {
using QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE>::
QuantizerTurboQuantMSE;
};

/*******************************************************************
* FP16 quantizer
*******************************************************************/
Expand Down
107 changes: 107 additions & 0 deletions faiss/impl/scalar_quantizer/sq-avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

#include <faiss/impl/simdlib/simdlib_avx2.h>

#include <cstring>

#include <faiss/impl/scalar_quantizer/codecs.h>
#include <faiss/impl/scalar_quantizer/distance_computers.h>
#include <faiss/impl/scalar_quantizer/quantizers.h>
Expand All @@ -21,6 +23,61 @@ namespace scalar_quantizer {

using simd8float32 = faiss::simd8float32_tpl<SIMDLevel::AVX2>;

namespace {

FAISS_ALWAYS_INLINE uint16_t load_u16(const uint8_t* ptr) {
uint16_t value;
std::memcpy(&value, ptr, sizeof(value));
return value;
}

FAISS_ALWAYS_INLINE uint32_t load_u32(const uint8_t* ptr) {
uint32_t value;
std::memcpy(&value, ptr, sizeof(value));
return value;
}

FAISS_ALWAYS_INLINE uint32_t load_u24(const uint8_t* ptr) {
return static_cast<uint32_t>(ptr[0]) |
(static_cast<uint32_t>(ptr[1]) << 8) |
(static_cast<uint32_t>(ptr[2]) << 16);
}

FAISS_ALWAYS_INLINE __m256i unpack_8x1bit_to_u32(const uint8_t* code, int i) {
const uint32_t packed = code[static_cast<size_t>(i) >> 3];
const __m256i shifts = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
const __m256i indices =
_mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
return _mm256_and_si256(indices, _mm256_set1_epi32(0x1));
}

FAISS_ALWAYS_INLINE __m256i unpack_8x2bit_to_u32(const uint8_t* code, int i) {
const uint32_t packed = load_u16(code + (static_cast<size_t>(i) >> 2));
const __m256i shifts = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
const __m256i indices =
_mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
return _mm256_and_si256(indices, _mm256_set1_epi32(0x3));
}

FAISS_ALWAYS_INLINE __m256i unpack_8x3bit_to_u32(const uint8_t* code, int i) {
const uint32_t packed =
load_u24(code + ((static_cast<size_t>(i) >> 3) * 3));
const __m256i shifts = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
const __m256i indices =
_mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
return _mm256_and_si256(indices, _mm256_set1_epi32(0x7));
}

FAISS_ALWAYS_INLINE __m256i unpack_8x4bit_to_u32(const uint8_t* code, int i) {
const uint32_t packed = load_u32(code + (static_cast<size_t>(i) >> 1));
const __m256i shifts = _mm256_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28);
const __m256i indices =
_mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
return _mm256_and_si256(indices, _mm256_set1_epi32(0xf));
}

} // namespace

/**********************************************************
* Codecs
**********************************************************/
Expand Down Expand Up @@ -168,6 +225,56 @@ struct QuantizerTemplate<
}
};

/**********************************************************
* TurboQuant MSE quantizer
**********************************************************/

#define DEFINE_TQMSE_AVX2_SPECIALIZATION(NBITS, INDEX_EXPR) \
template <> \
struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX2> \
: QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
\
QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
: Base(d, trained) { \
assert(d % 8 == 0); \
} \
\
FAISS_ALWAYS_INLINE simd8float32 \
reconstruct_8_components(const uint8_t* code, int i) const { \
const __m256i indices = (INDEX_EXPR); \
return simd8float32(_mm256_i32gather_ps( \
this->centroids, indices, sizeof(float))); \
} \
}

DEFINE_TQMSE_AVX2_SPECIALIZATION(1, unpack_8x1bit_to_u32(code, i));
DEFINE_TQMSE_AVX2_SPECIALIZATION(2, unpack_8x2bit_to_u32(code, i));
DEFINE_TQMSE_AVX2_SPECIALIZATION(3, unpack_8x3bit_to_u32(code, i));
DEFINE_TQMSE_AVX2_SPECIALIZATION(4, unpack_8x4bit_to_u32(code, i));

#undef DEFINE_TQMSE_AVX2_SPECIALIZATION

template <>
struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX2>
: QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
using Base = QuantizerTurboQuantMSE<8, SIMDLevel::NONE>;

QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
: Base(d, trained) {
assert(d % 8 == 0);
}

FAISS_ALWAYS_INLINE simd8float32
reconstruct_8_components(const uint8_t* code, int i) const {
const __m128i packed = _mm_loadl_epi64(
(const __m128i*)(code + static_cast<size_t>(i)));
const __m256i indices = _mm256_cvtepu8_epi32(packed);
return simd8float32(
_mm256_i32gather_ps(this->centroids, indices, sizeof(float)));
}
};

/**********************************************************
* FP16 Quantizer
**********************************************************/
Expand Down
Loading
Loading