Skip to content

Commit 0ccaf40

Browse files
authored
add compression support in GetMsgPackedValueAsString (#327)
adds support to retrieve a compressed version of the msgpacked value. If values are stored compressed directly uses the stored/compressed value.
1 parent 74bdaa4 commit 0ccaf40

32 files changed

Lines changed: 481 additions & 69 deletions

.github/workflows/keyvi.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ jobs:
3333
brew update
3434
# workaround for https://github.com/actions/setup-python/issues/577
3535
brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done
36-
brew install zlib snappy boost
36+
brew install zlib snappy boost@1.85
37+
brew link boost@1.85
3738
- name: checkout from git
3839
uses: actions/checkout@v4
3940

.github/workflows/python-cibuildwheel.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ jobs:
6464
run: |
6565
brew update && \
6666
brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done && \
67-
brew install ccache zlib snappy boost
67+
brew install ccache zlib snappy boost@1.85
68+
brew link boost@1.85
6869
6970
- name: set mac deployment target X64
7071
if: runner.os == 'macOS' && runner.arch == 'X64'
@@ -102,7 +103,7 @@ jobs:
102103
CIBW_BEFORE_BUILD: pip install -r python/requirements.txt
103104

104105
# testing
105-
CIBW_TEST_REQUIRES: pytest
106+
CIBW_TEST_REQUIRES: pytest python-snappy zstd
106107
CIBW_TEST_COMMAND: >
107108
python -m pytest {package}/tests &&
108109
python -m pytest {package}/integration-tests
@@ -139,6 +140,7 @@ jobs:
139140
python setup.py sdist -d wheelhouse && \
140141
python -m pip uninstall -y autowrap && \
141142
python -m pip install wheelhouse/*.tar.gz -v && \
143+
python -m pip install python-snappy zstd && \
142144
python -m pytest tests && \
143145
python -m pip uninstall -y keyvi
144146

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ else ()
7272
message(FATAL_ERROR "Can not find Boost")
7373
endif ()
7474
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
75-
set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread")
75+
set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread-mt")
7676
else ()
7777
set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread")
7878
endif ()

keyvi/bin/keyvi_c/c_api.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,29 @@ keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match* match) {
182182
return keyvi_bytes{data_size, static_cast<const uint8_t*>(data_ptr)};
183183
}
184184

185+
keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match* match,
186+
keyvi::compression::CompressionAlgorithm compression) {
187+
const keyvi_bytes empty_keyvi_bytes{0, nullptr};
188+
189+
if (!match->obj_) {
190+
return empty_keyvi_bytes;
191+
}
192+
193+
const std::string compressed_value = match->obj_->GetMsgPackedValueAsString(compression);
194+
195+
const size_t data_size = compressed_value.size();
196+
if (0 == data_size) {
197+
return empty_keyvi_bytes;
198+
}
199+
auto* data_ptr = malloc(data_size);
200+
if (nullptr == data_ptr) {
201+
return empty_keyvi_bytes;
202+
}
203+
memcpy(data_ptr, compressed_value.c_str(), data_size);
204+
205+
return keyvi_bytes{data_size, static_cast<const uint8_t*>(data_ptr)};
206+
}
207+
185208
char* keyvi_match_get_matched_string(const keyvi_match* match) {
186209
return std_2_c_string(match->obj_ ? match->obj_->GetMatchedString() : "");
187210
}

keyvi/include/keyvi/c_api/c_api.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ extern "C" {
3232
#include <stddef.h>
3333
#include <stdint.h>
3434

35+
#include "keyvi/compression/compression_algorithm.h"
36+
3537
struct keyvi_dictionary;
3638
struct keyvi_match;
3739
struct keyvi_match_iterator;
@@ -92,6 +94,9 @@ char* keyvi_match_get_value_as_string(const struct keyvi_match*);
9294

9395
keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match*);
9496

97+
keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match*,
98+
keyvi::compression::CompressionAlgorithm);
99+
95100
char* keyvi_match_get_matched_string(const struct keyvi_match*);
96101

97102
//////////////////////
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* * keyvi - A key value store.
2+
*
3+
* Copyright 2025 Hendrik Muhs<hendrik.muhs@gmail.com>
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
#ifndef KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_
19+
#define KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_
20+
21+
namespace keyvi {
22+
namespace compression {
23+
24+
enum CompressionAlgorithm {
25+
NO_COMPRESSION = 0,
26+
ZLIB_COMPRESSION = 1,
27+
SNAPPY_COMPRESSION = 2,
28+
ZSTD_COMPRESSION = 3,
29+
};
30+
31+
} /* namespace compression */
32+
} /* namespace keyvi */
33+
34+
#endif // KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_

keyvi/include/keyvi/compression/compression_selector.h

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@
2525
#ifndef KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_
2626
#define KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_
2727

28+
#include <memory>
2829
#include <string>
2930

3031
#include <boost/algorithm/string.hpp>
3132
#include <boost/lexical_cast.hpp>
3233

34+
#include "keyvi/compression/compression_algorithm.h"
3335
#include "keyvi/compression/compression_strategy.h"
3436
#include "keyvi/compression/snappy_compression_strategy.h"
3537
#include "keyvi/compression/zlib_compression_strategy.h"
@@ -64,8 +66,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") {
6466
typedef std::string (*decompress_func_t)(const std::string&);
6567
typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t);
6668

67-
inline decompress_func_t decompressor_by_code(const std::string& s) {
68-
switch (s[0]) {
69+
inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorithm) {
70+
switch (algorithm) {
6971
case NO_COMPRESSION:
7072
TRACE("unpack uncompressed string");
7173
return RawCompressionStrategy::DoDecompress;
@@ -79,8 +81,29 @@ inline decompress_func_t decompressor_by_code(const std::string& s) {
7981
TRACE("unpack zstd compressed string");
8082
return ZstdCompressionStrategy::DoDecompress;
8183
default:
82-
throw std::invalid_argument("Invalid compression code " +
83-
boost::lexical_cast<std::string>(static_cast<int>(s[0])));
84+
throw std::invalid_argument("Invalid compression algorithm " +
85+
boost::lexical_cast<std::string>(static_cast<int>(algorithm)));
86+
}
87+
}
88+
89+
inline decompress_func_t decompressor_from_string(const std::string& s) {
90+
return decompressor_by_code(static_cast<CompressionAlgorithm>(s[0]));
91+
}
92+
93+
/** Returns an instance of a compression strategy by enum. */
94+
inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) {
95+
switch (algorithm) {
96+
case NO_COMPRESSION:
97+
return std::make_unique<RawCompressionStrategy>();
98+
case ZLIB_COMPRESSION:
99+
return std::make_unique<ZlibCompressionStrategy>();
100+
case SNAPPY_COMPRESSION:
101+
return std::make_unique<SnappyCompressionStrategy>();
102+
case ZSTD_COMPRESSION:
103+
return std::make_unique<ZstdCompressionStrategy>();
104+
default:
105+
throw std::invalid_argument("Invalid compression algorithm " +
106+
boost::lexical_cast<std::string>(static_cast<int>(algorithm)));
84107
}
85108
}
86109

keyvi/include/keyvi/compression/compression_strategy.h

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,18 @@
2626
#define KEYVI_COMPRESSION_COMPRESSION_STRATEGY_H_
2727

2828
#include <cstring>
29+
#include <memory>
2930
#include <string>
3031
#include <vector>
3132

33+
#include "keyvi/compression/compression_algorithm.h"
3234
#include "keyvi/dictionary/fsa/internal/constants.h"
3335

3436
namespace keyvi {
3537
namespace compression {
3638

37-
enum CompressionCode {
38-
NO_COMPRESSION = 0,
39-
ZLIB_COMPRESSION = 1,
40-
SNAPPY_COMPRESSION = 2,
41-
ZSTD_COMPRESSION = 3,
42-
};
43-
4439
// buffer type which is realloc-able
45-
typedef std::vector<char> buffer_t;
40+
using buffer_t = std::vector<char>;
4641

4742
/**
4843
* The base class of every compression strategy.
@@ -64,6 +59,12 @@ struct CompressionStrategy {
6459
return std::string(buf.data(), buf.size());
6560
}
6661

62+
inline std::string CompressWithoutHeader(const std::string& raw) {
63+
buffer_t buf;
64+
Compress(&buf, raw.data(), raw.size());
65+
return std::string(buf.data() + 1, buf.size() - 1);
66+
}
67+
6768
/**
6869
* By the time this function is called, the length field added in Compress()
6970
* will have been removed.
@@ -77,6 +78,8 @@ struct CompressionStrategy {
7778
virtual uint64_t GetFileVersionMin() const = 0;
7879
};
7980

81+
using compression_strategy_t = std::unique_ptr<CompressionStrategy>;
82+
8083
/**
8184
* A compression strategy that does almost nothing; i.e. it only adds
8285
* the length field.
@@ -90,12 +93,6 @@ struct RawCompressionStrategy final : public CompressionStrategy {
9093
std::memcpy(buffer->data() + 1, raw, raw_size);
9194
}
9295

93-
static inline std::string DoCompress(const char* raw, size_t raw_size) {
94-
buffer_t buf;
95-
DoCompress(&buf, raw, raw_size);
96-
return std::string(buf.data(), buf.size());
97-
}
98-
9996
inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); }
10097

10198
static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); }

keyvi/include/keyvi/compression/snappy_compression_strategy.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,6 @@ struct SnappyCompressionStrategy final : public CompressionStrategy {
4747
buffer->resize(output_length + 1);
4848
}
4949

50-
static inline std::string DoCompress(const char* raw, size_t raw_size) {
51-
buffer_t buf;
52-
DoCompress(&buf, raw, raw_size);
53-
return std::string(buf.data(), buf.size());
54-
}
55-
5650
inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); }
5751

5852
static std::string DoDecompress(const std::string& compressed) {

keyvi/include/keyvi/dictionary/fsa/automata.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,13 @@ class Automata final {
394394
return value_store_reader_->GetRawValueAsString(state_value);
395395
}
396396

397+
std::string GetMsgPackedValueAsString(uint64_t state_value,
398+
const compression::CompressionAlgorithm compression_algorithm =
399+
compression::CompressionAlgorithm::NO_COMPRESSION) const {
400+
assert(value_store_reader_);
401+
return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm);
402+
}
403+
397404
std::string GetStatistics() const {
398405
return dictionary_properties_->GetStatistics();
399406
}

0 commit comments

Comments
 (0)