From 17fd016cb60ec77223ccc9616b064e1a957368fb Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Fri, 5 Jun 2026 20:26:40 +0800 Subject: [PATCH 01/38] record_rotater --- doc/draft.md | 107 +++++++++++++++ src/core/quantizer/CMakeLists.txt | 2 +- src/core/quantizer/quantizer_params.h | 6 +- src/core/quantizer/record_rotater.cc | 184 ++++++++++++++++++++++++++ src/core/quantizer/record_rotater.h | 104 +++++++++++++++ 5 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 doc/draft.md create mode 100644 src/core/quantizer/record_rotater.cc create mode 100644 src/core/quantizer/record_rotater.h diff --git a/doc/draft.md b/doc/draft.md new file mode 100644 index 000000000..0192aaf69 --- /dev/null +++ b/doc/draft.md @@ -0,0 +1,107 @@ +## 量化方案新增旋转功能 + +1. 动机: +Int8量化采用 per-vector min-max 量化,即用每个向量自身的最小/最大值来确定量化区间 [-127, 127],误差主要来自: + - 维度间的值分布不均匀:某些维度的值远大于其他维度,导致量化区间被少数极端维度"撑开",大部分维度的量化精度被浪费。 + - 非各向同性分布:真实embedding数据的能量往往集中在少数方向上。 +随机旋转在保持距离不变的同时,会将向量的能量均匀分散到所有维度,使每个维度的值分布更接近高斯分布,从而减小per-vector min-max量化的量化误差。 + +2. 修改类型: +一种可选的量化参数 +```yaml +// 构建侧新增量化配置选项: +ConverterParams: + integer_streaming.converter.enable_rotate: !!bool true +// 搜索侧不做变化 +``` +``` +Build 阶段: + Converter::init() → 读取 enable_rotate=true,创建 rabitqlib::Rotator + Converter::transform() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 + Converter::dump() → 将 rotator 数据写入独立 segment + Streamer::dump() → 写入 meta + HNSW 图数据(不感知 converter) + meta.set_reformer() → reformer_params 中写入 enable_rotate=true + +Search 阶段: + Index::Open() → reformer_->load(storage_) → 从 segment 加载 rotator + Reformer::transform() → 每条 query: rotator->rotate(q) → [normalize] → int8 量化 +``` +## Int8StreamingConverter具体实现 + +### 1: 新增参数定义 +```cpp +//! IntegerStreamingConverter +static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = + "integer_streaming.converter.enable_rotate"; +//! IntegerStreamingReformer +static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = + "integer_streaming.reformer.enable_rotate"; +``` + +### 2. 新增矩阵旋转工具类 [DONE] +1. 便于拓展,将旋转功能抽象到统一的文件`/root/code/zvec/src/core/quantizer/record_rotater.h`和`record_rotater.cc`中(pimpl模式,rabitqlib依赖仅在.cc中) +2. 实现方式参考/root/code/zvec/src/core/algorithm/hnsw_rabitq中的旋转方式,具体实现调用第三方库/root/code/zvec/thirdparty/RaBitQ-Library +3. 包含功能: + 1. O(d \log d)复杂度的快速旋转 + 2. 保存矩阵(通过IndexDumper写入segment,含CRC + 32字节对齐) + 3. 加载矩阵(通过IndexStorage读取segment,含CRC校验) +```cpp +class RecordRotator { + public: + RecordRotator(); + ~RecordRotator(); + + //! Move-only (pimpl with unique_ptr) + RecordRotator(RecordRotator &&) noexcept; + RecordRotator &operator=(RecordRotator &&) noexcept; + RecordRotator(const RecordRotator &) = delete; + RecordRotator &operator=(const RecordRotator &) = delete; + + //! Initialize the rotator + //! @param dimension original vector dimension + //! @param padded_dim padded dimension (rounded up for SIMD alignment) + //! @param rotator_type rotation algorithm (default: FhtKac) + void init(size_t dimension, size_t padded_dim, + RecordRotatorType rotator_type = RecordRotatorType::FhtKac); + + //! Rotate a single vector + //! @param in input vector of size >= dimension + //! @param out output buffer of size >= padded_dim + void rotate(const float *in, float *out) const; + + //! Rotate a single vector into a managed buffer + //! @param in input vector of size >= dimension + //! @return vector of size padded_dim containing rotated result + std::vector rotate(const float *in) const; + + //! Return the serialized size of the rotator in bytes + size_t dump_bytes() const; + + //! Dump the rotator data to an IndexDumper as a named segment. + //! Writes the raw rotator bytes, appends padding for 32-byte alignment, + //! and registers the segment meta (id, size, padding, crc). + int dump(const IndexDumper::Pointer &dumper, + const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; + + //! Load the rotator data from an IndexStorage segment. + //! Reads the serialized rotator bytes and reconstructs the rotator. + int load(IndexStorage::Pointer storage, + const std::string &seg_id = RECORD_ROTATOR_SEG_ID); + + //! Return the original dimension + size_t dimension() const; + + //! Return the padded dimension + size_t padded_dim() const; + + //! Return the rotator type + RecordRotatorType rotator_type() const; + + //! Check if the rotator is initialized + bool initialized() const; + + private: + struct Impl; + std::unique_ptr impl_; +}; +``` diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index 80b4f612a..459b8b88c 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -10,7 +10,7 @@ cc_library( NAME core_quantizer STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc - LIBS zvec_ailego core_framework + LIBS zvec_ailego core_framework rabitqlib INCS . ${PROJECT_ROOT_DIR}/src/core LDFLAGS "${CORE_QUANTIZER_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" diff --git a/src/core/quantizer/quantizer_params.h b/src/core/quantizer/quantizer_params.h index 622361660..9b34a4b30 100644 --- a/src/core/quantizer/quantizer_params.h +++ b/src/core/quantizer/quantizer_params.h @@ -108,12 +108,16 @@ static const std::string COSINE_REFORMER_FORCED_HALF_FLOAT = //! IntegerStreamingConverter static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_NORMALIZE = "integer_streaming.converter.enable_normalize"; +static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = + "integer_streaming.converter.enable_rotate"; -//! IntegerStreamingConverter +//! IntegerStreamingReformer static const std::string INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE = "integer_streaming.reformer.enable_normalize"; static const std::string INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN = "integer_streaming.reformer.is_euclidean"; +static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = + "integer_streaming.reformer.enable_rotate"; //! DoubleBitConverter static const std::string DOUBLE_BIT_CONVERTER_TRAIN_SAMPLE_COUNT = diff --git a/src/core/quantizer/record_rotater.cc b/src/core/quantizer/record_rotater.cc new file mode 100644 index 000000000..1e55d9f89 --- /dev/null +++ b/src/core/quantizer/record_rotater.cc @@ -0,0 +1,184 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "record_rotater.h" +#include +#include +#include +#include "zvec/core/framework/index_error.h" +#include "zvec/core/framework/index_logger.h" + +namespace zvec { +namespace core { + +// All rabitqlib types are confined to this translation unit via pimpl. +struct RecordRotator::Impl { + size_t dimension{0}; + size_t padded_dim{0}; + RecordRotatorType type{RecordRotatorType::FhtKac}; + std::unique_ptr> rotator; + + static rabitqlib::RotatorType to_rabitq(RecordRotatorType t) { + return t == RecordRotatorType::Matrix + ? rabitqlib::RotatorType::MatrixRotator + : rabitqlib::RotatorType::FhtKacRotator; + } +}; + +RecordRotator::RecordRotator() : impl_(std::make_unique()) {} + +RecordRotator::~RecordRotator() = default; + +RecordRotator::RecordRotator(RecordRotator &&) noexcept = default; +RecordRotator &RecordRotator::operator=(RecordRotator &&) noexcept = default; + +void RecordRotator::init(size_t dimension, size_t padded_dim, + RecordRotatorType rotator_type) { + impl_->dimension = dimension; + impl_->padded_dim = padded_dim; + impl_->type = rotator_type; + impl_->rotator.reset(rabitqlib::choose_rotator( + dimension, Impl::to_rabitq(rotator_type), padded_dim)); +} + +void RecordRotator::rotate(const float *in, float *out) const { + impl_->rotator->rotate(in, out); +} + +std::vector RecordRotator::rotate(const float *in) const { + std::vector out(impl_->padded_dim); + impl_->rotator->rotate(in, out.data()); + return out; +} + +size_t RecordRotator::dump_bytes() const { + return impl_->rotator->dump_bytes(); +} + +int RecordRotator::dump(const IndexDumper::Pointer &dumper, + const std::string &seg_id) const { + if (!dumper) { + LOG_ERROR("RecordRotator::dump: null dumper"); + return IndexError_InvalidArgument; + } + if (!impl_->rotator) { + LOG_ERROR("RecordRotator::dump: rotator not initialized"); + return IndexError_NoReady; + } + + auto align_size = [](size_t size) -> size_t { + return (size + 0x1F) & (~0x1F); + }; + + // Serialize rotator to buffer + const size_t data_size = impl_->rotator->dump_bytes(); + std::vector buffer(data_size); + impl_->rotator->save(buffer.data()); + + // Write rotator data + size_t written = dumper->write(buffer.data(), data_size); + if (written != data_size) { + LOG_ERROR("RecordRotator::dump: write failed, written=%zu, expected=%zu", + written, data_size); + return IndexError_WriteData; + } + uint32_t crc = ailego::Crc32c::Hash(buffer.data(), data_size, 0); + + // Write padding for 32-byte alignment + size_t padding_size = align_size(data_size) - data_size; + if (padding_size > 0) { + std::string padding(padding_size, '\0'); + if (dumper->write(padding.data(), padding_size) != padding_size) { + LOG_ERROR("RecordRotator::dump: padding write failed"); + return IndexError_WriteData; + } + } + + // Register segment meta + int ret = dumper->append(seg_id, data_size, padding_size, crc); + if (ret != 0) { + LOG_ERROR("RecordRotator::dump: append segment meta failed, ret=%d", ret); + return ret; + } + + LOG_DEBUG("RecordRotator::dump done: seg=%s, data_size=%zu, padding=%zu", + seg_id.c_str(), data_size, padding_size); + return 0; +} + +int RecordRotator::load(IndexStorage::Pointer storage, + const std::string &seg_id) { + if (!storage) { + LOG_ERROR("RecordRotator::load: null storage"); + return IndexError_InvalidArgument; + } + + auto segment = storage->get(seg_id); + if (!segment) { + LOG_ERROR("RecordRotator::load: segment '%s' not found", seg_id.c_str()); + return IndexError_InvalidFormat; + } + + // Read the rotator data from the segment + const size_t data_size = segment->data_size(); + IndexStorage::MemoryBlock block; + size_t read_size = segment->read(0, block, data_size); + if (read_size != data_size) { + LOG_ERROR("RecordRotator::load: read failed, read=%zu, expected=%zu", + read_size, data_size); + return IndexError_InvalidFormat; + } + + // Verify CRC if available + uint32_t expected_crc = segment->data_crc(); + if (expected_crc != 0) { + uint32_t actual_crc = ailego::Crc32c::Hash(block.data(), data_size, 0); + if (actual_crc != expected_crc) { + LOG_ERROR( + "RecordRotator::load: CRC mismatch, expected=0x%08x, actual=0x%08x", + expected_crc, actual_crc); + return IndexError_InvalidFormat; + } + } + + // Reconstruct the rotator from serialized data + impl_->rotator.reset(rabitqlib::choose_rotator( + impl_->dimension, Impl::to_rabitq(impl_->type), impl_->padded_dim)); + impl_->rotator->load(reinterpret_cast(block.data())); + + LOG_DEBUG( + "RecordRotator::load done: seg=%s, dim=%zu, padded_dim=%zu, " + "data_size=%zu", + seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); + return 0; +} + +size_t RecordRotator::dimension() const { + return impl_->dimension; +} + +size_t RecordRotator::padded_dim() const { + return impl_->padded_dim; +} + +RecordRotatorType RecordRotator::rotator_type() const { + return impl_->type; +} + +bool RecordRotator::initialized() const { + return impl_->rotator != nullptr; +} + +} // namespace core +} // namespace zvec diff --git a/src/core/quantizer/record_rotater.h b/src/core/quantizer/record_rotater.h new file mode 100644 index 000000000..30377e0d0 --- /dev/null +++ b/src/core/quantizer/record_rotater.h @@ -0,0 +1,104 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include "zvec/core/framework/index_dumper.h" +#include "zvec/core/framework/index_storage.h" + +namespace zvec { +namespace core { + +//! Segment ID used when dumping/loading the rotator data +inline const std::string RECORD_ROTATOR_SEG_ID{"integer_streaming.rotator"}; + +//! Rotator type exposed without rabitqlib dependency +enum class RecordRotatorType : uint8_t { + FhtKac = 0, //!< O(d log d) FHT-based Kac random rotation (default) + Matrix = 1, //!< O(d^2) explicit random matrix rotation +}; + +/*! RecordRotator wraps rabitqlib::Rotator for per-vector rotation. + * + * All rabitqlib types are hidden behind a pimpl to avoid leaking + * rabitqlib headers to consumers of this class. + * + * Provides O(d log d) fast rotation (FHT-based Kac random rotation), + * as well as serialization (save/load) of the rotation matrix. + * Used by IntegerStreamingConverter/Reformer when enable_rotate is true. + */ +class RecordRotator { + public: + RecordRotator(); + ~RecordRotator(); + + //! Move-only (pimpl with unique_ptr) + RecordRotator(RecordRotator &&) noexcept; + RecordRotator &operator=(RecordRotator &&) noexcept; + RecordRotator(const RecordRotator &) = delete; + RecordRotator &operator=(const RecordRotator &) = delete; + + //! Initialize the rotator + //! @param dimension original vector dimension + //! @param padded_dim padded dimension (rounded up for SIMD alignment) + //! @param rotator_type rotation algorithm (default: FhtKac) + void init(size_t dimension, size_t padded_dim, + RecordRotatorType rotator_type = RecordRotatorType::FhtKac); + + //! Rotate a single vector + //! @param in input vector of size >= dimension + //! @param out output buffer of size >= padded_dim + void rotate(const float *in, float *out) const; + + //! Rotate a single vector into a managed buffer + //! @param in input vector of size >= dimension + //! @return vector of size padded_dim containing rotated result + std::vector rotate(const float *in) const; + + //! Return the serialized size of the rotator in bytes + size_t dump_bytes() const; + + //! Dump the rotator data to an IndexDumper as a named segment. + //! Writes the raw rotator bytes, appends padding for 32-byte alignment, + //! and registers the segment meta (id, size, padding, crc). + int dump(const IndexDumper::Pointer &dumper, + const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; + + //! Load the rotator data from an IndexStorage segment. + //! Reads the serialized rotator bytes and reconstructs the rotator. + int load(IndexStorage::Pointer storage, + const std::string &seg_id = RECORD_ROTATOR_SEG_ID); + + //! Return the original dimension + size_t dimension() const; + + //! Return the padded dimension + size_t padded_dim() const; + + //! Return the rotator type + RecordRotatorType rotator_type() const; + + //! Check if the rotator is initialized + bool initialized() const; + + private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace core +} // namespace zvec From 72e2592394fc7bd04c1ae3522562177d17431eb9 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Sun, 7 Jun 2026 23:44:35 +0800 Subject: [PATCH 02/38] init --- config/construct.yaml | 23 +++++ config/search_baseline.yaml | 19 ++++ config/search_current.yaml | 19 ++++ doc/draft.md | 66 +++++++++++--- src/core/interface/index.cc | 9 +- .../quantizer/integer_quantizer_converter.cc | 62 +++++++++++-- .../quantizer/integer_quantizer_reformer.cc | 53 ++++++++++- src/core/quantizer/record_rotater.cc | 88 ++++++++++++++++--- src/core/quantizer/record_rotater.h | 21 +++-- 9 files changed, 315 insertions(+), 45 deletions(-) create mode 100644 config/construct.yaml create mode 100644 config/search_baseline.yaml create mode 100644 config/search_current.yaml diff --git a/config/construct.yaml b/config/construct.yaml new file mode 100644 index 000000000..45084d962 --- /dev/null +++ b/config/construct.yaml @@ -0,0 +1,23 @@ +BuilderCommon: + BuilderClass: HnswStreamer + BuildFile: /root/data/gist/gist_train.vecs + # NeedTrain: true #是否需要走train流程 + # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs + DumpPath: ./flase.tmp + IndexPath: /root/data/gist/index/gist.random2.l2.fp32.index + + ThreadCount: 16 + + MetricName: SquaredEuclidean + # ConverterName: CosineFp16Converter + ConverterName: Int8StreamingConverter + + DisableIdMap: true + +ConverterParams: + integer_streaming.converter.enable_rotate: !!bool true + +BuilderParams: #各Builder方法的params参数 + proxima.hnsw.streamer.efconstruction: !!int 500 + proxima.hnsw.streamer.use_id_map: !!bool false + proxima.hnsw.streamer.max_neighbor_count: !!int 15 \ No newline at end of file diff --git a/config/search_baseline.yaml b/config/search_baseline.yaml new file mode 100644 index 000000000..8b750f8e6 --- /dev/null +++ b/config/search_baseline.yaml @@ -0,0 +1,19 @@ +IndexCommon: + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' + IndexPath: /root/data/gist/index/gist.l2.fp32.index + TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 + QueryFile: /root/data/gist/query.txt + QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) + QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 + QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 + GroundTruthFile: /root/data/gist/ground_truth.txt + + RecallThreadCount: 16 + BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) + BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 + CompareById: true + LogLevel: info + +QueryConfig: + QueryParam: '{"index_type":"kHNSW","ef_search":180}' + diff --git a/config/search_current.yaml b/config/search_current.yaml new file mode 100644 index 000000000..e1fb907c5 --- /dev/null +++ b/config/search_current.yaml @@ -0,0 +1,19 @@ +IndexCommon: + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' + IndexPath: /root/data/gist/index/gist.random2.l2.fp32.index + TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 + QueryFile: /root/data/gist/query.txt + QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) + QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 + QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 + GroundTruthFile: /root/data/gist/ground_truth.txt + + RecallThreadCount: 16 + BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) + BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 + CompareById: true + LogLevel: info + +QueryConfig: + QueryParam: '{"index_type":"kHNSW","ef_search":180}' + diff --git a/doc/draft.md b/doc/draft.md index 0192aaf69..e7142f914 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -28,7 +28,7 @@ Search 阶段: ``` ## Int8StreamingConverter具体实现 -### 1: 新增参数定义 +### 1: 新增参数定义 [DONE] ```cpp //! IntegerStreamingConverter static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = @@ -43,8 +43,9 @@ static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = 2. 实现方式参考/root/code/zvec/src/core/algorithm/hnsw_rabitq中的旋转方式,具体实现调用第三方库/root/code/zvec/thirdparty/RaBitQ-Library 3. 包含功能: 1. O(d \log d)复杂度的快速旋转 - 2. 保存矩阵(通过IndexDumper写入segment,含CRC + 32字节对齐) - 3. 加载矩阵(通过IndexStorage读取segment,含CRC校验) + 2. dump:保存矩阵(通过IndexDumper写入segment,含自描述Header + rabitqlib blob + CRC + 32字节对齐) + 3. open:从Storage加载序列化旋转器(通过IndexStorage读取segment,从Header解析type/dim/padded_dim,无需预先init,含CRC校验) + 4. load:加载用户自定义旋转矩阵(MatrixRotator,行主序 dim x padded_dim) ```cpp class RecordRotator { public: @@ -74,20 +75,27 @@ class RecordRotator { //! @return vector of size padded_dim containing rotated result std::vector rotate(const float *in) const; - //! Return the serialized size of the rotator in bytes + //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; - //! Dump the rotator data to an IndexDumper as a named segment. - //! Writes the raw rotator bytes, appends padding for 32-byte alignment, - //! and registers the segment meta (id, size, padding, crc). + //! Dump the rotator to an IndexDumper as a named segment. + //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] + //! Appends padding for 32-byte alignment, registers segment meta (id, size, padding, crc). int dump(const IndexDumper::Pointer &dumper, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; - //! Load the rotator data from an IndexStorage segment. - //! Reads the serialized rotator bytes and reconstructs the rotator. - int load(IndexStorage::Pointer storage, + //! Open the rotator from an IndexStorage segment (self-describing, no init needed). + //! Parses header to get type/dimension/padded_dim, then reconstructs the rotator. + int open(IndexStorage::Pointer storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID); + //! Load a user-specified rotation matrix. + //! Always uses MatrixRotator internally. + //! @param matrix row-major matrix of shape dimension x padded_dim + //! @param dimension original vector dimension + //! @param padded_dim padded dimension (must be multiple of 64) + int load(const float *matrix, size_t dimension, size_t padded_dim); + //! Return the original dimension size_t dimension() const; @@ -105,3 +113,41 @@ class RecordRotator { std::unique_ptr impl_; }; ``` +### 3. 修改 IntegerStreaming 的 Converter 和 Reformer [DONE] + +1. 修改文件:`integer_quantizer_converter.cc` 和 `integer_quantizer_reformer.cc` +2. Converter 修改: + 1. 新增 `#include "record_rotater.h"` 和成员变量 `enable_rotate_`, `rotator_`(无 `padded_dim_`,由 `rotator_->padded_dim()` 派生) + 2. `init()` 读取 `enable_rotate` 标记,创建 FhtKacRotator(padded_dim=向上取64倍数),将 `enable_rotate` 写入 reformer_params + 3. `transform()` 将 `rotator_` 传入 Holder,Holder 通过 `rotator_->padded_dim()` 获取对齐维度 + 4. `dump()` 调用 `rotator_->dump(dumper)` 保存旋转矩阵(自描述格式) + 5. Holder Iterator 的 `encode_record()` 管线:rotate → normalize → quantize +3. Reformer 修改: + 1. `init()` 仅读取 `enable_rotate` 标记(维度信息从序列化数据自描述获取) + 2. `load()` 创建 rotator,调用 `rotator_->open(storage)` 加载旋转矩阵(open 内部从 header 解析 type/dim/padded_dim) + 3. 所有 `transform()`/`convert()` 方法在量化前应用旋转 + 4. `revert()` 在旋转模式下拒绝反量化 + +### 4. 修改 Index::Open() [DONE] +1. 修改代码:`src/core/interface/index.cc` +2. 在 `Index::Open()` 中 streamer 打开后,调用 `reformer_->load(storage_)` 加载序列化数据(旋转矩阵等) +3. 对无序列化数据的 reformer(如非旋转模式),`load()` 为 no-op 直接返回 0,不干扰运行时功能 + +### 5. 修改运行时测试代码 +1. 修改代码:/root/code/zvec/tools/core/local_builder.cc,使其可以保存旋转矩阵 +2. 编译代码: +```cpp +cmake -DCMAKE_BUILD_TYPE=Release .. +make -j$(nproc) +``` +3. 测试代码: +索引构建: +```cpp +./build/bin/local_builder /root/code/zvec/config/construct.yaml +``` +搜索测试: +```cpp +./build/bin/bench /root/code/zvec/config/search_baseline.yaml +./build/bin/bench /root/code/zvec/config/search_current.yaml +``` +4. 运行代码,并修改错误 diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index d482f1292..84df683a8 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -294,8 +294,13 @@ int Index::Open(const std::string &file_path, StorageOptions storage_options) { return core::IndexError_Runtime; } - // converter/reformer/metric are created in IndexFactory::CreateIndex - // TODO: init + // Load reformer data from storage (e.g., rotation matrix for IntegerStreaming) + if (reformer_ != nullptr) { + if (reformer_->load(storage_) != 0) { + LOG_ERROR("Failed to load reformer, path: %s", file_path.c_str()); + return core::IndexError_Runtime; + } + } // TODO: context pool if (!init_context()) { // to validate if any error, will be overwritten diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index f812b6e3c..7e9defdc6 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -19,6 +19,7 @@ #include #include #include "record_quantizer.h" +#include "record_rotater.h" #include "../metric/metric_params.h" namespace zvec { @@ -378,10 +379,14 @@ class IntegerStreamingConverter : public IndexConverter { meta_ = index_meta; params.get(INTEGER_STREAMING_CONVERTER_ENABLE_NORMALIZE, &enable_normalize_); + params.get(INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE, &enable_rotate_); ailego::Params reformer_params; if (enable_normalize_) { reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE, true); } + if (enable_rotate_) { + reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, true); + } is_euclidean_ = index_meta.metric_name() == "MipsSquaredEuclidean" || index_meta.metric_name() == "SquaredEuclidean" || @@ -390,6 +395,17 @@ class IntegerStreamingConverter : public IndexConverter { reformer_params.set(INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN, true); } + // Compute padded dimension and create rotator if rotation is enabled + size_t padded_dim = index_meta.dimension(); + if (enable_rotate_) { + size_t dim = index_meta.dimension(); + padded_dim = ((dim + 63) / 64) * 64; + rotator_ = std::make_shared(); + rotator_->init(dim, padded_dim); + LOG_DEBUG("IntegerStreamingConverter: rotation enabled, dim=%zu, " + "padded_dim=%zu", + dim, padded_dim); + } if (data_type_ == IndexMeta::DataType::DT_INT8) { meta_.set_converter("Int8StreamingConverter", 0, params); @@ -409,7 +425,7 @@ class IntegerStreamingConverter : public IndexConverter { metric_params.set(QUANTIZED_INTEGER_METRIC_ORIGIN_METRIC_PARAMS, index_meta.metric_params()); meta_.set_metric("QuantizedInteger", 0, metric_params); - meta_.set_meta(data_type_, meta_.dimension() + ExtraDimension(data_type_)); + meta_.set_meta(data_type_, padded_dim + ExtraDimension(data_type_)); return 0; } @@ -433,12 +449,20 @@ class IntegerStreamingConverter : public IndexConverter { *stats_.mutable_transformed_count() += holder->count(); holder_ = std::make_shared( - holder, data_type_, enable_normalize_, is_euclidean_); + holder, data_type_, enable_normalize_, is_euclidean_, rotator_); return 0; } //! Dump index into storage - int dump(const IndexDumper::Pointer & /*dumper*/) override { + int dump(const IndexDumper::Pointer &dumper) override { + if (enable_rotate_ && rotator_) { + int ret = rotator_->dump(dumper); + if (ret != 0) { + LOG_ERROR("IntegerStreamingConverter: dump rotator failed, ret=%d", ret); + return ret; + } + stats_.set_dumped_size(stats_.dumped_size() + rotator_->dump_bytes()); + } return 0; } @@ -469,6 +493,7 @@ class IntegerStreamingConverter : public IndexConverter { : owner_(owner), buffer_(owner->element_size(), 0), normalize_buffer_(owner->front_->element_size(), 0), + rotate_buffer_(owner->padded_dim() * sizeof(float), 0), front_iter_(std::move(iter)) { this->encode_record(); } @@ -503,17 +528,24 @@ class IntegerStreamingConverter : public IndexConverter { if (front_iter_->is_valid()) { const float *vec = reinterpret_cast(front_iter_->data()); + size_t pdim = owner_->padded_dim(); + if (owner_->rotator_) { + float *rotate_buf = + reinterpret_cast(rotate_buffer_.data()); + owner_->rotator_->rotate(vec, rotate_buf); + vec = rotate_buf; + } if (owner_->enable_normalize_) { float norm = 0.0; memcpy((void *)normalize_buffer_.data(), vec, - owner_->front_->element_size()); + pdim * sizeof(float)); ailego::Normalizer::L2((float *)normalize_buffer_.data(), - owner_->dimension_, &norm); + pdim, &norm); vec = (float *)normalize_buffer_.data(); } RecordQuantizer::quantize_record( - vec, owner_->dimension_, owner_->data_type(), + vec, pdim, owner_->data_type(), owner_->is_euclidean_, buffer_.data()); } } @@ -522,18 +554,27 @@ class IntegerStreamingConverter : public IndexConverter { const IntegerStreamingConverterHolder *owner_{nullptr}; std::vector buffer_{}; std::string normalize_buffer_{}; + std::string rotate_buffer_{}; IndexHolder::Iterator::Pointer front_iter_{}; }; //! Constructor IntegerStreamingConverterHolder(IndexHolder::Pointer front, IndexMeta::DataType tp, - bool enable_normalize, bool is_euclidean) + bool enable_normalize, bool is_euclidean, + std::shared_ptr rotator) : front_(std::move(front)), data_type_(tp), dimension_(front_->dimension()), enable_normalize_(enable_normalize), - is_euclidean_(is_euclidean) {} + is_euclidean_(is_euclidean), + rotator_(std::move(rotator)) {} + + //! Retrieve padded dimension + size_t padded_dim(void) const { + return rotator_ ? rotator_->padded_dim() + : static_cast(dimension_); + } //! Retrieve count of elements in holder (-1 indicates unknown) size_t count(void) const override { @@ -542,7 +583,7 @@ class IntegerStreamingConverter : public IndexConverter { //! Retrieve dimension size_t dimension(void) const override { - return dimension_ + ExtraDimension(data_type_); + return padded_dim() + ExtraDimension(data_type_); } //! Retrieve type information @@ -576,6 +617,7 @@ class IntegerStreamingConverter : public IndexConverter { uint32_t dimension_{0}; bool enable_normalize_{false}; bool is_euclidean_{false}; + std::shared_ptr rotator_{}; }; static size_t ExtraDimension(IndexMeta::DataType type) { @@ -593,7 +635,9 @@ class IntegerStreamingConverter : public IndexConverter { IndexHolder::Pointer holder_{}; IndexMeta::DataType data_type_{}; bool enable_normalize_{false}; + bool enable_rotate_{false}; bool is_euclidean_{false}; + std::shared_ptr rotator_{}; }; INDEX_FACTORY_REGISTER_CONVERTER_ALIAS( diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 4228d0fda..a263b9985 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -19,6 +19,7 @@ #include #include #include "record_quantizer.h" +#include "record_rotater.h" namespace zvec { namespace core { @@ -286,6 +287,7 @@ class IntegerStreamingReformer : public IndexReformer { int init(const ailego::Params ¶ms) override { params.get(INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE, &enable_normalize_); params.get(INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN, &is_euclidean_); + params.get(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, &enable_rotate_); return 0; } @@ -295,7 +297,18 @@ class IntegerStreamingReformer : public IndexReformer { } //! Load index from container - int load(IndexStorage::Pointer) override { + int load(IndexStorage::Pointer storage) override { + if (enable_rotate_) { + rotator_ = std::make_shared(); + int ret = rotator_->open(storage); + if (ret != 0) { + LOG_ERROR("IntegerStreamingReformer: load rotator failed, ret=%d", ret); + return ret; + } + LOG_DEBUG("IntegerStreamingReformer: rotator loaded, origin_dim=%zu, " + "padded_dim=%zu", + rotator_->dimension(), rotator_->padded_dim()); + } return 0; } @@ -319,10 +332,16 @@ class IntegerStreamingReformer : public IndexReformer { ometa->set_meta(data_type_, qmeta.dimension() + extra_dimension_); out->resize(ometa->element_size()); const float *vec = reinterpret_cast(query); + std::unique_ptr rotate_buffer; + if (enable_rotate_ && rotator_) { + rotate_buffer.reset(new float[qmeta.dimension()]); + rotator_->rotate(vec, rotate_buffer.get()); + vec = rotate_buffer.get(); + } std::unique_ptr normalized; if (enable_normalize_) { normalized.reset(new float[qmeta.dimension()]); - vec = normalize(query, qmeta, normalized.get()); + vec = normalize(vec, qmeta, normalized.get()); } RecordQuantizer::quantize_record(vec, qmeta.dimension(), data_type_, @@ -344,13 +363,21 @@ class IntegerStreamingReformer : public IndexReformer { *ometa = qmeta; ometa->set_meta(data_type_, qmeta.dimension() + extra_dimension_); out->resize(count * ometa->element_size()); + std::unique_ptr rotate_buffer; std::unique_ptr normalized; + if (enable_rotate_ && rotator_) { + rotate_buffer.reset(new float[qmeta.dimension()]); + } if (enable_normalize_) { normalized.reset(new float[qmeta.dimension()]); } for (size_t i = 0; i < count; ++i) { const float *vec = reinterpret_cast(query) + i * qmeta.dimension(); + if (enable_rotate_ && rotator_) { + rotator_->rotate(vec, rotate_buffer.get()); + vec = rotate_buffer.get(); + } if (enable_normalize_) { vec = normalize(vec, qmeta, normalized.get()); } @@ -378,10 +405,16 @@ class IntegerStreamingReformer : public IndexReformer { ometa->set_meta(data_type_, rmeta.dimension() + extra_dimension_); out->resize(ometa->element_size()); const float *vec = reinterpret_cast(record); + std::unique_ptr rotate_buffer; + if (enable_rotate_ && rotator_) { + rotate_buffer.reset(new float[rmeta.dimension()]); + rotator_->rotate(vec, rotate_buffer.get()); + vec = rotate_buffer.get(); + } std::unique_ptr normalized; if (enable_normalize_) { normalized.reset(new float[rmeta.dimension()]); - vec = normalize(record, rmeta, normalized.get()); + vec = normalize(vec, rmeta, normalized.get()); } RecordQuantizer::quantize_record(vec, rmeta.dimension(), data_type_, @@ -404,13 +437,21 @@ class IntegerStreamingReformer : public IndexReformer { *ometa = rmeta; ometa->set_meta(data_type_, rmeta.dimension() + extra_dimension_); out->resize(count * ometa->element_size()); + std::unique_ptr rotate_buffer; std::unique_ptr normalized; + if (enable_rotate_ && rotator_) { + rotate_buffer.reset(new float[rmeta.dimension()]); + } if (enable_normalize_) { normalized.reset(new float[rmeta.dimension()]); } for (size_t i = 0; i < count; ++i) { const float *vec = reinterpret_cast(records) + i * rmeta.dimension(); + if (enable_rotate_ && rotator_) { + rotator_->rotate(vec, rotate_buffer.get()); + vec = rotate_buffer.get(); + } if (enable_normalize_) { vec = normalize(vec, rmeta, normalized.get()); } @@ -445,6 +486,10 @@ class IntegerStreamingReformer : public IndexReformer { int revert(const void *in, const IndexQueryMeta &qmeta, std::string *out) const override { + if (enable_rotate_) { + LOG_ERROR("Unsupported revert for rotated value"); + return IndexError_Unsupported; + } if (enable_normalize_) { LOG_ERROR("Unsupported revert for normalized value"); @@ -465,6 +510,8 @@ class IntegerStreamingReformer : public IndexReformer { uint32_t extra_dimension_{0}; bool enable_normalize_{false}; bool is_euclidean_{false}; + bool enable_rotate_{false}; + std::shared_ptr rotator_{}; }; INDEX_FACTORY_REGISTER_REFORMER_ALIAS( diff --git a/src/core/quantizer/record_rotater.cc b/src/core/quantizer/record_rotater.cc index 1e55d9f89..33c9e8ca5 100644 --- a/src/core/quantizer/record_rotater.cc +++ b/src/core/quantizer/record_rotater.cc @@ -24,6 +24,15 @@ namespace core { // All rabitqlib types are confined to this translation unit via pimpl. struct RecordRotator::Impl { + //! Self-describing header prepended to the rabitqlib blob on dump + struct Header { + uint8_t type; + uint32_t origin_dim; + uint32_t padded_dim; + }; + + static constexpr size_t kHeaderSize = sizeof(Header); // 9 bytes + size_t dimension{0}; size_t padded_dim{0}; RecordRotatorType type{RecordRotatorType::FhtKac}; @@ -34,6 +43,12 @@ struct RecordRotator::Impl { ? rabitqlib::RotatorType::MatrixRotator : rabitqlib::RotatorType::FhtKacRotator; } + + static RecordRotatorType from_rabitq(uint8_t t) { + return t == static_cast(RecordRotatorType::Matrix) + ? RecordRotatorType::Matrix + : RecordRotatorType::FhtKac; + } }; RecordRotator::RecordRotator() : impl_(std::make_unique()) {} @@ -63,7 +78,7 @@ std::vector RecordRotator::rotate(const float *in) const { } size_t RecordRotator::dump_bytes() const { - return impl_->rotator->dump_bytes(); + return Impl::kHeaderSize + impl_->rotator->dump_bytes(); } int RecordRotator::dump(const IndexDumper::Pointer &dumper, @@ -81,10 +96,17 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, return (size + 0x1F) & (~0x1F); }; - // Serialize rotator to buffer - const size_t data_size = impl_->rotator->dump_bytes(); + // Serialize: [Header: type|origin_dim|padded_dim] [rabitqlib blob] + const size_t blob_size = impl_->rotator->dump_bytes(); + const size_t data_size = Impl::kHeaderSize + blob_size; std::vector buffer(data_size); - impl_->rotator->save(buffer.data()); + + Impl::Header header; + header.type = static_cast(impl_->type); + header.origin_dim = static_cast(impl_->dimension); + header.padded_dim = static_cast(impl_->padded_dim); + std::memcpy(buffer.data(), &header, Impl::kHeaderSize); + impl_->rotator->save(buffer.data() + Impl::kHeaderSize); // Write rotator data size_t written = dumper->write(buffer.data(), data_size); @@ -117,53 +139,91 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, return 0; } -int RecordRotator::load(IndexStorage::Pointer storage, +int RecordRotator::open(IndexStorage::Pointer storage, const std::string &seg_id) { if (!storage) { - LOG_ERROR("RecordRotator::load: null storage"); + LOG_ERROR("RecordRotator::open: null storage"); return IndexError_InvalidArgument; } auto segment = storage->get(seg_id); if (!segment) { - LOG_ERROR("RecordRotator::load: segment '%s' not found", seg_id.c_str()); + LOG_ERROR("RecordRotator::open: segment '%s' not found", seg_id.c_str()); return IndexError_InvalidFormat; } - // Read the rotator data from the segment + // Read the rotator data from the segment (header + blob) const size_t data_size = segment->data_size(); + if (data_size <= Impl::kHeaderSize) { + LOG_ERROR("RecordRotator::open: data too small (%zu bytes)", data_size); + return IndexError_InvalidFormat; + } + IndexStorage::MemoryBlock block; size_t read_size = segment->read(0, block, data_size); if (read_size != data_size) { - LOG_ERROR("RecordRotator::load: read failed, read=%zu, expected=%zu", + LOG_ERROR("RecordRotator::open: read failed, read=%zu, expected=%zu", read_size, data_size); return IndexError_InvalidFormat; } - // Verify CRC if available + // Verify CRC if available (covers header + blob) uint32_t expected_crc = segment->data_crc(); if (expected_crc != 0) { uint32_t actual_crc = ailego::Crc32c::Hash(block.data(), data_size, 0); if (actual_crc != expected_crc) { LOG_ERROR( - "RecordRotator::load: CRC mismatch, expected=0x%08x, actual=0x%08x", + "RecordRotator::open: CRC mismatch, expected=0x%08x, actual=0x%08x", expected_crc, actual_crc); return IndexError_InvalidFormat; } } - // Reconstruct the rotator from serialized data + // Parse self-describing header + const char *raw = reinterpret_cast(block.data()); + Impl::Header header; + std::memcpy(&header, raw, Impl::kHeaderSize); + + impl_->type = Impl::from_rabitq(header.type); + impl_->dimension = static_cast(header.origin_dim); + impl_->padded_dim = static_cast(header.padded_dim); + + // Reconstruct the rotator from header info and load blob impl_->rotator.reset(rabitqlib::choose_rotator( impl_->dimension, Impl::to_rabitq(impl_->type), impl_->padded_dim)); - impl_->rotator->load(reinterpret_cast(block.data())); + impl_->rotator->load(raw + Impl::kHeaderSize); LOG_DEBUG( - "RecordRotator::load done: seg=%s, dim=%zu, padded_dim=%zu, " + "RecordRotator::open done: seg=%s, dim=%zu, padded_dim=%zu, " "data_size=%zu", seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); return 0; } +int RecordRotator::load(const float *matrix, size_t dimension, + size_t padded_dim) { + if (!matrix) { + LOG_ERROR("RecordRotator::load: null matrix"); + return IndexError_InvalidArgument; + } + if (dimension == 0 || padded_dim == 0) { + LOG_ERROR("RecordRotator::load: invalid dims %zu x %zu", dimension, + padded_dim); + return IndexError_InvalidArgument; + } + + impl_->dimension = dimension; + impl_->padded_dim = padded_dim; + impl_->type = RecordRotatorType::Matrix; + impl_->rotator.reset(rabitqlib::choose_rotator( + dimension, rabitqlib::RotatorType::MatrixRotator, padded_dim)); + impl_->rotator->load(reinterpret_cast(matrix)); + + LOG_DEBUG("RecordRotator::load done: dim=%zu, padded_dim=%zu", + dimension, padded_dim); + return 0; +} + size_t RecordRotator::dimension() const { return impl_->dimension; } diff --git a/src/core/quantizer/record_rotater.h b/src/core/quantizer/record_rotater.h index 30377e0d0..132c552dc 100644 --- a/src/core/quantizer/record_rotater.h +++ b/src/core/quantizer/record_rotater.h @@ -69,20 +69,27 @@ class RecordRotator { //! @return vector of size padded_dim containing rotated result std::vector rotate(const float *in) const; - //! Return the serialized size of the rotator in bytes + //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; - //! Dump the rotator data to an IndexDumper as a named segment. - //! Writes the raw rotator bytes, appends padding for 32-byte alignment, - //! and registers the segment meta (id, size, padding, crc). + //! Dump the rotator to an IndexDumper as a named segment. + //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] + //! Appends padding for 32-byte alignment, registers segment meta (id, size, padding, crc). int dump(const IndexDumper::Pointer &dumper, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; - //! Load the rotator data from an IndexStorage segment. - //! Reads the serialized rotator bytes and reconstructs the rotator. - int load(IndexStorage::Pointer storage, + //! Open the rotator from an IndexStorage segment (self-describing, no init needed). + //! Parses header to get type/dimension/padded_dim, then reconstructs the rotator. + int open(IndexStorage::Pointer storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID); + //! Load a user-specified rotation matrix. + //! Always uses MatrixRotator internally. + //! @param matrix row-major matrix of shape dimension x padded_dim + //! @param dimension original vector dimension + //! @param padded_dim padded dimension (must be multiple of 64) + int load(const float *matrix, size_t dimension, size_t padded_dim); + //! Return the original dimension size_t dimension() const; From 89251a25b95f98885dd5474427889c3f3f8f3397 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 8 Jun 2026 11:14:08 +0800 Subject: [PATCH 03/38] error --- config/construct.yaml | 2 +- config/search_baseline.yaml | 4 ++-- config/search_current.yaml | 2 +- doc/draft.md | 8 +++++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/config/construct.yaml b/config/construct.yaml index 45084d962..481484a72 100644 --- a/config/construct.yaml +++ b/config/construct.yaml @@ -20,4 +20,4 @@ ConverterParams: BuilderParams: #各Builder方法的params参数 proxima.hnsw.streamer.efconstruction: !!int 500 proxima.hnsw.streamer.use_id_map: !!bool false - proxima.hnsw.streamer.max_neighbor_count: !!int 15 \ No newline at end of file + proxima.hnsw.streamer.max_neighbor_count: !!int 15 diff --git a/config/search_baseline.yaml b/config/search_baseline.yaml index 8b750f8e6..a1bd334f2 100644 --- a/config/search_baseline.yaml +++ b/config/search_baseline.yaml @@ -1,8 +1,8 @@ IndexCommon: IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/gist/index/gist.l2.fp32.index + IndexPath: /root/data/gist/index/gist.random.l2.fp32.index TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/gist/query.txt + QueryFile: /root/data/gist/query_random.txt QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 diff --git a/config/search_current.yaml b/config/search_current.yaml index e1fb907c5..d4ba76ba5 100644 --- a/config/search_current.yaml +++ b/config/search_current.yaml @@ -1,6 +1,6 @@ IndexCommon: IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/gist/index/gist.random2.l2.fp32.index + IndexPath: /root/data/gist/index/gist.random2.l2.int8.index TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 QueryFile: /root/data/gist/query.txt QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) diff --git a/doc/draft.md b/doc/draft.md index e7142f914..40edde366 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -134,10 +134,16 @@ class RecordRotator { 3. 对无序列化数据的 reformer(如非旋转模式),`load()` 为 no-op 直接返回 0,不干扰运行时功能 ### 5. 修改运行时测试代码 +0. 测试原始功能是否有问题: +``` +./build/bin/bench /root/code/zvec/config/search_baseline.yaml +./build/bin/recall /root/code/zvec/config/search_baseline.yaml +``` +查看是否能正常运行,以检查原始功能是否出现问题 1. 修改代码:/root/code/zvec/tools/core/local_builder.cc,使其可以保存旋转矩阵 2. 编译代码: ```cpp -cmake -DCMAKE_BUILD_TYPE=Release .. +cmake -DENABLE_SKYLAKE=ON -DCMAKE_BUILD_TYPE=Release .. make -j$(nproc) ``` 3. 测试代码: From 46a08a55d26bf9aa1776f8cdc09e777eb0542d29 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 8 Jun 2026 16:06:14 +0800 Subject: [PATCH 04/38] Int8StreamingConverter --- config/construct.yaml | 2 +- config/search_current.yaml | 2 +- config/search_tmp.yaml | 19 ++ doc/draft.md | 22 +- .../quantizer/integer_quantizer_converter.cc | 15 +- src/core/quantizer/record_rotater.cc | 55 ++--- src/core/quantizer/record_rotater.h | 8 +- .../zvec/core/framework/index_converter.h | 8 + tools/core/local_builder.cc | 207 ++++-------------- 9 files changed, 131 insertions(+), 207 deletions(-) create mode 100644 config/search_tmp.yaml diff --git a/config/construct.yaml b/config/construct.yaml index 481484a72..6bfc54d3c 100644 --- a/config/construct.yaml +++ b/config/construct.yaml @@ -4,7 +4,7 @@ BuilderCommon: # NeedTrain: true #是否需要走train流程 # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs DumpPath: ./flase.tmp - IndexPath: /root/data/gist/index/gist.random2.l2.fp32.index + IndexPath: /root/data/gist/index/gist.random2.l2.int8.index ThreadCount: 16 diff --git a/config/search_current.yaml b/config/search_current.yaml index d4ba76ba5..68675a031 100644 --- a/config/search_current.yaml +++ b/config/search_current.yaml @@ -1,5 +1,5 @@ IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8","integer_streaming.converter.enable_rotate":true,"integer_streaming.reformer.enable_rotate":true},"m":15}' IndexPath: /root/data/gist/index/gist.random2.l2.int8.index TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 QueryFile: /root/data/gist/query.txt diff --git a/config/search_tmp.yaml b/config/search_tmp.yaml new file mode 100644 index 000000000..ba48b7a30 --- /dev/null +++ b/config/search_tmp.yaml @@ -0,0 +1,19 @@ +IndexCommon: + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' + IndexPath: /root/code/VectorDBBench/db/cohere-1m/0/dense.qindex.5.proxima + TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 + QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt + QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) + QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 + QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 + GroundTruthFile: /root/data/cohere/1m/neighbors.txt + + RecallThreadCount: 16 + BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) + BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 + CompareById: true + LogLevel: info + +QueryConfig: + QueryParam: '{"index_type":"kHNSW","ef_search":180}' + diff --git a/doc/draft.md b/doc/draft.md index 40edde366..aa1c552ac 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -120,7 +120,7 @@ class RecordRotator { 1. 新增 `#include "record_rotater.h"` 和成员变量 `enable_rotate_`, `rotator_`(无 `padded_dim_`,由 `rotator_->padded_dim()` 派生) 2. `init()` 读取 `enable_rotate` 标记,创建 FhtKacRotator(padded_dim=向上取64倍数),将 `enable_rotate` 写入 reformer_params 3. `transform()` 将 `rotator_` 传入 Holder,Holder 通过 `rotator_->padded_dim()` 获取对齐维度 - 4. `dump()` 调用 `rotator_->dump(dumper)` 保存旋转矩阵(自描述格式) + 4. `dump()` 已删除(DumpPath 已移除),改为 `dump_to_storage()` 调用 `rotator_->dump(storage)` 保存旋转矩阵(自描述格式) 5. Holder Iterator 的 `encode_record()` 管线:rotate → normalize → quantize 3. Reformer 修改: 1. `init()` 仅读取 `enable_rotate` 标记(维度信息从序列化数据自描述获取) @@ -133,17 +133,29 @@ class RecordRotator { 2. 在 `Index::Open()` 中 streamer 打开后,调用 `reformer_->load(storage_)` 加载序列化数据(旋转矩阵等) 3. 对无序列化数据的 reformer(如非旋转模式),`load()` 为 no-op 直接返回 0,不干扰运行时功能 -### 5. 修改运行时测试代码 -0. 测试原始功能是否有问题: +### 5. 修改local_builder.cc,使其可以保存旋转矩阵 [DONE] +1. 删除 DumpPath 相关代码(AlignSize、dump_meta_segment、dump_taglist 辅助函数,check_config 中 DumpPath 检查,do_build/do_build_sparse 中的 DUMP 代码块) +2. 保留 IndexPath 流式构建路径,保留 UseTrainer 路径的 IndexDumper(写入 TrainerIndexPath) +3. RecordRotator 新增 `dump(IndexStorage::Pointer)` 重载,将旋转矩阵写入 IndexStorage segment +4. IndexConverter 基类新增 `dump_to_storage()` 虚方法(默认 no-op),IntegerStreamingConverter 重写以持久化 rotator +5. local_builder.cc 中 `convert_holder()`/`convert_sparse_holder()` 输出 converter 指针,`build_by_streamer()`/`build_sparse_by_streamer()` 在 `streamer->open(storage)` 后调用 `converter->dump_to_storage(storage)` +6. 删除 RecordRotator::dump(IndexDumper) 死代码(DumpPath 已删除,无调用者) +7. 修改文件清单: + - `tools/core/local_builder.cc`:删除 DumpPath 代码,添加 converter 传递和 dump_to_storage 调用 + - `src/core/quantizer/record_rotater.h/cc`:新增 dump(IndexStorage),删除 dump(IndexDumper) + - `src/include/zvec/core/framework/index_converter.h`:新增 dump_to_storage() 虚方法 + - `src/core/quantizer/integer_quantizer_converter.cc`:重写 dump_to_storage(),删除 dump(IndexDumper) override + +### 6. 修改运行时测试代码 +1. 测试原始功能是否有问题: ``` ./build/bin/bench /root/code/zvec/config/search_baseline.yaml ./build/bin/recall /root/code/zvec/config/search_baseline.yaml ``` 查看是否能正常运行,以检查原始功能是否出现问题 -1. 修改代码:/root/code/zvec/tools/core/local_builder.cc,使其可以保存旋转矩阵 2. 编译代码: ```cpp -cmake -DENABLE_SKYLAKE=ON -DCMAKE_BUILD_TYPE=Release .. +cmake -DCMAKE_BUILD_TYPE=Release .. make -j$(nproc) ``` 3. 测试代码: diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index 7e9defdc6..4643aea68 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -453,15 +453,20 @@ class IntegerStreamingConverter : public IndexConverter { return 0; } - //! Dump index into storage - int dump(const IndexDumper::Pointer &dumper) override { + //! Dump index into storage (no-op: DumpPath removed, use dump_to_storage instead) + int dump(const IndexDumper::Pointer & /*dumper*/) override { return 0; } + + //! Dump converter state to IndexStorage for streaming build + int dump_to_storage(const IndexStorage::Pointer &storage) override { if (enable_rotate_ && rotator_) { - int ret = rotator_->dump(dumper); + int ret = rotator_->dump(storage); if (ret != 0) { - LOG_ERROR("IntegerStreamingConverter: dump rotator failed, ret=%d", ret); + LOG_ERROR( + "IntegerStreamingConverter: dump rotator to storage failed, ret=%d", + ret); return ret; } - stats_.set_dumped_size(stats_.dumped_size() + rotator_->dump_bytes()); + LOG_DEBUG("IntegerStreamingConverter: rotator dumped to storage"); } return 0; } diff --git a/src/core/quantizer/record_rotater.cc b/src/core/quantizer/record_rotater.cc index 33c9e8ca5..e9c12fb78 100644 --- a/src/core/quantizer/record_rotater.cc +++ b/src/core/quantizer/record_rotater.cc @@ -81,14 +81,14 @@ size_t RecordRotator::dump_bytes() const { return Impl::kHeaderSize + impl_->rotator->dump_bytes(); } -int RecordRotator::dump(const IndexDumper::Pointer &dumper, +int RecordRotator::dump(const IndexStorage::Pointer &storage, const std::string &seg_id) const { - if (!dumper) { - LOG_ERROR("RecordRotator::dump: null dumper"); + if (!storage) { + LOG_ERROR("RecordRotator::dump(storage): null storage"); return IndexError_InvalidArgument; } if (!impl_->rotator) { - LOG_ERROR("RecordRotator::dump: rotator not initialized"); + LOG_ERROR("RecordRotator::dump(storage): rotator not initialized"); return IndexError_NoReady; } @@ -99,6 +99,7 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, // Serialize: [Header: type|origin_dim|padded_dim] [rabitqlib blob] const size_t blob_size = impl_->rotator->dump_bytes(); const size_t data_size = Impl::kHeaderSize + blob_size; + const size_t total_size = align_size(data_size); std::vector buffer(data_size); Impl::Header header; @@ -108,34 +109,34 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, std::memcpy(buffer.data(), &header, Impl::kHeaderSize); impl_->rotator->save(buffer.data() + Impl::kHeaderSize); - // Write rotator data - size_t written = dumper->write(buffer.data(), data_size); - if (written != data_size) { - LOG_ERROR("RecordRotator::dump: write failed, written=%zu, expected=%zu", - written, data_size); - return IndexError_WriteData; + // Append segment to storage + int ret = storage->append(seg_id, total_size); + if (ret != 0) { + LOG_ERROR("RecordRotator::dump(storage): append segment '%s' failed, ret=%d", + seg_id.c_str(), ret); + return ret; } - uint32_t crc = ailego::Crc32c::Hash(buffer.data(), data_size, 0); - - // Write padding for 32-byte alignment - size_t padding_size = align_size(data_size) - data_size; - if (padding_size > 0) { - std::string padding(padding_size, '\0'); - if (dumper->write(padding.data(), padding_size) != padding_size) { - LOG_ERROR("RecordRotator::dump: padding write failed"); - return IndexError_WriteData; - } + + auto segment = storage->get(seg_id); + if (!segment) { + LOG_ERROR("RecordRotator::dump(storage): get segment '%s' failed", + seg_id.c_str()); + return IndexError_WriteData; } - // Register segment meta - int ret = dumper->append(seg_id, data_size, padding_size, crc); - if (ret != 0) { - LOG_ERROR("RecordRotator::dump: append segment meta failed, ret=%d", ret); - return ret; + size_t written = segment->write(0, buffer.data(), data_size); + if (written != data_size) { + LOG_ERROR( + "RecordRotator::dump(storage): write failed, written=%zu, expected=%zu", + written, data_size); + return IndexError_WriteData; } + segment->resize(data_size); + segment->update_data_crc(ailego::Crc32c::Hash(buffer.data(), data_size, 0)); - LOG_DEBUG("RecordRotator::dump done: seg=%s, data_size=%zu, padding=%zu", - seg_id.c_str(), data_size, padding_size); + LOG_DEBUG( + "RecordRotator::dump(storage) done: seg=%s, data_size=%zu, total=%zu", + seg_id.c_str(), data_size, total_size); return 0; } diff --git a/src/core/quantizer/record_rotater.h b/src/core/quantizer/record_rotater.h index 132c552dc..d187e2528 100644 --- a/src/core/quantizer/record_rotater.h +++ b/src/core/quantizer/record_rotater.h @@ -17,7 +17,6 @@ #include #include #include -#include "zvec/core/framework/index_dumper.h" #include "zvec/core/framework/index_storage.h" namespace zvec { @@ -72,10 +71,9 @@ class RecordRotator { //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; - //! Dump the rotator to an IndexDumper as a named segment. - //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] - //! Appends padding for 32-byte alignment, registers segment meta (id, size, padding, crc). - int dump(const IndexDumper::Pointer &dumper, + //! Dump the rotator to an IndexStorage as a named segment. + //! Same self-describing format as the dumper variant. + int dump(const IndexStorage::Pointer &storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; //! Open the rotator from an IndexStorage segment (self-describing, no init needed). diff --git a/src/include/zvec/core/framework/index_converter.h b/src/include/zvec/core/framework/index_converter.h index 53ac1c7a2..4dc26468f 100644 --- a/src/include/zvec/core/framework/index_converter.h +++ b/src/include/zvec/core/framework/index_converter.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "zvec/core/framework/index_reformer.h" namespace zvec { @@ -196,6 +197,13 @@ class IndexConverter : public IndexModule { //! Dump index into storage virtual int dump(const IndexDumper::Pointer &dumper) = 0; + //! Dump converter state (e.g. rotator) to IndexStorage for streaming build. + //! Default is no-op; override in subclasses that need storage persistence. + virtual int dump_to_storage(const IndexStorage::Pointer &storage) { + (void)storage; + return 0; + } + //! Retrieve statistics virtual const Stats &stats(void) const = 0; diff --git a/tools/core/local_builder.cc b/tools/core/local_builder.cc index 52ae8321d..57fc6b6ca 100644 --- a/tools/core/local_builder.cc +++ b/tools/core/local_builder.cc @@ -35,7 +35,6 @@ #include "zvec/core/framework/index_reformer.h" #include "zvec/core/framework/index_streamer.h" #include "index_meta_helper.h" -#include "meta_segment_common.h" #include "vecs_index_holder.h" #ifdef __clang__ @@ -206,10 +205,6 @@ bool check_config(YAML::Node &config_root) { return false; } } - if (!common["DumpPath"]) { - LOG_ERROR("Can not find [DumpPath] in config"); - return false; - } if (!config_root["BuilderParams"]) { LOG_ERROR("Can not find [BuilderParams] in config"); return false; @@ -217,75 +212,6 @@ bool check_config(YAML::Node &config_root) { return true; } -static inline size_t AlignSize(size_t size) { - return (size + 0x1F) & (~0x1F); -} - -bool dump_meta_segment(const IndexDumper::Pointer &dumper, - const std::string &segment_id, const void *data, - size_t size, size_t &writes) { - size_t len = dumper->write(data, size); - if (len != size) { - LOG_ERROR("Dump segment %s data failed, expect: %lu, actual: %lu", - segment_id.c_str(), size, len); - return false; - } - - size_t padding_size = AlignSize(size) - size; - if (padding_size > 0) { - std::string padding(padding_size, '\0'); - if (dumper->write(padding.data(), padding_size) != padding_size) { - LOG_ERROR("Append padding failed, size %lu", padding_size); - return false; - } - } - - uint32_t crc = ailego::Crc32c::Hash(data, size); - int ret = dumper->append(segment_id, size, padding_size, crc); - if (ret != 0) { - LOG_ERROR("Dump segment %s meta failed, ret=%d", segment_id.c_str(), ret); - return false; - } - - writes = len + padding_size; - - return true; -} - -int dump_taglist(IndexDumper::Pointer dumper, size_t num_vecs, - const void *key_base, const void *taglist_data, - uint64_t taglist_size) { - TagListHeader taglist_header; - - taglist_header.num_vecs = num_vecs; - - size_t total_writes; - - bool ret = - dump_meta_segment(dumper, TAGLIST_HEADER_SEGMENT_NAME, &taglist_header, - sizeof(TagListHeader), total_writes); - if (ret == false) { - LOG_ERROR("dump taglist meta failed"); - return IndexError_WriteData; - } - - ret = dump_meta_segment(dumper, TAGLIST_KEY_SEGMENT_NAME, key_base, - num_vecs * sizeof(uint64_t), total_writes); - if (ret == false) { - LOG_ERROR("dump taglist key failed"); - return IndexError_WriteData; - } - - ret = dump_meta_segment(dumper, TAGLIST_DATA_SEGMENT_NAME, taglist_data, - taglist_size, total_writes); - if (ret == false) { - LOG_ERROR("dump taglist data failed"); - return IndexError_WriteData; - } - - return 0; -} - int do_build_sparse_by_streamer(IndexStreamer::Pointer &streamer, uint32_t thread_count) { int ret; @@ -422,7 +348,8 @@ int do_build_sparse_by_streamer(IndexStreamer::Pointer &streamer, } int build_sparse_by_streamer(IndexStreamer::Pointer &streamer, - YAML::Node &config_common) { + YAML::Node &config_common, + const IndexConverter::Pointer &converter) { if (!config_common["IndexPath"]) { LOG_ERROR("Miss params IndexPath for Streamer"); return IndexError_InvalidArgument; @@ -451,6 +378,15 @@ int build_sparse_by_streamer(IndexStreamer::Pointer &streamer, return IndexError_Runtime; } + // Dump converter state (e.g. rotator) to storage for streaming build + if (converter) { + ret = converter->dump_to_storage(storage); + if (ret != 0) { + LOG_ERROR("Failed to dump converter to storage, ret=%d", ret); + return ret; + } + } + size_t thread_count = config_common["ThreadCount"] ? config_common["ThreadCount"].as() : std::thread::hardware_concurrency(); @@ -593,7 +529,8 @@ int do_build_by_streamer(IndexStreamer::Pointer &streamer, } int build_by_streamer(IndexStreamer::Pointer &streamer, - YAML::Node &config_common) { + YAML::Node &config_common, + const IndexConverter::Pointer &converter) { if (!config_common["IndexPath"]) { LOG_ERROR("Miss params IndexPath for Streamer"); return IndexError_InvalidArgument; @@ -624,6 +561,15 @@ int build_by_streamer(IndexStreamer::Pointer &streamer, return IndexError_Runtime; } + // Dump converter state (e.g. rotator) to storage for streaming build + if (converter) { + ret = converter->dump_to_storage(storage); + if (ret != 0) { + LOG_ERROR("Failed to dump converter to storage, ret=%d", ret); + return ret; + } + } + size_t thread_count = config_common["ThreadCount"] ? config_common["ThreadCount"].as() : std::thread::hardware_concurrency(); @@ -646,7 +592,8 @@ int build_by_streamer(IndexStreamer::Pointer &streamer, IndexSparseHolder::Pointer convert_sparse_holder( const std::string &name, const ailego::Params ¶ms, - VecsIndexSparseHolder::Pointer &in_holder, IndexMeta &index_meta) { + VecsIndexSparseHolder::Pointer &in_holder, IndexMeta &index_meta, + IndexConverter::Pointer *out_converter) { IndexSparseHolder::Pointer cast_holder = std::dynamic_pointer_cast(in_holder); if (name.empty()) { @@ -679,13 +626,17 @@ IndexSparseHolder::Pointer convert_sparse_holder( index_meta = converter->meta(); + if (out_converter) { + *out_converter = converter; + } return converter->sparse_result(); } IndexHolder::Pointer convert_holder(const std::string &name, const ailego::Params ¶ms, VecsIndexHolder::Pointer &in_holder, - IndexMeta &index_meta) { + IndexMeta &index_meta, + IndexConverter::Pointer *out_converter) { IndexHolder::Pointer cast_holder = std::dynamic_pointer_cast(in_holder); if (name.empty()) { @@ -718,6 +669,9 @@ IndexHolder::Pointer convert_holder(const std::string &name, index_meta = converter->meta(); + if (out_converter) { + *out_converter = converter; + } return converter->result(); } @@ -782,8 +736,9 @@ int do_build_sparse(YAML::Node &config_root, YAML::Node &config_common) { } cout << "Created builder " << builder_class << endl; + IndexConverter::Pointer build_converter; IndexSparseHolder::Pointer cv_build_holder = convert_sparse_holder( - converter_name, converter_params, build_holder, meta); + converter_name, converter_params, build_holder, meta, &build_converter); if (!cv_build_holder) { LOG_ERROR("Convert holder failed."); return -1; @@ -819,7 +774,7 @@ int do_build_sparse(YAML::Node &config_root, YAML::Node &config_common) { } IndexSparseHolder::Pointer cv_train_holder = convert_sparse_holder( - converter_name, converter_params, train_holder, meta); + converter_name, converter_params, train_holder, meta, nullptr); if (!cv_train_holder) { LOG_ERROR("Convert train holder failed."); return -1; @@ -846,7 +801,7 @@ int do_build_sparse(YAML::Node &config_root, YAML::Node &config_common) { if (builder != nullptr) { ret = builder->build(std::move(cv_build_holder)); } else { - ret = build_sparse_by_streamer(streamer, config_common); + ret = build_sparse_by_streamer(streamer, config_common, build_converter); } size_t build_time = timer.milli_seconds(); if (ret < 0) { @@ -856,45 +811,6 @@ int do_build_sparse(YAML::Node &config_root, YAML::Node &config_common) { cout << "Build finished, consume " << build_time << "ms." << endl; signal(SIGINT, SIG_DFL); - // DUMP - IndexDumper::Pointer dumper = IndexFactory::CreateDumper("FileDumper"); - if (!dumper) { - LOG_ERROR("Failed to create FileDumper."); - return -1; - } - string dump_prefix = config_common["DumpPath"].as(); - ret = dumper->create(dump_prefix); - if (ret != 0) { - LOG_ERROR("Failed to create in dumper, ret=%d", ret); - return -1; - } - timer.reset(); - ret = streamer ? streamer->dump(dumper) : builder->dump(dumper); - size_t dump_time = timer.milli_seconds(); - if (ret == IndexError_NotImplemented) { - LOG_WARN("Dump index not implemented"); - } else if (ret < 0) { - LOG_ERROR("Failed to dump in builder, ret=%d", ret); - return -1; - } - - if (build_holder->has_taglist()) { - size_t taglist_size{0}; - const void *taglist_data = build_holder->get_taglist_data(taglist_size); - const void *key_base = build_holder->get_key_base(); - - dump_taglist(dumper, build_holder->get_num_vecs(), key_base, taglist_data, - taglist_size); - } - - ret = dumper->close(); - if (ret != 0) { - LOG_ERROR("Dumper failed to close, ret=%d", ret); - return -1; - } - std::cout << "Dump to [" << dump_prefix << "] finished, consume " << dump_time - << "ms." << std::endl; - if (builder) { auto &stats = reinterpret_cast(builder.get())->stats(); @@ -987,8 +903,10 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { cout << "Created builder " << builder_class << endl; + IndexConverter::Pointer build_converter; IndexHolder::Pointer cv_build_holder = - convert_holder(converter_name, converter_params, build_holder, meta); + convert_holder(converter_name, converter_params, build_holder, meta, + &build_converter); if (!cv_build_holder) { LOG_ERROR("Convert holder failed."); return -1; @@ -1080,7 +998,8 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { // support fp16 convert IndexHolder::Pointer cv_train_holder = - convert_holder(converter_name, converter_params, train_holder, meta); + convert_holder(converter_name, converter_params, train_holder, meta, + nullptr); if (!cv_train_holder) { LOG_ERROR("Convert train holder failed."); return -1; @@ -1137,7 +1056,8 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { train_holder->set_metric(metric_name, metric_params); } IndexHolder::Pointer cv_train_holder = - convert_holder(converter_name, converter_params, train_holder, meta); + convert_holder(converter_name, converter_params, train_holder, meta, + nullptr); if (!cv_train_holder) { LOG_ERROR("Convert train holder failed."); return -1; @@ -1177,7 +1097,7 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { retrieval_mode = "dense"; } - ret = build_by_streamer(streamer, config_common); + ret = build_by_streamer(streamer, config_common, build_converter); } size_t build_time = timer.milli_seconds(); if (ret < 0) { @@ -1187,45 +1107,6 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { cout << "Build finished, consume " << build_time << "ms." << endl; signal(SIGINT, SIG_DFL); - // DUMP - IndexDumper::Pointer dumper = IndexFactory::CreateDumper("FileDumper"); - if (!dumper) { - LOG_ERROR("Failed to create FileDumper."); - return -1; - } - string dump_prefix = config_common["DumpPath"].as(); - ret = dumper->create(dump_prefix); - if (ret != 0) { - LOG_ERROR("Failed to create in dumper, ret=%d", ret); - return -1; - } - timer.reset(); - ret = streamer ? streamer->dump(dumper) : builder->dump(dumper); - size_t dump_time = timer.milli_seconds(); - if (ret == IndexError_NotImplemented) { - LOG_WARN("Dump index not implemented"); - } else if (ret < 0) { - LOG_ERROR("Failed to dump in builder, ret=%d", ret); - return -1; - } - - if (build_holder->has_taglist()) { - size_t taglist_size{0}; - const void *taglist_data = build_holder->get_taglist_data(taglist_size); - const void *key_base = build_holder->get_key_base(); - - dump_taglist(dumper, build_holder->get_num_vecs(), key_base, taglist_data, - taglist_size); - } - - ret = dumper->close(); - if (ret != 0) { - LOG_ERROR("Dumper failed to close, ret=%d", ret); - return -1; - } - std::cout << "Dump to [" << dump_prefix << "] finished, consume " << dump_time - << "ms." << std::endl; - if (builder) { auto &stats = reinterpret_cast(builder.get())->stats(); From 8c553f1c610eaa3221e420e5b33681b518c47f57 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 8 Jun 2026 19:23:15 +0800 Subject: [PATCH 05/38] debug --- config/construct2.yaml | 23 +++++ config/search_baseline2.yaml | 19 ++++ config/search_current.yaml | 2 +- config/search_current2.yaml | 19 ++++ doc/draft.md | 87 ++++++++++--------- src/core/CMakeLists.txt | 6 ++ src/core/quantizer/CMakeLists.txt | 10 +++ .../quantizer/integer_quantizer_reformer.cc | 26 ++++-- tools/core/local_builder.cc | 13 ++- 9 files changed, 156 insertions(+), 49 deletions(-) create mode 100644 config/construct2.yaml create mode 100644 config/search_baseline2.yaml create mode 100644 config/search_current2.yaml diff --git a/config/construct2.yaml b/config/construct2.yaml new file mode 100644 index 000000000..e0bbc1387 --- /dev/null +++ b/config/construct2.yaml @@ -0,0 +1,23 @@ +BuilderCommon: + BuilderClass: HnswStreamer + BuildFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs + # NeedTrain: true #是否需要走train流程 + # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs + DumpPath: ./flase.tmp + IndexPath: /root/data/cohere/1m/cohere.random2.cosine.int8.index + + ThreadCount: 16 + + MetricName: Cosine + ConverterName: CosineFp16Converter + # ConverterName: Int8StreamingConverter + + DisableIdMap: true + +ConverterParams: + integer_streaming.converter.enable_rotate: !!bool true + +BuilderParams: #各Builder方法的params参数 + proxima.hnsw.streamer.efconstruction: !!int 500 + proxima.hnsw.streamer.use_id_map: !!bool false + proxima.hnsw.streamer.max_neighbor_count: !!int 15 diff --git a/config/search_baseline2.yaml b/config/search_baseline2.yaml new file mode 100644 index 000000000..100f6e197 --- /dev/null +++ b/config/search_baseline2.yaml @@ -0,0 +1,19 @@ +IndexCommon: + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' + IndexPath: /root/data/cohere/1m/index/cohere.random.cosine.int8.index + TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 + QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.random.txt + QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) + QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 + QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 + GroundTruthFile: /root/data/cohere/1m/neighbors.txt + + RecallThreadCount: 16 + BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) + BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 + CompareById: true + LogLevel: info + +QueryConfig: + QueryParam: '{"index_type":"kHNSW","ef_search":180}' + diff --git a/config/search_current.yaml b/config/search_current.yaml index 68675a031..d4ba76ba5 100644 --- a/config/search_current.yaml +++ b/config/search_current.yaml @@ -1,5 +1,5 @@ IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8","integer_streaming.converter.enable_rotate":true,"integer_streaming.reformer.enable_rotate":true},"m":15}' + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' IndexPath: /root/data/gist/index/gist.random2.l2.int8.index TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 QueryFile: /root/data/gist/query.txt diff --git a/config/search_current2.yaml b/config/search_current2.yaml new file mode 100644 index 000000000..44995b51c --- /dev/null +++ b/config/search_current2.yaml @@ -0,0 +1,19 @@ +IndexCommon: + IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' + IndexPath: /root/data/cohere/1m/index/cohere.random2.cosine.int8.index + TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 + QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt + QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) + QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 + QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 + GroundTruthFile: /root/data/cohere/1m/neighbors.txt + + RecallThreadCount: 16 + BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) + BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 + CompareById: true + LogLevel: info + +QueryConfig: + QueryParam: '{"index_type":"kHNSW","ef_search":180}' + diff --git a/doc/draft.md b/doc/draft.md index aa1c552ac..8c5850ba7 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -16,15 +16,18 @@ ConverterParams: ``` ``` Build 阶段: - Converter::init() → 读取 enable_rotate=true,创建 rabitqlib::Rotator - Converter::transform() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 - Converter::dump() → 将 rotator 数据写入独立 segment - Streamer::dump() → 写入 meta + HNSW 图数据(不感知 converter) - meta.set_reformer() → reformer_params 中写入 enable_rotate=true + Converter::init() → 读取 enable_rotate=true,创建 rabitqlib::Rotator + Converter::transform() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 + Converter::dump_to_storage() → 将 rotator 写入 IndexStorage segment(自描述格式) + Reformer::load(storage) → 从 segment 加载 rotator(构建时由 local_builder 调用) + Reformer::convert() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 → 写入 HNSW + Streamer::dump() → 写入 meta + HNSW 图数据(不感知 converter) + meta.set_reformer() → reformer_params 中写入 enable_rotate=true Search 阶段: - Index::Open() → reformer_->load(storage_) → 从 segment 加载 rotator - Reformer::transform() → 每条 query: rotator->rotate(q) → [normalize] → int8 量化 + Index::Open() → reformer_->load(storage_) → 自动检测 storage 中的 rotator segment + 若存在则加载(无需搜索侧配置 enable_rotate),若不存在则为 no-op + Reformer::transform() → 每条 query: rotator->rotate(q) → [normalize] → int8 量化 ``` ## Int8StreamingConverter具体实现 @@ -43,8 +46,8 @@ static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = 2. 实现方式参考/root/code/zvec/src/core/algorithm/hnsw_rabitq中的旋转方式,具体实现调用第三方库/root/code/zvec/thirdparty/RaBitQ-Library 3. 包含功能: 1. O(d \log d)复杂度的快速旋转 - 2. dump:保存矩阵(通过IndexDumper写入segment,含自描述Header + rabitqlib blob + CRC + 32字节对齐) - 3. open:从Storage加载序列化旋转器(通过IndexStorage读取segment,从Header解析type/dim/padded_dim,无需预先init,含CRC校验) + 2. dump(IndexStorage):将旋转矩阵写入 IndexStorage segment(自描述Header + rabitqlib blob + 32字节对齐) + 3. open:从Storage加载序列化旋转器(通过IndexStorage读取segment,从Header解析type/dim/padded_dim,无需预先init) 4. load:加载用户自定义旋转矩阵(MatrixRotator,行主序 dim x padded_dim) ```cpp class RecordRotator { @@ -78,10 +81,10 @@ class RecordRotator { //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; - //! Dump the rotator to an IndexDumper as a named segment. + //! Dump the rotator to an IndexStorage as a named segment. //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] - //! Appends padding for 32-byte alignment, registers segment meta (id, size, padding, crc). - int dump(const IndexDumper::Pointer &dumper, + //! Appends padding for 32-byte alignment. + int dump(const IndexStorage::Pointer &storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; //! Open the rotator from an IndexStorage segment (self-describing, no init needed). @@ -124,8 +127,8 @@ class RecordRotator { 5. Holder Iterator 的 `encode_record()` 管线:rotate → normalize → quantize 3. Reformer 修改: 1. `init()` 仅读取 `enable_rotate` 标记(维度信息从序列化数据自描述获取) - 2. `load()` 创建 rotator,调用 `rotator_->open(storage)` 加载旋转矩阵(open 内部从 header 解析 type/dim/padded_dim) - 3. 所有 `transform()`/`convert()` 方法在量化前应用旋转 + 2. `load(storage)` 自动检测 storage 中的 rotator segment(通过 `storage->get(RECORD_ROTATOR_SEG_ID)` 探测),若存在则创建 rotator 并调用 `rotator_->open(storage)` 加载,设置 `enable_rotate_=true`;若不存在则为 no-op。**搜索侧无需在配置中显式指定 enable_rotate** + 3. `transform()`/`convert()` 方法在量化前应用旋转(`convert()` 供构建侧 `do_build_by_streamer()` 调用) 4. `revert()` 在旋转模式下拒绝反量化 ### 4. 修改 Index::Open() [DONE] @@ -140,32 +143,36 @@ class RecordRotator { 4. IndexConverter 基类新增 `dump_to_storage()` 虚方法(默认 no-op),IntegerStreamingConverter 重写以持久化 rotator 5. local_builder.cc 中 `convert_holder()`/`convert_sparse_holder()` 输出 converter 指针,`build_by_streamer()`/`build_sparse_by_streamer()` 在 `streamer->open(storage)` 后调用 `converter->dump_to_storage(storage)` 6. 删除 RecordRotator::dump(IndexDumper) 死代码(DumpPath 已删除,无调用者) -7. 修改文件清单: - - `tools/core/local_builder.cc`:删除 DumpPath 代码,添加 converter 传递和 dump_to_storage 调用 +7. `do_build_by_streamer()` 新增 storage 参数,reformer `init()` 后调用 `reformer->load(storage)` 加载 rotator,确保构建侧数据向量被旋转 +8. 修改文件清单: + - `tools/core/local_builder.cc`:删除 DumpPath 代码,添加 converter 传递和 dump_to_storage 调用,`do_build_by_streamer()` 传入 storage 并加载 reformer - `src/core/quantizer/record_rotater.h/cc`:新增 dump(IndexStorage),删除 dump(IndexDumper) - `src/include/zvec/core/framework/index_converter.h`:新增 dump_to_storage() 虚方法 - `src/core/quantizer/integer_quantizer_converter.cc`:重写 dump_to_storage(),删除 dump(IndexDumper) override - -### 6. 修改运行时测试代码 -1. 测试原始功能是否有问题: -``` -./build/bin/bench /root/code/zvec/config/search_baseline.yaml -./build/bin/recall /root/code/zvec/config/search_baseline.yaml -``` -查看是否能正常运行,以检查原始功能是否出现问题 -2. 编译代码: -```cpp -cmake -DCMAKE_BUILD_TYPE=Release .. -make -j$(nproc) -``` -3. 测试代码: -索引构建: -```cpp -./build/bin/local_builder /root/code/zvec/config/construct.yaml -``` -搜索测试: -```cpp -./build/bin/bench /root/code/zvec/config/search_baseline.yaml -./build/bin/bench /root/code/zvec/config/search_current.yaml -``` -4. 运行代码,并修改错误 + - `src/core/quantizer/integer_quantizer_reformer.cc`:`load()` 改为自动检测 storage 中的 rotator segment + +### 6. 搜索侧自动检测旋转器 [DONE] +1. `IntegerStreamingReformer::load(storage)` 自动检测 storage 中的 `RECORD_ROTATOR_SEG_ID` segment +2. 若 segment 存在,创建 rotator 并从 storage 加载,设置 `enable_rotate_=true` +3. 若 segment 不存在,为 no-op(非旋转索引正常工作) +4. 搜索侧配置 `search_current.yaml` 无需指定 `enable_rotate`,旋转信息完全由索引文件自描述 +5. 修改文件:`src/core/quantizer/integer_quantizer_reformer.cc` + +### 7. 编译配置修复 [DONE] +1. `record_rotater.cc` 包含 rabitqlib 的 `rotator.hpp`,其中 `flip_sign()` 和 `kacs_walk()` 使用编译时 `#if defined(__AVX2__)` 宏守卫 +2. 需要在 CMake 中为 `record_rotater.cc` 添加 `-march=core-avx2` 编译标志(即 `RABITQ_ARCH_FLAG`) +3. 该文件被两个 CMake 目标编译,均需要添加: + - `src/core/CMakeLists.txt`:`zvec_core` 目标 + - `src/core/quantizer/CMakeLists.txt`:`core_quantizer_objects` 目标(recall/bench 链接此目标,容易遗漏) +4. 修改文件:`src/core/CMakeLists.txt`、`src/core/quantizer/CMakeLists.txt` + +### 8. 端到端验证 [DONE] +1. 编译:`cmake -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc)` +2. 构建索引:`./build/bin/local_builder config/construct.yaml`(ConverterParams 中指定 `integer_streaming.converter.enable_rotate: true`) +3. 搜索测试:`./build/bin/bench config/search_current.yaml`、`./build/bin/recall config/search_current.yaml` +4. 实验结果(gist 100万条 960维 FP32 → INT8,ef_search=180): + +| 配置 | Recall@100 | QPS | +|---|---|---| +| Baseline(无旋转) | 84.317 | 21,715 | +| 旋转索引 | 84.165 | 22,847 | diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 2b676787d..e0b9870ad 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -12,6 +12,12 @@ if(RABITQ_SUPPORTED AND AUTO_DETECT_ARCH) ) set(HNSW_RABITQ_FILES_FULL ${HNSW_RABITQ_FILES}) list(TRANSFORM HNSW_RABITQ_FILES_FULL PREPEND "algorithm/hnsw_rabitq/") + + # record_rotater.cc includes rabitqlib's rotator.hpp which uses AVX2 + # intrinsics in flip_sign() and kacs_walk(), so it also needs the + # RABITQ_ARCH_FLAG at compile time. + list(APPEND HNSW_RABITQ_FILES_FULL "quantizer/record_rotater.cc") + foreach(FILE ${HNSW_RABITQ_FILES_FULL}) set_source_files_properties( ${FILE} diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index 459b8b88c..0d9569551 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -6,6 +6,16 @@ if(NOT APPLE) "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") endif() +# record_rotater.cc includes rabitqlib's rotator.hpp which uses AVX2 +# intrinsics (flip_sign, kacs_walk), so it needs the AVX2 compile flag. +if(RABITQ_SUPPORTED AND RABITQ_ARCH_FLAG) + set_source_files_properties( + record_rotater.cc + PROPERTIES + COMPILE_FLAGS "${RABITQ_ARCH_FLAG}" + ) +endif() + cc_library( NAME core_quantizer STATIC SHARED STRICT ALWAYS_LINK diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index a263b9985..40b4de989 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -297,17 +297,31 @@ class IntegerStreamingReformer : public IndexReformer { } //! Load index from container + //! Auto-detects rotation by checking for rotator segment in storage. + //! No need for enable_rotate in search config. int load(IndexStorage::Pointer storage) override { - if (enable_rotate_) { + // If config explicitly enables rotate but rotator not yet loaded, try storage + // If config doesn't enable rotate, still try storage (auto-detect) + if (enable_rotate_ || storage->get(RECORD_ROTATOR_SEG_ID)) { rotator_ = std::make_shared(); int ret = rotator_->open(storage); if (ret != 0) { - LOG_ERROR("IntegerStreamingReformer: load rotator failed, ret=%d", ret); - return ret; + if (enable_rotate_) { + // Config said enable_rotate but storage has no rotator — error + LOG_ERROR( + "IntegerStreamingReformer: load rotator failed, ret=%d", ret); + rotator_.reset(); + return ret; + } + // No rotator in storage, rotation not available + rotator_.reset(); + } else { + enable_rotate_ = true; + LOG_DEBUG( + "IntegerStreamingReformer: rotator auto-loaded, origin_dim=%zu, " + "padded_dim=%zu", + rotator_->dimension(), rotator_->padded_dim()); } - LOG_DEBUG("IntegerStreamingReformer: rotator loaded, origin_dim=%zu, " - "padded_dim=%zu", - rotator_->dimension(), rotator_->padded_dim()); } return 0; } diff --git a/tools/core/local_builder.cc b/tools/core/local_builder.cc index 57fc6b6ca..f53e3ec8a 100644 --- a/tools/core/local_builder.cc +++ b/tools/core/local_builder.cc @@ -400,7 +400,8 @@ int build_sparse_by_streamer(IndexStreamer::Pointer &streamer, } int do_build_by_streamer(IndexStreamer::Pointer &streamer, - uint32_t thread_count, RetrievalMode retrieval_mode) { + uint32_t thread_count, RetrievalMode retrieval_mode, + const IndexStorage::Pointer &storage = nullptr) { int ret; ailego::ThreadPool pool(thread_count, false); std::atomic finished{0}; @@ -422,6 +423,14 @@ int do_build_by_streamer(IndexStreamer::Pointer &streamer, return IndexError_NoExist; } reformer->init(meta.reformer_params()); + // Load reformer state from storage (e.g. rotator for IntegerStreaming) + if (storage) { + ret = reformer->load(storage); + if (ret != 0) { + LOG_ERROR("Failed to load reformer from storage, ret=%d", ret); + return ret; + } + } } } @@ -585,7 +594,7 @@ int build_by_streamer(IndexStreamer::Pointer &streamer, LOG_DEBUG("thread count: %zu, retrieval mode: %s", thread_count, retrieval_mode == 1 ? "Dense" : "Sparse"); - do_build_by_streamer(streamer, thread_count, retrieval_mode); + do_build_by_streamer(streamer, thread_count, retrieval_mode, storage); return 0; } From 2c4b532d51482aa2bfbde82b3e80702a47847053 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 8 Jun 2026 19:40:28 +0800 Subject: [PATCH 06/38] CosineInt8Converter --- config/construct2.yaml | 4 +- doc/draft.md | 6 +++ src/core/quantizer/cosine_converter.cc | 62 +++++++++++++++++++++++--- src/core/quantizer/cosine_reformer.cc | 59 ++++++++++++++++++++---- 4 files changed, 113 insertions(+), 18 deletions(-) diff --git a/config/construct2.yaml b/config/construct2.yaml index e0bbc1387..86383b35e 100644 --- a/config/construct2.yaml +++ b/config/construct2.yaml @@ -4,12 +4,12 @@ BuilderCommon: # NeedTrain: true #是否需要走train流程 # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs DumpPath: ./flase.tmp - IndexPath: /root/data/cohere/1m/cohere.random2.cosine.int8.index + IndexPath: /root/data/cohere/1m/index/cohere.random2.cosine.int8.index ThreadCount: 16 MetricName: Cosine - ConverterName: CosineFp16Converter + ConverterName: CosineInt8Converter # ConverterName: Int8StreamingConverter DisableIdMap: true diff --git a/doc/draft.md b/doc/draft.md index 8c5850ba7..4889d4ae2 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -176,3 +176,9 @@ class RecordRotator { |---|---|---| | Baseline(无旋转) | 84.317 | 21,715 | | 旋转索引 | 84.165 | 22,847 | + +## CosineInt8Converter具体实现 +1. 模仿Int8StreamingConverter具体实现,将功能拓展到CosineInt8Converter +2. 构建索引:`./build/bin/local_builder config/construct2.yaml` +3. baseline测试:`./build/bin/bench config/search_baseline2.yaml`、`./build/bin/recall config/search_baseline2.yaml` +4. 搜索测试:`./build/bin/bench config/search_current2.yaml`、`./build/bin/recall config/search_current2.yaml` \ No newline at end of file diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index ded1e3eb5..a67072866 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -19,6 +19,7 @@ #include #include #include "record_quantizer.h" +#include "record_rotater.h" #include "../metric/metric_params.h" namespace zvec { @@ -54,6 +55,11 @@ class CosineConverterHolder : public IndexHolder { type_ == IndexMeta::DataType::DT_INT8) { buffer_.resize(element_size, 0); } + + // Allocate rotate buffer if owner has a rotator + if (owner_->rotator_) { + rotate_buffer_.resize(owner_->rotator_->padded_dim()); + } } this->convert_record(); @@ -116,17 +122,30 @@ class CosineConverterHolder : public IndexHolder { original_element_size); float *buf = reinterpret_cast(&normalize_buffer_[0]); + const float *vec = buf; + + // Apply rotation if enabled + if (owner_->rotator_) { + owner_->rotator_->rotate(vec, rotate_buffer_.data()); + vec = rotate_buffer_.data(); + } float norm = 0.0f; - ailego::Normalizer::L2(buf, original_dimension_, &norm); + ailego::Normalizer::L2( + const_cast(vec), + owner_->rotator_ ? owner_->rotator_->padded_dim() + : original_dimension_, + &norm); if (type_ == IndexMeta::DataType::DT_FP32) { + ::memcpy(reinterpret_cast(&normalize_buffer_[0]), + vec, original_dimension_ * sizeof(float)); ::memcpy(reinterpret_cast(&normalize_buffer_[0]) + original_dimension_, &norm, NORM_SIZE); } else if (type_ == IndexMeta::DataType::DT_FP16) { ailego::FloatHelper::ToFP16( - buf, original_dimension_, + const_cast(vec), original_dimension_, reinterpret_cast(&buffer_[0])); ::memcpy( @@ -135,8 +154,7 @@ class CosineConverterHolder : public IndexHolder { } else if (type_ == IndexMeta::DataType::DT_INT4 || type_ == IndexMeta::DataType::DT_INT8) { RecordQuantizer::quantize_record( - reinterpret_cast(normalize_buffer_.data()), - original_dimension_, type_, false, &buffer_[0]); + vec, original_dimension_, type_, false, &buffer_[0]); ::memcpy(reinterpret_cast(&buffer_[0]) + element_size - NORM_SIZE, @@ -149,6 +167,7 @@ class CosineConverterHolder : public IndexHolder { const CosineConverterHolder *owner_{nullptr}; std::string buffer_{}; std::string normalize_buffer_{}; + std::vector rotate_buffer_; IndexHolder::Iterator::Pointer front_iter_{}; size_t dimension_{0u}; size_t original_dimension_{0u}; @@ -159,11 +178,13 @@ class CosineConverterHolder : public IndexHolder { //! Constructor CosineConverterHolder(IndexHolder::Pointer front, IndexMeta::DataType original_type, - IndexMeta::DataType type) + IndexMeta::DataType type, + std::shared_ptr rotator = nullptr) : front_(std::move(front)), original_type_(original_type), type_(type), - dimension_(front_->dimension()) {} + dimension_(front_->dimension()), + rotator_(std::move(rotator)) {} //! Retrieve count of elements in holder (-1 indicates unknown) size_t count(void) const override { @@ -222,6 +243,7 @@ class CosineConverterHolder : public IndexHolder { IndexMeta::DataType original_type_{}; IndexMeta::DataType type_{}; uint32_t dimension_{0}; + std::shared_ptr rotator_{}; }; /*! Converter of Cosine @@ -264,7 +286,23 @@ class CosineConverter : public IndexConverter { return IndexError_Unsupported; } + // Read rotation config + params.get(INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE, &enable_rotate_); + ailego::Params reformer_params; + if (enable_rotate_) { + reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, true); + } + + // Compute padded dimension and create rotator if rotation is enabled + if (enable_rotate_) { + size_t dim = index_meta.dimension(); + size_t padded_dim = ((dim + 63) / 64) * 64; + rotator_ = std::make_shared(); + rotator_->init(dim, padded_dim); + LOG_DEBUG("CosineConverter: rotation enabled, dim=%zu, padded_dim=%zu", + dim, padded_dim); + } if (dst_type_ == IndexMeta::DataType::DT_INT8) { meta_.set_converter("CosineInt8Converter", 0, params); @@ -333,7 +371,7 @@ class CosineConverter : public IndexConverter { *stats_.mutable_transformed_count() += holder->count(); holder_ = std::make_shared( - holder, holder->data_type(), dst_type_); + holder, holder->data_type(), dst_type_, rotator_); return 0; } @@ -342,6 +380,14 @@ class CosineConverter : public IndexConverter { return 0; } + //! Dump converter state to storage (rotator) + int dump_to_storage(const IndexStorage::Pointer &storage) override { + if (rotator_) { + return rotator_->dump(storage); + } + return 0; + } + //! Retrieve statistics const Stats &stats(void) const override { return stats_; @@ -378,6 +424,8 @@ class CosineConverter : public IndexConverter { IndexHolder::Pointer holder_{}; IndexMeta::DataType original_type_{IndexMeta::DataType::DT_UNDEFINED}; IndexMeta::DataType dst_type_{IndexMeta::DataType::DT_UNDEFINED}; + bool enable_rotate_{false}; + std::shared_ptr rotator_{}; }; INDEX_FACTORY_REGISTER_CONVERTER_ALIAS(CosineNormalizeConverter, diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index d6080b8d9..e35d040c8 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -18,6 +18,7 @@ #include #include #include "record_quantizer.h" +#include "record_rotater.h" namespace zvec { namespace core { @@ -43,7 +44,8 @@ class CosineReformer : public IndexReformer { dst_type_(IndexMeta::DataType::DT_UNDEFINED) {} //! Initialize Reformer - int init(const ailego::Params & /*params*/) override { + int init(const ailego::Params ¶ms) override { + params.get(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, &enable_rotate_); return 0; } @@ -53,7 +55,26 @@ class CosineReformer : public IndexReformer { } //! Load index from container - int load(IndexStorage::Pointer) override { + //! Auto-detects rotation by checking for rotator segment in storage. + int load(IndexStorage::Pointer storage) override { + if (enable_rotate_ || storage->get(RECORD_ROTATOR_SEG_ID)) { + rotator_ = std::make_shared(); + int ret = rotator_->open(storage); + if (ret != 0) { + if (enable_rotate_) { + LOG_ERROR("CosineReformer: load rotator failed, ret=%d", ret); + rotator_.reset(); + return ret; + } + rotator_.reset(); + } else { + enable_rotate_ = true; + LOG_DEBUG( + "CosineReformer: rotator auto-loaded, origin_dim=%zu, " + "padded_dim=%zu", + rotator_->dimension(), rotator_->padded_dim()); + } + } return 0; } @@ -83,28 +104,46 @@ class CosineReformer : public IndexReformer { ometa->set_meta(dst_type_, qmeta.dimension() + ExtraDimension(dst_type_)); out->resize(ometa->element_size()); - float norm = 0.0f; size_t origin_dimension = qmeta.dimension(); + const float *vec = reinterpret_cast(query); + + // Apply rotation if enabled + std::unique_ptr rotate_buffer; + if (enable_rotate_ && rotator_) { + rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotator_->rotate(vec, rotate_buffer.get()); + vec = rotate_buffer.get(); + origin_dimension = rotator_->padded_dim(); + } + + // Normalize (L2) + float norm = 0.0f; std::string normalized_buffer(reinterpret_cast(query), qmeta.element_size()); - float *buf = reinterpret_cast(&normalized_buffer[0]); - - ailego::Normalizer::L2(buf, origin_dimension, &norm); + if (enable_rotate_ && rotator_) { + // Already rotated, normalize the rotated vector + ailego::Normalizer::L2(const_cast(vec), origin_dimension, + &norm); + } else { + ailego::Normalizer::L2(buf, origin_dimension, &norm); + vec = buf; + } ::memcpy(reinterpret_cast(&(*out)[0]) + ometa->element_size() - NORM_SIZE, &norm, NORM_SIZE); if (dst_type_ == IndexMeta::DataType::DT_FP32) { - ::memcpy(reinterpret_cast(&(*out)[0]), buf, + ::memcpy(reinterpret_cast(&(*out)[0]), vec, ometa->element_size() - NORM_SIZE); } else if (dst_type_ == IndexMeta::DataType::DT_FP16) { - RecordQuantizer::quantize_record(buf, origin_dimension, dst_type_, + RecordQuantizer::quantize_record(const_cast(vec), + qmeta.dimension(), dst_type_, false, &(*out)[0]); } else if (dst_type_ == IndexMeta::DataType::DT_INT4 || dst_type_ == IndexMeta::DataType::DT_INT8) { - RecordQuantizer::quantize_record(buf, qmeta.dimension(), dst_type_, + RecordQuantizer::quantize_record(vec, qmeta.dimension(), dst_type_, false, &(*out)[0]); } } else if (type == IndexMeta::DataType::DT_FP16) { @@ -262,6 +301,8 @@ class CosineReformer : public IndexReformer { //! Members IndexMeta::DataType original_type_{IndexMeta::DataType::DT_UNDEFINED}; IndexMeta::DataType dst_type_{IndexMeta::DataType::DT_UNDEFINED}; + bool enable_rotate_{false}; + std::shared_ptr rotator_{}; }; INDEX_FACTORY_REGISTER_REFORMER_ALIAS(CosineNormalizeReformer, CosineReformer, From aa209b431f4b2fbbb6c841bfdbfb5360825ffc90 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 8 Jun 2026 20:38:25 +0800 Subject: [PATCH 07/38] python --- doc/draft.md | 86 ++++++++++++++++++- .../python/model/param/python_param.cc | 24 ++++-- src/core/interface/index.cc | 11 +++ .../column/vector_column/engine_helper.hpp | 2 + src/db/index/common/proto_converter.cc | 4 +- src/db/proto/zvec.proto | 3 + src/include/zvec/core/interface/index_param.h | 1 + .../core/interface/index_param_builders.h | 5 ++ src/include/zvec/db/index_params.h | 27 ++++-- 9 files changed, 150 insertions(+), 13 deletions(-) diff --git a/doc/draft.md b/doc/draft.md index 4889d4ae2..a321510df 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -181,4 +181,88 @@ class RecordRotator { 1. 模仿Int8StreamingConverter具体实现,将功能拓展到CosineInt8Converter 2. 构建索引:`./build/bin/local_builder config/construct2.yaml` 3. baseline测试:`./build/bin/bench config/search_baseline2.yaml`、`./build/bin/recall config/search_baseline2.yaml` -4. 搜索测试:`./build/bin/bench config/search_current2.yaml`、`./build/bin/recall config/search_current2.yaml` \ No newline at end of file +4. 搜索测试:`./build/bin/bench config/search_current2.yaml`、`./build/bin/recall config/search_current2.yaml` + +## python层接口 +1. 当前的enable_rotate仅仅支持int8,如果不是int8却有该配置,默认无效并警告 +2. 实现时上层和下层尽量解耦合 + +### 实现方案 [DONE] + +#### 层级解耦设计 +``` +Python SDK (HnswIndexParam) + ↓ pybind11 +DB Layer (HnswIndexParams) — enable_rotate_ 用户接口 + ↓ engine_helper.hpp +Core Layer (BaseIndexParam) — enable_rotate 内部字段 + ↓ index.cc CreateAndInitConverterReformer() +Converter Layer — converter_params.set("integer_streaming.converter.enable_rotate", true) +``` + +#### 修改文件清单 +1. `src/include/zvec/db/index_params.h`:`HnswIndexParams` 新增 `enable_rotate_` 成员、构造函数参数、getter/setter、clone/to_string/operator== +2. `src/include/zvec/core/interface/index_param.h`:`BaseIndexParam` 新增 `enable_rotate` 字段 +3. `src/include/zvec/core/interface/index_param_builders.h`:`BaseIndexParamBuilder` 新增 `WithEnableRotate()` 方法 +4. `src/db/index/column/vector_column/engine_helper.hpp`:HNSW 分支调用 `WithEnableRotate(db_index_params->enable_rotate())` +5. `src/core/interface/index.cc`:`CreateAndInitConverterReformer()` 检查 `index_param.enable_rotate`,仅 INT8 生效,非 INT8 打印 WARN +6. `src/db/proto/zvec.proto`:`HnswIndexParams` message 新增 `enable_rotate = 5` +7. `src/db/index/common/proto_converter.cc`:`FromPb`/`ToPb` 处理 `enable_rotate` +8. `src/binding/python/model/param/python_param.cc`:`HnswIndexParam` pybind11 绑定新增 `enable_rotate` 参数、property、to_dict/repr/pickle + +#### Python 使用方式 +```python +from zvec import HnswIndexParam, MetricType, QuantizeType + +# 创建带旋转的 INT8 索引 +params = HnswIndexParam( + metric_type=MetricType.COSINE, + m=15, + ef_construction=500, + quantize_type=QuantizeType.INT8, + enable_rotate=True, # 新增参数 +) +print(params) +# {"metric_type":COSINE, "m":15, "ef_construction":500, "quantize_type":INT8, "use_contiguous_memory":false, "enable_rotate":true} +``` + +## 对接 VectorDBBench +1. 环境: +``` +conda activate baseline +``` +2. 原始指令 +``` +vectordbbench zvec \ +--path /root/code/VectorDBBench/db/cohere-1m \ +--db-label 16c64g-v0.1 \ +--case-type Performance768D1M \ +--num-concurrency 16 \ +--quantize-type int8 \ +--m 15 \ +--ef-search 180 \ +--skip-drop-old \ +--skip-load +``` +3. 对接随机旋转方式,改为: +``` +vectordbbench zvec \ +--path /root/code/VectorDBBench/db/cohere-1m-exp \ +--db-label 16c64g-v0.1 \ +--case-type Performance768D1M \ +--num-concurrency 16 \ +--quantize-type int8 \ +--m 15 \ +--ef-search 180 \ +--enable_rotate \ +--skip-drop-old \ +--skip-load +``` +从而开启随机旋转方式,然后测试 +4. 修改/root/code/VectorDBBench完成对接, +``` +cd /root/code/VectorDBBench +pip install -e . +``` +进行安装 +5. 进行测试,没有--skip-drop-old --skip-load为构建,有则为搜索 diff --git a/src/binding/python/model/param/python_param.cc b/src/binding/python/model/param/python_param.cc index 4da41f926..8f0fd763a 100644 --- a/src/binding/python/model/param/python_param.cc +++ b/src/binding/python/model/param/python_param.cc @@ -421,13 +421,14 @@ encapsulates its construction hyperparameters. {'metric_type': 'IP', 'm': 16, 'ef_construction': 200, 'quantize_type': 'INT8', 'use_contiguous_memory': True} )pbdoc"); hnsw_params - .def(py::init(), + .def(py::init(), py::arg("metric_type") = MetricType::IP, py::arg("m") = core_interface::kDefaultHnswNeighborCnt, py::arg("ef_construction") = core_interface::kDefaultHnswEfConstruction, py::arg("quantize_type") = QuantizeType::UNDEFINED, - py::arg("use_contiguous_memory") = false) + py::arg("use_contiguous_memory") = false, + py::arg("enable_rotate") = false) .def_property_readonly( "m", &HnswIndexParams::m, "int: Maximum number of neighbors per node in upper layers.") @@ -439,6 +440,11 @@ encapsulates its construction hyperparameters. "bool: Whether to allocate a single contiguous memory arena for " "all HNSW graph nodes. Improves cache locality and search " "throughput at the cost of peak memory usage. Defaults to False.") + .def_property_readonly( + "enable_rotate", &HnswIndexParams::enable_rotate, + "bool: Whether to apply random rotation before INT8 quantization " + "to reduce quantization error. Only effective with " + "quantize_type=INT8. Defaults to False.") .def( "to_dict", [](const HnswIndexParams &self) -> py::dict { @@ -450,6 +456,7 @@ encapsulates its construction hyperparameters. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); dict["use_contiguous_memory"] = self.use_contiguous_memory(); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -464,20 +471,25 @@ encapsulates its construction hyperparameters. ", \"quantize_type\":" + quantize_type_to_string(self.quantize_type()) + ", \"use_contiguous_memory\":" + - (self.use_contiguous_memory() ? "true" : "false") + "}"; + (self.use_contiguous_memory() ? "true" : "false") + + ", \"enable_rotate\":" + + (self.enable_rotate() ? "true" : "false") + "}"; }) .def(py::pickle( [](const HnswIndexParams &self) { return py::make_tuple(self.metric_type(), self.m(), self.ef_construction(), self.quantize_type(), - self.use_contiguous_memory()); + self.use_contiguous_memory(), + self.enable_rotate()); }, [](py::tuple t) { - if (t.size() != 5) + if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for HnswIndexParams"); + bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast()); + t[3].cast(), t[4].cast(), + enable_rotate); })); // binding hnsw rabitq index params diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index 84df683a8..4a40cbb77 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -178,6 +178,17 @@ int Index::CreateAndInitConverterReformer(const QuantizerParam ¶m, } } + // Pass enable_rotate to converter_params (only effective for INT8) + if (index_param.enable_rotate) { + if (param.type == QuantizerType::kInt8) { + converter_params.set("integer_streaming.converter.enable_rotate", true); + } else { + LOG_WARN( + "enable_rotate is only supported for INT8 quantizer, " + "ignoring for current quantizer type"); + } + } + proxima_index_meta_.set_converter(converter_name, 0, converter_params); converter_ = core::IndexFactory::CreateConverter(converter_name); if (converter_ == nullptr || diff --git a/src/db/index/column/vector_column/engine_helper.hpp b/src/db/index/column/vector_column/engine_helper.hpp index 29569dcb7..6843b0bf9 100644 --- a/src/db/index/column/vector_column/engine_helper.hpp +++ b/src/db/index/column/vector_column/engine_helper.hpp @@ -380,6 +380,8 @@ class ProximaEngineHelper { db_index_params->ef_construction()); index_param_builder->WithUseContiguousMemory( db_index_params->use_contiguous_memory()); + index_param_builder->WithEnableRotate( + db_index_params->enable_rotate()); return index_param_builder->Build(); } diff --git a/src/db/index/common/proto_converter.cc b/src/db/index/common/proto_converter.cc index faf0cf0e3..f39d005ab 100644 --- a/src/db/index/common/proto_converter.cc +++ b/src/db/index/common/proto_converter.cc @@ -22,7 +22,8 @@ HnswIndexParams::OPtr ProtoConverter::FromPb( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.m(), params_pb.ef_construction(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.use_contiguous_memory()); + params_pb.use_contiguous_memory(), + params_pb.enable_rotate()); return params; } @@ -36,6 +37,7 @@ proto::HnswIndexParams ProtoConverter::ToPb(const HnswIndexParams *params) { params_pb.set_ef_construction(params->ef_construction()); params_pb.set_m(params->m()); params_pb.set_use_contiguous_memory(params->use_contiguous_memory()); + params_pb.set_enable_rotate(params->enable_rotate()); return params_pb; } diff --git a/src/db/proto/zvec.proto b/src/db/proto/zvec.proto index ad6cfb158..dc423372c 100644 --- a/src/db/proto/zvec.proto +++ b/src/db/proto/zvec.proto @@ -100,6 +100,9 @@ message HnswIndexParams { // arena for all graph nodes, which improves cache locality / search // throughput at the cost of peak memory usage. Defaults to false. bool use_contiguous_memory = 4; + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + bool enable_rotate = 5; } message HnswRabitqIndexParams { diff --git a/src/include/zvec/core/interface/index_param.h b/src/include/zvec/core/interface/index_param.h index cd617b237..33eac004c 100644 --- a/src/include/zvec/core/interface/index_param.h +++ b/src/include/zvec/core/interface/index_param.h @@ -244,6 +244,7 @@ class BaseIndexParam : public SerializableBase { bool is_huge_page = false; DataType data_type = DataType::DT_UNDEFINED; bool use_id_map = true; + bool enable_rotate = false; // IndexMeta meta; ailego::Params params; diff --git a/src/include/zvec/core/interface/index_param_builders.h b/src/include/zvec/core/interface/index_param_builders.h index 75fc67b9f..e43408ee8 100644 --- a/src/include/zvec/core/interface/index_param_builders.h +++ b/src/include/zvec/core/interface/index_param_builders.h @@ -88,6 +88,11 @@ class BaseIndexParamBuilder { // : public return static_cast(*this); } + ActualIndexParamBuilderType &WithEnableRotate(bool enable_rotate) { + param->enable_rotate = enable_rotate; + return static_cast(*this); + } + virtual std::shared_ptr Build() = 0; protected: diff --git a/src/include/zvec/db/index_params.h b/src/include/zvec/db/index_params.h index c19cf8028..b55efeb8e 100644 --- a/src/include/zvec/db/index_params.h +++ b/src/include/zvec/db/index_params.h @@ -165,11 +165,13 @@ class HnswIndexParams : public VectorIndexParams { MetricType metric_type, int m = core_interface::kDefaultHnswNeighborCnt, int ef_construction = core_interface::kDefaultHnswEfConstruction, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool use_contiguous_memory = false) + bool use_contiguous_memory = false, + bool enable_rotate = false) : VectorIndexParams(IndexType::HNSW, metric_type, quantize_type), m_(m), ef_construction_(ef_construction), - use_contiguous_memory_(use_contiguous_memory) {} + use_contiguous_memory_(use_contiguous_memory), + enable_rotate_(enable_rotate) {} using OPtr = std::shared_ptr; @@ -177,7 +179,8 @@ class HnswIndexParams : public VectorIndexParams { Ptr clone() const override { return std::make_shared(metric_type_, m_, ef_construction_, quantize_type_, - use_contiguous_memory_); + use_contiguous_memory_, + enable_rotate_); } std::string to_string() const override { @@ -186,7 +189,9 @@ class HnswIndexParams : public VectorIndexParams { std::ostringstream oss; oss << base_str << ",m:" << m_ << ",ef_construction:" << ef_construction_ << ",use_contiguous_memory:" - << (use_contiguous_memory_ ? "true" : "false") << "}"; + << (use_contiguous_memory_ ? "true" : "false") + << ",enable_rotate:" + << (enable_rotate_ ? "true" : "false") << "}"; return oss.str(); } @@ -200,7 +205,9 @@ class HnswIndexParams : public VectorIndexParams { quantize_type() == static_cast(other).quantize_type() && use_contiguous_memory_ == static_cast(other) - .use_contiguous_memory_; + .use_contiguous_memory_ && + enable_rotate_ == static_cast(other) + .enable_rotate_; } void set_m(int m) { @@ -223,6 +230,13 @@ class HnswIndexParams : public VectorIndexParams { return use_contiguous_memory_; } + void set_enable_rotate(bool enable_rotate) { + enable_rotate_ = enable_rotate; + } + bool enable_rotate() const { + return enable_rotate_; + } + protected: int m_; int ef_construction_; @@ -231,6 +245,9 @@ class HnswIndexParams : public VectorIndexParams { // the cost of peak memory usage. Defaults to false for backward // compatibility. bool use_contiguous_memory_{false}; + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + bool enable_rotate_{false}; }; class HnswRabitqIndexParams : public VectorIndexParams { From efb044396ec02249c9187b8985e898d373da0c78 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 15:01:01 +0800 Subject: [PATCH 08/38] over --- doc/draft.md | 70 ++++++- examples/python/int8_rotate_build.py | 237 +++++++++++++++++++++ examples/python/int8_rotate_query.py | 228 +++++++++++++++++++++ logs/vectordb_bench.log | 295 +++++++++++++++++++++++++++ src/core/interface/index.cc | 6 + 5 files changed, 826 insertions(+), 10 deletions(-) create mode 100644 examples/python/int8_rotate_build.py create mode 100644 examples/python/int8_rotate_query.py create mode 100644 logs/vectordb_bench.log diff --git a/doc/draft.md b/doc/draft.md index a321510df..b1bbfa38d 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -40,6 +40,13 @@ static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = "integer_streaming.reformer.enable_rotate"; ``` +【check】: CosineInt8Converter层和IntegerStreamingConverter共用同一个标志不太好: +改为: +```cpp +integer_streaming.converter.enable_rotate +consine.converter.enable_rotate +``` + ### 2. 新增矩阵旋转工具类 [DONE] 1. 便于拓展,将旋转功能抽象到统一的文件`/root/code/zvec/src/core/quantizer/record_rotater.h`和`record_rotater.cc`中(pimpl模式,rabitqlib依赖仅在.cc中) @@ -116,6 +123,7 @@ class RecordRotator { std::unique_ptr impl_; }; ``` +【check】: 当前直接复用rabitq的旋转方法,可能不太好,待修正 ### 3. 修改 IntegerStreaming 的 Converter 和 Reformer [DONE] 1. 修改文件:`integer_quantizer_converter.cc` 和 `integer_quantizer_reformer.cc` @@ -130,7 +138,7 @@ class RecordRotator { 2. `load(storage)` 自动检测 storage 中的 rotator segment(通过 `storage->get(RECORD_ROTATOR_SEG_ID)` 探测),若存在则创建 rotator 并调用 `rotator_->open(storage)` 加载,设置 `enable_rotate_=true`;若不存在则为 no-op。**搜索侧无需在配置中显式指定 enable_rotate** 3. `transform()`/`convert()` 方法在量化前应用旋转(`convert()` 供构建侧 `do_build_by_streamer()` 调用) 4. `revert()` 在旋转模式下拒绝反量化 - +【check】: ### 4. 修改 Index::Open() [DONE] 1. 修改代码:`src/core/interface/index.cc` 2. 在 `Index::Open()` 中 streamer 打开后,调用 `reformer_->load(storage_)` 加载序列化数据(旋转矩阵等) @@ -209,6 +217,9 @@ Converter Layer — converter_params.set("integer_streaming.converter.enable_rot 6. `src/db/proto/zvec.proto`:`HnswIndexParams` message 新增 `enable_rotate = 5` 7. `src/db/index/common/proto_converter.cc`:`FromPb`/`ToPb` 处理 `enable_rotate` 8. `src/binding/python/model/param/python_param.cc`:`HnswIndexParam` pybind11 绑定新增 `enable_rotate` 参数、property、to_dict/repr/pickle +9. `src/core/interface/index.cc`:`Index::Open()` 新增 `create_new` 时 `converter_->dump_to_storage()` 逻辑,修复 DB 构建路径 Reformer 加载 rotator 失败 +10. `examples/python/int8_rotate_build.py`:新增 Python INT8+rotate 构建示例 +11. `examples/python/int8_rotate_query.py`:新增 Python INT8+rotate 查询示例 #### Python 使用方式 ```python @@ -226,6 +237,37 @@ print(params) # {"metric_type":COSINE, "m":15, "ef_construction":500, "quantize_type":INT8, "use_contiguous_memory":false, "enable_rotate":true} ``` +#### Python 示例脚本(已完成) +模仿 `dco_build.py` / `dco_query.py`,将 `construct2.yaml` / `search_current2.yaml` 在 Python 层实现。 + +- **构建脚本**:`examples/python/int8_rotate_build.py` + - 读取 `.zvec.vecs` 文件,创建 Collection(INT8 + enable_rotate=True + COSINE) + - 触发 CosineInt8Converter + FhtKacRotator + - 插入 → optimize → flush +- **查询脚本**:`examples/python/int8_rotate_query.py` + - 打开已构建的 Collection,加载 Reformer(自动从 storage 检测 rotator) + - 执行 search + recall 评估 + +#### 额外修复 +`src/core/interface/index.cc` `Index::Open()` 新增 L310-314:DB 构建路径(`create_new=true`)下,先将 Converter 的 rotator dump 到 storage,再让 Reformer load。修复前 DB optimize 阶段会因 Reformer 找不到 rotator segment 而失败。 + +```cpp +// When building a new index, dump converter state (e.g., rotator) to +// storage so the reformer can load it. +if (storage_options.create_new && converter_ != nullptr) { + converter_->dump_to_storage(storage_); +} +``` + +#### 测试结果(Cohere 1M, dim=768, Cosine, INT8+rotate, m=15, ef_construction=500) + +| 指标 | 数值 | +|------|------| +| 插入速度 | ~20k docs/s | +| HNSW 构建 | 111.5s | +| QPS (ef=180) | ~1344 | +| recall@100 | 94.03% | + ## 对接 VectorDBBench 1. 环境: ``` @@ -254,15 +296,23 @@ vectordbbench zvec \ --quantize-type int8 \ --m 15 \ --ef-search 180 \ ---enable_rotate \ +--enable-rotate \ --skip-drop-old \ --skip-load ``` -从而开启随机旋转方式,然后测试 -4. 修改/root/code/VectorDBBench完成对接, -``` -cd /root/code/VectorDBBench -pip install -e . -``` -进行安装 -5. 进行测试,没有--skip-drop-old --skip-load为构建,有则为搜索 +4. 修改 `/root/code/VectorDBBench` 完成对接(已完成): + - `vectordb_bench/backend/clients/zvec/cli.py`:新增 `--enable-rotate` CLI flag + - `vectordb_bench/backend/clients/zvec/config.py`:`ZvecHNSWIndexConfig` 新增 `enable_rotate: bool = False` + - `vectordb_bench/backend/clients/zvec/zvec.py`:`_parse_index_param()` 传递 `enable_rotate` 到 `HnswIndexParam` + - `pip install -e .` 安装 +5. 测试结果(已完成,Cohere 1M, 768D, Cosine, INT8+rotate, m=15, ef_search=180) + +| 指标 | 数值 | +|------|------| +| 插入耗时 | 33.6s | +| 优化耗时 | 109.8s | +| 并发 QPS (16线程) | **13,989** | +| recall@100 | **93.97%** | +| NDCG | 94.91% | +| 串行延迟 p99 | 1.4ms | +| 串行延迟 p95 | 0.8ms | diff --git a/examples/python/int8_rotate_build.py b/examples/python/int8_rotate_build.py new file mode 100644 index 000000000..f519ee672 --- /dev/null +++ b/examples/python/int8_rotate_build.py @@ -0,0 +1,237 @@ +""" +Zvec Python API — INT8 + Random Rotation Build Example +======================================================= + +Builds a zvec Collection with INT8 quantization and random rotation +(CosineInt8Converter + FhtKacRotator) enabled. + +The key configuration is: + quantize_type=QuantizeType.INT8, enable_rotate=True + +This triggers the C++ CosineInt8Converter to: + 1. Create a FhtKacRotator (random orthogonal rotation matrix) + 2. Rotate all data vectors before INT8 quantization + 3. Store the rotator state in the index meta for search-side query rotation + +Equivalent C++ config (construct2.yaml): + ConverterName: CosineInt8Converter + ConverterParams: + integer_streaming.converter.enable_rotate: !!bool true + +Input : /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs +Output: /root/data/cohere/1m/db/cohere_cosine_int8_rotate + +Usage:: + + conda activate baseline + python int8_rotate_build.py +""" + +from __future__ import annotations + +import mmap +import os +import shutil +import struct +import time + +import numpy as np + +import zvec +from zvec import ( + CollectionOption, + DataType, + Doc, + FieldSchema, + HnswIndexParam, + InvertIndexParam, + LogLevel, + LogType, + MetricType, + OptimizeOption, + QuantizeType, + VectorSchema, +) + +# ==================== Configuration ==================== + +VECS_FILE = "/root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs" +COLLECTION_PATH = "/root/data/cohere/1m/db/cohere_cosine_int8_rotate" + +DIMENSION = 768 +METRIC_TYPE = MetricType.COSINE +HNSW_M = 15 +EF_CONSTRUCTION = 500 + +INSERT_BATCH_SIZE = 1000 + +# ==================== .zvec.vecs Parser ==================== + +VECS_HEADER_FMT = " None: + print("=" * 60) + print(" Zvec Python API — INT8 + Rotate Build Example") + print("=" * 60) + + # ---- Step 1: Init zvec ---- + print("\n[Step 1] Initializing zvec ...") + zvec.init(log_type=LogType.CONSOLE, log_level=LogLevel.INFO) + print(" Done.") + + # ---- Step 2: Parse .zvec.vecs header ---- + print(f"\n[Step 2] Parsing vecs file: {VECS_FILE}") + num_vecs, meta_size, data_start, offsets = parse_vecs_file(VECS_FILE) + dense_offset, dense_size = offsets["dense"] + key_offset, key_size = offsets["key"] + + elem_size = dense_size // num_vecs + vec_dim_floats = elem_size // 4 + print(f" num_vecs: {num_vecs:,}, dim: {vec_dim_floats}") + assert vec_dim_floats == DIMENSION + + # ---- Step 3: Create collection with INT8 + enable_rotate ---- + print(f"\n[Step 3] Creating collection at {COLLECTION_PATH} ...") + print(f" quantize_type = QuantizeType.INT8 + enable_rotate=True") + print(f" metric_type = MetricType.COSINE") + print(f" → CosineInt8Converter + FhtKacRotator") + + index_param = HnswIndexParam( + metric_type=METRIC_TYPE, + m=HNSW_M, + ef_construction=EF_CONSTRUCTION, + quantize_type=QuantizeType.INT8, + enable_rotate=True, + ) + print(f" index_param = {index_param}") + + schema = zvec.CollectionSchema( + name="cohere_cosine_int8_rotate", + fields=[ + FieldSchema( + "id", + DataType.INT64, + nullable=False, + index_param=InvertIndexParam(enable_range_optimization=True), + ), + ], + vectors=[ + VectorSchema( + "embedding", + DataType.VECTOR_FP32, + dimension=DIMENSION, + index_param=index_param, + ), + ], + ) + + os.makedirs(os.path.dirname(COLLECTION_PATH), exist_ok=True) + + if os.path.exists(COLLECTION_PATH): + print(f" Removing existing collection ...") + shutil.rmtree(COLLECTION_PATH) + + collection = zvec.create_and_open( + path=COLLECTION_PATH, + schema=schema, + option=CollectionOption(read_only=False, enable_mmap=True), + ) + print(f" Collection created: {collection.schema.name}") + + # ---- Step 4: Read vectors via mmap and insert ---- + print(f"\n[Step 4] Inserting {num_vecs:,} vectors " + f"(batch_size={INSERT_BATCH_SIZE}) ...") + + insert_start = time.perf_counter() + total_inserted = 0 + + with open(VECS_FILE, "rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + dense_abs = data_start + dense_offset + key_abs = data_start + key_offset + + batch_docs: list[Doc] = [] + + for i in range(num_vecs): + key_val = struct.unpack_from("= INSERT_BATCH_SIZE: + results = collection.insert(batch_docs) + ok = sum(1 for r in results if r.ok()) + total_inserted += ok + if (total_inserted % 50_000) == 0 or total_inserted == num_vecs: + elapsed = time.perf_counter() - insert_start + speed = total_inserted / elapsed if elapsed > 0 else 0 + print(f" [{total_inserted:>8,} / {num_vecs:,}] " + f"{speed:.0f} docs/s") + batch_docs.clear() + + if batch_docs: + results = collection.insert(batch_docs) + ok = sum(1 for r in results if r.ok()) + total_inserted += ok + + insert_elapsed = time.perf_counter() - insert_start + print(f"\n Insert complete: {total_inserted:,} docs " + f"in {insert_elapsed:.1f}s " + f"({total_inserted / insert_elapsed:.0f} docs/s)") + + # ---- Step 5: Optimize (build HNSW graph with INT8 + rotation) ---- + print(f"\n[Step 5] Optimizing collection (building HNSW index with " + f"INT8 + rotation) ...") + opt_start = time.perf_counter() + collection.optimize(option=OptimizeOption()) + opt_elapsed = time.perf_counter() - opt_start + print(f" Optimize done in {opt_elapsed:.1f}s") + + # ---- Step 6: Flush ---- + print(f"\n[Step 6] Flushing collection ...") + collection.flush() + print(f" Doc count: {collection.stats.doc_count:,}") + print(" Done.") + + print(f"\n{'=' * 60}") + print(f" Build complete!") + print(f" Collection saved to: {COLLECTION_PATH}") + print(f" Run int8_rotate_query.py to search and evaluate.") + print(f"{'=' * 60}") + + +if __name__ == "__main__": + main() diff --git a/examples/python/int8_rotate_query.py b/examples/python/int8_rotate_query.py new file mode 100644 index 000000000..f57c3e4e8 --- /dev/null +++ b/examples/python/int8_rotate_query.py @@ -0,0 +1,228 @@ +""" +Zvec Python API — INT8 + Random Rotation Query Example +======================================================= + +Opens an INT8 + rotation enabled collection (built by int8_rotate_build.py), +runs vector searches, and evaluates recall against ground truth. + +The reformer (CosineInt8Reformer) is automatically loaded from the stored +index meta during collection.open(), which rotates query vectors using the +same FhtKacRotator that was used during build. + +Equivalent C++ config (search_current2.yaml): + IndexConfig: '{"quantizer_param":{"type":"kInt8"},"metric_type":"kCosine","m":15,...}' + QueryConfig: '{"index_type":"kHNSW","ef_search":180}' + +Configuration: + Collection : /root/data/cohere/1m/db/cohere_cosine_int8_rotate + TopK : 100 + QueryFile : /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt + GroundTruth: /root/data/cohere/1m/neighbors.txt + ef_search : 180 + +Usage:: + + conda activate baseline + python int8_rotate_query.py +""" + +from __future__ import annotations + +import os +import time +from typing import Optional + +import numpy as np + +import zvec +from zvec import ( + CollectionOption, + HnswQueryParam, + LogLevel, + LogType, + Query, +) + +# ==================== Configuration ==================== + +COLLECTION_PATH = "/root/data/cohere/1m/db/cohere_cosine_int8_rotate" +QUERY_FILE = "/root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt" +GROUNDTRUTH_FILE = "/root/data/cohere/1m/neighbors.txt" + +DIMENSION = 768 +TOPK = 100 +EF_SEARCH = 180 +MAX_QUERIES = 1000 +WARMUP_ROUNDS = 1 +MEASURE_ROUNDS = 3 + + +# ==================== File Parsers ==================== + +def parse_query_file( + path: str, + dimension: int, + first_sep: str = ";", + second_sep: str = " ", + max_queries: int = 0, +) -> list[tuple[Optional[str], np.ndarray]]: + """Parse query file in ``key;v1 v2 v3 ...`` format.""" + queries: list[tuple[Optional[str], np.ndarray]] = [] + + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + + parts = line.split(first_sep, 1) + key = parts[0].strip() if parts else None + + vec_str = parts[1].strip().rstrip(first_sep).strip() if len(parts) > 1 else "" + vec_strs = vec_str.split(second_sep) if vec_str else [] + vector = np.array([float(v) for v in vec_strs], dtype=np.float32) + + if len(vector) != dimension: + print(f" Warning: query {key} has dim={len(vector)}, " + f"expected {dimension}, skipping") + continue + + queries.append((key, vector)) + if max_queries and len(queries) >= max_queries: + break + + return queries + + +def parse_groundtruth_file( + path: str, + first_sep: str = ";", + second_sep: str = " ", +) -> dict[str, list[str]]: + """Parse ground truth file in ``key;id1 id2 id3 ...`` format.""" + gt: dict[str, list[str]] = {} + + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + + parts = line.split(first_sep, 1) + key = parts[0].strip() + + ids_str = parts[1].strip().rstrip(first_sep).strip() if len(parts) > 1 else "" + ids = ids_str.split(second_sep) if ids_str else [] + gt[key] = ids + + return gt + + +# ==================== Main ==================== + +def main() -> None: + print("=" * 60) + print(" Zvec Python API — INT8 + Rotate Query Example") + print("=" * 60) + + # ---- Step 1: Init zvec ---- + print("\n[Step 1] Initializing zvec ...") + zvec.init(log_type=LogType.CONSOLE, log_level=LogLevel.INFO) + print(" Done.") + + # ---- Step 2: Open collection ---- + print(f"\n[Step 2] Opening collection: {COLLECTION_PATH}") + collection = zvec.open( + path=COLLECTION_PATH, + option=CollectionOption(read_only=True, enable_mmap=True), + ) + print(f" Collection : {collection.schema.name}") + print(f" Doc count : {collection.stats.doc_count:,}") + print(f" Dimension : {DIMENSION}") + print(f" TopK : {TOPK}") + print(f" ef_search : {EF_SEARCH}") + + # ---- Step 3: Load queries ---- + print(f"\n[Step 3] Loading queries from: {QUERY_FILE}") + queries = parse_query_file(QUERY_FILE, DIMENSION, + max_queries=MAX_QUERIES) + num_queries = len(queries) + print(f" Loaded {num_queries} queries.") + + # ---- Step 4: Load ground truth ---- + gt: dict[str, list[str]] = {} + if os.path.exists(GROUNDTRUTH_FILE): + print(f"\n[Step 4] Loading ground truth from: {GROUNDTRUTH_FILE}") + gt = parse_groundtruth_file(GROUNDTRUTH_FILE) + print(f" Loaded ground truth for {len(gt)} queries.") + else: + print(f"\n[Step 4] Ground truth not found, skipping recall eval.") + + # ---- Step 5: Run rounds (warmup + measured) ---- + total_rounds = WARMUP_ROUNDS + MEASURE_ROUNDS + print(f"\n[Step 5] Running {total_rounds} rounds " + f"({WARMUP_ROUNDS} warmup + {MEASURE_ROUNDS} measured), " + f"{num_queries} queries/round ...") + + round_qps_list: list[float] = [] + round_recall_list: list[float] = [] + + for rnd in range(total_rounds): + is_warmup = rnd < WARMUP_ROUNDS + label = "warmup" if is_warmup else f"measured-{rnd - WARMUP_ROUNDS + 1}" + + search_start = time.perf_counter() + total_recall = 0.0 + matched = 0 + + for idx, (key, vec) in enumerate(queries): + vq = Query( + field_name="embedding", + vector=vec.tolist(), + param=HnswQueryParam(ef=EF_SEARCH), + ) + results = collection.query(queries=vq, topk=TOPK) + qid = key if key is not None else str(idx) + + if qid in gt: + gt_ids = set(gt[qid][:TOPK]) + if gt_ids: + hit = sum(1 for d in results if d.id in gt_ids) + recall = hit / len(gt_ids) + total_recall += recall + matched += 1 + + search_elapsed = time.perf_counter() - search_start + rnd_qps = num_queries / search_elapsed if search_elapsed > 0 else 0 + rnd_recall = (total_recall / matched * 100) if matched > 0 else 0.0 + + if is_warmup: + print(f" [Round {rnd + 1}/{total_rounds}] {label}: " + f"QPS={rnd_qps:.1f} recall@{TOPK}={rnd_recall:.2f}% (discarded)") + else: + round_qps_list.append(rnd_qps) + round_recall_list.append(rnd_recall) + print(f" [Round {rnd + 1}/{total_rounds}] {label}: " + f"QPS={rnd_qps:.1f} recall@{TOPK}={rnd_recall:.2f}%") + + # ---- Step 6: Summary ---- + avg_qps = sum(round_qps_list) / len(round_qps_list) if round_qps_list else 0 + avg_recall = sum(round_recall_list) / len(round_recall_list) if round_recall_list else 0 + min_qps = min(round_qps_list) if round_qps_list else 0 + max_qps = max(round_qps_list) if round_qps_list else 0 + + print(f"\n[Step 6] Summary") + print(f" Warmup rounds : {WARMUP_ROUNDS}") + print(f" Measured rounds : {MEASURE_ROUNDS}") + print(f" Queries/round : {num_queries}") + print(f" Avg QPS : {avg_qps:.1f} (min={min_qps:.1f}, max={max_qps:.1f})") + if round_recall_list: + print(f" Avg recall@{TOPK} : {avg_recall:.2f}%") + else: + print(f" Avg recall@{TOPK} : N/A (no ground truth)") + + print(f"\n{'=' * 60}") + + +if __name__ == "__main__": + main() diff --git a/logs/vectordb_bench.log b/logs/vectordb_bench.log new file mode 100644 index 000000000..6407fc45e --- /dev/null +++ b/logs/vectordb_bench.log @@ -0,0 +1,295 @@ +2026-06-09 11:34:55,964 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 11:34:55,964 | INFO |generated uuid for the tasks: 38274539699b459baa5d743642157fef (interface.py:73) +2026-06-09 11:34:55,992 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 11:34:55,992 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 11:34:55,992 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | 38274539699b459baa5d743642157fef (task_runner.py:411) +2026-06-09 11:34:55,992 | INFO |task submitted: id=38274539699b459baa5d743642157fef, 38274539699b459baa5d743642157fef, case number: 1 (interface.py:248) +2026-06-09 11:34:56,541 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 11:34:56,541 | INFO |Starting run (task_runner.py:149) +2026-06-09 11:34:56,569 | INFO |Search config: {} (zvec.py:58) +2026-06-09 11:34:57,889 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 11:34:57,926 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 11:34:57,971 | INFO |Start performance case (task_runner.py:194) +2026-06-09 11:34:58,522 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 11:35:08,555 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 11:35:38,687 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0502s, count=230, qps in this process: 7.6539 (mp_runner.py:101) +2026-06-09 11:35:38,687 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0507s, count=231, qps in this process: 7.687 (mp_runner.py:101) +2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0513s, count=230, qps in this process: 7.6536 (mp_runner.py:101) +2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0499s, count=232, qps in this process: 7.7205 (mp_runner.py:101) +2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0426s, count=230, qps in this process: 7.6558 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.043s, count=230, qps in this process: 7.6557 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0523s, count=230, qps in this process: 7.6533 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0565s, count=230, qps in this process: 7.6523 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.063s, count=231, qps in this process: 7.6839 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0617s, count=230, qps in this process: 7.6509 (mp_runner.py:101) +2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.048s, count=232, qps in this process: 7.721 (mp_runner.py:101) +2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.052s, count=232, qps in this process: 7.72 (mp_runner.py:101) +2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0589s, count=232, qps in this process: 7.7182 (mp_runner.py:101) +2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.051s, count=230, qps in this process: 7.6537 (mp_runner.py:101) +2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0517s, count=230, qps in this process: 7.6535 (mp_runner.py:101) +2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.064s, count=231, qps in this process: 7.6836 (mp_runner.py:101) +2026-06-09 11:35:38,691 | INFO |End search in concurrency 16: dur=30.135677246842533s, total_count=3691, qps=122.4794 (mp_runner.py:152) +2026-06-09 11:35:39,052 | INFO |Update largest qps with concurrency 16: current max_qps=122.4794 (mp_runner.py:156) +2026-06-09 11:35:39,052 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 11:35:39,687 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 11:37:03,768 | INFO |SpawnProcess-1:18 search entire test_data: cost=83.7335s, queries=1000, avg_recall=0.9695, avg_ndcg=0.9779, avg_latency=0.0837, p99=0.0902, p95=0.0849 (serial_runner.py:198) +2026-06-09 11:37:03,890 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=122.4794, serial_latency_p99=np.float64(0.0902), serial_latency_p95=np.float64(0.0849), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[122.4794], conc_latency_p99_list=[np.float64(0.1534800931811332)], conc_latency_p95_list=[np.float64(0.1342107669916004)], conc_latency_avg_list=[np.float64(0.13012447999361362)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 11:37:03,890 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=122.4794, serial_latency_p99=np.float64(0.0902), serial_latency_p95=np.float64(0.0849), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[122.4794], conc_latency_p99_list=[np.float64(0.1534800931811332)], conc_latency_p95_list=[np.float64(0.1342107669916004)], conc_latency_avg_list=[np.float64(0.13012447999361362)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 11:37:03,891 | INFO |Task summary: run_id=38274, task_label=38274539699b459baa5d743642157fef (models.py:478) +2026-06-09 11:37:03,891 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 11:37:03,891 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ----------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 11:37:03,891 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) 38274539699b459baa5d743642157fef | 0.0 122.4794 0.0902 0.0849 0.9695 0 | :) (models.py:478) +2026-06-09 11:37:03,891 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_38274539699b459baa5d743642157fef_zvec.json (models.py:315) +2026-06-09 11:37:03,891 | INFO |Success to finish task: label=38274539699b459baa5d743642157fef, run_id=38274539699b459baa5d743642157fef (interface.py:219) +2026-06-09 13:30:19,806 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 13:30:19,806 | INFO |generated uuid for the tasks: d98092fa3b88400ab07ede7482192284 (interface.py:73) +2026-06-09 13:30:19,834 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 13:30:19,834 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 13:30:19,834 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d98092fa3b88400ab07ede7482192284 (task_runner.py:411) +2026-06-09 13:30:19,834 | INFO |task submitted: id=d98092fa3b88400ab07ede7482192284, d98092fa3b88400ab07ede7482192284, case number: 1 (interface.py:248) +2026-06-09 13:30:20,378 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 13:30:20,379 | INFO |Starting run (task_runner.py:149) +2026-06-09 13:30:20,406 | INFO |Search config: {} (zvec.py:58) +2026-06-09 13:30:21,725 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 13:30:21,762 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 13:30:21,809 | INFO |Start performance case (task_runner.py:194) +2026-06-09 13:30:22,370 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 13:30:32,408 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 13:31:02,526 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0343s, count=233, qps in this process: 7.7578 (mp_runner.py:101) +2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0289s, count=232, qps in this process: 7.7259 (mp_runner.py:101) +2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0297s, count=233, qps in this process: 7.759 (mp_runner.py:101) +2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0532s, count=232, qps in this process: 7.7196 (mp_runner.py:101) +2026-06-09 13:31:02,543 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0533s, count=231, qps in this process: 7.6863 (mp_runner.py:101) +2026-06-09 13:31:02,543 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0535s, count=231, qps in this process: 7.6863 (mp_runner.py:101) +2026-06-09 13:31:02,561 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.024s, count=232, qps in this process: 7.7272 (mp_runner.py:101) +2026-06-09 13:31:02,563 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0738s, count=233, qps in this process: 7.7476 (mp_runner.py:101) +2026-06-09 13:31:02,572 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0603s, count=233, qps in this process: 7.7511 (mp_runner.py:101) +2026-06-09 13:31:02,572 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0773s, count=234, qps in this process: 7.78 (mp_runner.py:101) +2026-06-09 13:31:02,573 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0848s, count=233, qps in this process: 7.7448 (mp_runner.py:101) +2026-06-09 13:31:02,573 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0807s, count=233, qps in this process: 7.7458 (mp_runner.py:101) +2026-06-09 13:31:02,574 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0949s, count=233, qps in this process: 7.7422 (mp_runner.py:101) +2026-06-09 13:31:02,574 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0926s, count=233, qps in this process: 7.7428 (mp_runner.py:101) +2026-06-09 13:31:02,643 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.1135s, count=232, qps in this process: 7.7042 (mp_runner.py:101) +2026-06-09 13:31:02,648 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.1185s, count=232, qps in this process: 7.7029 (mp_runner.py:101) +2026-06-09 13:31:02,650 | INFO |End search in concurrency 16: dur=30.241203671321273s, total_count=3720, qps=123.011 (mp_runner.py:152) +2026-06-09 13:31:02,998 | INFO |Update largest qps with concurrency 16: current max_qps=123.011 (mp_runner.py:156) +2026-06-09 13:31:02,998 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 13:31:03,635 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 13:32:27,471 | INFO |SpawnProcess-1:18 search entire test_data: cost=83.4878s, queries=1000, avg_recall=0.9695, avg_ndcg=0.9779, avg_latency=0.0835, p99=0.085, p95=0.0845 (serial_runner.py:198) +2026-06-09 13:32:27,596 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=123.011, serial_latency_p99=np.float64(0.085), serial_latency_p95=np.float64(0.0845), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[123.011], conc_latency_p99_list=[np.float64(0.14689669567625965)], conc_latency_p95_list=[np.float64(0.13342893943190576)], conc_latency_avg_list=[np.float64(0.1292198499749785)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 13:32:27,596 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=123.011, serial_latency_p99=np.float64(0.085), serial_latency_p95=np.float64(0.0845), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[123.011], conc_latency_p99_list=[np.float64(0.14689669567625965)], conc_latency_p95_list=[np.float64(0.13342893943190576)], conc_latency_avg_list=[np.float64(0.1292198499749785)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 13:32:27,597 | INFO |Task summary: run_id=d9809, task_label=d98092fa3b88400ab07ede7482192284 (models.py:478) +2026-06-09 13:32:27,597 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 13:32:27,597 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ---------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 13:32:27,597 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d98092fa3b88400ab07ede7482192284 | 0.0 123.011 0.085 0.0845 0.9695 0 | :) (models.py:478) +2026-06-09 13:32:27,597 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d98092fa3b88400ab07ede7482192284_zvec.json (models.py:315) +2026-06-09 13:32:27,597 | INFO |Success to finish task: label=d98092fa3b88400ab07ede7482192284, run_id=d98092fa3b88400ab07ede7482192284 (interface.py:219) +2026-06-09 14:44:02,774 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['drop_old', 'load', 'search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 14:44:02,774 | INFO |generated uuid for the tasks: 602b05d4b2b04d68ac77fa1311ecb8a1 (interface.py:73) +2026-06-09 14:44:02,802 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 14:44:02,802 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 14:44:02,802 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | 602b05d4b2b04d68ac77fa1311ecb8a1 (task_runner.py:411) +2026-06-09 14:44:02,802 | INFO |task submitted: id=602b05d4b2b04d68ac77fa1311ecb8a1, 602b05d4b2b04d68ac77fa1311ecb8a1, case number: 1 (interface.py:248) +2026-06-09 14:44:03,344 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=True (interface.py:178) +2026-06-09 14:44:03,345 | INFO |Starting run (task_runner.py:149) +2026-06-09 14:44:03,371 | INFO |Search config: {} (zvec.py:58) +2026-06-09 14:44:05,173 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 14:44:05,210 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 14:44:05,259 | INFO |Start performance case (task_runner.py:194) +2026-06-09 14:44:05,911 | INFO |(SpawnProcess-1:1) Start concurrent insert, batch_size=100, max_workers=4 (concurrent_runner.py:187) +2026-06-09 14:44:05,911 | INFO |Get iterator for shuffle_train.parquet (dataset.py:428) +2026-06-09 14:44:33,099 | INFO |(SpawnProcess-1:1) Finish concurrent insert, count=1000000, dur=27.19s (concurrent_runner.py:208) +2026-06-09 14:46:29,510 | INFO |Finish loading the entire dataset into VectorDB, insert_duration=33.62117011426017, optimize_duration=109.80431694490835 load_duration(insert + optimize) = 143.4255 (task_runner.py:204) +2026-06-09 14:46:30,070 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 14:46:40,094 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0219s, count=25457, qps in this process: 847.9477 (mp_runner.py:101) +2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:19 search 30s: actual_dur=30.0268s, count=26710, qps in this process: 889.5387 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:18 search 30s: actual_dur=30.0279s, count=26538, qps in this process: 883.7781 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0301s, count=26581, qps in this process: 885.1452 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0277s, count=26796, qps in this process: 892.376 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.022s, count=26131, qps in this process: 870.395 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0192s, count=26042, qps in this process: 867.5115 (mp_runner.py:101) +2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0187s, count=26256, qps in this process: 874.6548 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0303s, count=26661, qps in this process: 887.8033 (mp_runner.py:101) +2026-06-09 14:47:10,177 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0208s, count=26373, qps in this process: 878.4909 (mp_runner.py:101) +2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0265s, count=26415, qps in this process: 879.7229 (mp_runner.py:101) +2026-06-09 14:47:10,177 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0203s, count=26138, qps in this process: 870.6775 (mp_runner.py:101) +2026-06-09 14:47:10,178 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0185s, count=26339, qps in this process: 877.4256 (mp_runner.py:101) +2026-06-09 14:47:10,179 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0158s, count=26613, qps in this process: 886.633 (mp_runner.py:101) +2026-06-09 14:47:10,181 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0149s, count=26753, qps in this process: 891.324 (mp_runner.py:101) +2026-06-09 14:47:10,187 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0168s, count=25916, qps in this process: 863.3832 (mp_runner.py:101) +2026-06-09 14:47:10,241 | INFO |End search in concurrency 16: dur=30.14720565499738s, total_count=421719, qps=13988.6597 (mp_runner.py:152) +2026-06-09 14:47:10,587 | INFO |Update largest qps with concurrency 16: current max_qps=13988.6597 (mp_runner.py:156) +2026-06-09 14:47:10,591 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 14:47:11,232 | INFO |SpawnProcess-1:20 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 14:47:12,265 | INFO |SpawnProcess-1:20 search entire test_data: cost=0.7515s, queries=1000, avg_recall=0.9397, avg_ndcg=0.9491, avg_latency=0.0008, p99=0.0014, p95=0.0008 (serial_runner.py:198) +2026-06-09 14:47:12,389 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=33.6212, optimize_duration=109.8043, load_duration=143.4255, qps=13988.6597, serial_latency_p99=np.float64(0.0014), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9397), ndcg=np.float64(0.9491), conc_num_list=[16], conc_qps_list=[13988.6597], conc_latency_p99_list=[np.float64(0.0022724022064358014)], conc_latency_p95_list=[np.float64(0.001238895207643509)], conc_latency_avg_list=[np.float64(0.001135618022694545)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 14:47:12,390 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=33.6212, optimize_duration=109.8043, load_duration=143.4255, qps=13988.6597, serial_latency_p99=np.float64(0.0014), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9397), ndcg=np.float64(0.9491), conc_num_list=[16], conc_qps_list=[13988.6597], conc_latency_p99_list=[np.float64(0.0022724022064358014)], conc_latency_p95_list=[np.float64(0.001238895207643509)], conc_latency_avg_list=[np.float64(0.001135618022694545)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 14:47:12,390 | INFO |Task summary: run_id=602b0, task_label=602b05d4b2b04d68ac77fa1311ecb8a1 (models.py:478) +2026-06-09 14:47:12,390 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 14:47:12,390 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 14:47:12,390 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) 602b05d4b2b04d68ac77fa1311ecb8a1 | 143.4255 13988.6597 0.0014 0.0008 0.9397 0 | :) (models.py:478) +2026-06-09 14:47:12,390 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_602b05d4b2b04d68ac77fa1311ecb8a1_zvec.json (models.py:315) +2026-06-09 14:47:12,390 | INFO |Success to finish task: label=602b05d4b2b04d68ac77fa1311ecb8a1, run_id=602b05d4b2b04d68ac77fa1311ecb8a1 (interface.py:219) +2026-06-09 14:51:42,424 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 14:51:42,424 | INFO |generated uuid for the tasks: d802e43419c4461c97e75a0aacd207cb (interface.py:73) +2026-06-09 14:51:42,452 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 14:51:42,452 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 14:51:42,452 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d802e43419c4461c97e75a0aacd207cb (task_runner.py:411) +2026-06-09 14:51:42,452 | INFO |task submitted: id=d802e43419c4461c97e75a0aacd207cb, d802e43419c4461c97e75a0aacd207cb, case number: 1 (interface.py:248) +2026-06-09 14:51:43,001 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 14:51:43,001 | INFO |Starting run (task_runner.py:149) +2026-06-09 14:51:43,029 | INFO |Search config: {} (zvec.py:58) +2026-06-09 14:51:44,295 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 14:51:44,332 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 14:51:44,381 | INFO |Start performance case (task_runner.py:194) +2026-06-09 14:51:44,928 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 14:51:54,965 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0302s, count=26187, qps in this process: 872.0222 (mp_runner.py:101) +2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0337s, count=26145, qps in this process: 870.5221 (mp_runner.py:101) +2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0268s, count=27097, qps in this process: 902.4272 (mp_runner.py:101) +2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0349s, count=26504, qps in this process: 882.4401 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.032s, count=26064, qps in this process: 867.8743 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0363s, count=25922, qps in this process: 863.0224 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.039s, count=26177, qps in this process: 871.4338 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0356s, count=26352, qps in this process: 877.3589 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0397s, count=26370, qps in this process: 877.8383 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0374s, count=26668, qps in this process: 887.8265 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0367s, count=26174, qps in this process: 871.4007 (mp_runner.py:101) +2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.041s, count=26194, qps in this process: 871.9417 (mp_runner.py:101) +2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0352s, count=26298, qps in this process: 875.5727 (mp_runner.py:101) +2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.035s, count=26270, qps in this process: 874.6462 (mp_runner.py:101) +2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0416s, count=26479, qps in this process: 881.4111 (mp_runner.py:101) +2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.041s, count=26340, qps in this process: 876.8017 (mp_runner.py:101) +2026-06-09 14:52:25,126 | INFO |End search in concurrency 16: dur=30.160955973900855s, total_count=421241, qps=13966.434 (mp_runner.py:152) +2026-06-09 14:52:25,497 | INFO |Update largest qps with concurrency 16: current max_qps=13966.434 (mp_runner.py:156) +2026-06-09 14:52:25,500 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 14:52:26,132 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 14:52:27,149 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7386s, queries=1000, avg_recall=0.9285, avg_ndcg=0.9405, avg_latency=0.0007, p99=0.0017, p95=0.0008 (serial_runner.py:198) +2026-06-09 14:52:27,269 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13966.434, serial_latency_p99=np.float64(0.0017), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13966.434], conc_latency_p99_list=[np.float64(0.0023291470482945417)], conc_latency_p95_list=[np.float64(0.001239514909684658)], conc_latency_avg_list=[np.float64(0.0011369578863421963)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 14:52:27,270 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13966.434, serial_latency_p99=np.float64(0.0017), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13966.434], conc_latency_p99_list=[np.float64(0.0023291470482945417)], conc_latency_p95_list=[np.float64(0.001239514909684658)], conc_latency_avg_list=[np.float64(0.0011369578863421963)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 14:52:27,270 | INFO |Task summary: run_id=d802e, task_label=d802e43419c4461c97e75a0aacd207cb (models.py:478) +2026-06-09 14:52:27,270 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 14:52:27,270 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------ --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 14:52:27,270 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d802e43419c4461c97e75a0aacd207cb | 0.0 13966.434 0.0017 0.0008 0.9285 0 | :) (models.py:478) +2026-06-09 14:52:27,270 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d802e43419c4461c97e75a0aacd207cb_zvec.json (models.py:315) +2026-06-09 14:52:27,270 | INFO |Success to finish task: label=d802e43419c4461c97e75a0aacd207cb, run_id=d802e43419c4461c97e75a0aacd207cb (interface.py:219) +2026-06-09 14:54:01,400 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 14:54:01,400 | INFO |generated uuid for the tasks: e6ff48f902df4da487e0b7a350dce2bb (interface.py:73) +2026-06-09 14:54:01,463 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 14:54:01,463 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 14:54:01,463 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | e6ff48f902df4da487e0b7a350dce2bb (task_runner.py:411) +2026-06-09 14:54:01,463 | INFO |task submitted: id=e6ff48f902df4da487e0b7a350dce2bb, e6ff48f902df4da487e0b7a350dce2bb, case number: 1 (interface.py:248) +2026-06-09 14:54:02,023 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 14:54:02,023 | INFO |Starting run (task_runner.py:149) +2026-06-09 14:54:02,049 | INFO |Search config: {} (zvec.py:58) +2026-06-09 14:54:02,049 | WARNING |[1/1] case {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'} failed to run, reason=__init__(): incompatible constructor arguments. The following argument types are supported: + 1. _zvec.param.HnswIndexParam(metric_type: _zvec.typing.MetricType = , m: typing.SupportsInt | typing.SupportsIndex = 50, ef_construction: typing.SupportsInt | typing.SupportsIndex = 500, quantize_type: _zvec.typing.QuantizeType = , use_contiguous_memory: bool = False) + +Invoked with: kwargs: metric_type=, m=15, ef_construction=500, quantize_type=, enable_rotate=False (interface.py:200) +2026-06-09 14:54:02,050 | INFO |Task summary: run_id=e6ff4, task_label=e6ff48f902df4da487e0b7a350dce2bb (models.py:478) +2026-06-09 14:54:02,050 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 14:54:02,050 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ---------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 14:54:02,050 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) e6ff48f902df4da487e0b7a350dce2bb | 0.0 0.0 0.0 0.0 0.0 0 | x (models.py:478) +2026-06-09 14:54:02,050 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_e6ff48f902df4da487e0b7a350dce2bb_zvec.json (models.py:315) +2026-06-09 14:54:02,050 | INFO |Success to finish task: label=e6ff48f902df4da487e0b7a350dce2bb, run_id=e6ff48f902df4da487e0b7a350dce2bb (interface.py:219) +2026-06-09 14:56:41,670 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 14:56:41,670 | INFO |generated uuid for the tasks: d15c6088018a44188967789891ac1acf (interface.py:73) +2026-06-09 14:56:41,699 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 14:56:41,699 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 14:56:41,699 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d15c6088018a44188967789891ac1acf (task_runner.py:411) +2026-06-09 14:56:41,699 | INFO |task submitted: id=d15c6088018a44188967789891ac1acf, d15c6088018a44188967789891ac1acf, case number: 1 (interface.py:248) +2026-06-09 14:56:42,244 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 14:56:42,244 | INFO |Starting run (task_runner.py:149) +2026-06-09 14:56:42,271 | INFO |Search config: {} (zvec.py:58) +2026-06-09 14:56:43,535 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 14:56:43,571 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 14:56:43,620 | INFO |Start performance case (task_runner.py:194) +2026-06-09 14:56:44,166 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 14:56:54,201 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0277s, count=26591, qps in this process: 885.549 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0308s, count=26392, qps in this process: 878.8311 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0376s, count=26429, qps in this process: 879.8639 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0259s, count=26041, qps in this process: 867.2846 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0327s, count=26277, qps in this process: 874.9463 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0355s, count=26648, qps in this process: 887.2168 (mp_runner.py:101) +2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0227s, count=26570, qps in this process: 884.997 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0203s, count=26348, qps in this process: 877.6728 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0397s, count=26510, qps in this process: 882.4988 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0361s, count=26048, qps in this process: 867.2231 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0223s, count=26209, qps in this process: 872.9844 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0202s, count=25908, qps in this process: 863.0189 (mp_runner.py:101) +2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0184s, count=25627, qps in this process: 853.7097 (mp_runner.py:101) +2026-06-09 14:57:24,293 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0199s, count=26950, qps in this process: 897.7378 (mp_runner.py:101) +2026-06-09 14:57:24,293 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0197s, count=26408, qps in this process: 879.689 (mp_runner.py:101) +2026-06-09 14:57:24,302 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0233s, count=27081, qps in this process: 901.9994 (mp_runner.py:101) +2026-06-09 14:57:24,355 | INFO |End search in concurrency 16: dur=30.154128178954124s, total_count=422037, qps=13995.9941 (mp_runner.py:152) +2026-06-09 14:57:24,752 | INFO |Update largest qps with concurrency 16: current max_qps=13995.9941 (mp_runner.py:156) +2026-06-09 14:57:24,755 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 14:57:25,398 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 14:57:26,433 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7526s, queries=1000, avg_recall=0.9285, avg_ndcg=0.9405, avg_latency=0.0008, p99=0.0018, p95=0.0008 (serial_runner.py:198) +2026-06-09 14:57:26,557 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13995.9941, serial_latency_p99=np.float64(0.0018), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13995.9941], conc_latency_p99_list=[np.float64(0.0022443212196230893)], conc_latency_p95_list=[np.float64(0.0012407672591507434)], conc_latency_avg_list=[np.float64(0.0011347746009090012)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 14:57:26,557 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13995.9941, serial_latency_p99=np.float64(0.0018), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13995.9941], conc_latency_p99_list=[np.float64(0.0022443212196230893)], conc_latency_p95_list=[np.float64(0.0012407672591507434)], conc_latency_avg_list=[np.float64(0.0011347746009090012)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 14:57:26,557 | INFO |Task summary: run_id=d15c6, task_label=d15c6088018a44188967789891ac1acf (models.py:478) +2026-06-09 14:57:26,557 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 14:57:26,557 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 14:57:26,557 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d15c6088018a44188967789891ac1acf | 0.0 13995.9941 0.0018 0.0008 0.9285 0 | :) (models.py:478) +2026-06-09 14:57:26,558 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d15c6088018a44188967789891ac1acf_zvec.json (models.py:315) +2026-06-09 14:57:26,558 | INFO |Success to finish task: label=d15c6088018a44188967789891ac1acf, run_id=d15c6088018a44188967789891ac1acf (interface.py:219) +2026-06-09 14:59:33,023 | INFO |Task: +TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-cos2l2'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={'name': 'Cohere1M-cos2l2', 'description': 'This is a customized dataset.', 'load_timeout': 36000, 'optimize_timeout': 36000, 'dataset_config': {'name': 'Cohere1M-cos2l2', 'dir': '/root/code/VectorDBBench/datasets/cohere-1m-cos2l2', 'size': '1000000', 'dim': '768', 'metric_type': 'L2', 'file_count': '1', 'use_shuffled': False, 'with_gt': True}}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) + (cli.py:659) +2026-06-09 14:59:33,023 | INFO |generated uuid for the tasks: 4cc4609939544e4ba8d162ec00835a51 (interface.py:73) +2026-06-09 14:59:33,051 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) +2026-06-09 14:59:33,051 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) +2026-06-09 14:59:33,051 | INFO | Zvec-16c64g-v0.1 | Performance Cohere1M-cos2l2-Custom-1M 0.0 | 4cc4609939544e4ba8d162ec00835a51 (task_runner.py:411) +2026-06-09 14:59:33,051 | INFO |task submitted: id=4cc4609939544e4ba8d162ec00835a51, 4cc4609939544e4ba8d162ec00835a51, case number: 1 (interface.py:248) +2026-06-09 14:59:33,600 | INFO |[1/1] start case: {'label': , 'name': 'Cohere1M-cos2l2', 'dataset': {'data': {'name': 'Cohere1M-cos2l2', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) +2026-06-09 14:59:33,600 | INFO |Starting run (task_runner.py:149) +2026-06-09 14:59:33,627 | INFO |Search config: {} (zvec.py:58) +2026-06-09 14:59:33,820 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) +2026-06-09 14:59:33,857 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) +2026-06-09 14:59:33,904 | INFO |Start performance case (task_runner.py:194) +2026-06-09 14:59:34,452 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) +2026-06-09 14:59:44,486 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) +2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0329s, count=25112, qps in this process: 836.1497 (mp_runner.py:101) +2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0366s, count=25472, qps in this process: 848.0321 (mp_runner.py:101) +2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0269s, count=25685, qps in this process: 855.3997 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0257s, count=24790, qps in this process: 825.626 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0275s, count=24552, qps in this process: 817.6505 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0326s, count=25096, qps in this process: 835.6253 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0429s, count=25224, qps in this process: 839.5994 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0397s, count=25566, qps in this process: 851.0737 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0284s, count=25365, qps in this process: 844.7004 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0343s, count=25512, qps in this process: 849.4288 (mp_runner.py:101) +2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0363s, count=25159, qps in this process: 837.6198 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.038s, count=25444, qps in this process: 847.0604 (mp_runner.py:101) +2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0297s, count=24896, qps in this process: 829.0459 (mp_runner.py:101) +2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0366s, count=25161, qps in this process: 837.678 (mp_runner.py:101) +2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0442s, count=25354, qps in this process: 843.89 (mp_runner.py:101) +2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0216s, count=25171, qps in this process: 838.4297 (mp_runner.py:101) +2026-06-09 15:00:14,639 | INFO |End search in concurrency 16: dur=30.152784225996584s, total_count=403559, qps=13383.8055 (mp_runner.py:152) +2026-06-09 15:00:14,988 | INFO |Update largest qps with concurrency 16: current max_qps=13383.8055 (mp_runner.py:156) +2026-06-09 15:00:14,991 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) +2026-06-09 15:00:15,620 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) +2026-06-09 15:00:16,653 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7505s, queries=1000, avg_recall=0.9428, avg_ndcg=0.9512, avg_latency=0.0008, p99=0.0016, p95=0.0008 (serial_runner.py:198) +2026-06-09 15:00:16,772 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13383.8055, serial_latency_p99=np.float64(0.0016), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9428), ndcg=np.float64(0.9512), conc_num_list=[16], conc_qps_list=[13383.8055], conc_latency_p99_list=[np.float64(0.002614401644095773)], conc_latency_p95_list=[np.float64(0.0012562056072056293)], conc_latency_avg_list=[np.float64(0.001186844353695998)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) +2026-06-09 15:00:16,773 | INFO |[1/1] finish case: {'label': , 'name': 'Cohere1M-cos2l2', 'dataset': {'data': {'name': 'Cohere1M-cos2l2', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13383.8055, serial_latency_p99=np.float64(0.0016), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9428), ndcg=np.float64(0.9512), conc_num_list=[16], conc_qps_list=[13383.8055], conc_latency_p99_list=[np.float64(0.002614401644095773)], conc_latency_p95_list=[np.float64(0.0012562056072056293)], conc_latency_avg_list=[np.float64(0.001186844353695998)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) +2026-06-09 15:00:16,773 | INFO |Task summary: run_id=4cc46, task_label=4cc4609939544e4ba8d162ec00835a51 (models.py:478) +2026-06-09 15:00:16,773 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) +2026-06-09 15:00:16,773 | INFO |---- | -------------- --------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) +2026-06-09 15:00:16,773 | INFO |Zvec | 16c64g-v0.1 Cohere1M-cos2l2 4cc4609939544e4ba8d162ec00835a51 | 0.0 13383.8055 0.0016 0.0008 0.9428 0 | :) (models.py:478) +2026-06-09 15:00:16,773 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_4cc4609939544e4ba8d162ec00835a51_zvec.json (models.py:315) +2026-06-09 15:00:16,774 | INFO |Success to finish task: label=4cc4609939544e4ba8d162ec00835a51, run_id=4cc4609939544e4ba8d162ec00835a51 (interface.py:219) diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index 4a40cbb77..e72ab4f15 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -307,6 +307,12 @@ int Index::Open(const std::string &file_path, StorageOptions storage_options) { // Load reformer data from storage (e.g., rotation matrix for IntegerStreaming) if (reformer_ != nullptr) { + // When building a new index, dump converter state (e.g., rotator) to + // storage so the reformer can load it. This is needed for + // enable_rotate with INT8 quantization. + if (storage_options.create_new && converter_ != nullptr) { + converter_->dump_to_storage(storage_); + } if (reformer_->load(storage_) != 0) { LOG_ERROR("Failed to load reformer, path: %s", file_path.c_str()); return core::IndexError_Runtime; From 342b8a3e0655079397bde5db285de4f6c6722fa3 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 19:46:35 +0800 Subject: [PATCH 09/38] debug --- config/construct2.yaml | 2 +- doc/draft.md | 12 ++++++------ src/core/interface/index.cc | 7 ++++++- src/core/quantizer/cosine_converter.cc | 4 ++-- src/core/quantizer/cosine_reformer.cc | 2 +- src/core/quantizer/quantizer_params.h | 4 ++++ 6 files changed, 20 insertions(+), 11 deletions(-) diff --git a/config/construct2.yaml b/config/construct2.yaml index 86383b35e..176363fc5 100644 --- a/config/construct2.yaml +++ b/config/construct2.yaml @@ -15,7 +15,7 @@ BuilderCommon: DisableIdMap: true ConverterParams: - integer_streaming.converter.enable_rotate: !!bool true + cosine.converter.enable_rotate: !!bool true BuilderParams: #各Builder方法的params参数 proxima.hnsw.streamer.efconstruction: !!int 500 diff --git a/doc/draft.md b/doc/draft.md index b1bbfa38d..c38b22f6f 100644 --- a/doc/draft.md +++ b/doc/draft.md @@ -39,12 +39,12 @@ static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = //! IntegerStreamingReformer static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = "integer_streaming.reformer.enable_rotate"; -``` -【check】: CosineInt8Converter层和IntegerStreamingConverter共用同一个标志不太好: -改为: -```cpp -integer_streaming.converter.enable_rotate -consine.converter.enable_rotate +//! CosineConverter +static const std::string COSINE_CONVERTER_ENABLE_ROTATE = + "cosine.converter.enable_rotate"; +//! CosineReformer +static const std::string COSINE_REFORMER_ENABLE_ROTATE = + "cosine.reformer.enable_rotate"; ``` diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index e72ab4f15..9378eaf7d 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -181,7 +181,12 @@ int Index::CreateAndInitConverterReformer(const QuantizerParam ¶m, // Pass enable_rotate to converter_params (only effective for INT8) if (index_param.enable_rotate) { if (param.type == QuantizerType::kInt8) { - converter_params.set("integer_streaming.converter.enable_rotate", true); + if (index_param.metric_type == MetricType::kCosine) { + converter_params.set("cosine.converter.enable_rotate", true); + } else { + converter_params.set("integer_streaming.converter.enable_rotate", + true); + } } else { LOG_WARN( "enable_rotate is only supported for INT8 quantizer, " diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index a67072866..251f61684 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -287,11 +287,11 @@ class CosineConverter : public IndexConverter { } // Read rotation config - params.get(INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE, &enable_rotate_); + params.get(COSINE_CONVERTER_ENABLE_ROTATE, &enable_rotate_); ailego::Params reformer_params; if (enable_rotate_) { - reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, true); + reformer_params.set(COSINE_REFORMER_ENABLE_ROTATE, true); } // Compute padded dimension and create rotator if rotation is enabled diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index e35d040c8..50d4a3a80 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -45,7 +45,7 @@ class CosineReformer : public IndexReformer { //! Initialize Reformer int init(const ailego::Params ¶ms) override { - params.get(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, &enable_rotate_); + params.get(COSINE_REFORMER_ENABLE_ROTATE, &enable_rotate_); return 0; } diff --git a/src/core/quantizer/quantizer_params.h b/src/core/quantizer/quantizer_params.h index 9b34a4b30..3c6e4e3b2 100644 --- a/src/core/quantizer/quantizer_params.h +++ b/src/core/quantizer/quantizer_params.h @@ -100,10 +100,14 @@ static const std::string INT4_QUANTIZER_REFORMER_METRIC = //! CosineConverter static const std::string COSINE_CONVERTER_FORCED_HALF_FLOAT = "cosine.converter.forced_half_float"; +static const std::string COSINE_CONVERTER_ENABLE_ROTATE = + "cosine.converter.enable_rotate"; //! CosineReformer static const std::string COSINE_REFORMER_FORCED_HALF_FLOAT = "cosine.reformer.forced_half_float"; +static const std::string COSINE_REFORMER_ENABLE_ROTATE = + "cosine.reformer.enable_rotate"; //! IntegerStreamingConverter static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_NORMALIZE = From b23a648d86e16c512fedaf5ee6f809f51b0c47dc Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 19:56:39 +0800 Subject: [PATCH 10/38] add int8 rotate --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 38c769e2a..79b96ceb6 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,7 @@ allure-* !build_android.sh !build_ios.sh +# congfig +doc +config +example/python \ No newline at end of file From 4d465424c9144e021336e973994a5e12c5c5d8a8 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 19:59:48 +0800 Subject: [PATCH 11/38] add int8 rotate --- .gitignore | 6 +- config/construct.yaml | 23 --- config/construct2.yaml | 23 --- config/search_baseline.yaml | 19 --- config/search_baseline2.yaml | 19 --- config/search_current.yaml | 19 --- config/search_current2.yaml | 19 --- config/search_tmp.yaml | 19 --- doc/draft.md | 318 ----------------------------------- 9 files changed, 3 insertions(+), 462 deletions(-) delete mode 100644 config/construct.yaml delete mode 100644 config/construct2.yaml delete mode 100644 config/search_baseline.yaml delete mode 100644 config/search_baseline2.yaml delete mode 100644 config/search_current.yaml delete mode 100644 config/search_current2.yaml delete mode 100644 config/search_tmp.yaml delete mode 100644 doc/draft.md diff --git a/.gitignore b/.gitignore index 79b96ceb6..556a6535c 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,6 @@ allure-* !build_ios.sh # congfig -doc -config -example/python \ No newline at end of file +doc/ +config/ +example/python/ \ No newline at end of file diff --git a/config/construct.yaml b/config/construct.yaml deleted file mode 100644 index 6bfc54d3c..000000000 --- a/config/construct.yaml +++ /dev/null @@ -1,23 +0,0 @@ -BuilderCommon: - BuilderClass: HnswStreamer - BuildFile: /root/data/gist/gist_train.vecs - # NeedTrain: true #是否需要走train流程 - # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs - DumpPath: ./flase.tmp - IndexPath: /root/data/gist/index/gist.random2.l2.int8.index - - ThreadCount: 16 - - MetricName: SquaredEuclidean - # ConverterName: CosineFp16Converter - ConverterName: Int8StreamingConverter - - DisableIdMap: true - -ConverterParams: - integer_streaming.converter.enable_rotate: !!bool true - -BuilderParams: #各Builder方法的params参数 - proxima.hnsw.streamer.efconstruction: !!int 500 - proxima.hnsw.streamer.use_id_map: !!bool false - proxima.hnsw.streamer.max_neighbor_count: !!int 15 diff --git a/config/construct2.yaml b/config/construct2.yaml deleted file mode 100644 index 176363fc5..000000000 --- a/config/construct2.yaml +++ /dev/null @@ -1,23 +0,0 @@ -BuilderCommon: - BuilderClass: HnswStreamer - BuildFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs - # NeedTrain: true #是否需要走train流程 - # TrainFile: /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs - DumpPath: ./flase.tmp - IndexPath: /root/data/cohere/1m/index/cohere.random2.cosine.int8.index - - ThreadCount: 16 - - MetricName: Cosine - ConverterName: CosineInt8Converter - # ConverterName: Int8StreamingConverter - - DisableIdMap: true - -ConverterParams: - cosine.converter.enable_rotate: !!bool true - -BuilderParams: #各Builder方法的params参数 - proxima.hnsw.streamer.efconstruction: !!int 500 - proxima.hnsw.streamer.use_id_map: !!bool false - proxima.hnsw.streamer.max_neighbor_count: !!int 15 diff --git a/config/search_baseline.yaml b/config/search_baseline.yaml deleted file mode 100644 index a1bd334f2..000000000 --- a/config/search_baseline.yaml +++ /dev/null @@ -1,19 +0,0 @@ -IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/gist/index/gist.random.l2.fp32.index - TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/gist/query_random.txt - QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) - QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 - QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 - GroundTruthFile: /root/data/gist/ground_truth.txt - - RecallThreadCount: 16 - BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) - BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 - CompareById: true - LogLevel: info - -QueryConfig: - QueryParam: '{"index_type":"kHNSW","ef_search":180}' - diff --git a/config/search_baseline2.yaml b/config/search_baseline2.yaml deleted file mode 100644 index 100f6e197..000000000 --- a/config/search_baseline2.yaml +++ /dev/null @@ -1,19 +0,0 @@ -IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/cohere/1m/index/cohere.random.cosine.int8.index - TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.random.txt - QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) - QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 - QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 - GroundTruthFile: /root/data/cohere/1m/neighbors.txt - - RecallThreadCount: 16 - BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) - BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 - CompareById: true - LogLevel: info - -QueryConfig: - QueryParam: '{"index_type":"kHNSW","ef_search":180}' - diff --git a/config/search_current.yaml b/config/search_current.yaml deleted file mode 100644 index d4ba76ba5..000000000 --- a/config/search_current.yaml +++ /dev/null @@ -1,19 +0,0 @@ -IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":960,"index_type":"kHNSW","metric_type":"kL2sq","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/gist/index/gist.random2.l2.int8.index - TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/gist/query.txt - QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) - QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 - QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 - GroundTruthFile: /root/data/gist/ground_truth.txt - - RecallThreadCount: 16 - BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) - BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 - CompareById: true - LogLevel: info - -QueryConfig: - QueryParam: '{"index_type":"kHNSW","ef_search":180}' - diff --git a/config/search_current2.yaml b/config/search_current2.yaml deleted file mode 100644 index 44995b51c..000000000 --- a/config/search_current2.yaml +++ /dev/null @@ -1,19 +0,0 @@ -IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/data/cohere/1m/index/cohere.random2.cosine.int8.index - TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt - QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) - QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 - QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 - GroundTruthFile: /root/data/cohere/1m/neighbors.txt - - RecallThreadCount: 16 - BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) - BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 - CompareById: true - LogLevel: info - -QueryConfig: - QueryParam: '{"index_type":"kHNSW","ef_search":180}' - diff --git a/config/search_tmp.yaml b/config/search_tmp.yaml deleted file mode 100644 index ba48b7a30..000000000 --- a/config/search_tmp.yaml +++ /dev/null @@ -1,19 +0,0 @@ -IndexCommon: - IndexConfig: '{"use_id_map":false,"data_type":"DT_FP32","dimension":768,"index_type":"kHNSW","metric_type":"kCosine","quantizer_param":{"type":"kInt8"},"m":15}' - IndexPath: /root/code/VectorDBBench/db/cohere-1m/0/dense.qindex.5.proxima - TopK: 100 #指定返回topk,recall时多topk用逗号隔开,bench时会使用最大值 - QueryFile: /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt - QueryType: float #指定query向量类型,需要与索引类型一致(FP16索引使用float查询,内部会自动转换) - QueryFirstSep: ";" #指定query第一分隔符,用于分割key和特征 - QuerySecondSep: " " #指定query第二分隔符,用于分割特征各维度 - GroundTruthFile: /root/data/cohere/1m/neighbors.txt - - RecallThreadCount: 16 - BenchThreadCount: 16 #指定bench并发数(召回并发直接使用cpu核数) - BenchIterCount: 100000 #指定bench执行条目数,当query量较少时会对query重复使用 - CompareById: true - LogLevel: info - -QueryConfig: - QueryParam: '{"index_type":"kHNSW","ef_search":180}' - diff --git a/doc/draft.md b/doc/draft.md deleted file mode 100644 index c38b22f6f..000000000 --- a/doc/draft.md +++ /dev/null @@ -1,318 +0,0 @@ -## 量化方案新增旋转功能 - -1. 动机: -Int8量化采用 per-vector min-max 量化,即用每个向量自身的最小/最大值来确定量化区间 [-127, 127],误差主要来自: - - 维度间的值分布不均匀:某些维度的值远大于其他维度,导致量化区间被少数极端维度"撑开",大部分维度的量化精度被浪费。 - - 非各向同性分布:真实embedding数据的能量往往集中在少数方向上。 -随机旋转在保持距离不变的同时,会将向量的能量均匀分散到所有维度,使每个维度的值分布更接近高斯分布,从而减小per-vector min-max量化的量化误差。 - -2. 修改类型: -一种可选的量化参数 -```yaml -// 构建侧新增量化配置选项: -ConverterParams: - integer_streaming.converter.enable_rotate: !!bool true -// 搜索侧不做变化 -``` -``` -Build 阶段: - Converter::init() → 读取 enable_rotate=true,创建 rabitqlib::Rotator - Converter::transform() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 - Converter::dump_to_storage() → 将 rotator 写入 IndexStorage segment(自描述格式) - Reformer::load(storage) → 从 segment 加载 rotator(构建时由 local_builder 调用) - Reformer::convert() → 每条向量: rotator->rotate(x) → [normalize] → int8 量化 → 写入 HNSW - Streamer::dump() → 写入 meta + HNSW 图数据(不感知 converter) - meta.set_reformer() → reformer_params 中写入 enable_rotate=true - -Search 阶段: - Index::Open() → reformer_->load(storage_) → 自动检测 storage 中的 rotator segment - 若存在则加载(无需搜索侧配置 enable_rotate),若不存在则为 no-op - Reformer::transform() → 每条 query: rotator->rotate(q) → [normalize] → int8 量化 -``` -## Int8StreamingConverter具体实现 - -### 1: 新增参数定义 [DONE] -```cpp -//! IntegerStreamingConverter -static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_ROTATE = - "integer_streaming.converter.enable_rotate"; -//! IntegerStreamingReformer -static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = - "integer_streaming.reformer.enable_rotate"; -//! CosineConverter -static const std::string COSINE_CONVERTER_ENABLE_ROTATE = - "cosine.converter.enable_rotate"; -//! CosineReformer -static const std::string COSINE_REFORMER_ENABLE_ROTATE = - "cosine.reformer.enable_rotate"; -``` - - -### 2. 新增矩阵旋转工具类 [DONE] -1. 便于拓展,将旋转功能抽象到统一的文件`/root/code/zvec/src/core/quantizer/record_rotater.h`和`record_rotater.cc`中(pimpl模式,rabitqlib依赖仅在.cc中) -2. 实现方式参考/root/code/zvec/src/core/algorithm/hnsw_rabitq中的旋转方式,具体实现调用第三方库/root/code/zvec/thirdparty/RaBitQ-Library -3. 包含功能: - 1. O(d \log d)复杂度的快速旋转 - 2. dump(IndexStorage):将旋转矩阵写入 IndexStorage segment(自描述Header + rabitqlib blob + 32字节对齐) - 3. open:从Storage加载序列化旋转器(通过IndexStorage读取segment,从Header解析type/dim/padded_dim,无需预先init) - 4. load:加载用户自定义旋转矩阵(MatrixRotator,行主序 dim x padded_dim) -```cpp -class RecordRotator { - public: - RecordRotator(); - ~RecordRotator(); - - //! Move-only (pimpl with unique_ptr) - RecordRotator(RecordRotator &&) noexcept; - RecordRotator &operator=(RecordRotator &&) noexcept; - RecordRotator(const RecordRotator &) = delete; - RecordRotator &operator=(const RecordRotator &) = delete; - - //! Initialize the rotator - //! @param dimension original vector dimension - //! @param padded_dim padded dimension (rounded up for SIMD alignment) - //! @param rotator_type rotation algorithm (default: FhtKac) - void init(size_t dimension, size_t padded_dim, - RecordRotatorType rotator_type = RecordRotatorType::FhtKac); - - //! Rotate a single vector - //! @param in input vector of size >= dimension - //! @param out output buffer of size >= padded_dim - void rotate(const float *in, float *out) const; - - //! Rotate a single vector into a managed buffer - //! @param in input vector of size >= dimension - //! @return vector of size padded_dim containing rotated result - std::vector rotate(const float *in) const; - - //! Return the serialized size of the rotator in bytes (header + blob) - size_t dump_bytes() const; - - //! Dump the rotator to an IndexStorage as a named segment. - //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] - //! Appends padding for 32-byte alignment. - int dump(const IndexStorage::Pointer &storage, - const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; - - //! Open the rotator from an IndexStorage segment (self-describing, no init needed). - //! Parses header to get type/dimension/padded_dim, then reconstructs the rotator. - int open(IndexStorage::Pointer storage, - const std::string &seg_id = RECORD_ROTATOR_SEG_ID); - - //! Load a user-specified rotation matrix. - //! Always uses MatrixRotator internally. - //! @param matrix row-major matrix of shape dimension x padded_dim - //! @param dimension original vector dimension - //! @param padded_dim padded dimension (must be multiple of 64) - int load(const float *matrix, size_t dimension, size_t padded_dim); - - //! Return the original dimension - size_t dimension() const; - - //! Return the padded dimension - size_t padded_dim() const; - - //! Return the rotator type - RecordRotatorType rotator_type() const; - - //! Check if the rotator is initialized - bool initialized() const; - - private: - struct Impl; - std::unique_ptr impl_; -}; -``` -【check】: 当前直接复用rabitq的旋转方法,可能不太好,待修正 -### 3. 修改 IntegerStreaming 的 Converter 和 Reformer [DONE] - -1. 修改文件:`integer_quantizer_converter.cc` 和 `integer_quantizer_reformer.cc` -2. Converter 修改: - 1. 新增 `#include "record_rotater.h"` 和成员变量 `enable_rotate_`, `rotator_`(无 `padded_dim_`,由 `rotator_->padded_dim()` 派生) - 2. `init()` 读取 `enable_rotate` 标记,创建 FhtKacRotator(padded_dim=向上取64倍数),将 `enable_rotate` 写入 reformer_params - 3. `transform()` 将 `rotator_` 传入 Holder,Holder 通过 `rotator_->padded_dim()` 获取对齐维度 - 4. `dump()` 已删除(DumpPath 已移除),改为 `dump_to_storage()` 调用 `rotator_->dump(storage)` 保存旋转矩阵(自描述格式) - 5. Holder Iterator 的 `encode_record()` 管线:rotate → normalize → quantize -3. Reformer 修改: - 1. `init()` 仅读取 `enable_rotate` 标记(维度信息从序列化数据自描述获取) - 2. `load(storage)` 自动检测 storage 中的 rotator segment(通过 `storage->get(RECORD_ROTATOR_SEG_ID)` 探测),若存在则创建 rotator 并调用 `rotator_->open(storage)` 加载,设置 `enable_rotate_=true`;若不存在则为 no-op。**搜索侧无需在配置中显式指定 enable_rotate** - 3. `transform()`/`convert()` 方法在量化前应用旋转(`convert()` 供构建侧 `do_build_by_streamer()` 调用) - 4. `revert()` 在旋转模式下拒绝反量化 -【check】: -### 4. 修改 Index::Open() [DONE] -1. 修改代码:`src/core/interface/index.cc` -2. 在 `Index::Open()` 中 streamer 打开后,调用 `reformer_->load(storage_)` 加载序列化数据(旋转矩阵等) -3. 对无序列化数据的 reformer(如非旋转模式),`load()` 为 no-op 直接返回 0,不干扰运行时功能 - -### 5. 修改local_builder.cc,使其可以保存旋转矩阵 [DONE] -1. 删除 DumpPath 相关代码(AlignSize、dump_meta_segment、dump_taglist 辅助函数,check_config 中 DumpPath 检查,do_build/do_build_sparse 中的 DUMP 代码块) -2. 保留 IndexPath 流式构建路径,保留 UseTrainer 路径的 IndexDumper(写入 TrainerIndexPath) -3. RecordRotator 新增 `dump(IndexStorage::Pointer)` 重载,将旋转矩阵写入 IndexStorage segment -4. IndexConverter 基类新增 `dump_to_storage()` 虚方法(默认 no-op),IntegerStreamingConverter 重写以持久化 rotator -5. local_builder.cc 中 `convert_holder()`/`convert_sparse_holder()` 输出 converter 指针,`build_by_streamer()`/`build_sparse_by_streamer()` 在 `streamer->open(storage)` 后调用 `converter->dump_to_storage(storage)` -6. 删除 RecordRotator::dump(IndexDumper) 死代码(DumpPath 已删除,无调用者) -7. `do_build_by_streamer()` 新增 storage 参数,reformer `init()` 后调用 `reformer->load(storage)` 加载 rotator,确保构建侧数据向量被旋转 -8. 修改文件清单: - - `tools/core/local_builder.cc`:删除 DumpPath 代码,添加 converter 传递和 dump_to_storage 调用,`do_build_by_streamer()` 传入 storage 并加载 reformer - - `src/core/quantizer/record_rotater.h/cc`:新增 dump(IndexStorage),删除 dump(IndexDumper) - - `src/include/zvec/core/framework/index_converter.h`:新增 dump_to_storage() 虚方法 - - `src/core/quantizer/integer_quantizer_converter.cc`:重写 dump_to_storage(),删除 dump(IndexDumper) override - - `src/core/quantizer/integer_quantizer_reformer.cc`:`load()` 改为自动检测 storage 中的 rotator segment - -### 6. 搜索侧自动检测旋转器 [DONE] -1. `IntegerStreamingReformer::load(storage)` 自动检测 storage 中的 `RECORD_ROTATOR_SEG_ID` segment -2. 若 segment 存在,创建 rotator 并从 storage 加载,设置 `enable_rotate_=true` -3. 若 segment 不存在,为 no-op(非旋转索引正常工作) -4. 搜索侧配置 `search_current.yaml` 无需指定 `enable_rotate`,旋转信息完全由索引文件自描述 -5. 修改文件:`src/core/quantizer/integer_quantizer_reformer.cc` - -### 7. 编译配置修复 [DONE] -1. `record_rotater.cc` 包含 rabitqlib 的 `rotator.hpp`,其中 `flip_sign()` 和 `kacs_walk()` 使用编译时 `#if defined(__AVX2__)` 宏守卫 -2. 需要在 CMake 中为 `record_rotater.cc` 添加 `-march=core-avx2` 编译标志(即 `RABITQ_ARCH_FLAG`) -3. 该文件被两个 CMake 目标编译,均需要添加: - - `src/core/CMakeLists.txt`:`zvec_core` 目标 - - `src/core/quantizer/CMakeLists.txt`:`core_quantizer_objects` 目标(recall/bench 链接此目标,容易遗漏) -4. 修改文件:`src/core/CMakeLists.txt`、`src/core/quantizer/CMakeLists.txt` - -### 8. 端到端验证 [DONE] -1. 编译:`cmake -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc)` -2. 构建索引:`./build/bin/local_builder config/construct.yaml`(ConverterParams 中指定 `integer_streaming.converter.enable_rotate: true`) -3. 搜索测试:`./build/bin/bench config/search_current.yaml`、`./build/bin/recall config/search_current.yaml` -4. 实验结果(gist 100万条 960维 FP32 → INT8,ef_search=180): - -| 配置 | Recall@100 | QPS | -|---|---|---| -| Baseline(无旋转) | 84.317 | 21,715 | -| 旋转索引 | 84.165 | 22,847 | - -## CosineInt8Converter具体实现 -1. 模仿Int8StreamingConverter具体实现,将功能拓展到CosineInt8Converter -2. 构建索引:`./build/bin/local_builder config/construct2.yaml` -3. baseline测试:`./build/bin/bench config/search_baseline2.yaml`、`./build/bin/recall config/search_baseline2.yaml` -4. 搜索测试:`./build/bin/bench config/search_current2.yaml`、`./build/bin/recall config/search_current2.yaml` - -## python层接口 -1. 当前的enable_rotate仅仅支持int8,如果不是int8却有该配置,默认无效并警告 -2. 实现时上层和下层尽量解耦合 - -### 实现方案 [DONE] - -#### 层级解耦设计 -``` -Python SDK (HnswIndexParam) - ↓ pybind11 -DB Layer (HnswIndexParams) — enable_rotate_ 用户接口 - ↓ engine_helper.hpp -Core Layer (BaseIndexParam) — enable_rotate 内部字段 - ↓ index.cc CreateAndInitConverterReformer() -Converter Layer — converter_params.set("integer_streaming.converter.enable_rotate", true) -``` - -#### 修改文件清单 -1. `src/include/zvec/db/index_params.h`:`HnswIndexParams` 新增 `enable_rotate_` 成员、构造函数参数、getter/setter、clone/to_string/operator== -2. `src/include/zvec/core/interface/index_param.h`:`BaseIndexParam` 新增 `enable_rotate` 字段 -3. `src/include/zvec/core/interface/index_param_builders.h`:`BaseIndexParamBuilder` 新增 `WithEnableRotate()` 方法 -4. `src/db/index/column/vector_column/engine_helper.hpp`:HNSW 分支调用 `WithEnableRotate(db_index_params->enable_rotate())` -5. `src/core/interface/index.cc`:`CreateAndInitConverterReformer()` 检查 `index_param.enable_rotate`,仅 INT8 生效,非 INT8 打印 WARN -6. `src/db/proto/zvec.proto`:`HnswIndexParams` message 新增 `enable_rotate = 5` -7. `src/db/index/common/proto_converter.cc`:`FromPb`/`ToPb` 处理 `enable_rotate` -8. `src/binding/python/model/param/python_param.cc`:`HnswIndexParam` pybind11 绑定新增 `enable_rotate` 参数、property、to_dict/repr/pickle -9. `src/core/interface/index.cc`:`Index::Open()` 新增 `create_new` 时 `converter_->dump_to_storage()` 逻辑,修复 DB 构建路径 Reformer 加载 rotator 失败 -10. `examples/python/int8_rotate_build.py`:新增 Python INT8+rotate 构建示例 -11. `examples/python/int8_rotate_query.py`:新增 Python INT8+rotate 查询示例 - -#### Python 使用方式 -```python -from zvec import HnswIndexParam, MetricType, QuantizeType - -# 创建带旋转的 INT8 索引 -params = HnswIndexParam( - metric_type=MetricType.COSINE, - m=15, - ef_construction=500, - quantize_type=QuantizeType.INT8, - enable_rotate=True, # 新增参数 -) -print(params) -# {"metric_type":COSINE, "m":15, "ef_construction":500, "quantize_type":INT8, "use_contiguous_memory":false, "enable_rotate":true} -``` - -#### Python 示例脚本(已完成) -模仿 `dco_build.py` / `dco_query.py`,将 `construct2.yaml` / `search_current2.yaml` 在 Python 层实现。 - -- **构建脚本**:`examples/python/int8_rotate_build.py` - - 读取 `.zvec.vecs` 文件,创建 Collection(INT8 + enable_rotate=True + COSINE) - - 触发 CosineInt8Converter + FhtKacRotator - - 插入 → optimize → flush -- **查询脚本**:`examples/python/int8_rotate_query.py` - - 打开已构建的 Collection,加载 Reformer(自动从 storage 检测 rotator) - - 执行 search + recall 评估 - -#### 额外修复 -`src/core/interface/index.cc` `Index::Open()` 新增 L310-314:DB 构建路径(`create_new=true`)下,先将 Converter 的 rotator dump 到 storage,再让 Reformer load。修复前 DB optimize 阶段会因 Reformer 找不到 rotator segment 而失败。 - -```cpp -// When building a new index, dump converter state (e.g., rotator) to -// storage so the reformer can load it. -if (storage_options.create_new && converter_ != nullptr) { - converter_->dump_to_storage(storage_); -} -``` - -#### 测试结果(Cohere 1M, dim=768, Cosine, INT8+rotate, m=15, ef_construction=500) - -| 指标 | 数值 | -|------|------| -| 插入速度 | ~20k docs/s | -| HNSW 构建 | 111.5s | -| QPS (ef=180) | ~1344 | -| recall@100 | 94.03% | - -## 对接 VectorDBBench -1. 环境: -``` -conda activate baseline -``` -2. 原始指令 -``` -vectordbbench zvec \ ---path /root/code/VectorDBBench/db/cohere-1m \ ---db-label 16c64g-v0.1 \ ---case-type Performance768D1M \ ---num-concurrency 16 \ ---quantize-type int8 \ ---m 15 \ ---ef-search 180 \ ---skip-drop-old \ ---skip-load -``` -3. 对接随机旋转方式,改为: -``` -vectordbbench zvec \ ---path /root/code/VectorDBBench/db/cohere-1m-exp \ ---db-label 16c64g-v0.1 \ ---case-type Performance768D1M \ ---num-concurrency 16 \ ---quantize-type int8 \ ---m 15 \ ---ef-search 180 \ ---enable-rotate \ ---skip-drop-old \ ---skip-load -``` -4. 修改 `/root/code/VectorDBBench` 完成对接(已完成): - - `vectordb_bench/backend/clients/zvec/cli.py`:新增 `--enable-rotate` CLI flag - - `vectordb_bench/backend/clients/zvec/config.py`:`ZvecHNSWIndexConfig` 新增 `enable_rotate: bool = False` - - `vectordb_bench/backend/clients/zvec/zvec.py`:`_parse_index_param()` 传递 `enable_rotate` 到 `HnswIndexParam` - - `pip install -e .` 安装 -5. 测试结果(已完成,Cohere 1M, 768D, Cosine, INT8+rotate, m=15, ef_search=180) - -| 指标 | 数值 | -|------|------| -| 插入耗时 | 33.6s | -| 优化耗时 | 109.8s | -| 并发 QPS (16线程) | **13,989** | -| recall@100 | **93.97%** | -| NDCG | 94.91% | -| 串行延迟 p99 | 1.4ms | -| 串行延迟 p95 | 0.8ms | From 494bd745e9669f582490640e6f4dd6cb198745ab Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 20:03:36 +0800 Subject: [PATCH 12/38] add int8 rotate --- .gitignore | 2 +- examples/python/int8_rotate_build.py | 237 --------------------------- examples/python/int8_rotate_query.py | 228 -------------------------- 3 files changed, 1 insertion(+), 466 deletions(-) delete mode 100644 examples/python/int8_rotate_build.py delete mode 100644 examples/python/int8_rotate_query.py diff --git a/.gitignore b/.gitignore index 556a6535c..8fb272c92 100644 --- a/.gitignore +++ b/.gitignore @@ -55,4 +55,4 @@ allure-* # congfig doc/ config/ -example/python/ \ No newline at end of file +examples/python/ \ No newline at end of file diff --git a/examples/python/int8_rotate_build.py b/examples/python/int8_rotate_build.py deleted file mode 100644 index f519ee672..000000000 --- a/examples/python/int8_rotate_build.py +++ /dev/null @@ -1,237 +0,0 @@ -""" -Zvec Python API — INT8 + Random Rotation Build Example -======================================================= - -Builds a zvec Collection with INT8 quantization and random rotation -(CosineInt8Converter + FhtKacRotator) enabled. - -The key configuration is: - quantize_type=QuantizeType.INT8, enable_rotate=True - -This triggers the C++ CosineInt8Converter to: - 1. Create a FhtKacRotator (random orthogonal rotation matrix) - 2. Rotate all data vectors before INT8 quantization - 3. Store the rotator state in the index meta for search-side query rotation - -Equivalent C++ config (construct2.yaml): - ConverterName: CosineInt8Converter - ConverterParams: - integer_streaming.converter.enable_rotate: !!bool true - -Input : /root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs -Output: /root/data/cohere/1m/db/cohere_cosine_int8_rotate - -Usage:: - - conda activate baseline - python int8_rotate_build.py -""" - -from __future__ import annotations - -import mmap -import os -import shutil -import struct -import time - -import numpy as np - -import zvec -from zvec import ( - CollectionOption, - DataType, - Doc, - FieldSchema, - HnswIndexParam, - InvertIndexParam, - LogLevel, - LogType, - MetricType, - OptimizeOption, - QuantizeType, - VectorSchema, -) - -# ==================== Configuration ==================== - -VECS_FILE = "/root/data/cohere/1m/cohere_train_vector_1m.norm.zvec.vecs" -COLLECTION_PATH = "/root/data/cohere/1m/db/cohere_cosine_int8_rotate" - -DIMENSION = 768 -METRIC_TYPE = MetricType.COSINE -HNSW_M = 15 -EF_CONSTRUCTION = 500 - -INSERT_BATCH_SIZE = 1000 - -# ==================== .zvec.vecs Parser ==================== - -VECS_HEADER_FMT = " None: - print("=" * 60) - print(" Zvec Python API — INT8 + Rotate Build Example") - print("=" * 60) - - # ---- Step 1: Init zvec ---- - print("\n[Step 1] Initializing zvec ...") - zvec.init(log_type=LogType.CONSOLE, log_level=LogLevel.INFO) - print(" Done.") - - # ---- Step 2: Parse .zvec.vecs header ---- - print(f"\n[Step 2] Parsing vecs file: {VECS_FILE}") - num_vecs, meta_size, data_start, offsets = parse_vecs_file(VECS_FILE) - dense_offset, dense_size = offsets["dense"] - key_offset, key_size = offsets["key"] - - elem_size = dense_size // num_vecs - vec_dim_floats = elem_size // 4 - print(f" num_vecs: {num_vecs:,}, dim: {vec_dim_floats}") - assert vec_dim_floats == DIMENSION - - # ---- Step 3: Create collection with INT8 + enable_rotate ---- - print(f"\n[Step 3] Creating collection at {COLLECTION_PATH} ...") - print(f" quantize_type = QuantizeType.INT8 + enable_rotate=True") - print(f" metric_type = MetricType.COSINE") - print(f" → CosineInt8Converter + FhtKacRotator") - - index_param = HnswIndexParam( - metric_type=METRIC_TYPE, - m=HNSW_M, - ef_construction=EF_CONSTRUCTION, - quantize_type=QuantizeType.INT8, - enable_rotate=True, - ) - print(f" index_param = {index_param}") - - schema = zvec.CollectionSchema( - name="cohere_cosine_int8_rotate", - fields=[ - FieldSchema( - "id", - DataType.INT64, - nullable=False, - index_param=InvertIndexParam(enable_range_optimization=True), - ), - ], - vectors=[ - VectorSchema( - "embedding", - DataType.VECTOR_FP32, - dimension=DIMENSION, - index_param=index_param, - ), - ], - ) - - os.makedirs(os.path.dirname(COLLECTION_PATH), exist_ok=True) - - if os.path.exists(COLLECTION_PATH): - print(f" Removing existing collection ...") - shutil.rmtree(COLLECTION_PATH) - - collection = zvec.create_and_open( - path=COLLECTION_PATH, - schema=schema, - option=CollectionOption(read_only=False, enable_mmap=True), - ) - print(f" Collection created: {collection.schema.name}") - - # ---- Step 4: Read vectors via mmap and insert ---- - print(f"\n[Step 4] Inserting {num_vecs:,} vectors " - f"(batch_size={INSERT_BATCH_SIZE}) ...") - - insert_start = time.perf_counter() - total_inserted = 0 - - with open(VECS_FILE, "rb") as f: - with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: - dense_abs = data_start + dense_offset - key_abs = data_start + key_offset - - batch_docs: list[Doc] = [] - - for i in range(num_vecs): - key_val = struct.unpack_from("= INSERT_BATCH_SIZE: - results = collection.insert(batch_docs) - ok = sum(1 for r in results if r.ok()) - total_inserted += ok - if (total_inserted % 50_000) == 0 or total_inserted == num_vecs: - elapsed = time.perf_counter() - insert_start - speed = total_inserted / elapsed if elapsed > 0 else 0 - print(f" [{total_inserted:>8,} / {num_vecs:,}] " - f"{speed:.0f} docs/s") - batch_docs.clear() - - if batch_docs: - results = collection.insert(batch_docs) - ok = sum(1 for r in results if r.ok()) - total_inserted += ok - - insert_elapsed = time.perf_counter() - insert_start - print(f"\n Insert complete: {total_inserted:,} docs " - f"in {insert_elapsed:.1f}s " - f"({total_inserted / insert_elapsed:.0f} docs/s)") - - # ---- Step 5: Optimize (build HNSW graph with INT8 + rotation) ---- - print(f"\n[Step 5] Optimizing collection (building HNSW index with " - f"INT8 + rotation) ...") - opt_start = time.perf_counter() - collection.optimize(option=OptimizeOption()) - opt_elapsed = time.perf_counter() - opt_start - print(f" Optimize done in {opt_elapsed:.1f}s") - - # ---- Step 6: Flush ---- - print(f"\n[Step 6] Flushing collection ...") - collection.flush() - print(f" Doc count: {collection.stats.doc_count:,}") - print(" Done.") - - print(f"\n{'=' * 60}") - print(f" Build complete!") - print(f" Collection saved to: {COLLECTION_PATH}") - print(f" Run int8_rotate_query.py to search and evaluate.") - print(f"{'=' * 60}") - - -if __name__ == "__main__": - main() diff --git a/examples/python/int8_rotate_query.py b/examples/python/int8_rotate_query.py deleted file mode 100644 index f57c3e4e8..000000000 --- a/examples/python/int8_rotate_query.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -Zvec Python API — INT8 + Random Rotation Query Example -======================================================= - -Opens an INT8 + rotation enabled collection (built by int8_rotate_build.py), -runs vector searches, and evaluates recall against ground truth. - -The reformer (CosineInt8Reformer) is automatically loaded from the stored -index meta during collection.open(), which rotates query vectors using the -same FhtKacRotator that was used during build. - -Equivalent C++ config (search_current2.yaml): - IndexConfig: '{"quantizer_param":{"type":"kInt8"},"metric_type":"kCosine","m":15,...}' - QueryConfig: '{"index_type":"kHNSW","ef_search":180}' - -Configuration: - Collection : /root/data/cohere/1m/db/cohere_cosine_int8_rotate - TopK : 100 - QueryFile : /root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt - GroundTruth: /root/data/cohere/1m/neighbors.txt - ef_search : 180 - -Usage:: - - conda activate baseline - python int8_rotate_query.py -""" - -from __future__ import annotations - -import os -import time -from typing import Optional - -import numpy as np - -import zvec -from zvec import ( - CollectionOption, - HnswQueryParam, - LogLevel, - LogType, - Query, -) - -# ==================== Configuration ==================== - -COLLECTION_PATH = "/root/data/cohere/1m/db/cohere_cosine_int8_rotate" -QUERY_FILE = "/root/data/cohere/1m/cohere_test_vector_1m.1000.norm.txt" -GROUNDTRUTH_FILE = "/root/data/cohere/1m/neighbors.txt" - -DIMENSION = 768 -TOPK = 100 -EF_SEARCH = 180 -MAX_QUERIES = 1000 -WARMUP_ROUNDS = 1 -MEASURE_ROUNDS = 3 - - -# ==================== File Parsers ==================== - -def parse_query_file( - path: str, - dimension: int, - first_sep: str = ";", - second_sep: str = " ", - max_queries: int = 0, -) -> list[tuple[Optional[str], np.ndarray]]: - """Parse query file in ``key;v1 v2 v3 ...`` format.""" - queries: list[tuple[Optional[str], np.ndarray]] = [] - - with open(path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - - parts = line.split(first_sep, 1) - key = parts[0].strip() if parts else None - - vec_str = parts[1].strip().rstrip(first_sep).strip() if len(parts) > 1 else "" - vec_strs = vec_str.split(second_sep) if vec_str else [] - vector = np.array([float(v) for v in vec_strs], dtype=np.float32) - - if len(vector) != dimension: - print(f" Warning: query {key} has dim={len(vector)}, " - f"expected {dimension}, skipping") - continue - - queries.append((key, vector)) - if max_queries and len(queries) >= max_queries: - break - - return queries - - -def parse_groundtruth_file( - path: str, - first_sep: str = ";", - second_sep: str = " ", -) -> dict[str, list[str]]: - """Parse ground truth file in ``key;id1 id2 id3 ...`` format.""" - gt: dict[str, list[str]] = {} - - with open(path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - - parts = line.split(first_sep, 1) - key = parts[0].strip() - - ids_str = parts[1].strip().rstrip(first_sep).strip() if len(parts) > 1 else "" - ids = ids_str.split(second_sep) if ids_str else [] - gt[key] = ids - - return gt - - -# ==================== Main ==================== - -def main() -> None: - print("=" * 60) - print(" Zvec Python API — INT8 + Rotate Query Example") - print("=" * 60) - - # ---- Step 1: Init zvec ---- - print("\n[Step 1] Initializing zvec ...") - zvec.init(log_type=LogType.CONSOLE, log_level=LogLevel.INFO) - print(" Done.") - - # ---- Step 2: Open collection ---- - print(f"\n[Step 2] Opening collection: {COLLECTION_PATH}") - collection = zvec.open( - path=COLLECTION_PATH, - option=CollectionOption(read_only=True, enable_mmap=True), - ) - print(f" Collection : {collection.schema.name}") - print(f" Doc count : {collection.stats.doc_count:,}") - print(f" Dimension : {DIMENSION}") - print(f" TopK : {TOPK}") - print(f" ef_search : {EF_SEARCH}") - - # ---- Step 3: Load queries ---- - print(f"\n[Step 3] Loading queries from: {QUERY_FILE}") - queries = parse_query_file(QUERY_FILE, DIMENSION, - max_queries=MAX_QUERIES) - num_queries = len(queries) - print(f" Loaded {num_queries} queries.") - - # ---- Step 4: Load ground truth ---- - gt: dict[str, list[str]] = {} - if os.path.exists(GROUNDTRUTH_FILE): - print(f"\n[Step 4] Loading ground truth from: {GROUNDTRUTH_FILE}") - gt = parse_groundtruth_file(GROUNDTRUTH_FILE) - print(f" Loaded ground truth for {len(gt)} queries.") - else: - print(f"\n[Step 4] Ground truth not found, skipping recall eval.") - - # ---- Step 5: Run rounds (warmup + measured) ---- - total_rounds = WARMUP_ROUNDS + MEASURE_ROUNDS - print(f"\n[Step 5] Running {total_rounds} rounds " - f"({WARMUP_ROUNDS} warmup + {MEASURE_ROUNDS} measured), " - f"{num_queries} queries/round ...") - - round_qps_list: list[float] = [] - round_recall_list: list[float] = [] - - for rnd in range(total_rounds): - is_warmup = rnd < WARMUP_ROUNDS - label = "warmup" if is_warmup else f"measured-{rnd - WARMUP_ROUNDS + 1}" - - search_start = time.perf_counter() - total_recall = 0.0 - matched = 0 - - for idx, (key, vec) in enumerate(queries): - vq = Query( - field_name="embedding", - vector=vec.tolist(), - param=HnswQueryParam(ef=EF_SEARCH), - ) - results = collection.query(queries=vq, topk=TOPK) - qid = key if key is not None else str(idx) - - if qid in gt: - gt_ids = set(gt[qid][:TOPK]) - if gt_ids: - hit = sum(1 for d in results if d.id in gt_ids) - recall = hit / len(gt_ids) - total_recall += recall - matched += 1 - - search_elapsed = time.perf_counter() - search_start - rnd_qps = num_queries / search_elapsed if search_elapsed > 0 else 0 - rnd_recall = (total_recall / matched * 100) if matched > 0 else 0.0 - - if is_warmup: - print(f" [Round {rnd + 1}/{total_rounds}] {label}: " - f"QPS={rnd_qps:.1f} recall@{TOPK}={rnd_recall:.2f}% (discarded)") - else: - round_qps_list.append(rnd_qps) - round_recall_list.append(rnd_recall) - print(f" [Round {rnd + 1}/{total_rounds}] {label}: " - f"QPS={rnd_qps:.1f} recall@{TOPK}={rnd_recall:.2f}%") - - # ---- Step 6: Summary ---- - avg_qps = sum(round_qps_list) / len(round_qps_list) if round_qps_list else 0 - avg_recall = sum(round_recall_list) / len(round_recall_list) if round_recall_list else 0 - min_qps = min(round_qps_list) if round_qps_list else 0 - max_qps = max(round_qps_list) if round_qps_list else 0 - - print(f"\n[Step 6] Summary") - print(f" Warmup rounds : {WARMUP_ROUNDS}") - print(f" Measured rounds : {MEASURE_ROUNDS}") - print(f" Queries/round : {num_queries}") - print(f" Avg QPS : {avg_qps:.1f} (min={min_qps:.1f}, max={max_qps:.1f})") - if round_recall_list: - print(f" Avg recall@{TOPK} : {avg_recall:.2f}%") - else: - print(f" Avg recall@{TOPK} : N/A (no ground truth)") - - print(f"\n{'=' * 60}") - - -if __name__ == "__main__": - main() From 252fbcc1889983cb83aebebdcfbcf0cfece30366 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 20:04:46 +0800 Subject: [PATCH 13/38] add int8 rotate --- .gitignore | 3 +- logs/vectordb_bench.log | 295 ---------------------------------------- 2 files changed, 2 insertions(+), 296 deletions(-) delete mode 100644 logs/vectordb_bench.log diff --git a/.gitignore b/.gitignore index 8fb272c92..53fb938be 100644 --- a/.gitignore +++ b/.gitignore @@ -55,4 +55,5 @@ allure-* # congfig doc/ config/ -examples/python/ \ No newline at end of file +examples/python/ +logs/ \ No newline at end of file diff --git a/logs/vectordb_bench.log b/logs/vectordb_bench.log deleted file mode 100644 index 6407fc45e..000000000 --- a/logs/vectordb_bench.log +++ /dev/null @@ -1,295 +0,0 @@ -2026-06-09 11:34:55,964 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 11:34:55,964 | INFO |generated uuid for the tasks: 38274539699b459baa5d743642157fef (interface.py:73) -2026-06-09 11:34:55,992 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 11:34:55,992 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 11:34:55,992 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | 38274539699b459baa5d743642157fef (task_runner.py:411) -2026-06-09 11:34:55,992 | INFO |task submitted: id=38274539699b459baa5d743642157fef, 38274539699b459baa5d743642157fef, case number: 1 (interface.py:248) -2026-06-09 11:34:56,541 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 11:34:56,541 | INFO |Starting run (task_runner.py:149) -2026-06-09 11:34:56,569 | INFO |Search config: {} (zvec.py:58) -2026-06-09 11:34:57,889 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 11:34:57,926 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 11:34:57,971 | INFO |Start performance case (task_runner.py:194) -2026-06-09 11:34:58,522 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 11:35:08,555 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 11:35:38,687 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0502s, count=230, qps in this process: 7.6539 (mp_runner.py:101) -2026-06-09 11:35:38,687 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0507s, count=231, qps in this process: 7.687 (mp_runner.py:101) -2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0513s, count=230, qps in this process: 7.6536 (mp_runner.py:101) -2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0499s, count=232, qps in this process: 7.7205 (mp_runner.py:101) -2026-06-09 11:35:38,688 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0426s, count=230, qps in this process: 7.6558 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.043s, count=230, qps in this process: 7.6557 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0523s, count=230, qps in this process: 7.6533 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0565s, count=230, qps in this process: 7.6523 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.063s, count=231, qps in this process: 7.6839 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0617s, count=230, qps in this process: 7.6509 (mp_runner.py:101) -2026-06-09 11:35:38,689 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.048s, count=232, qps in this process: 7.721 (mp_runner.py:101) -2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.052s, count=232, qps in this process: 7.72 (mp_runner.py:101) -2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0589s, count=232, qps in this process: 7.7182 (mp_runner.py:101) -2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.051s, count=230, qps in this process: 7.6537 (mp_runner.py:101) -2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0517s, count=230, qps in this process: 7.6535 (mp_runner.py:101) -2026-06-09 11:35:38,690 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.064s, count=231, qps in this process: 7.6836 (mp_runner.py:101) -2026-06-09 11:35:38,691 | INFO |End search in concurrency 16: dur=30.135677246842533s, total_count=3691, qps=122.4794 (mp_runner.py:152) -2026-06-09 11:35:39,052 | INFO |Update largest qps with concurrency 16: current max_qps=122.4794 (mp_runner.py:156) -2026-06-09 11:35:39,052 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 11:35:39,687 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 11:37:03,768 | INFO |SpawnProcess-1:18 search entire test_data: cost=83.7335s, queries=1000, avg_recall=0.9695, avg_ndcg=0.9779, avg_latency=0.0837, p99=0.0902, p95=0.0849 (serial_runner.py:198) -2026-06-09 11:37:03,890 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=122.4794, serial_latency_p99=np.float64(0.0902), serial_latency_p95=np.float64(0.0849), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[122.4794], conc_latency_p99_list=[np.float64(0.1534800931811332)], conc_latency_p95_list=[np.float64(0.1342107669916004)], conc_latency_avg_list=[np.float64(0.13012447999361362)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 11:37:03,890 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=122.4794, serial_latency_p99=np.float64(0.0902), serial_latency_p95=np.float64(0.0849), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[122.4794], conc_latency_p99_list=[np.float64(0.1534800931811332)], conc_latency_p95_list=[np.float64(0.1342107669916004)], conc_latency_avg_list=[np.float64(0.13012447999361362)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 11:37:03,891 | INFO |Task summary: run_id=38274, task_label=38274539699b459baa5d743642157fef (models.py:478) -2026-06-09 11:37:03,891 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 11:37:03,891 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ----------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 11:37:03,891 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) 38274539699b459baa5d743642157fef | 0.0 122.4794 0.0902 0.0849 0.9695 0 | :) (models.py:478) -2026-06-09 11:37:03,891 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_38274539699b459baa5d743642157fef_zvec.json (models.py:315) -2026-06-09 11:37:03,891 | INFO |Success to finish task: label=38274539699b459baa5d743642157fef, run_id=38274539699b459baa5d743642157fef (interface.py:219) -2026-06-09 13:30:19,806 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 13:30:19,806 | INFO |generated uuid for the tasks: d98092fa3b88400ab07ede7482192284 (interface.py:73) -2026-06-09 13:30:19,834 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 13:30:19,834 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 13:30:19,834 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d98092fa3b88400ab07ede7482192284 (task_runner.py:411) -2026-06-09 13:30:19,834 | INFO |task submitted: id=d98092fa3b88400ab07ede7482192284, d98092fa3b88400ab07ede7482192284, case number: 1 (interface.py:248) -2026-06-09 13:30:20,378 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 13:30:20,379 | INFO |Starting run (task_runner.py:149) -2026-06-09 13:30:20,406 | INFO |Search config: {} (zvec.py:58) -2026-06-09 13:30:21,725 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 13:30:21,762 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 13:30:21,809 | INFO |Start performance case (task_runner.py:194) -2026-06-09 13:30:22,370 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 13:30:32,408 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 13:31:02,526 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0343s, count=233, qps in this process: 7.7578 (mp_runner.py:101) -2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0289s, count=232, qps in this process: 7.7259 (mp_runner.py:101) -2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0297s, count=233, qps in this process: 7.759 (mp_runner.py:101) -2026-06-09 13:31:02,536 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0532s, count=232, qps in this process: 7.7196 (mp_runner.py:101) -2026-06-09 13:31:02,543 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0533s, count=231, qps in this process: 7.6863 (mp_runner.py:101) -2026-06-09 13:31:02,543 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0535s, count=231, qps in this process: 7.6863 (mp_runner.py:101) -2026-06-09 13:31:02,561 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.024s, count=232, qps in this process: 7.7272 (mp_runner.py:101) -2026-06-09 13:31:02,563 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0738s, count=233, qps in this process: 7.7476 (mp_runner.py:101) -2026-06-09 13:31:02,572 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0603s, count=233, qps in this process: 7.7511 (mp_runner.py:101) -2026-06-09 13:31:02,572 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0773s, count=234, qps in this process: 7.78 (mp_runner.py:101) -2026-06-09 13:31:02,573 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0848s, count=233, qps in this process: 7.7448 (mp_runner.py:101) -2026-06-09 13:31:02,573 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0807s, count=233, qps in this process: 7.7458 (mp_runner.py:101) -2026-06-09 13:31:02,574 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0949s, count=233, qps in this process: 7.7422 (mp_runner.py:101) -2026-06-09 13:31:02,574 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0926s, count=233, qps in this process: 7.7428 (mp_runner.py:101) -2026-06-09 13:31:02,643 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.1135s, count=232, qps in this process: 7.7042 (mp_runner.py:101) -2026-06-09 13:31:02,648 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.1185s, count=232, qps in this process: 7.7029 (mp_runner.py:101) -2026-06-09 13:31:02,650 | INFO |End search in concurrency 16: dur=30.241203671321273s, total_count=3720, qps=123.011 (mp_runner.py:152) -2026-06-09 13:31:02,998 | INFO |Update largest qps with concurrency 16: current max_qps=123.011 (mp_runner.py:156) -2026-06-09 13:31:02,998 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 13:31:03,635 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 13:32:27,471 | INFO |SpawnProcess-1:18 search entire test_data: cost=83.4878s, queries=1000, avg_recall=0.9695, avg_ndcg=0.9779, avg_latency=0.0835, p99=0.085, p95=0.0845 (serial_runner.py:198) -2026-06-09 13:32:27,596 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=123.011, serial_latency_p99=np.float64(0.085), serial_latency_p95=np.float64(0.0845), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[123.011], conc_latency_p99_list=[np.float64(0.14689669567625965)], conc_latency_p95_list=[np.float64(0.13342893943190576)], conc_latency_avg_list=[np.float64(0.1292198499749785)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 13:32:27,596 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=123.011, serial_latency_p99=np.float64(0.085), serial_latency_p95=np.float64(0.0845), recall=np.float64(0.9695), ndcg=np.float64(0.9779), conc_num_list=[16], conc_qps_list=[123.011], conc_latency_p99_list=[np.float64(0.14689669567625965)], conc_latency_p95_list=[np.float64(0.13342893943190576)], conc_latency_avg_list=[np.float64(0.1292198499749785)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 13:32:27,597 | INFO |Task summary: run_id=d9809, task_label=d98092fa3b88400ab07ede7482192284 (models.py:478) -2026-06-09 13:32:27,597 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 13:32:27,597 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ---------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 13:32:27,597 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d98092fa3b88400ab07ede7482192284 | 0.0 123.011 0.085 0.0845 0.9695 0 | :) (models.py:478) -2026-06-09 13:32:27,597 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d98092fa3b88400ab07ede7482192284_zvec.json (models.py:315) -2026-06-09 13:32:27,597 | INFO |Success to finish task: label=d98092fa3b88400ab07ede7482192284, run_id=d98092fa3b88400ab07ede7482192284 (interface.py:219) -2026-06-09 14:44:02,774 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-exp'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=True), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['drop_old', 'load', 'search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 14:44:02,774 | INFO |generated uuid for the tasks: 602b05d4b2b04d68ac77fa1311ecb8a1 (interface.py:73) -2026-06-09 14:44:02,802 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 14:44:02,802 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 14:44:02,802 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | 602b05d4b2b04d68ac77fa1311ecb8a1 (task_runner.py:411) -2026-06-09 14:44:02,802 | INFO |task submitted: id=602b05d4b2b04d68ac77fa1311ecb8a1, 602b05d4b2b04d68ac77fa1311ecb8a1, case number: 1 (interface.py:248) -2026-06-09 14:44:03,344 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=True (interface.py:178) -2026-06-09 14:44:03,345 | INFO |Starting run (task_runner.py:149) -2026-06-09 14:44:03,371 | INFO |Search config: {} (zvec.py:58) -2026-06-09 14:44:05,173 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 14:44:05,210 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 14:44:05,259 | INFO |Start performance case (task_runner.py:194) -2026-06-09 14:44:05,911 | INFO |(SpawnProcess-1:1) Start concurrent insert, batch_size=100, max_workers=4 (concurrent_runner.py:187) -2026-06-09 14:44:05,911 | INFO |Get iterator for shuffle_train.parquet (dataset.py:428) -2026-06-09 14:44:33,099 | INFO |(SpawnProcess-1:1) Finish concurrent insert, count=1000000, dur=27.19s (concurrent_runner.py:208) -2026-06-09 14:46:29,510 | INFO |Finish loading the entire dataset into VectorDB, insert_duration=33.62117011426017, optimize_duration=109.80431694490835 load_duration(insert + optimize) = 143.4255 (task_runner.py:204) -2026-06-09 14:46:30,070 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 14:46:40,094 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0219s, count=25457, qps in this process: 847.9477 (mp_runner.py:101) -2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:19 search 30s: actual_dur=30.0268s, count=26710, qps in this process: 889.5387 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:18 search 30s: actual_dur=30.0279s, count=26538, qps in this process: 883.7781 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0301s, count=26581, qps in this process: 885.1452 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0277s, count=26796, qps in this process: 892.376 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.022s, count=26131, qps in this process: 870.395 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0192s, count=26042, qps in this process: 867.5115 (mp_runner.py:101) -2026-06-09 14:47:10,175 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0187s, count=26256, qps in this process: 874.6548 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0303s, count=26661, qps in this process: 887.8033 (mp_runner.py:101) -2026-06-09 14:47:10,177 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0208s, count=26373, qps in this process: 878.4909 (mp_runner.py:101) -2026-06-09 14:47:10,176 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0265s, count=26415, qps in this process: 879.7229 (mp_runner.py:101) -2026-06-09 14:47:10,177 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0203s, count=26138, qps in this process: 870.6775 (mp_runner.py:101) -2026-06-09 14:47:10,178 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0185s, count=26339, qps in this process: 877.4256 (mp_runner.py:101) -2026-06-09 14:47:10,179 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0158s, count=26613, qps in this process: 886.633 (mp_runner.py:101) -2026-06-09 14:47:10,181 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0149s, count=26753, qps in this process: 891.324 (mp_runner.py:101) -2026-06-09 14:47:10,187 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0168s, count=25916, qps in this process: 863.3832 (mp_runner.py:101) -2026-06-09 14:47:10,241 | INFO |End search in concurrency 16: dur=30.14720565499738s, total_count=421719, qps=13988.6597 (mp_runner.py:152) -2026-06-09 14:47:10,587 | INFO |Update largest qps with concurrency 16: current max_qps=13988.6597 (mp_runner.py:156) -2026-06-09 14:47:10,591 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 14:47:11,232 | INFO |SpawnProcess-1:20 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 14:47:12,265 | INFO |SpawnProcess-1:20 search entire test_data: cost=0.7515s, queries=1000, avg_recall=0.9397, avg_ndcg=0.9491, avg_latency=0.0008, p99=0.0014, p95=0.0008 (serial_runner.py:198) -2026-06-09 14:47:12,389 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=33.6212, optimize_duration=109.8043, load_duration=143.4255, qps=13988.6597, serial_latency_p99=np.float64(0.0014), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9397), ndcg=np.float64(0.9491), conc_num_list=[16], conc_qps_list=[13988.6597], conc_latency_p99_list=[np.float64(0.0022724022064358014)], conc_latency_p95_list=[np.float64(0.001238895207643509)], conc_latency_avg_list=[np.float64(0.001135618022694545)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 14:47:12,390 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=33.6212, optimize_duration=109.8043, load_duration=143.4255, qps=13988.6597, serial_latency_p99=np.float64(0.0014), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9397), ndcg=np.float64(0.9491), conc_num_list=[16], conc_qps_list=[13988.6597], conc_latency_p99_list=[np.float64(0.0022724022064358014)], conc_latency_p95_list=[np.float64(0.001238895207643509)], conc_latency_avg_list=[np.float64(0.001135618022694545)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 14:47:12,390 | INFO |Task summary: run_id=602b0, task_label=602b05d4b2b04d68ac77fa1311ecb8a1 (models.py:478) -2026-06-09 14:47:12,390 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 14:47:12,390 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 14:47:12,390 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) 602b05d4b2b04d68ac77fa1311ecb8a1 | 143.4255 13988.6597 0.0014 0.0008 0.9397 0 | :) (models.py:478) -2026-06-09 14:47:12,390 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_602b05d4b2b04d68ac77fa1311ecb8a1_zvec.json (models.py:315) -2026-06-09 14:47:12,390 | INFO |Success to finish task: label=602b05d4b2b04d68ac77fa1311ecb8a1, run_id=602b05d4b2b04d68ac77fa1311ecb8a1 (interface.py:219) -2026-06-09 14:51:42,424 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 14:51:42,424 | INFO |generated uuid for the tasks: d802e43419c4461c97e75a0aacd207cb (interface.py:73) -2026-06-09 14:51:42,452 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 14:51:42,452 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 14:51:42,452 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d802e43419c4461c97e75a0aacd207cb (task_runner.py:411) -2026-06-09 14:51:42,452 | INFO |task submitted: id=d802e43419c4461c97e75a0aacd207cb, d802e43419c4461c97e75a0aacd207cb, case number: 1 (interface.py:248) -2026-06-09 14:51:43,001 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 14:51:43,001 | INFO |Starting run (task_runner.py:149) -2026-06-09 14:51:43,029 | INFO |Search config: {} (zvec.py:58) -2026-06-09 14:51:44,295 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 14:51:44,332 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 14:51:44,381 | INFO |Start performance case (task_runner.py:194) -2026-06-09 14:51:44,928 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 14:51:54,965 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0302s, count=26187, qps in this process: 872.0222 (mp_runner.py:101) -2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0337s, count=26145, qps in this process: 870.5221 (mp_runner.py:101) -2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0268s, count=27097, qps in this process: 902.4272 (mp_runner.py:101) -2026-06-09 14:52:25,060 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0349s, count=26504, qps in this process: 882.4401 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.032s, count=26064, qps in this process: 867.8743 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0363s, count=25922, qps in this process: 863.0224 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.039s, count=26177, qps in this process: 871.4338 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0356s, count=26352, qps in this process: 877.3589 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0397s, count=26370, qps in this process: 877.8383 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0374s, count=26668, qps in this process: 887.8265 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0367s, count=26174, qps in this process: 871.4007 (mp_runner.py:101) -2026-06-09 14:52:25,061 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.041s, count=26194, qps in this process: 871.9417 (mp_runner.py:101) -2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0352s, count=26298, qps in this process: 875.5727 (mp_runner.py:101) -2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.035s, count=26270, qps in this process: 874.6462 (mp_runner.py:101) -2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0416s, count=26479, qps in this process: 881.4111 (mp_runner.py:101) -2026-06-09 14:52:25,062 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.041s, count=26340, qps in this process: 876.8017 (mp_runner.py:101) -2026-06-09 14:52:25,126 | INFO |End search in concurrency 16: dur=30.160955973900855s, total_count=421241, qps=13966.434 (mp_runner.py:152) -2026-06-09 14:52:25,497 | INFO |Update largest qps with concurrency 16: current max_qps=13966.434 (mp_runner.py:156) -2026-06-09 14:52:25,500 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 14:52:26,132 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 14:52:27,149 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7386s, queries=1000, avg_recall=0.9285, avg_ndcg=0.9405, avg_latency=0.0007, p99=0.0017, p95=0.0008 (serial_runner.py:198) -2026-06-09 14:52:27,269 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13966.434, serial_latency_p99=np.float64(0.0017), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13966.434], conc_latency_p99_list=[np.float64(0.0023291470482945417)], conc_latency_p95_list=[np.float64(0.001239514909684658)], conc_latency_avg_list=[np.float64(0.0011369578863421963)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 14:52:27,270 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13966.434, serial_latency_p99=np.float64(0.0017), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13966.434], conc_latency_p99_list=[np.float64(0.0023291470482945417)], conc_latency_p95_list=[np.float64(0.001239514909684658)], conc_latency_avg_list=[np.float64(0.0011369578863421963)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 14:52:27,270 | INFO |Task summary: run_id=d802e, task_label=d802e43419c4461c97e75a0aacd207cb (models.py:478) -2026-06-09 14:52:27,270 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 14:52:27,270 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------ --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 14:52:27,270 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d802e43419c4461c97e75a0aacd207cb | 0.0 13966.434 0.0017 0.0008 0.9285 0 | :) (models.py:478) -2026-06-09 14:52:27,270 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d802e43419c4461c97e75a0aacd207cb_zvec.json (models.py:315) -2026-06-09 14:52:27,270 | INFO |Success to finish task: label=d802e43419c4461c97e75a0aacd207cb, run_id=d802e43419c4461c97e75a0aacd207cb (interface.py:219) -2026-06-09 14:54:01,400 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 14:54:01,400 | INFO |generated uuid for the tasks: e6ff48f902df4da487e0b7a350dce2bb (interface.py:73) -2026-06-09 14:54:01,463 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 14:54:01,463 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 14:54:01,463 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | e6ff48f902df4da487e0b7a350dce2bb (task_runner.py:411) -2026-06-09 14:54:01,463 | INFO |task submitted: id=e6ff48f902df4da487e0b7a350dce2bb, e6ff48f902df4da487e0b7a350dce2bb, case number: 1 (interface.py:248) -2026-06-09 14:54:02,023 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 14:54:02,023 | INFO |Starting run (task_runner.py:149) -2026-06-09 14:54:02,049 | INFO |Search config: {} (zvec.py:58) -2026-06-09 14:54:02,049 | WARNING |[1/1] case {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'} failed to run, reason=__init__(): incompatible constructor arguments. The following argument types are supported: - 1. _zvec.param.HnswIndexParam(metric_type: _zvec.typing.MetricType = , m: typing.SupportsInt | typing.SupportsIndex = 50, ef_construction: typing.SupportsInt | typing.SupportsIndex = 500, quantize_type: _zvec.typing.QuantizeType = , use_contiguous_memory: bool = False) - -Invoked with: kwargs: metric_type=, m=15, ef_construction=500, quantize_type=, enable_rotate=False (interface.py:200) -2026-06-09 14:54:02,050 | INFO |Task summary: run_id=e6ff4, task_label=e6ff48f902df4da487e0b7a350dce2bb (models.py:478) -2026-06-09 14:54:02,050 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 14:54:02,050 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ---------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 14:54:02,050 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) e6ff48f902df4da487e0b7a350dce2bb | 0.0 0.0 0.0 0.0 0.0 0 | x (models.py:478) -2026-06-09 14:54:02,050 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_e6ff48f902df4da487e0b7a350dce2bb_zvec.json (models.py:315) -2026-06-09 14:54:02,050 | INFO |Success to finish task: label=e6ff48f902df4da487e0b7a350dce2bb, run_id=e6ff48f902df4da487e0b7a350dce2bb (interface.py:219) -2026-06-09 14:56:41,670 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 14:56:41,670 | INFO |generated uuid for the tasks: d15c6088018a44188967789891ac1acf (interface.py:73) -2026-06-09 14:56:41,699 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 14:56:41,699 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 14:56:41,699 | INFO | Zvec-16c64g-v0.1 | Performance Cohere-MEDIUM-1M 0.0 | d15c6088018a44188967789891ac1acf (task_runner.py:411) -2026-06-09 14:56:41,699 | INFO |task submitted: id=d15c6088018a44188967789891ac1acf, d15c6088018a44188967789891ac1acf, case number: 1 (interface.py:248) -2026-06-09 14:56:42,244 | INFO |[1/1] start case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 14:56:42,244 | INFO |Starting run (task_runner.py:149) -2026-06-09 14:56:42,271 | INFO |Search config: {} (zvec.py:58) -2026-06-09 14:56:43,535 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 14:56:43,571 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 14:56:43,620 | INFO |Start performance case (task_runner.py:194) -2026-06-09 14:56:44,166 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 14:56:54,201 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0277s, count=26591, qps in this process: 885.549 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0308s, count=26392, qps in this process: 878.8311 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0376s, count=26429, qps in this process: 879.8639 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0259s, count=26041, qps in this process: 867.2846 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0327s, count=26277, qps in this process: 874.9463 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0355s, count=26648, qps in this process: 887.2168 (mp_runner.py:101) -2026-06-09 14:57:24,290 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0227s, count=26570, qps in this process: 884.997 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0203s, count=26348, qps in this process: 877.6728 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0397s, count=26510, qps in this process: 882.4988 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0361s, count=26048, qps in this process: 867.2231 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0223s, count=26209, qps in this process: 872.9844 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0202s, count=25908, qps in this process: 863.0189 (mp_runner.py:101) -2026-06-09 14:57:24,291 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0184s, count=25627, qps in this process: 853.7097 (mp_runner.py:101) -2026-06-09 14:57:24,293 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0199s, count=26950, qps in this process: 897.7378 (mp_runner.py:101) -2026-06-09 14:57:24,293 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.0197s, count=26408, qps in this process: 879.689 (mp_runner.py:101) -2026-06-09 14:57:24,302 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0233s, count=27081, qps in this process: 901.9994 (mp_runner.py:101) -2026-06-09 14:57:24,355 | INFO |End search in concurrency 16: dur=30.154128178954124s, total_count=422037, qps=13995.9941 (mp_runner.py:152) -2026-06-09 14:57:24,752 | INFO |Update largest qps with concurrency 16: current max_qps=13995.9941 (mp_runner.py:156) -2026-06-09 14:57:24,755 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 14:57:25,398 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 14:57:26,433 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7526s, queries=1000, avg_recall=0.9285, avg_ndcg=0.9405, avg_latency=0.0008, p99=0.0018, p95=0.0008 (serial_runner.py:198) -2026-06-09 14:57:26,557 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13995.9941, serial_latency_p99=np.float64(0.0018), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13995.9941], conc_latency_p99_list=[np.float64(0.0022443212196230893)], conc_latency_p95_list=[np.float64(0.0012407672591507434)], conc_latency_avg_list=[np.float64(0.0011347746009090012)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 14:57:26,557 | INFO |[1/1] finish case: {'label': , 'name': 'Search Performance Test (1M Dataset, 768 Dim)', 'dataset': {'data': {'name': 'Cohere', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13995.9941, serial_latency_p99=np.float64(0.0018), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9285), ndcg=np.float64(0.9405), conc_num_list=[16], conc_qps_list=[13995.9941], conc_latency_p99_list=[np.float64(0.0022443212196230893)], conc_latency_p95_list=[np.float64(0.0012407672591507434)], conc_latency_avg_list=[np.float64(0.0011347746009090012)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 14:57:26,557 | INFO |Task summary: run_id=d15c6, task_label=d15c6088018a44188967789891ac1acf (models.py:478) -2026-06-09 14:57:26,557 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 14:57:26,557 | INFO |---- | -------------- --------------------------------------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 14:57:26,557 | INFO |Zvec | 16c64g-v0.1 Search Performance Test (1M Dataset, 768 Dim) d15c6088018a44188967789891ac1acf | 0.0 13995.9941 0.0018 0.0008 0.9285 0 | :) (models.py:478) -2026-06-09 14:57:26,558 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_d15c6088018a44188967789891ac1acf_zvec.json (models.py:315) -2026-06-09 14:57:26,558 | INFO |Success to finish task: label=d15c6088018a44188967789891ac1acf, run_id=d15c6088018a44188967789891ac1acf (interface.py:219) -2026-06-09 14:59:33,023 | INFO |Task: -TaskConfig(db=, db_config=ZvecConfig(db_label='16c64g-v0.1', version='', note='', path='/root/code/VectorDBBench/db/cohere-1m-cos2l2'), db_case_config=ZvecHNSWIndexConfig(metric_type=None, M=15, ef_construction=500, ef_search=180, quantize_type='int8', is_using_refiner=False, enable_rotate=False), case_config=CaseConfig(case_id=, custom_case={'name': 'Cohere1M-cos2l2', 'description': 'This is a customized dataset.', 'load_timeout': 36000, 'optimize_timeout': 36000, 'dataset_config': {'name': 'Cohere1M-cos2l2', 'dir': '/root/code/VectorDBBench/datasets/cohere-1m-cos2l2', 'size': '1000000', 'dim': '768', 'metric_type': 'L2', 'file_count': '1', 'use_shuffled': False, 'with_gt': True}}, k=100, concurrency_search_config=ConcurrencySearchConfig(num_concurrency=[16], concurrency_duration=30, concurrency_timeout=3600)), stages=['search_serial', 'search_concurrent'], load_concurrency=0) - (cli.py:659) -2026-06-09 14:59:33,023 | INFO |generated uuid for the tasks: 4cc4609939544e4ba8d162ec00835a51 (interface.py:73) -2026-06-09 14:59:33,051 | INFO | DB | CaseType Dataset Filter | task_label (task_runner.py:411) -2026-06-09 14:59:33,051 | INFO | ----------- | ------------ -------------------- ------- | ------- (task_runner.py:411) -2026-06-09 14:59:33,051 | INFO | Zvec-16c64g-v0.1 | Performance Cohere1M-cos2l2-Custom-1M 0.0 | 4cc4609939544e4ba8d162ec00835a51 (task_runner.py:411) -2026-06-09 14:59:33,051 | INFO |task submitted: id=4cc4609939544e4ba8d162ec00835a51, 4cc4609939544e4ba8d162ec00835a51, case number: 1 (interface.py:248) -2026-06-09 14:59:33,600 | INFO |[1/1] start case: {'label': , 'name': 'Cohere1M-cos2l2', 'dataset': {'data': {'name': 'Cohere1M-cos2l2', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, drop_old=False (interface.py:178) -2026-06-09 14:59:33,600 | INFO |Starting run (task_runner.py:149) -2026-06-09 14:59:33,627 | INFO |Search config: {} (zvec.py:58) -2026-06-09 14:59:33,820 | INFO |Read the entire file into memory: test.parquet (dataset.py:396) -2026-06-09 14:59:33,857 | INFO |Read the entire file into memory: neighbors.parquet (dataset.py:396) -2026-06-09 14:59:33,904 | INFO |Start performance case (task_runner.py:194) -2026-06-09 14:59:34,452 | INFO |Start search 30s in concurrency 16, filters: type= filter_rate=0.0 gt_file_name='neighbors.parquet' (mp_runner.py:129) -2026-06-09 14:59:44,486 | INFO |Syncing all process and start concurrency search, concurrency=16 (mp_runner.py:136) -2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:15 search 30s: actual_dur=30.0329s, count=25112, qps in this process: 836.1497 (mp_runner.py:101) -2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:8 search 30s: actual_dur=30.0366s, count=25472, qps in this process: 848.0321 (mp_runner.py:101) -2026-06-09 15:00:14,576 | INFO |SpawnProcess-1:9 search 30s: actual_dur=30.0269s, count=25685, qps in this process: 855.3997 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:5 search 30s: actual_dur=30.0257s, count=24790, qps in this process: 825.626 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:10 search 30s: actual_dur=30.0275s, count=24552, qps in this process: 817.6505 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:16 search 30s: actual_dur=30.0326s, count=25096, qps in this process: 835.6253 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:12 search 30s: actual_dur=30.0429s, count=25224, qps in this process: 839.5994 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:13 search 30s: actual_dur=30.0397s, count=25566, qps in this process: 851.0737 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:6 search 30s: actual_dur=30.0284s, count=25365, qps in this process: 844.7004 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:17 search 30s: actual_dur=30.0343s, count=25512, qps in this process: 849.4288 (mp_runner.py:101) -2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:3 search 30s: actual_dur=30.0363s, count=25159, qps in this process: 837.6198 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:2 search 30s: actual_dur=30.038s, count=25444, qps in this process: 847.0604 (mp_runner.py:101) -2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:11 search 30s: actual_dur=30.0297s, count=24896, qps in this process: 829.0459 (mp_runner.py:101) -2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:7 search 30s: actual_dur=30.0366s, count=25161, qps in this process: 837.678 (mp_runner.py:101) -2026-06-09 15:00:14,578 | INFO |SpawnProcess-1:14 search 30s: actual_dur=30.0442s, count=25354, qps in this process: 843.89 (mp_runner.py:101) -2026-06-09 15:00:14,577 | INFO |SpawnProcess-1:4 search 30s: actual_dur=30.0216s, count=25171, qps in this process: 838.4297 (mp_runner.py:101) -2026-06-09 15:00:14,639 | INFO |End search in concurrency 16: dur=30.152784225996584s, total_count=403559, qps=13383.8055 (mp_runner.py:152) -2026-06-09 15:00:14,988 | INFO |Update largest qps with concurrency 16: current max_qps=13383.8055 (mp_runner.py:156) -2026-06-09 15:00:14,991 | INFO |SpawnProcess-1 start serial search (serial_runner.py:217) -2026-06-09 15:00:15,620 | INFO |SpawnProcess-1:18 start search the entire test_data to get recall and latency (serial_runner.py:158) -2026-06-09 15:00:16,653 | INFO |SpawnProcess-1:18 search entire test_data: cost=0.7505s, queries=1000, avg_recall=0.9428, avg_ndcg=0.9512, avg_latency=0.0008, p99=0.0016, p95=0.0008 (serial_runner.py:198) -2026-06-09 15:00:16,772 | INFO |Performance case got result: Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13383.8055, serial_latency_p99=np.float64(0.0016), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9428), ndcg=np.float64(0.9512), conc_num_list=[16], conc_qps_list=[13383.8055], conc_latency_p99_list=[np.float64(0.002614401644095773)], conc_latency_p95_list=[np.float64(0.0012562056072056293)], conc_latency_avg_list=[np.float64(0.001186844353695998)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]) (task_runner.py:232) -2026-06-09 15:00:16,773 | INFO |[1/1] finish case: {'label': , 'name': 'Cohere1M-cos2l2', 'dataset': {'data': {'name': 'Cohere1M-cos2l2', 'size': 1000000, 'dim': 768, 'metric_type': }}, 'db': 'Zvec-16c64g-v0.1'}, result=Metric(max_load_count=0, insert_duration=0.0, optimize_duration=0.0, load_duration=0.0, qps=13383.8055, serial_latency_p99=np.float64(0.0016), serial_latency_p95=np.float64(0.0008), recall=np.float64(0.9428), ndcg=np.float64(0.9512), conc_num_list=[16], conc_qps_list=[13383.8055], conc_latency_p99_list=[np.float64(0.002614401644095773)], conc_latency_p95_list=[np.float64(0.0012562056072056293)], conc_latency_avg_list=[np.float64(0.001186844353695998)], st_ideal_insert_duration=0, st_search_stage_list=[], st_search_time_list=[], st_max_qps_list_list=[], st_recall_list=[], st_ndcg_list=[], st_serial_latency_p99_list=[], st_serial_latency_p95_list=[], st_conc_failed_rate_list=[], st_conc_num_list_list=[], st_conc_qps_list_list=[], st_conc_latency_p99_list_list=[], st_conc_latency_p95_list_list=[], st_conc_latency_avg_list_list=[]), label=ResultLabel.NORMAL (interface.py:180) -2026-06-09 15:00:16,773 | INFO |Task summary: run_id=4cc46, task_label=4cc4609939544e4ba8d162ec00835a51 (models.py:478) -2026-06-09 15:00:16,773 | INFO |DB | db_label case label | load_dur qps latency(p99) latency(p95) recall max_load_count | label (models.py:478) -2026-06-09 15:00:16,773 | INFO |---- | -------------- --------------- -------------------------------- | ----------- ------------- --------------- --------------- ------------- -------------- | ----- (models.py:478) -2026-06-09 15:00:16,773 | INFO |Zvec | 16c64g-v0.1 Cohere1M-cos2l2 4cc4609939544e4ba8d162ec00835a51 | 0.0 13383.8055 0.0016 0.0008 0.9428 0 | :) (models.py:478) -2026-06-09 15:00:16,773 | INFO |write results to disk /root/code/VectorDBBench/vectordb_bench/results/Zvec/result_20260609_4cc4609939544e4ba8d162ec00835a51_zvec.json (models.py:315) -2026-06-09 15:00:16,774 | INFO |Success to finish task: label=4cc4609939544e4ba8d162ec00835a51, run_id=4cc4609939544e4ba8d162ec00835a51 (interface.py:219) From 95490cf9c538bae5e1f79fd35873284464892467 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 9 Jun 2026 20:35:38 +0800 Subject: [PATCH 14/38] add int8 rotate --- src/binding/python/model/param/python_param.cc | 2 +- src/core/CMakeLists.txt | 4 ++-- src/core/quantizer/CMakeLists.txt | 4 +--- src/core/quantizer/cosine_converter.cc | 2 +- src/core/quantizer/cosine_reformer.cc | 2 +- src/core/quantizer/integer_quantizer_converter.cc | 2 +- src/core/quantizer/integer_quantizer_reformer.cc | 2 +- src/core/quantizer/{record_rotater.cc => record_rotator.cc} | 2 +- src/core/quantizer/{record_rotater.h => record_rotator.h} | 0 9 files changed, 9 insertions(+), 11 deletions(-) rename src/core/quantizer/{record_rotater.cc => record_rotator.cc} (99%) rename src/core/quantizer/{record_rotater.h => record_rotator.h} (100%) diff --git a/src/binding/python/model/param/python_param.cc b/src/binding/python/model/param/python_param.cc index 8f0fd763a..84201c300 100644 --- a/src/binding/python/model/param/python_param.cc +++ b/src/binding/python/model/param/python_param.cc @@ -421,7 +421,7 @@ encapsulates its construction hyperparameters. {'metric_type': 'IP', 'm': 16, 'ef_construction': 200, 'quantize_type': 'INT8', 'use_contiguous_memory': True} )pbdoc"); hnsw_params - .def(py::init(), + .def(py::init(), // Added a new parameter; refactored to QuantizerParam in future py::arg("metric_type") = MetricType::IP, py::arg("m") = core_interface::kDefaultHnswNeighborCnt, py::arg("ef_construction") = diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index e0b9870ad..282c2eb56 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -13,10 +13,10 @@ if(RABITQ_SUPPORTED AND AUTO_DETECT_ARCH) set(HNSW_RABITQ_FILES_FULL ${HNSW_RABITQ_FILES}) list(TRANSFORM HNSW_RABITQ_FILES_FULL PREPEND "algorithm/hnsw_rabitq/") - # record_rotater.cc includes rabitqlib's rotator.hpp which uses AVX2 + # record_rotator.cc includes rabitqlib's rotator.hpp which uses AVX2 # intrinsics in flip_sign() and kacs_walk(), so it also needs the # RABITQ_ARCH_FLAG at compile time. - list(APPEND HNSW_RABITQ_FILES_FULL "quantizer/record_rotater.cc") + list(APPEND HNSW_RABITQ_FILES_FULL "quantizer/record_rotator.cc") foreach(FILE ${HNSW_RABITQ_FILES_FULL}) set_source_files_properties( diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index 0d9569551..37c6c2c87 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -6,11 +6,9 @@ if(NOT APPLE) "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") endif() -# record_rotater.cc includes rabitqlib's rotator.hpp which uses AVX2 -# intrinsics (flip_sign, kacs_walk), so it needs the AVX2 compile flag. if(RABITQ_SUPPORTED AND RABITQ_ARCH_FLAG) set_source_files_properties( - record_rotater.cc + record_rotator.cc PROPERTIES COMPILE_FLAGS "${RABITQ_ARCH_FLAG}" ) diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index 251f61684..41e5000fc 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -19,7 +19,7 @@ #include #include #include "record_quantizer.h" -#include "record_rotater.h" +#include "record_rotator.h" #include "../metric/metric_params.h" namespace zvec { diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index 50d4a3a80..ea67282a5 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -18,7 +18,7 @@ #include #include #include "record_quantizer.h" -#include "record_rotater.h" +#include "record_rotator.h" namespace zvec { namespace core { diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index 4643aea68..2c7b00ca4 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -19,7 +19,7 @@ #include #include #include "record_quantizer.h" -#include "record_rotater.h" +#include "record_rotator.h" #include "../metric/metric_params.h" namespace zvec { diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 40b4de989..88b285ef8 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -19,7 +19,7 @@ #include #include #include "record_quantizer.h" -#include "record_rotater.h" +#include "record_rotator.h" namespace zvec { namespace core { diff --git a/src/core/quantizer/record_rotater.cc b/src/core/quantizer/record_rotator.cc similarity index 99% rename from src/core/quantizer/record_rotater.cc rename to src/core/quantizer/record_rotator.cc index e9c12fb78..8d553b879 100644 --- a/src/core/quantizer/record_rotater.cc +++ b/src/core/quantizer/record_rotator.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "record_rotater.h" +#include "record_rotator.h" #include #include #include diff --git a/src/core/quantizer/record_rotater.h b/src/core/quantizer/record_rotator.h similarity index 100% rename from src/core/quantizer/record_rotater.h rename to src/core/quantizer/record_rotator.h From 4d3a4b3f371a344964cf8e846f3f2f47b5ba8954 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Wed, 10 Jun 2026 15:25:11 +0800 Subject: [PATCH 15/38] common param --- .../python/model/param/python_param.cc | 91 +++++++++++++----- src/core/interface/index.cc | 2 +- src/core/interface/index_param.cc | 4 + .../column/vector_column/engine_helper.hpp | 3 +- src/db/index/common/proto_converter.cc | 24 ++++- src/db/proto/zvec.proto | 10 +- src/include/zvec/core/interface/index_param.h | 7 +- .../core/interface/index_param_builders.h | 2 +- src/include/zvec/db/index_params.h | 94 ++++++++++++------- 9 files changed, 161 insertions(+), 76 deletions(-) diff --git a/src/binding/python/model/param/python_param.cc b/src/binding/python/model/param/python_param.cc index 8d30d0619..4d91b986a 100644 --- a/src/binding/python/model/param/python_param.cc +++ b/src/binding/python/model/param/python_param.cc @@ -363,6 +363,14 @@ Encapsulates common settings for all vector index types. return self.quantize_type(); }, "QuantizeType: Vector quantization type (e.g., FP16, INT8).") + .def_property_readonly( + "enable_rotate", + [](const VectorIndexParams &self) -> bool { + return self.enable_rotate(); + }, + "bool: Whether to apply random rotation before INT8 quantization " + "to reduce quantization error. Only effective with " + "quantize_type=INT8. Defaults to False.") .def( "to_dict", [](const VectorIndexParams &self) -> py::dict { @@ -371,6 +379,7 @@ Encapsulates common settings for all vector index types. dict["metric_type"] = metric_type_to_string(self.metric_type()); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -440,11 +449,6 @@ encapsulates its construction hyperparameters. "bool: Whether to allocate a single contiguous memory arena for " "all HNSW graph nodes. Improves cache locality and search " "throughput at the cost of peak memory usage. Defaults to False.") - .def_property_readonly( - "enable_rotate", &HnswIndexParams::enable_rotate, - "bool: Whether to apply random rotation before INT8 quantization " - "to reduce quantization error. Only effective with " - "quantize_type=INT8. Defaults to False.") .def( "to_dict", [](const HnswIndexParams &self) -> py::dict { @@ -639,7 +643,7 @@ its construction hyperparameters. )pbdoc"); vamana_params .def(py::init(), + QuantizeType, bool>(), py::arg("metric_type") = MetricType::IP, py::arg("max_degree") = core_interface::kDefaultVamanaMaxDegree, py::arg("search_list_size") = @@ -649,7 +653,8 @@ its construction hyperparameters. core_interface::kDefaultVamanaSaturateGraph, py::arg("use_contiguous_memory") = false, py::arg("use_id_map") = false, - py::arg("quantize_type") = QuantizeType::UNDEFINED) + py::arg("quantize_type") = QuantizeType::UNDEFINED, + py::arg("enable_rotate") = false) .def_property_readonly( "max_degree", &VamanaIndexParams::max_degree, "int: Maximum out-degree (R) of every node in the Vamana graph.") @@ -685,6 +690,7 @@ its construction hyperparameters. dict["use_id_map"] = self.use_id_map(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -707,7 +713,9 @@ its construction hyperparameters. ", \"use_id_map\":" + std::string(self.use_id_map() ? "true" : "false") + ", \"quantize_type\":\"" + - quantize_type_to_string(self.quantize_type()) + "\"}"; + quantize_type_to_string(self.quantize_type()) + + "\", \"enable_rotate\":" + + std::string(self.enable_rotate() ? "true" : "false") + "}"; }) .def(py::pickle( [](const VamanaIndexParams &self) { @@ -715,15 +723,18 @@ its construction hyperparameters. self.search_list_size(), self.alpha(), self.saturate_graph(), self.use_contiguous_memory(), - self.use_id_map(), self.quantize_type()); + self.use_id_map(), self.quantize_type(), + self.enable_rotate()); }, [](py::tuple t) { - if (t.size() != 8) + if (t.size() != 8 && t.size() != 9) throw std::runtime_error("Invalid state for VamanaIndexParams"); + bool enable_rotate = t.size() >= 9 ? t[8].cast() : false; return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), t[5].cast(), - t[6].cast(), t[7].cast()); + t[6].cast(), t[7].cast(), + enable_rotate); })); // FlatIndexParams @@ -753,9 +764,10 @@ suitable for small to medium datasets or as a baseline. {'metric_type': 'L2', 'quantize_type': 'FP16'} )pbdoc"); flat_params - .def(py::init(), + .def(py::init(), py::arg("metric_type") = MetricType::IP, py::arg("quantize_type") = QuantizeType::UNDEFINED, + py::arg("enable_rotate") = false, R"pbdoc( Constructs a FlatIndexParam instance. @@ -763,6 +775,9 @@ Constructs a FlatIndexParam instance. metric_type (MetricType, optional): Distance metric. Defaults to MetricType.IP. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED (no quantization). + enable_rotate (bool, optional): Whether to apply random rotation before + INT8 quantization. Only effective with quantize_type=INT8. + Defaults to False. )pbdoc") .def( "to_dict", @@ -771,6 +786,7 @@ Constructs a FlatIndexParam instance. dict["metric_type"] = metric_type_to_string(self.metric_type()); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -780,17 +796,22 @@ Constructs a FlatIndexParam instance. "\"metric_type\":" + metric_type_to_string(self.metric_type()) + ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + "}"; + quantize_type_to_string(self.quantize_type()) + + ", \"enable_rotate\":" + + (self.enable_rotate() ? "true" : "false") + "}"; }) .def(py::pickle( [](const FlatIndexParams &self) { - return py::make_tuple(self.metric_type(), self.quantize_type()); + return py::make_tuple(self.metric_type(), self.quantize_type(), + self.enable_rotate()); }, [](py::tuple t) { - if (t.size() != 2) + if (t.size() != 2 && t.size() != 3) throw std::runtime_error("Invalid state for FlatIndexParams"); + bool enable_rotate = t.size() >= 3 ? t[2].cast() : false; return std::make_shared(t[0].cast(), - t[1].cast()); + t[1].cast(), + enable_rotate); })); // IVFIndexParams @@ -827,10 +848,11 @@ and accuracy. 100 )pbdoc"); ivf_params - .def(py::init(), + .def(py::init(), py::arg("metric_type") = MetricType::IP, py::arg("n_list") = 10, py::arg("n_iters") = 10, py::arg("use_soar") = false, py::arg("quantize_type") = QuantizeType::UNDEFINED, + py::arg("enable_rotate") = false, R"pbdoc( Constructs an IVFIndexParam instance. @@ -843,6 +865,9 @@ Constructs an IVFIndexParam instance. use_soar (bool, optional): Enable SOAR optimization. Defaults to False. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED. + enable_rotate (bool, optional): Whether to apply random rotation before + INT8 quantization. Only effective with quantize_type=INT8. + Defaults to False. )pbdoc") .def_property_readonly("n_list", &IVFIndexParams::n_list, "int: Number of inverted lists.") @@ -862,6 +887,7 @@ Constructs an IVFIndexParam instance. dict["use_soar"] = self.use_soar(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -874,20 +900,24 @@ Constructs an IVFIndexParam instance. ", \"n_iters\":" + std::to_string(self.n_iters()) + ", \"use_soar\":" + std::to_string(self.use_soar()) + ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + "}"; + quantize_type_to_string(self.quantize_type()) + + ", \"enable_rotate\":" + + (self.enable_rotate() ? "true" : "false") + "}"; }) .def(py::pickle( [](const IVFIndexParams &self) { return py::make_tuple(self.metric_type(), self.n_list(), self.n_iters(), self.use_soar(), - self.quantize_type()); + self.quantize_type(), self.enable_rotate()); }, [](py::tuple t) { - if (t.size() != 5) + if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for IVFIndexParams"); + bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast()); + t[3].cast(), t[4].cast(), + enable_rotate); })); // DiskAnnIndexParams @@ -927,10 +957,11 @@ only compressed vector will be loaded into memory. By this way, search memory at 100 )pbdoc"); diskann_params - .def(py::init(), + .def(py::init(), py::arg("metric_type") = MetricType::IP, py::arg("max_degree") = 100, py::arg("list_size") = 50, py::arg("pq_chunk_num") = 0, py::arg("quantize_type") = QuantizeType::UNDEFINED, + py::arg("enable_rotate") = false, R"pbdoc( Constructs an DiskAnnIndexParams instance. @@ -945,6 +976,9 @@ Constructs an DiskAnnIndexParams instance. Clamped to [1, 1024]. Defaults to 0. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED. + enable_rotate (bool, optional): Whether to apply random rotation before + INT8 quantization. Only effective with quantize_type=INT8. + Defaults to False. )pbdoc") .def_property_readonly("max_degree", &DiskAnnIndexParams::max_degree, "int: max node degree.") @@ -967,6 +1001,7 @@ Constructs an DiskAnnIndexParams instance. dict["pq_chunk_num"] = self.pq_chunk_num(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); + dict["enable_rotate"] = self.enable_rotate(); return dict; }, "Convert to dictionary with all fields") @@ -980,20 +1015,24 @@ Constructs an DiskAnnIndexParams instance. ", \"list_size\":" + std::to_string(self.list_size()) + ", \"pq_chunk_num\":" + std::to_string(self.pq_chunk_num()) + ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + "}"; + quantize_type_to_string(self.quantize_type()) + + ", \"enable_rotate\":" + + (self.enable_rotate() ? "true" : "false") + "}"; }) .def(py::pickle( [](const DiskAnnIndexParams &self) { return py::make_tuple(self.metric_type(), self.max_degree(), self.list_size(), self.pq_chunk_num(), - self.quantize_type()); + self.quantize_type(), self.enable_rotate()); }, [](py::tuple t) { - if (t.size() != 5) + if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for DiskAnnIndexParams"); + bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast()); + t[3].cast(), t[4].cast(), + enable_rotate); })); } diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index ec8513844..90ab019cf 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -182,7 +182,7 @@ int Index::CreateAndInitConverterReformer(const QuantizerParam ¶m, } // Pass enable_rotate to converter_params (only effective for INT8) - if (index_param.enable_rotate) { + if (param.enable_rotate) { if (param.type == QuantizerType::kInt8) { if (index_param.metric_type == MetricType::kCosine) { converter_params.set("cosine.converter.enable_rotate", true); diff --git a/src/core/interface/index_param.cc b/src/core/interface/index_param.cc index f8371e65a..a3656dc56 100644 --- a/src/core/interface/index_param.cc +++ b/src/core/interface/index_param.cc @@ -247,12 +247,16 @@ ailego::JsonObject QuantizerParam::SerializeToJsonObject( json_obj.set("type", zvec::ailego::JsonValue(magic_enum::enum_name(type).data())); } + if (!omit_empty_value || enable_rotate) { + json_obj.set("enable_rotate", ailego::JsonValue(enable_rotate)); + } return json_obj; } bool QuantizerParam::DeserializeFromJsonObject( const ailego::JsonObject &json_obj) { DESERIALIZE_ENUM_FIELD(json_obj, type, QuantizerType); + DESERIALIZE_VALUE_FIELD(json_obj, enable_rotate); return true; } diff --git a/src/db/index/column/vector_column/engine_helper.hpp b/src/db/index/column/vector_column/engine_helper.hpp index 6843b0bf9..0eefef1b2 100644 --- a/src/db/index/column/vector_column/engine_helper.hpp +++ b/src/db/index/column/vector_column/engine_helper.hpp @@ -336,6 +336,7 @@ class ProximaEngineHelper { return tl::make_unexpected( Status::InvalidArgument("unsupported quantize type")); } + index_param_builder->WithEnableRotate(db_index_params->enable_rotate()); return index_param_builder; } @@ -380,8 +381,6 @@ class ProximaEngineHelper { db_index_params->ef_construction()); index_param_builder->WithUseContiguousMemory( db_index_params->use_contiguous_memory()); - index_param_builder->WithEnableRotate( - db_index_params->enable_rotate()); return index_param_builder->Build(); } diff --git a/src/db/index/common/proto_converter.cc b/src/db/index/common/proto_converter.cc index f39d005ab..10b43dbcc 100644 --- a/src/db/index/common/proto_converter.cc +++ b/src/db/index/common/proto_converter.cc @@ -18,12 +18,16 @@ namespace zvec { HnswIndexParams::OPtr ProtoConverter::FromPb( const proto::HnswIndexParams ¶ms_pb) { + // OR merge: support both base.enable_rotate (new) and hnsw.enable_rotate + // (deprecated, for backward compat with old serialized data) + bool enable_rotate = + params_pb.base().enable_rotate() || params_pb.enable_rotate(); auto params = std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.m(), params_pb.ef_construction(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), params_pb.use_contiguous_memory(), - params_pb.enable_rotate()); + enable_rotate); return params; } @@ -34,9 +38,11 @@ proto::HnswIndexParams ProtoConverter::ToPb(const HnswIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); + params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); params_pb.set_ef_construction(params->ef_construction()); params_pb.set_m(params->m()); params_pb.set_use_contiguous_memory(params->use_contiguous_memory()); + // Also write to deprecated field for backward compat with old readers params_pb.set_enable_rotate(params->enable_rotate()); return params_pb; } @@ -72,7 +78,8 @@ FlatIndexParams::OPtr ProtoConverter::FromPb( const proto::FlatIndexParams ¶ms_pb) { return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), - QuantizeTypeCodeBook::Get(params_pb.base().quantize_type())); + QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), + params_pb.base().enable_rotate()); } proto::FlatIndexParams ProtoConverter::ToPb(const FlatIndexParams *params) { @@ -81,6 +88,7 @@ proto::FlatIndexParams ProtoConverter::ToPb(const FlatIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); + params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); return params_pb; } @@ -90,7 +98,8 @@ IVFIndexParams::OPtr ProtoConverter::FromPb( return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.n_list(), params_pb.n_iters(), params_pb.use_soar(), - QuantizeTypeCodeBook::Get(params_pb.base().quantize_type())); + QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), + params_pb.base().enable_rotate()); } proto::IVFIndexParams ProtoConverter::ToPb(const IVFIndexParams *params) { @@ -99,6 +108,7 @@ proto::IVFIndexParams ProtoConverter::ToPb(const IVFIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); + params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); params_pb.set_n_list(params->n_list()); params_pb.set_n_iters(params->n_iters()); params_pb.set_use_soar(params->use_soar()); @@ -113,7 +123,8 @@ VamanaIndexParams::OPtr ProtoConverter::FromPb( params_pb.max_degree(), params_pb.search_list_size(), params_pb.alpha(), params_pb.saturate_graph(), params_pb.use_contiguous_memory(), params_pb.use_id_map(), - QuantizeTypeCodeBook::Get(params_pb.base().quantize_type())); + QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), + params_pb.base().enable_rotate()); } proto::VamanaIndexParams ProtoConverter::ToPb(const VamanaIndexParams *params) { @@ -122,6 +133,7 @@ proto::VamanaIndexParams ProtoConverter::ToPb(const VamanaIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); + params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); params_pb.set_max_degree(params->max_degree()); params_pb.set_search_list_size(params->search_list_size()); params_pb.set_alpha(params->alpha()); @@ -152,7 +164,8 @@ DiskAnnIndexParams::OPtr ProtoConverter::FromPb( return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.max_degree(), params_pb.list_size(), params_pb.pq_chunk_num(), - QuantizeTypeCodeBook::Get(params_pb.base().quantize_type())); + QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), + params_pb.base().enable_rotate()); } proto::DiskAnnIndexParams ProtoConverter::ToPb( @@ -162,6 +175,7 @@ proto::DiskAnnIndexParams ProtoConverter::ToPb( MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); + params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); params_pb.set_max_degree(params->max_degree()); params_pb.set_list_size(params->list_size()); params_pb.set_pq_chunk_num(params->pq_chunk_num()); diff --git a/src/db/proto/zvec.proto b/src/db/proto/zvec.proto index dc423372c..a3df7bd93 100644 --- a/src/db/proto/zvec.proto +++ b/src/db/proto/zvec.proto @@ -90,6 +90,10 @@ message InvertIndexParams { message BaseIndexParams { MetricType metric_type = 1; QuantizeType quantize_type = 2; + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + // Shared by all vector index types. + bool enable_rotate = 3; }; message HnswIndexParams { @@ -100,9 +104,9 @@ message HnswIndexParams { // arena for all graph nodes, which improves cache locality / search // throughput at the cost of peak memory usage. Defaults to false. bool use_contiguous_memory = 4; - // When enabled, vectors are rotated before INT8 quantization to reduce - // quantization error. Only effective with quantize_type=INT8. - bool enable_rotate = 5; + // Deprecated: use BaseIndexParams.enable_rotate instead. + // Kept for backward compatibility with old serialized data. + bool enable_rotate = 5 [deprecated = true]; } message HnswRabitqIndexParams { diff --git a/src/include/zvec/core/interface/index_param.h b/src/include/zvec/core/interface/index_param.h index ac15f55fb..7bdc2cef1 100644 --- a/src/include/zvec/core/interface/index_param.h +++ b/src/include/zvec/core/interface/index_param.h @@ -116,12 +116,14 @@ struct QuantizerParam : public SerializableBase { QuantizerType type = QuantizerType::kNone; int num_subquantizers = 8; // M int num_bits = 8; // bits per subquantizer + bool enable_rotate = false; // rotate vectors before quantization to reduce error // Constructors // QuantizerParam() = default; QuantizerParam(QuantizerType t = QuantizerType::kNone, int subquantizers = 8, - int bits = 8) - : type(t), num_subquantizers(subquantizers), num_bits(bits) {} + int bits = 8, bool rotate = false) + : type(t), num_subquantizers(subquantizers), num_bits(bits), + enable_rotate(rotate) {} protected: @@ -245,7 +247,6 @@ class BaseIndexParam : public SerializableBase { bool is_huge_page = false; DataType data_type = DataType::DT_UNDEFINED; bool use_id_map = true; - bool enable_rotate = false; // IndexMeta meta; ailego::Params params; diff --git a/src/include/zvec/core/interface/index_param_builders.h b/src/include/zvec/core/interface/index_param_builders.h index e43408ee8..a0d49808a 100644 --- a/src/include/zvec/core/interface/index_param_builders.h +++ b/src/include/zvec/core/interface/index_param_builders.h @@ -89,7 +89,7 @@ class BaseIndexParamBuilder { // : public } ActualIndexParamBuilderType &WithEnableRotate(bool enable_rotate) { - param->enable_rotate = enable_rotate; + param->quantizer_param.enable_rotate = enable_rotate; return static_cast(*this); } diff --git a/src/include/zvec/db/index_params.h b/src/include/zvec/db/index_params.h index b55efeb8e..e2bc5966a 100644 --- a/src/include/zvec/db/index_params.h +++ b/src/include/zvec/db/index_params.h @@ -124,10 +124,12 @@ class InvertIndexParams : public IndexParams { class VectorIndexParams : public IndexParams { public: VectorIndexParams(IndexType type, MetricType metric_type, - QuantizeType quantize_type = QuantizeType::UNDEFINED) + QuantizeType quantize_type = QuantizeType::UNDEFINED, + bool enable_rotate = false) : IndexParams(type), metric_type_(metric_type), - quantize_type_(quantize_type) {} + quantize_type_(quantize_type), + enable_rotate_(enable_rotate) {} ~VectorIndexParams() override = default; @@ -151,9 +153,20 @@ class VectorIndexParams : public IndexParams { quantize_type_ = quantize_type; } + bool enable_rotate() const { + return enable_rotate_; + } + + void set_enable_rotate(bool enable_rotate) { + enable_rotate_ = enable_rotate; + } + protected: MetricType metric_type_; QuantizeType quantize_type_; + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + bool enable_rotate_{false}; }; /* @@ -167,11 +180,11 @@ class HnswIndexParams : public VectorIndexParams { QuantizeType quantize_type = QuantizeType::UNDEFINED, bool use_contiguous_memory = false, bool enable_rotate = false) - : VectorIndexParams(IndexType::HNSW, metric_type, quantize_type), + : VectorIndexParams(IndexType::HNSW, metric_type, quantize_type, + enable_rotate), m_(m), ef_construction_(ef_construction), - use_contiguous_memory_(use_contiguous_memory), - enable_rotate_(enable_rotate) {} + use_contiguous_memory_(use_contiguous_memory) {} using OPtr = std::shared_ptr; @@ -230,13 +243,6 @@ class HnswIndexParams : public VectorIndexParams { return use_contiguous_memory_; } - void set_enable_rotate(bool enable_rotate) { - enable_rotate_ = enable_rotate; - } - bool enable_rotate() const { - return enable_rotate_; - } - protected: int m_; int ef_construction_; @@ -245,9 +251,6 @@ class HnswIndexParams : public VectorIndexParams { // the cost of peak memory usage. Defaults to false for backward // compatibility. bool use_contiguous_memory_{false}; - // When enabled, vectors are rotated before INT8 quantization to reduce - // quantization error. Only effective with quantize_type=INT8. - bool enable_rotate_{false}; }; class HnswRabitqIndexParams : public VectorIndexParams { @@ -365,21 +368,25 @@ class HnswRabitqIndexParams : public VectorIndexParams { class FlatIndexParams : public VectorIndexParams { public: FlatIndexParams(MetricType metric_type, - QuantizeType quantize_type = QuantizeType::UNDEFINED) - : VectorIndexParams(IndexType::FLAT, metric_type, quantize_type) {} + QuantizeType quantize_type = QuantizeType::UNDEFINED, + bool enable_rotate = false) + : VectorIndexParams(IndexType::FLAT, metric_type, quantize_type, + enable_rotate) {} using OPtr = std::shared_ptr; public: Ptr clone() const override { - return std::make_shared(metric_type_, quantize_type_); + return std::make_shared(metric_type_, quantize_type_, + enable_rotate_); } std::string to_string() const override { auto base_str = vector_index_params_to_string("FlatIndexParams", metric_type_, quantize_type_); std::ostringstream oss; - oss << base_str << "}"; + oss << base_str + << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; return oss.str(); } @@ -388,7 +395,9 @@ class FlatIndexParams : public VectorIndexParams { metric_type() == static_cast(other).metric_type() && quantize_type() == - static_cast(other).quantize_type(); + static_cast(other).quantize_type() && + enable_rotate_ == + static_cast(other).enable_rotate(); } }; @@ -408,8 +417,10 @@ class IVFIndexParams : public VectorIndexParams { public: IVFIndexParams(MetricType metric_type, int n_list = 1024, int n_iters = 10, bool use_soar = false, - QuantizeType quantize_type = QuantizeType::UNDEFINED) - : VectorIndexParams(IndexType::IVF, metric_type, quantize_type), + QuantizeType quantize_type = QuantizeType::UNDEFINED, + bool enable_rotate = false) + : VectorIndexParams(IndexType::IVF, metric_type, quantize_type, + enable_rotate), n_list_(n_list), n_iters_(n_iters), use_soar_(use_soar) {} @@ -419,14 +430,16 @@ class IVFIndexParams : public VectorIndexParams { public: Ptr clone() const override { return std::make_shared(metric_type_, n_list_, n_iters_, - use_soar_, quantize_type_); + use_soar_, quantize_type_, + enable_rotate_); } std::string to_string() const override { auto base_str = vector_index_params_to_string("IVFIndexParams", metric_type_, quantize_type_); std::ostringstream oss; - oss << base_str << ",n_list:" << n_list_ << ",n_iters:" << n_iters_ << "}"; + oss << base_str << ",n_list:" << n_list_ << ",n_iters:" << n_iters_ + << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; return oss.str(); } @@ -462,7 +475,9 @@ class IVFIndexParams : public VectorIndexParams { n_iters_ == static_cast(other).n_iters_ && use_soar_ == static_cast(other).use_soar_ && quantize_type() == - static_cast(other).quantize_type(); + static_cast(other).quantize_type() && + enable_rotate_ == + static_cast(other).enable_rotate_; } private: @@ -475,8 +490,10 @@ class DiskAnnIndexParams : public VectorIndexParams { public: DiskAnnIndexParams(MetricType metric_type, int max_degree = 100, int list_size = 50, int pq_chunk_num = 0, - QuantizeType quantize_type = QuantizeType::UNDEFINED) - : VectorIndexParams(IndexType::DISKANN, metric_type, quantize_type), + QuantizeType quantize_type = QuantizeType::UNDEFINED, + bool enable_rotate = false) + : VectorIndexParams(IndexType::DISKANN, metric_type, quantize_type, + enable_rotate), max_degree_{max_degree}, list_size_{list_size}, pq_chunk_num_{pq_chunk_num} {} @@ -486,7 +503,8 @@ class DiskAnnIndexParams : public VectorIndexParams { public: Ptr clone() const override { return std::make_shared( - metric_type_, max_degree_, list_size_, pq_chunk_num_, quantize_type_); + metric_type_, max_degree_, list_size_, pq_chunk_num_, quantize_type_, + enable_rotate_); } std::string to_string() const override { @@ -495,7 +513,7 @@ class DiskAnnIndexParams : public VectorIndexParams { std::ostringstream oss; oss << base_str << ",max_degree:" << max_degree_ << ",list_size:" << list_size_ << ", pq_chunk_num:" << pq_chunk_num_ - << "}"; + << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; return oss.str(); } @@ -534,7 +552,9 @@ class DiskAnnIndexParams : public VectorIndexParams { pq_chunk_num_ == static_cast(other).pq_chunk_num_ && quantize_type() == - static_cast(other).quantize_type(); + static_cast(other).quantize_type() && + enable_rotate_ == + static_cast(other).enable_rotate_; } private: @@ -555,8 +575,10 @@ class VamanaIndexParams : public VectorIndexParams { float alpha = core_interface::kDefaultVamanaAlpha, bool saturate_graph = core_interface::kDefaultVamanaSaturateGraph, bool use_contiguous_memory = false, bool use_id_map = false, - QuantizeType quantize_type = QuantizeType::UNDEFINED) - : VectorIndexParams(IndexType::VAMANA, metric_type, quantize_type), + QuantizeType quantize_type = QuantizeType::UNDEFINED, + bool enable_rotate = false) + : VectorIndexParams(IndexType::VAMANA, metric_type, quantize_type, + enable_rotate), max_degree_(max_degree), search_list_size_(search_list_size), alpha_(alpha), @@ -570,7 +592,7 @@ class VamanaIndexParams : public VectorIndexParams { Ptr clone() const override { return std::make_shared( metric_type_, max_degree_, search_list_size_, alpha_, saturate_graph_, - use_contiguous_memory_, use_id_map_, quantize_type_); + use_contiguous_memory_, use_id_map_, quantize_type_, enable_rotate_); } std::string to_string() const override { @@ -582,7 +604,8 @@ class VamanaIndexParams : public VectorIndexParams { << ",saturate_graph:" << (saturate_graph_ ? "true" : "false") << ",use_contiguous_memory:" << (use_contiguous_memory_ ? "true" : "false") - << ",use_id_map:" << (use_id_map_ ? "true" : "false") << "}"; + << ",use_id_map:" << (use_id_map_ ? "true" : "false") + << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; return oss.str(); } @@ -597,7 +620,8 @@ class VamanaIndexParams : public VectorIndexParams { search_list_size_ == rhs.search_list_size_ && alpha_ == rhs.alpha_ && saturate_graph_ == rhs.saturate_graph_ && use_contiguous_memory_ == rhs.use_contiguous_memory_ && - use_id_map_ == rhs.use_id_map_; + use_id_map_ == rhs.use_id_map_ && + enable_rotate_ == rhs.enable_rotate_; } int max_degree() const { From 016f47412a46df8368e7f6bec7a063573636a676 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 11 Jun 2026 10:07:37 +0800 Subject: [PATCH 16/38] tmp --- src/core/algorithm/ivf/ivf_entity.cc | 45 ++++++++++++++++++ src/core/algorithm/ivf/ivf_entity.h | 3 ++ src/core/algorithm/ivf/ivf_params.h | 2 + src/core/algorithm/ivf/ivf_searcher.cc | 7 +++ src/core/algorithm/ivf/ivf_searcher_context.h | 31 +++++++++--- src/core/algorithm/ivf/ivf_streamer.cc | 7 +++ src/core/interface/indexes/ivf_index.cc | 25 ++++++++-- src/core/quantizer/cosine_converter.cc | 7 ++- .../quantizer/integer_quantizer_converter.cc | 9 +++- src/core/quantizer/record_rotator.cc | 47 +++++++++++++++++++ src/core/quantizer/record_rotator.h | 7 +++ 11 files changed, 175 insertions(+), 15 deletions(-) diff --git a/src/core/algorithm/ivf/ivf_entity.cc b/src/core/algorithm/ivf/ivf_entity.cc index 6dccc2b2c..39c9826f0 100644 --- a/src/core/algorithm/ivf/ivf_entity.cc +++ b/src/core/algorithm/ivf/ivf_entity.cc @@ -71,6 +71,16 @@ int IVFEntity::IVFReformerWrapper::init(const IndexMeta &imeta) { return 0; } +//! Load reformer state (e.g. rotation matrix) from storage +int IVFEntity::IVFReformerWrapper::load(const IndexStorage::Pointer &storage) { + if (!reformer_) { + return 0; + } + int ret = reformer_->load(storage); + ivf_check_with_msg(ret, "Failed to load reformer state"); + return 0; +} + //! Update the params, Called by gpu searcher only int IVFEntity::IVFReformerWrapper::update(const IndexMeta &meta) { auto &name = meta.reformer_name(); @@ -503,6 +513,12 @@ int IVFEntity::load(const IndexStorage::Pointer &container) { //! Load the remaining segments container_ = container; + + //! Load reformer state (e.g. rotation matrix) from the main container, + //! which holds the rotator segment dumped at build time. + ret = reformer_.load(container); + ivf_check_error_code(ret); + size_t expect_size = header_.inverted_body_size; inverted_ = load_segment(IVF_INVERTED_BODY_SEG_ID, expect_size); if (!inverted_) { @@ -581,6 +597,35 @@ int IVFEntity::load(const IndexStorage::Pointer &container) { header_.total_vector_count, header_.inverted_list_count, meta_.element_size(), meta_.metric_name().c_str(), meta_.reformer_name().c_str()); + { + size_t nlist = header_.inverted_list_count; + size_t maxc = 0, minc = SIZE_MAX, nonzero = 0; + size_t top1 = 0, top2 = 0, top3 = 0; + for (size_t i = 0; i < nlist; ++i) { + auto m = this->inverted_list_meta(i); + size_t c = m ? m->vector_count : 0; + if (c > 0) ++nonzero; + if (c > maxc) maxc = c; + if (c < minc) minc = c; + if (c > top1) { + top3 = top2; + top2 = top1; + top1 = c; + } else if (c > top2) { + top3 = top2; + top2 = c; + } else if (c > top3) { + top3 = c; + } + } + double total = static_cast(header_.total_vector_count); + LOG_INFO( + "[IVF_DIST_DBG] nlist=%zu total=%u nonzero=%zu max=%zu min=%zu " + "avg=%.1f top3=[%zu,%zu,%zu] top1_ratio=%.2f%%", + nlist, header_.total_vector_count, nonzero, maxc, + (minc == SIZE_MAX ? 0 : minc), total / static_cast(nlist), + top1, top2, top3, 100.0 * static_cast(top1) / total); + } return 0; } diff --git a/src/core/algorithm/ivf/ivf_entity.h b/src/core/algorithm/ivf/ivf_entity.h index e6fd4b6c4..e0265b6eb 100644 --- a/src/core/algorithm/ivf/ivf_entity.h +++ b/src/core/algorithm/ivf/ivf_entity.h @@ -267,6 +267,9 @@ class IVFEntity { //! Initialize int init(const IndexMeta &imeta); + //! Load reformer state (e.g. rotation matrix) from storage + int load(const IndexStorage::Pointer &storage); + //! Update int update(const IndexMeta &meta); diff --git a/src/core/algorithm/ivf/ivf_params.h b/src/core/algorithm/ivf/ivf_params.h index a33a7aa50..6cd66b474 100644 --- a/src/core/algorithm/ivf/ivf_params.h +++ b/src/core/algorithm/ivf/ivf_params.h @@ -62,6 +62,8 @@ static const std::string PARAM_IVF_BUILDER_BLOCK_VECTOR_COUNT( // searcher params static const std::string PARAM_IVF_SEARCHER_SCAN_RATIO( "proxima.ivf.searcher.scan_ratio"); +static const std::string PARAM_IVF_SEARCHER_NPROBE( + "proxima.ivf.searcher.nprobe"); static const std::string PARAM_IVF_SEARCHER_BRUTE_FORCE_THRESHOLD( "proxima.ivf.searcher.brute_force_threshold"); static const std::string PARAM_IVF_SEARCHER_OPTIMIZER( diff --git a/src/core/algorithm/ivf/ivf_searcher.cc b/src/core/algorithm/ivf/ivf_searcher.cc index 972fc8680..047046701 100644 --- a/src/core/algorithm/ivf/ivf_searcher.cc +++ b/src/core/algorithm/ivf/ivf_searcher.cc @@ -86,6 +86,13 @@ int IVFSearcher::load(IndexStorage::Pointer container, } auto reformer = centroid_index_->reformer(); + if (reformer) { + //! The centroid index is loaded from the centroid sub-segment which does + //! not contain the rotator segment. Load the reformer state (e.g. rotation + //! matrix) from the main container instead. + ret = reformer->load(container); + ivf_check_error_code(ret); + } params_.set(PARAM_IVF_SEARCHER_CONVERTER_REFORMER, reformer); //! load iverted index diff --git a/src/core/algorithm/ivf/ivf_searcher_context.h b/src/core/algorithm/ivf/ivf_searcher_context.h index d9ccc45c1..22dd273fe 100644 --- a/src/core/algorithm/ivf/ivf_searcher_context.h +++ b/src/core/algorithm/ivf/ivf_searcher_context.h @@ -62,19 +62,35 @@ class IVFSearcherContext : public IndexSearcher::Context { params.get(PARAM_IVF_SEARCHER_BRUTE_FORCE_THRESHOLD, &bruteforce_threshold_); params.get(PARAM_IVF_SEARCHER_SCAN_RATIO, &scan_ratio_); + params.get(PARAM_IVF_SEARCHER_NPROBE, &nprobe_); if (scan_ratio_ <= 0.0) { LOG_ERROR("Invalid params %s=%f", PARAM_IVF_SEARCHER_SCAN_RATIO.c_str(), scan_ratio_); return IndexError_InvalidArgument; } - size_t topk_val = - std::max(static_cast( - std::round(entity_->inverted_list_count() * scan_ratio_)), - 1u); - centroid_searcher_ctx_->set_topk(topk_val); - max_scan_count_ = - static_cast(std::ceil(entity_->vector_count() * scan_ratio_)); + size_t nlist = entity_->inverted_list_count(); + size_t topk_val; + if (nprobe_ > 0) { + //! nprobe explicitly controls how many inverted lists (centroids) to + //! probe. Do not let max_scan_count_ cut off the probed lists. + topk_val = std::min(static_cast(nprobe_), nlist); + topk_val = std::max(topk_val, static_cast(1)); + max_scan_count_ = static_cast(entity_->vector_count()); + } else { + topk_val = std::max( + static_cast(std::round(nlist * scan_ratio_)), 1u); + max_scan_count_ = static_cast( + std::ceil(entity_->vector_count() * scan_ratio_)); + } + centroid_searcher_ctx_->set_topk(static_cast(topk_val)); max_scan_count_ = std::max(bruteforce_threshold_, max_scan_count_); + static thread_local int kNprobeDbgCnt = 0; + if (kNprobeDbgCnt++ < 6) { + LOG_INFO( + "[NPROBE_DBG] nprobe_=%d scan_ratio_=%f nlist=%zu topk_val=%zu " + "max_scan_count_=%u", + nprobe_, scan_ratio_, nlist, topk_val, max_scan_count_); + } return 0; } @@ -215,6 +231,7 @@ class IVFSearcherContext : public IndexSearcher::Context { uint32_t topk_{0}; uint32_t magic_{0}; float scan_ratio_{kDefaultScanRatio}; + int nprobe_{0}; uint32_t max_scan_count_{0}; uint32_t bruteforce_threshold_{kDefaultBfThreshold}; }; diff --git a/src/core/algorithm/ivf/ivf_streamer.cc b/src/core/algorithm/ivf/ivf_streamer.cc index a2c924141..e42728e9a 100644 --- a/src/core/algorithm/ivf/ivf_streamer.cc +++ b/src/core/algorithm/ivf/ivf_streamer.cc @@ -86,6 +86,13 @@ int IVFStreamer::open(IndexStorage::Pointer storage) { } auto reformer = centroid_index_->reformer(); + if (reformer) { + //! The centroid index is loaded from the centroid sub-segment which does + //! not contain the rotator segment. Load the reformer state (e.g. rotation + //! matrix) from the main storage instead. + ret = reformer->load(storage); + ivf_check_error_code(ret); + } params_.set(PARAM_IVF_SEARCHER_CONVERTER_REFORMER, reformer); //! load iverted index diff --git a/src/core/interface/indexes/ivf_index.cc b/src/core/interface/indexes/ivf_index.cc index d85acce62..6bd793b2a 100644 --- a/src/core/interface/indexes/ivf_index.cc +++ b/src/core/interface/indexes/ivf_index.cc @@ -121,6 +121,10 @@ int IVFIndex::Open(const std::string &file_path, LOG_ERROR("Failed to open streamer, path: %s", file_path_.c_str()); return core::IndexError_Runtime; } + // Load reformer data from storage (e.g., rotation matrix for INT8+rotate) + if (reformer_ != nullptr) { + reformer_->load(storage_); + } is_trained_ = true; } is_open_ = true; @@ -164,6 +168,10 @@ int IVFIndex::Train() { dumper->create(file_path_); builder_->dump(dumper); + // Dump converter state (e.g., rotator for INT8+rotate) to dumper + if (converter_) { + converter_->dump(dumper); + } dumper->close(); int ret = storage_->open(file_path_, false); if (ret != 0) { @@ -175,6 +183,10 @@ int IVFIndex::Train() { LOG_ERROR("Failed to open streamer, path: %s", file_path_.c_str()); return core::IndexError_Runtime; } + // Load reformer data from storage (e.g., rotation matrix) + if (reformer_ != nullptr) { + reformer_->load(storage_); + } is_trained_ = true; return 0; } @@ -209,11 +221,8 @@ int IVFIndex::_prepare_for_search( } if (ivf_search_param->nprobe > 0) { - // TODO: 1. sparse; 2. default ef ailego::Params params; - // need fix - params.set(core::PARAM_IVF_BUILDER_CENTROID_COUNT, - ivf_search_param->nprobe); + params.set(core::PARAM_IVF_SEARCHER_NPROBE, ivf_search_param->nprobe); context->update(params); } return 0; @@ -229,6 +238,10 @@ int IVFIndex::Merge(const std::vector &indexes, dumper->create(file_path_); builder_->dump(dumper); + // Dump converter state (e.g., rotator for INT8+rotate) to dumper + if (converter_) { + converter_->dump(dumper); + } dumper->close(); int ret = storage_->open(file_path_, false); if (ret != 0) { @@ -240,6 +253,10 @@ int IVFIndex::Merge(const std::vector &indexes, LOG_ERROR("Failed to open streamer, path: %s", file_path_.c_str()); return core::IndexError_Runtime; } + // Load reformer data from storage (e.g., rotation matrix) + if (reformer_ != nullptr) { + reformer_->load(storage_); + } is_trained_ = true; return 0; } diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index 41e5000fc..c9b8cb7d1 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -375,8 +375,11 @@ class CosineConverter : public IndexConverter { return 0; } - //! Dump index into storage - int dump(const IndexDumper::Pointer & /*dumper*/) override { + //! Dump index into storage (writes rotator segment if present) + int dump(const IndexDumper::Pointer &dumper) override { + if (rotator_) { + return rotator_->dump(dumper); + } return 0; } diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index 2c7b00ca4..adbdab52b 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -453,8 +453,13 @@ class IntegerStreamingConverter : public IndexConverter { return 0; } - //! Dump index into storage (no-op: DumpPath removed, use dump_to_storage instead) - int dump(const IndexDumper::Pointer & /*dumper*/) override { return 0; } + //! Dump index into storage (writes rotator segment if rotate enabled) + int dump(const IndexDumper::Pointer &dumper) override { + if (enable_rotate_ && rotator_) { + return rotator_->dump(dumper); + } + return 0; + } //! Dump converter state to IndexStorage for streaming build int dump_to_storage(const IndexStorage::Pointer &storage) override { diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 8d553b879..3bddc97f6 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -140,6 +140,53 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, return 0; } +int RecordRotator::dump(const IndexDumper::Pointer &dumper, + const std::string &seg_id) const { + if (!dumper) { + LOG_ERROR("RecordRotator::dump(dumper): null dumper"); + return IndexError_InvalidArgument; + } + if (!impl_->rotator) { + LOG_ERROR("RecordRotator::dump(dumper): rotator not initialized"); + return IndexError_NoReady; + } + + // Serialize: [Header: type|origin_dim|padded_dim] [rabitqlib blob] + const size_t blob_size = impl_->rotator->dump_bytes(); + const size_t data_size = Impl::kHeaderSize + blob_size; + const size_t total_size = (data_size + 0x1F) & (~0x1F); + + std::vector buffer(total_size, 0); + Impl::Header header; + header.type = static_cast(impl_->type); + header.origin_dim = static_cast(impl_->dimension); + header.padded_dim = static_cast(impl_->padded_dim); + std::memcpy(buffer.data(), &header, Impl::kHeaderSize); + impl_->rotator->save(buffer.data() + Impl::kHeaderSize); + + const uint32_t crc = ailego::Crc32c::Hash(buffer.data(), data_size, 0); + const size_t padding_size = total_size - data_size; + + // Write data + padding to dumper + if (dumper->write(buffer.data(), total_size) != total_size) { + LOG_ERROR("RecordRotator::dump(dumper): write failed, seg=%s", seg_id.c_str()); + return IndexError_WriteData; + } + + // Register segment + int ret = dumper->append(seg_id, data_size, padding_size, crc); + if (ret != 0) { + LOG_ERROR("RecordRotator::dump(dumper): append failed, seg=%s, ret=%d", + seg_id.c_str(), ret); + return ret; + } + + LOG_DEBUG( + "RecordRotator::dump(dumper) done: seg=%s, data_size=%zu, padding=%zu", + seg_id.c_str(), data_size, padding_size); + return 0; +} + int RecordRotator::open(IndexStorage::Pointer storage, const std::string &seg_id) { if (!storage) { diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index d187e2528..9e7a19811 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -17,6 +17,7 @@ #include #include #include +#include "zvec/core/framework/index_dumper.h" #include "zvec/core/framework/index_storage.h" namespace zvec { @@ -76,6 +77,12 @@ class RecordRotator { int dump(const IndexStorage::Pointer &storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; + //! Dump the rotator to an IndexDumper as a named segment. + //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] + //! Appends padding for 32-byte alignment. + int dump(const IndexDumper::Pointer &dumper, + const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; + //! Open the rotator from an IndexStorage segment (self-describing, no init needed). //! Parses header to get type/dimension/padded_dim, then reconstructs the rotator. int open(IndexStorage::Pointer storage, From 1b9e3f20614b801afbf31500f4cfb10e78d6ad8e Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 11 Jun 2026 13:26:31 +0800 Subject: [PATCH 17/38] debug --- src/core/algorithm/ivf/ivf_entity.cc | 29 ------------------- src/core/algorithm/ivf/ivf_searcher_context.h | 7 ----- 2 files changed, 36 deletions(-) diff --git a/src/core/algorithm/ivf/ivf_entity.cc b/src/core/algorithm/ivf/ivf_entity.cc index 39c9826f0..decc86d22 100644 --- a/src/core/algorithm/ivf/ivf_entity.cc +++ b/src/core/algorithm/ivf/ivf_entity.cc @@ -597,35 +597,6 @@ int IVFEntity::load(const IndexStorage::Pointer &container) { header_.total_vector_count, header_.inverted_list_count, meta_.element_size(), meta_.metric_name().c_str(), meta_.reformer_name().c_str()); - { - size_t nlist = header_.inverted_list_count; - size_t maxc = 0, minc = SIZE_MAX, nonzero = 0; - size_t top1 = 0, top2 = 0, top3 = 0; - for (size_t i = 0; i < nlist; ++i) { - auto m = this->inverted_list_meta(i); - size_t c = m ? m->vector_count : 0; - if (c > 0) ++nonzero; - if (c > maxc) maxc = c; - if (c < minc) minc = c; - if (c > top1) { - top3 = top2; - top2 = top1; - top1 = c; - } else if (c > top2) { - top3 = top2; - top2 = c; - } else if (c > top3) { - top3 = c; - } - } - double total = static_cast(header_.total_vector_count); - LOG_INFO( - "[IVF_DIST_DBG] nlist=%zu total=%u nonzero=%zu max=%zu min=%zu " - "avg=%.1f top3=[%zu,%zu,%zu] top1_ratio=%.2f%%", - nlist, header_.total_vector_count, nonzero, maxc, - (minc == SIZE_MAX ? 0 : minc), total / static_cast(nlist), - top1, top2, top3, 100.0 * static_cast(top1) / total); - } return 0; } diff --git a/src/core/algorithm/ivf/ivf_searcher_context.h b/src/core/algorithm/ivf/ivf_searcher_context.h index 22dd273fe..a0a941e5e 100644 --- a/src/core/algorithm/ivf/ivf_searcher_context.h +++ b/src/core/algorithm/ivf/ivf_searcher_context.h @@ -84,13 +84,6 @@ class IVFSearcherContext : public IndexSearcher::Context { } centroid_searcher_ctx_->set_topk(static_cast(topk_val)); max_scan_count_ = std::max(bruteforce_threshold_, max_scan_count_); - static thread_local int kNprobeDbgCnt = 0; - if (kNprobeDbgCnt++ < 6) { - LOG_INFO( - "[NPROBE_DBG] nprobe_=%d scan_ratio_=%f nlist=%zu topk_val=%zu " - "max_scan_count_=%u", - nprobe_, scan_ratio_, nlist, topk_val, max_scan_count_); - } return 0; } From d0fea3b8885ad7a1091a8fbdf53934f0fefbfd18 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 11 Jun 2026 17:49:27 +0800 Subject: [PATCH 18/38] tmp --- python/zvec/model/param/__init__.pyi | 7 +++++++ src/db/index/segment/segment.cc | 6 ++++-- src/include/zvec/db/index_params.h | 5 +++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python/zvec/model/param/__init__.pyi b/python/zvec/model/param/__init__.pyi index f4ff22aa4..46c0dda87 100644 --- a/python/zvec/model/param/__init__.pyi +++ b/python/zvec/model/param/__init__.pyi @@ -145,6 +145,9 @@ class FlatIndexParam(VectorIndexParam): quantize_type (QuantizeType): Optional quantization type for vector compression (e.g., FP16, INT8). Use ``QuantizeType.UNDEFINED`` to disable quantization. Default is ``QuantizeType.UNDEFINED``. + enable_rotate (bool): Whether to apply random rotation before INT8 + quantization. Only effective with quantize_type=INT8. + Default is ``False``. Examples: >>> from zvec.typing import MetricType, QuantizeType @@ -161,6 +164,7 @@ class FlatIndexParam(VectorIndexParam): self, metric_type: _zvec.typing.MetricType = ..., quantize_type: _zvec.typing.QuantizeType = ..., + enable_rotate: bool = ..., ) -> None: """ Constructs a FlatIndexParam instance. @@ -169,6 +173,9 @@ class FlatIndexParam(VectorIndexParam): metric_type (MetricType, optional): Distance metric. Defaults to MetricType.IP. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED (no quantization). + enable_rotate (bool, optional): Whether to apply random rotation before + INT8 quantization. Only effective with quantize_type=INT8. + Defaults to False. """ def __repr__(self) -> str: ... diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 2e334c3fc..58173b577 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -4028,7 +4028,8 @@ Status SegmentImpl::load_vector_index_blocks() { if (!segment_meta_->vector_indexed(column)) { new_field_params.set_index_params(MakeDefaultQuantVectorIndexParams( vector_index_params->metric_type(), - vector_index_params->quantize_type())); + vector_index_params->quantize_type(), + vector_index_params->enable_rotate())); } } @@ -4163,7 +4164,8 @@ Status SegmentImpl::init_memory_components() { block_id = allocate_block_id(); FieldSchema normal_quant_field(*field); normal_quant_field.set_index_params(MakeDefaultQuantVectorIndexParams( - index_params->metric_type(), index_params->quantize_type())); + index_params->metric_type(), index_params->quantize_type(), + index_params->enable_rotate())); auto quant_vector_indexer = create_vector_indexer( field->name(), normal_quant_field, block_id, true); diff --git a/src/include/zvec/db/index_params.h b/src/include/zvec/db/index_params.h index e2bc5966a..1223105df 100644 --- a/src/include/zvec/db/index_params.h +++ b/src/include/zvec/db/index_params.h @@ -409,8 +409,9 @@ inline FlatIndexParams MakeDefaultVectorIndexParams(MetricType metric_type) { } inline FlatIndexParams MakeDefaultQuantVectorIndexParams( - MetricType metric_type, QuantizeType quantize_type) { - return FlatIndexParams(metric_type, quantize_type); + MetricType metric_type, QuantizeType quantize_type, + bool enable_rotate = false) { + return FlatIndexParams(metric_type, quantize_type, enable_rotate); } class IVFIndexParams : public VectorIndexParams { From b49ea9d732dc8e9a1cf8ade65b3a1f1f89ad8e09 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 11 Jun 2026 20:23:26 +0800 Subject: [PATCH 19/38] unrotate --- src/core/quantizer/cosine_reformer.cc | 17 ++ .../quantizer/integer_quantizer_reformer.cc | 20 +- src/core/quantizer/record_rotator.cc | 76 +++++++ src/core/quantizer/record_rotator.h | 15 ++ test_unrotate.py | 209 ++++++++++++++++++ 5 files changed, 329 insertions(+), 8 deletions(-) create mode 100644 test_unrotate.py diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index ea67282a5..442e08e34 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include +#include #include #include #include @@ -225,6 +226,11 @@ class CosineReformer : public IndexReformer { NORM_SIZE, NORM_SIZE); + // For FP32 input type, rotation may have been applied during transform. + // For FP16 input type, rotation was NOT applied — skip inverse rotation. + const bool need_inv_rotate = + (type == IndexMeta::DataType::DT_FP32 && enable_rotate_ && rotator_); + if (type == IndexMeta::DataType::DT_FP32) { if (dst_type_ != IndexMeta::DataType::DT_FP32) { return IndexError_Unsupported; @@ -234,6 +240,11 @@ class CosineReformer : public IndexReformer { const float *in_buf = reinterpret_cast(in); this->denormalize(in_buf, out_buf, qmeta, norm); + if (need_inv_rotate) { + std::vector tmp(dimension); + rotator_->unrotate(out_buf, tmp.data()); + std::memcpy(out_buf, tmp.data(), dimension * sizeof(float)); + } } else if (type == IndexMeta::DataType::DT_FP16) { if (dst_type_ != IndexMeta::DataType::DT_FP16) { return IndexError_Unsupported; @@ -249,6 +260,7 @@ class CosineReformer : public IndexReformer { RecordQuantizer::unquantize_record(in, dimension, dst_type_, out_buf); this->denormalize(out_buf, out_buf, qmeta, norm); + // FP16 type path: no rotation was applied, skip inverse } else { ailego::Float16 *out_buf = reinterpret_cast(&(*out)[0]); @@ -267,6 +279,11 @@ class CosineReformer : public IndexReformer { RecordQuantizer::unquantize_record(in, dimension, dst_type_, out_buf); this->denormalize(out_buf, out_buf, qmeta, norm); + if (need_inv_rotate) { + std::vector tmp(dimension); + rotator_->unrotate(out_buf, tmp.data()); + std::memcpy(out_buf, tmp.data(), dimension * sizeof(float)); + } } return 0; diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 88b285ef8..498522a4e 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "record_quantizer.h" #include "record_rotator.h" @@ -500,21 +501,24 @@ class IntegerStreamingReformer : public IndexReformer { int revert(const void *in, const IndexQueryMeta &qmeta, std::string *out) const override { - if (enable_rotate_) { - LOG_ERROR("Unsupported revert for rotated value"); - return IndexError_Unsupported; - } if (enable_normalize_) { LOG_ERROR("Unsupported revert for normalized value"); - return IndexError_Unsupported; } - out->resize((qmeta.dimension() - extra_dimension_) * sizeof(float)); + const size_t origin_dim = qmeta.dimension() - extra_dimension_; + out->resize(origin_dim * sizeof(float)); float *out_buf = reinterpret_cast(out->data()); - RecordQuantizer::unquantize_record(in, qmeta.dimension() - extra_dimension_, - data_type_, out_buf); + if (enable_rotate_ && rotator_) { + // First unquantize into a temporary buffer, then inverse rotate + std::vector unq_buf(origin_dim); + RecordQuantizer::unquantize_record(in, origin_dim, data_type_, + unq_buf.data()); + rotator_->unrotate(unq_buf.data(), out_buf); + } else { + RecordQuantizer::unquantize_record(in, origin_dim, data_type_, out_buf); + } return 0; } diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 3bddc97f6..ca353b850 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -37,6 +37,10 @@ struct RecordRotator::Impl { size_t padded_dim{0}; RecordRotatorType type{RecordRotatorType::FhtKac}; std::unique_ptr> rotator; + //! Inverse rotation matrix, column-major: dim x padded_dim + //! Element [col][row] = inv_matrix_[col * dimension + row] + //! where col in [0, padded_dim), row in [0, dimension) + std::vector inv_matrix; static rabitqlib::RotatorType to_rabitq(RecordRotatorType t) { return t == RecordRotatorType::Matrix @@ -65,6 +69,8 @@ void RecordRotator::init(size_t dimension, size_t padded_dim, impl_->type = rotator_type; impl_->rotator.reset(rabitqlib::choose_rotator( dimension, Impl::to_rabitq(rotator_type), padded_dim)); + // Build inverse rotation data for unrotate support + build_inverse(); } void RecordRotator::rotate(const float *in, float *out) const { @@ -77,6 +83,68 @@ std::vector RecordRotator::rotate(const float *in) const { return out; } +void RecordRotator::build_inverse() { + if (!impl_->rotator) { + LOG_ERROR("RecordRotator::build_inverse: rotator not initialized"); + return; + } + + const size_t dim = impl_->dimension; + const size_t pdim = impl_->padded_dim; + + // Allocate column-major storage: padded_dim columns, each dim floats + impl_->inv_matrix.resize(pdim * dim, 0.0f); + + // Compute rotation matrix by rotating each standard basis vector e_i. + // R * e_i = i-th column of R, which we store as inv_matrix[i * dim + j]. + std::vector basis(dim, 0.0f); + std::vector rotated(pdim, 0.0f); + + for (size_t i = 0; i < pdim; ++i) { + std::fill(basis.begin(), basis.end(), 0.0f); + if (i < dim) { + basis[i] = 1.0f; + } + impl_->rotator->rotate(basis.data(), rotated.data()); + // Store as column i of the rotation matrix + for (size_t j = 0; j < dim; ++j) { + impl_->inv_matrix[i * dim + j] = rotated[j]; + } + } + + LOG_DEBUG("RecordRotator::build_inverse done: dim=%zu, padded_dim=%zu", + dim, pdim); +} + +void RecordRotator::unrotate(const float *in, float *out) const { + if (impl_->inv_matrix.empty()) { + LOG_ERROR("RecordRotator::unrotate: build_inverse() not called"); + return; + } + + const size_t dim = impl_->dimension; + const size_t pdim = impl_->padded_dim; + + // Compute x = R^T * y, where y is the dim-dimensional input (padded with zeros). + // x[j] = sum_{i=0}^{pdim-1} R[j][i] * y[i] + // = sum_{i=0}^{dim-1} inv_matrix_[i * dim + j] * in[i] + // (since y[i] = 0 for i >= dim) + std::vector tmp(dim, 0.0f); + for (size_t i = 0; i < dim; ++i) { + const float yi = in[i]; + for (size_t j = 0; j < dim; ++j) { + tmp[j] += impl_->inv_matrix[i * dim + j] * yi; + } + } + std::memcpy(out, tmp.data(), dim * sizeof(float)); +} + +std::vector RecordRotator::unrotate(const float *in) const { + std::vector out(impl_->dimension); + unrotate(in, out.data()); + return out; +} + size_t RecordRotator::dump_bytes() const { return Impl::kHeaderSize + impl_->rotator->dump_bytes(); } @@ -245,6 +313,10 @@ int RecordRotator::open(IndexStorage::Pointer storage, "RecordRotator::open done: seg=%s, dim=%zu, padded_dim=%zu, " "data_size=%zu", seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); + + // Build inverse rotation data for unrotate support + build_inverse(); + return 0; } @@ -269,6 +341,10 @@ int RecordRotator::load(const float *matrix, size_t dimension, LOG_DEBUG("RecordRotator::load done: dim=%zu, padded_dim=%zu", dimension, padded_dim); + + // Build inverse rotation data for unrotate support + build_inverse(); + return 0; } diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index 9e7a19811..83e9aa7e9 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -69,6 +69,21 @@ class RecordRotator { //! @return vector of size padded_dim containing rotated result std::vector rotate(const float *in) const; + //! Inverse-rotate a single vector (from rotated space back to original) + //! @param in input vector of size >= dimension (rotated, truncated) + //! @param out output buffer of size >= dimension (original space) + void unrotate(const float *in, float *out) const; + + //! Inverse-rotate a single vector into a managed buffer + //! @param in input vector of size >= dimension (rotated, truncated) + //! @return vector of size dimension containing inverse-rotated result + std::vector unrotate(const float *in) const; + + //! Prepare internal data structures for inverse rotation. + //! Computes the rotation matrix by rotating basis vectors. + //! Must be called after init() or open() before using unrotate(). + void build_inverse(); + //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; diff --git a/test_unrotate.py b/test_unrotate.py new file mode 100644 index 000000000..d1260f414 --- /dev/null +++ b/test_unrotate.py @@ -0,0 +1,209 @@ +"""Test inverse rotation: verify that vectors can be recovered after +rotation + quantization via the revert path.""" +import os +import shutil +import tempfile +import numpy as np +import zvec +from zvec import ( + Collection, + CollectionOption, + DataType, + Doc, + FieldSchema, + FlatIndexParam, + MetricType, + VectorSchema, + OptimizeOption, +) +from zvec.typing import QuantizeType + + +def test_inverse_rotation_int8(): + """Test inverse rotation with INT8 streaming quantizer + rotation.""" + dim = 128 + n_docs = 10 + + # Create temp dir for the collection + tmpdir = tempfile.mkdtemp(prefix="zvec_test_inv_rotate_") + coll_path = os.path.join(tmpdir, "collection") + try: + schema = zvec.CollectionSchema( + name="test_inv_rotate", + fields=[FieldSchema("id", DataType.INT64, nullable=False)], + vectors=[ + VectorSchema( + "embedding", + DataType.VECTOR_FP32, + dimension=dim, + index_param=FlatIndexParam( + metric_type=MetricType.IP, + quantize_type=QuantizeType.INT8, + enable_rotate=True, + ), + ), + ], + ) + + collection = zvec.create_and_open( + path=coll_path, + schema=schema, + option=CollectionOption(read_only=False), + ) + + # Generate random vectors + np.random.seed(42) + docs = [] + original_vecs = {} + for i in range(n_docs): + vec = np.random.randn(dim).astype(np.float32) + vec = vec / np.linalg.norm(vec) # Normalize for IP + original_vecs[str(i)] = vec + docs.append( + Doc( + id=str(i), + fields={"id": i}, + vectors={"embedding": vec.tolist()}, + ) + ) + + # Insert + for doc in docs: + result = collection.insert(doc) + assert result.ok(), f"Insert failed: {result.code()}" + + collection.flush() + + # Optimize to trigger reformer build + collection.optimize(option=OptimizeOption()) + import time + time.sleep(2) # Wait for optimization to complete + + # Fetch vectors back (triggers revert path) + ids = [str(i) for i in range(n_docs)] + fetched = collection.fetch(ids=ids) + + assert len(fetched) == n_docs, f"Expected {n_docs} docs, got {len(fetched)}" + + # Compare fetched vectors with originals + max_error = 0.0 + avg_error = 0.0 + print("\nDiagnostic: first fetched vector vs original:") + for i, doc_id in enumerate(ids): + assert doc_id in fetched, f"Doc {doc_id} not found in fetched results" + fetched_vec = np.array(fetched[doc_id].vector("embedding"), dtype=np.float32) + original_vec = original_vecs[doc_id] + if i == 0: + print(f" fetched[:8] = {fetched_vec[:8]}") + print(f" original[:8] = {original_vec[:8]}") + print(f" fetched shape: {fetched_vec.shape}, original shape: {original_vec.shape}") + error = np.max(np.abs(fetched_vec - original_vec)) + avg_error += np.mean(np.abs(fetched_vec - original_vec)) + max_error = max(max_error, error) + + avg_error /= n_docs + + print(f"\n=== INT8 + Rotate Inverse Rotation Test ===") + print(f"Max absolute error: {max_error:.6f}") + print(f"Avg absolute error: {avg_error:.6f}") + print(f"Number of docs: {n_docs}") + + # The error should be bounded (INT8 quantization introduces some loss) + # With rotation, the error should still be reasonable + assert max_error < 0.5, f"Max error {max_error} too large!" + print("PASSED!") + + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +def test_inverse_rotation_cosine(): + """Test inverse rotation with COSINE metric + INT8 quantizer + rotation.""" + dim = 128 + n_docs = 10 + + tmpdir = tempfile.mkdtemp(prefix="zvec_test_inv_rotate_cosine_") + coll_path = os.path.join(tmpdir, "collection") + try: + schema = zvec.CollectionSchema( + name="test_inv_rotate_cosine", + fields=[FieldSchema("id", DataType.INT64, nullable=False)], + vectors=[ + VectorSchema( + "embedding", + DataType.VECTOR_FP32, + dimension=dim, + index_param=FlatIndexParam( + metric_type=MetricType.COSINE, + quantize_type=QuantizeType.INT8, + enable_rotate=True, + ), + ), + ], + ) + + collection = zvec.create_and_open( + path=coll_path, + schema=schema, + option=CollectionOption(read_only=False), + ) + + # Generate random vectors + np.random.seed(42) + docs = [] + original_vecs = {} + for i in range(n_docs): + vec = np.random.randn(dim).astype(np.float32) + original_vecs[str(i)] = vec + docs.append( + Doc( + id=str(i), + fields={"id": i}, + vectors={"embedding": vec.tolist()}, + ) + ) + + for doc in docs: + result = collection.insert(doc) + assert result.ok(), f"Insert failed: {result.code()}" + + collection.flush() + collection.optimize(option=OptimizeOption()) + import time + time.sleep(2) + + ids = [str(i) for i in range(n_docs)] + fetched = collection.fetch(ids=ids) + + assert len(fetched) == n_docs, f"Expected {n_docs} docs, got {len(fetched)}" + + max_error = 0.0 + avg_error = 0.0 + for doc_id in ids: + assert doc_id in fetched, f"Doc {doc_id} not found" + fetched_vec = np.array(fetched[doc_id].vector("embedding"), dtype=np.float32) + original_vec = original_vecs[doc_id] + # Normalize both for comparison (COSINE metric normalizes) + fetched_norm = fetched_vec / (np.linalg.norm(fetched_vec) + 1e-8) + original_norm = original_vec / (np.linalg.norm(original_vec) + 1e-8) + error = np.max(np.abs(fetched_norm - original_norm)) + avg_error += np.mean(np.abs(fetched_norm - original_norm)) + max_error = max(max_error, error) + + avg_error /= n_docs + + print(f"\n=== COSINE + INT8 + Rotate Inverse Rotation Test ===") + print(f"Max absolute error (normalized): {max_error:.6f}") + print(f"Avg absolute error (normalized): {avg_error:.6f}") + + assert max_error < 0.5, f"Max error {max_error} too large!" + print("PASSED!") + + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +if __name__ == "__main__": + test_inverse_rotation_int8() + test_inverse_rotation_cosine() + print("\n=== All tests passed! ===") From 848677d3f5dc546e0c00759358278ab67dde98a7 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 11 Jun 2026 21:03:44 +0800 Subject: [PATCH 20/38] update int8 rotate --- python/zvec/__init__.py | 2 + python/zvec/__init__.pyi | 2 + python/zvec/model/param/__init__.py | 2 + python/zvec/model/param/__init__.pyi | 67 ++++- .../python/model/param/python_param.cc | 232 +++++++++++++----- src/db/index/common/proto_converter.cc | 36 +-- src/db/index/segment/segment.cc | 4 +- src/db/proto/zvec.proto | 17 +- src/include/zvec/db/index_params.h | 120 ++++++--- test_unrotate.py | 209 ---------------- 10 files changed, 352 insertions(+), 339 deletions(-) delete mode 100644 test_unrotate.py diff --git a/python/zvec/__init__.py b/python/zvec/__init__.py index 5fdf9732c..929c75fa2 100644 --- a/python/zvec/__init__.py +++ b/python/zvec/__init__.py @@ -108,6 +108,7 @@ IVFIndexParam, IVFQueryParam, OptimizeOption, + QuantizerParam, VamanaIndexParam, VamanaQueryParam, ) @@ -171,6 +172,7 @@ "HnswQueryParam", "HnswRabitqQueryParam", "IVFQueryParam", + "QuantizerParam", "VamanaIndexParam", "VamanaQueryParam", # Extensions diff --git a/python/zvec/__init__.pyi b/python/zvec/__init__.pyi index dd468cae1..7177a5ec0 100644 --- a/python/zvec/__init__.pyi +++ b/python/zvec/__init__.pyi @@ -28,6 +28,7 @@ from .model.param import ( IVFIndexParam, IVFQueryParam, OptimizeOption, + QuantizerParam, VamanaIndexParam, VamanaQueryParam, ) @@ -74,6 +75,7 @@ __all__: list = [ "MetricType", "OptimizeOption", "QuantizeType", + "QuantizerParam", "Query", "ReRanker", "RrfReRanker", diff --git a/python/zvec/model/param/__init__.py b/python/zvec/model/param/__init__.py index 43fc1ddce..084a79d00 100644 --- a/python/zvec/model/param/__init__.py +++ b/python/zvec/model/param/__init__.py @@ -31,6 +31,7 @@ IVFIndexParam, IVFQueryParam, OptimizeOption, + QuantizerParam, VamanaIndexParam, VamanaQueryParam, ) @@ -53,6 +54,7 @@ "IndexOption", "InvertIndexParam", "OptimizeOption", + "QuantizerParam", "VamanaIndexParam", "VamanaQueryParam", ] diff --git a/python/zvec/model/param/__init__.pyi b/python/zvec/model/param/__init__.pyi index 46c0dda87..700500d50 100644 --- a/python/zvec/model/param/__init__.pyi +++ b/python/zvec/model/param/__init__.pyi @@ -24,6 +24,7 @@ __all__: list[str] = [ "IndexParam", "InvertIndexParam", "OptimizeOption", + "QuantizerParam", "QueryParam", "SegmentOption", "VectorIndexParam", @@ -145,9 +146,8 @@ class FlatIndexParam(VectorIndexParam): quantize_type (QuantizeType): Optional quantization type for vector compression (e.g., FP16, INT8). Use ``QuantizeType.UNDEFINED`` to disable quantization. Default is ``QuantizeType.UNDEFINED``. - enable_rotate (bool): Whether to apply random rotation before INT8 - quantization. Only effective with quantize_type=INT8. - Default is ``False``. + quantizer_param (QuantizerParam): Quantizer configuration (e.g., enable_rotate). + Default is ``QuantizerParam()``. Examples: >>> from zvec.typing import MetricType, QuantizeType @@ -164,7 +164,7 @@ class FlatIndexParam(VectorIndexParam): self, metric_type: _zvec.typing.MetricType = ..., quantize_type: _zvec.typing.QuantizeType = ..., - enable_rotate: bool = ..., + quantizer_param: QuantizerParam = ..., ) -> None: """ Constructs a FlatIndexParam instance. @@ -173,9 +173,8 @@ class FlatIndexParam(VectorIndexParam): metric_type (MetricType, optional): Distance metric. Defaults to MetricType.IP. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED (no quantization). - enable_rotate (bool, optional): Whether to apply random rotation before - INT8 quantization. Only effective with quantize_type=INT8. - Defaults to False. + quantizer_param (QuantizerParam, optional): Quantizer configuration. + Defaults to QuantizerParam(). """ def __repr__(self) -> str: ... @@ -231,6 +230,7 @@ class HnswIndexParam(VectorIndexParam): ef_construction: typing.SupportsInt = 500, quantize_type: _zvec.typing.QuantizeType = ..., use_contiguous_memory: bool = False, + quantizer_param: QuantizerParam = ..., ) -> None: ... def __repr__(self) -> str: ... def __setstate__(self, arg0: tuple) -> None: ... @@ -479,6 +479,7 @@ class IVFIndexParam(VectorIndexParam): n_iters: typing.SupportsInt = 10, use_soar: bool = False, quantize_type: _zvec.typing.QuantizeType = ..., + quantizer_param: QuantizerParam = ..., ) -> None: """ Constructs an IVFIndexParam instance. @@ -492,6 +493,8 @@ class IVFIndexParam(VectorIndexParam): use_soar (bool, optional): Enable SOAR optimization. Defaults to False. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED. + quantizer_param (QuantizerParam, optional): Quantizer configuration. + Defaults to QuantizerParam(). """ def __repr__(self) -> str: ... @@ -791,6 +794,49 @@ class SegmentOption: bool: Whether the segment is read-only. """ +class QuantizerParam: + """ + + Parameters for quantizer configuration. + + Encapsulates quantization-related settings such as enable_rotate. + Designed for future extensibility. + + Attributes: + enable_rotate (bool): Whether to apply random rotation before INT8 + quantization to reduce quantization error. + Only effective with quantize_type=INT8. Defaults to False. + + Examples: + >>> qp = QuantizerParam(enable_rotate=True) + >>> print(qp.enable_rotate) + True + """ + + def __getstate__(self) -> tuple: ... + def __init__(self, enable_rotate: bool = False) -> None: + """ + Constructs a QuantizerParam instance. + + Args: + enable_rotate (bool, optional): Whether to apply random rotation + before INT8 quantization. Defaults to False. + """ + + def __repr__(self) -> str: ... + def __setstate__(self, arg0: tuple) -> None: ... + def __eq__(self, arg0: typing.Any) -> bool: ... + def to_dict(self) -> dict: + """ + Convert to dictionary with all fields + """ + + @property + def enable_rotate(self) -> bool: + """ + bool: Whether random rotation is enabled before INT8 quantization. + """ + class VectorIndexParam(IndexParam): """ @@ -802,6 +848,7 @@ class VectorIndexParam(IndexParam): type (IndexType): The specific vector index type (e.g., HNSW, FLAT). metric_type (MetricType): Distance metric used for similarity search. quantize_type (QuantizeType): Optional vector quantization type. + quantizer_param (QuantizerParam): Quantizer configuration (e.g., enable_rotate). """ def __getstate__(self) -> tuple: ... @@ -823,6 +870,12 @@ class VectorIndexParam(IndexParam): QuantizeType: Vector quantization type (e.g., FP16, INT8). """ + @property + def quantizer_param(self) -> QuantizerParam: + """ + QuantizerParam: Quantizer configuration including enable_rotate. + """ + class _SearchQuery: field_name: str filter: str diff --git a/src/binding/python/model/param/python_param.cc b/src/binding/python/model/param/python_param.cc index 4d91b986a..a5cad0d7b 100644 --- a/src/binding/python/model/param/python_param.cc +++ b/src/binding/python/model/param/python_param.cc @@ -338,6 +338,62 @@ Constructs an FtsIndexParam instance. t[2].cast()); })); + // binding QuantizerParam + py::class_> quantizer_param( + m, "QuantizerParam", R"pbdoc( +Parameters for quantizer configuration. + +Encapsulates quantization-related settings such as enable_rotate. +Designed for future extensibility. + +Attributes: + enable_rotate (bool): Whether to apply random rotation before INT8 + quantization to reduce quantization error. + Only effective with quantize_type=INT8. Defaults to False. + +Examples: + >>> qp = QuantizerParam(enable_rotate=True) + >>> print(qp.enable_rotate) + True +)pbdoc"); + quantizer_param + .def(py::init(), py::arg("enable_rotate") = false) + .def_property_readonly( + "enable_rotate", + [](const QuantizerParam &self) -> bool { + return self.enable_rotate(); + }, + "bool: Whether random rotation is enabled before INT8 quantization.") + .def( + "to_dict", + [](const QuantizerParam &self) -> py::dict { + py::dict dict; + dict["enable_rotate"] = self.enable_rotate(); + return dict; + }, + "Convert to dictionary with all fields") + .def("__repr__", + [](const QuantizerParam &self) -> std::string { + return "{\"enable_rotate\":" + + std::string(self.enable_rotate() ? "true" : "false") + "}"; + }) + .def( + "__eq__", + [](const QuantizerParam &self, const py::object &other) { + if (!py::isinstance(other)) return false; + return self == other.cast(); + }, + py::is_operator()) + .def(py::pickle( + [](const QuantizerParam &self) { + return py::make_tuple(self.enable_rotate()); + }, + [](py::tuple t) { + if (t.size() != 1) + throw std::runtime_error("Invalid state for QuantizerParam"); + return std::make_shared(t[0].cast()); + })); + // binding base vector index params py::class_> vector_params(m, "VectorIndexParam", R"pbdoc( @@ -349,6 +405,7 @@ Encapsulates common settings for all vector index types. type (IndexType): The specific vector index type (e.g., HNSW, FLAT). metric_type (MetricType): Distance metric used for similarity search. quantize_type (QuantizeType): Optional vector quantization type. + quantizer_param (QuantizerParam): Quantizer configuration (e.g., enable_rotate). )pbdoc"); vector_params .def_property_readonly( @@ -364,13 +421,11 @@ Encapsulates common settings for all vector index types. }, "QuantizeType: Vector quantization type (e.g., FP16, INT8).") .def_property_readonly( - "enable_rotate", - [](const VectorIndexParams &self) -> bool { - return self.enable_rotate(); + "quantizer_param", + [](const VectorIndexParams &self) -> QuantizerParam { + return self.quantizer_param(); }, - "bool: Whether to apply random rotation before INT8 quantization " - "to reduce quantization error. Only effective with " - "quantize_type=INT8. Defaults to False.") + "QuantizerParam: Quantizer configuration including enable_rotate.") .def( "to_dict", [](const VectorIndexParams &self) -> py::dict { @@ -379,7 +434,9 @@ Encapsulates common settings for all vector index types. dict["metric_type"] = metric_type_to_string(self.metric_type()); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -391,7 +448,7 @@ Encapsulates common settings for all vector index types. [](py::tuple t) { // __setstate__ if (t.size() != 3) throw std::runtime_error("Invalid state for VectorIndexParams"); - // 基类,不能直接实例化,用于子类 + // Base class, cannot instantiate directly, used by subclasses return std::shared_ptr(); })); @@ -430,14 +487,20 @@ encapsulates its construction hyperparameters. {'metric_type': 'IP', 'm': 16, 'ef_construction': 200, 'quantize_type': 'INT8', 'use_contiguous_memory': True} )pbdoc"); hnsw_params - .def(py::init(), // Added a new parameter; refactored to QuantizerParam in future + .def(py::init([](MetricType metric_type, int m, int ef_construction, + QuantizeType quantize_type, bool use_contiguous_memory, + QuantizerParam quantizer_param) { + return std::make_shared( + metric_type, m, ef_construction, quantize_type, + use_contiguous_memory, quantizer_param); + }), py::arg("metric_type") = MetricType::IP, py::arg("m") = core_interface::kDefaultHnswNeighborCnt, py::arg("ef_construction") = core_interface::kDefaultHnswEfConstruction, py::arg("quantize_type") = QuantizeType::UNDEFINED, py::arg("use_contiguous_memory") = false, - py::arg("enable_rotate") = false) + py::arg("quantizer_param") = QuantizerParam()) .def_property_readonly( "m", &HnswIndexParams::m, "int: Maximum number of neighbors per node in upper layers.") @@ -460,7 +523,10 @@ encapsulates its construction hyperparameters. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); dict["use_contiguous_memory"] = self.use_contiguous_memory(); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = + self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -476,24 +542,27 @@ encapsulates its construction hyperparameters. quantize_type_to_string(self.quantize_type()) + ", \"use_contiguous_memory\":" + (self.use_contiguous_memory() ? "true" : "false") + - ", \"enable_rotate\":" + - (self.enable_rotate() ? "true" : "false") + "}"; + ", \"quantizer_param\":{" + + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" + : "false") + + "}}"; }) .def(py::pickle( [](const HnswIndexParams &self) { return py::make_tuple(self.metric_type(), self.m(), self.ef_construction(), self.quantize_type(), self.use_contiguous_memory(), - self.enable_rotate()); + self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for HnswIndexParams"); - bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; + QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), - enable_rotate); + qp); })); // binding hnsw rabitq index params @@ -642,8 +711,16 @@ its construction hyperparameters. ... ) )pbdoc"); vamana_params - .def(py::init(), + .def(py::init([](MetricType metric_type, int max_degree, + int search_list_size, float alpha, bool saturate_graph, + bool use_contiguous_memory, bool use_id_map, + QuantizeType quantize_type, + QuantizerParam quantizer_param) { + return std::make_shared( + metric_type, max_degree, search_list_size, alpha, + saturate_graph, use_contiguous_memory, use_id_map, + quantize_type, quantizer_param); + }), py::arg("metric_type") = MetricType::IP, py::arg("max_degree") = core_interface::kDefaultVamanaMaxDegree, py::arg("search_list_size") = @@ -654,7 +731,7 @@ its construction hyperparameters. py::arg("use_contiguous_memory") = false, py::arg("use_id_map") = false, py::arg("quantize_type") = QuantizeType::UNDEFINED, - py::arg("enable_rotate") = false) + py::arg("quantizer_param") = QuantizerParam()) .def_property_readonly( "max_degree", &VamanaIndexParams::max_degree, "int: Maximum out-degree (R) of every node in the Vamana graph.") @@ -690,7 +767,10 @@ its construction hyperparameters. dict["use_id_map"] = self.use_id_map(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = + self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -714,8 +794,11 @@ its construction hyperparameters. std::string(self.use_id_map() ? "true" : "false") + ", \"quantize_type\":\"" + quantize_type_to_string(self.quantize_type()) + - "\", \"enable_rotate\":" + - std::string(self.enable_rotate() ? "true" : "false") + "}"; + "\", \"quantizer_param\":{" + + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" + : "false") + + "}}"; }) .def(py::pickle( [](const VamanaIndexParams &self) { @@ -724,17 +807,17 @@ its construction hyperparameters. self.saturate_graph(), self.use_contiguous_memory(), self.use_id_map(), self.quantize_type(), - self.enable_rotate()); + self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 8 && t.size() != 9) throw std::runtime_error("Invalid state for VamanaIndexParams"); - bool enable_rotate = t.size() >= 9 ? t[8].cast() : false; + QuantizerParam qp(t.size() >= 9 ? t[8].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), t[5].cast(), t[6].cast(), t[7].cast(), - enable_rotate); + qp); })); // FlatIndexParams @@ -764,10 +847,14 @@ suitable for small to medium datasets or as a baseline. {'metric_type': 'L2', 'quantize_type': 'FP16'} )pbdoc"); flat_params - .def(py::init(), + .def(py::init([](MetricType metric_type, QuantizeType quantize_type, + QuantizerParam quantizer_param) { + return std::make_shared( + metric_type, quantize_type, quantizer_param); + }), py::arg("metric_type") = MetricType::IP, py::arg("quantize_type") = QuantizeType::UNDEFINED, - py::arg("enable_rotate") = false, + py::arg("quantizer_param") = QuantizerParam(), R"pbdoc( Constructs a FlatIndexParam instance. @@ -775,9 +862,8 @@ Constructs a FlatIndexParam instance. metric_type (MetricType, optional): Distance metric. Defaults to MetricType.IP. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED (no quantization). - enable_rotate (bool, optional): Whether to apply random rotation before - INT8 quantization. Only effective with quantize_type=INT8. - Defaults to False. + quantizer_param (QuantizerParam, optional): Quantizer configuration. + Defaults to QuantizerParam(). )pbdoc") .def( "to_dict", @@ -786,7 +872,10 @@ Constructs a FlatIndexParam instance. dict["metric_type"] = metric_type_to_string(self.metric_type()); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = + self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -797,21 +886,24 @@ Constructs a FlatIndexParam instance. metric_type_to_string(self.metric_type()) + ", \"quantize_type\":" + quantize_type_to_string(self.quantize_type()) + - ", \"enable_rotate\":" + - (self.enable_rotate() ? "true" : "false") + "}"; + ", \"quantizer_param\":{" + + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" + : "false") + + "}}"; }) .def(py::pickle( [](const FlatIndexParams &self) { return py::make_tuple(self.metric_type(), self.quantize_type(), - self.enable_rotate()); + self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 2 && t.size() != 3) throw std::runtime_error("Invalid state for FlatIndexParams"); - bool enable_rotate = t.size() >= 3 ? t[2].cast() : false; + QuantizerParam qp(t.size() >= 3 ? t[2].cast() : false); return std::make_shared(t[0].cast(), t[1].cast(), - enable_rotate); + qp); })); // IVFIndexParams @@ -848,11 +940,17 @@ and accuracy. 100 )pbdoc"); ivf_params - .def(py::init(), + .def(py::init([](MetricType metric_type, int n_list, int n_iters, + bool use_soar, QuantizeType quantize_type, + QuantizerParam quantizer_param) { + return std::make_shared( + metric_type, n_list, n_iters, use_soar, quantize_type, + quantizer_param); + }), py::arg("metric_type") = MetricType::IP, py::arg("n_list") = 10, py::arg("n_iters") = 10, py::arg("use_soar") = false, py::arg("quantize_type") = QuantizeType::UNDEFINED, - py::arg("enable_rotate") = false, + py::arg("quantizer_param") = QuantizerParam(), R"pbdoc( Constructs an IVFIndexParam instance. @@ -865,9 +963,8 @@ Constructs an IVFIndexParam instance. use_soar (bool, optional): Enable SOAR optimization. Defaults to False. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED. - enable_rotate (bool, optional): Whether to apply random rotation before - INT8 quantization. Only effective with quantize_type=INT8. - Defaults to False. + quantizer_param (QuantizerParam, optional): Quantizer configuration. + Defaults to QuantizerParam(). )pbdoc") .def_property_readonly("n_list", &IVFIndexParams::n_list, "int: Number of inverted lists.") @@ -887,7 +984,10 @@ Constructs an IVFIndexParam instance. dict["use_soar"] = self.use_soar(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = + self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -901,23 +1001,27 @@ Constructs an IVFIndexParam instance. ", \"use_soar\":" + std::to_string(self.use_soar()) + ", \"quantize_type\":" + quantize_type_to_string(self.quantize_type()) + - ", \"enable_rotate\":" + - (self.enable_rotate() ? "true" : "false") + "}"; + ", \"quantizer_param\":{" + + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" + : "false") + + "}}"; }) .def(py::pickle( [](const IVFIndexParams &self) { return py::make_tuple(self.metric_type(), self.n_list(), self.n_iters(), self.use_soar(), - self.quantize_type(), self.enable_rotate()); + self.quantize_type(), + self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for IVFIndexParams"); - bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; + QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), - enable_rotate); + qp); })); // DiskAnnIndexParams @@ -957,11 +1061,17 @@ only compressed vector will be loaded into memory. By this way, search memory at 100 )pbdoc"); diskann_params - .def(py::init(), + .def(py::init([](MetricType metric_type, int max_degree, int list_size, + int pq_chunk_num, QuantizeType quantize_type, + QuantizerParam quantizer_param) { + return std::make_shared( + metric_type, max_degree, list_size, pq_chunk_num, + quantize_type, quantizer_param); + }), py::arg("metric_type") = MetricType::IP, py::arg("max_degree") = 100, py::arg("list_size") = 50, py::arg("pq_chunk_num") = 0, py::arg("quantize_type") = QuantizeType::UNDEFINED, - py::arg("enable_rotate") = false, + py::arg("quantizer_param") = QuantizerParam(), R"pbdoc( Constructs an DiskAnnIndexParams instance. @@ -976,9 +1086,8 @@ Constructs an DiskAnnIndexParams instance. Clamped to [1, 1024]. Defaults to 0. quantize_type (QuantizeType, optional): Vector quantization type. Defaults to QuantizeType.UNDEFINED. - enable_rotate (bool, optional): Whether to apply random rotation before - INT8 quantization. Only effective with quantize_type=INT8. - Defaults to False. + quantizer_param (QuantizerParam, optional): Quantizer configuration. + Defaults to QuantizerParam(). )pbdoc") .def_property_readonly("max_degree", &DiskAnnIndexParams::max_degree, "int: max node degree.") @@ -1001,7 +1110,10 @@ Constructs an DiskAnnIndexParams instance. dict["pq_chunk_num"] = self.pq_chunk_num(); dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); - dict["enable_rotate"] = self.enable_rotate(); + py::dict qp_dict; + qp_dict["enable_rotate"] = + self.quantizer_param().enable_rotate(); + dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") @@ -1016,23 +1128,27 @@ Constructs an DiskAnnIndexParams instance. ", \"pq_chunk_num\":" + std::to_string(self.pq_chunk_num()) + ", \"quantize_type\":" + quantize_type_to_string(self.quantize_type()) + - ", \"enable_rotate\":" + - (self.enable_rotate() ? "true" : "false") + "}"; + ", \"quantizer_param\":{" + + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" + : "false") + + "}}"; }) .def(py::pickle( [](const DiskAnnIndexParams &self) { return py::make_tuple(self.metric_type(), self.max_degree(), self.list_size(), self.pq_chunk_num(), - self.quantize_type(), self.enable_rotate()); + self.quantize_type(), + self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 5 && t.size() != 6) throw std::runtime_error("Invalid state for DiskAnnIndexParams"); - bool enable_rotate = t.size() >= 6 ? t[5].cast() : false; + QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), - enable_rotate); + qp); })); } diff --git a/src/db/index/common/proto_converter.cc b/src/db/index/common/proto_converter.cc index 10b43dbcc..ce32e42b8 100644 --- a/src/db/index/common/proto_converter.cc +++ b/src/db/index/common/proto_converter.cc @@ -18,16 +18,13 @@ namespace zvec { HnswIndexParams::OPtr ProtoConverter::FromPb( const proto::HnswIndexParams ¶ms_pb) { - // OR merge: support both base.enable_rotate (new) and hnsw.enable_rotate - // (deprecated, for backward compat with old serialized data) - bool enable_rotate = - params_pb.base().enable_rotate() || params_pb.enable_rotate(); + bool enable_rotate = params_pb.base().quantizer_param().enable_rotate(); auto params = std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.m(), params_pb.ef_construction(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), params_pb.use_contiguous_memory(), - enable_rotate); + QuantizerParam(enable_rotate)); return params; } @@ -38,12 +35,11 @@ proto::HnswIndexParams ProtoConverter::ToPb(const HnswIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); - params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); + params_pb.mutable_base()->mutable_quantizer_param()->set_enable_rotate( + params->quantizer_param().enable_rotate()); params_pb.set_ef_construction(params->ef_construction()); params_pb.set_m(params->m()); params_pb.set_use_contiguous_memory(params->use_contiguous_memory()); - // Also write to deprecated field for backward compat with old readers - params_pb.set_enable_rotate(params->enable_rotate()); return params_pb; } @@ -76,10 +72,11 @@ proto::HnswRabitqIndexParams ProtoConverter::ToPb( // FlatIndexParams FlatIndexParams::OPtr ProtoConverter::FromPb( const proto::FlatIndexParams ¶ms_pb) { + bool enable_rotate = params_pb.base().quantizer_param().enable_rotate(); return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.base().enable_rotate()); + QuantizerParam(enable_rotate)); } proto::FlatIndexParams ProtoConverter::ToPb(const FlatIndexParams *params) { @@ -88,18 +85,20 @@ proto::FlatIndexParams ProtoConverter::ToPb(const FlatIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); - params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); + params_pb.mutable_base()->mutable_quantizer_param()->set_enable_rotate( + params->quantizer_param().enable_rotate()); return params_pb; } // IVFIndexParams IVFIndexParams::OPtr ProtoConverter::FromPb( const proto::IVFIndexParams ¶ms_pb) { + bool enable_rotate = params_pb.base().quantizer_param().enable_rotate(); return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.n_list(), params_pb.n_iters(), params_pb.use_soar(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.base().enable_rotate()); + QuantizerParam(enable_rotate)); } proto::IVFIndexParams ProtoConverter::ToPb(const IVFIndexParams *params) { @@ -108,7 +107,8 @@ proto::IVFIndexParams ProtoConverter::ToPb(const IVFIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); - params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); + params_pb.mutable_base()->mutable_quantizer_param()->set_enable_rotate( + params->quantizer_param().enable_rotate()); params_pb.set_n_list(params->n_list()); params_pb.set_n_iters(params->n_iters()); params_pb.set_use_soar(params->use_soar()); @@ -118,13 +118,14 @@ proto::IVFIndexParams ProtoConverter::ToPb(const IVFIndexParams *params) { // VamanaIndexParams VamanaIndexParams::OPtr ProtoConverter::FromPb( const proto::VamanaIndexParams ¶ms_pb) { + bool enable_rotate = params_pb.base().quantizer_param().enable_rotate(); return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.max_degree(), params_pb.search_list_size(), params_pb.alpha(), params_pb.saturate_graph(), params_pb.use_contiguous_memory(), params_pb.use_id_map(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.base().enable_rotate()); + QuantizerParam(enable_rotate)); } proto::VamanaIndexParams ProtoConverter::ToPb(const VamanaIndexParams *params) { @@ -133,7 +134,8 @@ proto::VamanaIndexParams ProtoConverter::ToPb(const VamanaIndexParams *params) { MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); - params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); + params_pb.mutable_base()->mutable_quantizer_param()->set_enable_rotate( + params->quantizer_param().enable_rotate()); params_pb.set_max_degree(params->max_degree()); params_pb.set_search_list_size(params->search_list_size()); params_pb.set_alpha(params->alpha()); @@ -161,11 +163,12 @@ proto::InvertIndexParams ProtoConverter::ToPb(const InvertIndexParams *params) { // DiskAnnIndexParams DiskAnnIndexParams::OPtr ProtoConverter::FromPb( const proto::DiskAnnIndexParams ¶ms_pb) { + bool enable_rotate = params_pb.base().quantizer_param().enable_rotate(); return std::make_shared( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.max_degree(), params_pb.list_size(), params_pb.pq_chunk_num(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.base().enable_rotate()); + QuantizerParam(enable_rotate)); } proto::DiskAnnIndexParams ProtoConverter::ToPb( @@ -175,7 +178,8 @@ proto::DiskAnnIndexParams ProtoConverter::ToPb( MetricTypeCodeBook::Get(params->metric_type())); params_pb.mutable_base()->set_quantize_type( QuantizeTypeCodeBook::Get(params->quantize_type())); - params_pb.mutable_base()->set_enable_rotate(params->enable_rotate()); + params_pb.mutable_base()->mutable_quantizer_param()->set_enable_rotate( + params->quantizer_param().enable_rotate()); params_pb.set_max_degree(params->max_degree()); params_pb.set_list_size(params->list_size()); params_pb.set_pq_chunk_num(params->pq_chunk_num()); diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 58173b577..9b613ae97 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -4029,7 +4029,7 @@ Status SegmentImpl::load_vector_index_blocks() { new_field_params.set_index_params(MakeDefaultQuantVectorIndexParams( vector_index_params->metric_type(), vector_index_params->quantize_type(), - vector_index_params->enable_rotate())); + vector_index_params->quantizer_param())); } } @@ -4165,7 +4165,7 @@ Status SegmentImpl::init_memory_components() { FieldSchema normal_quant_field(*field); normal_quant_field.set_index_params(MakeDefaultQuantVectorIndexParams( index_params->metric_type(), index_params->quantize_type(), - index_params->enable_rotate())); + index_params->quantizer_param())); auto quant_vector_indexer = create_vector_indexer( field->name(), normal_quant_field, block_id, true); diff --git a/src/db/proto/zvec.proto b/src/db/proto/zvec.proto index a3df7bd93..f2c18f5ad 100644 --- a/src/db/proto/zvec.proto +++ b/src/db/proto/zvec.proto @@ -87,13 +87,19 @@ message InvertIndexParams { bool enable_range_optimization = 1; }; +// Quantizer-related parameters for vector indexes. +// Designed for future extensibility. +message QuantizerParam { + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + bool enable_rotate = 1; +}; + message BaseIndexParams { MetricType metric_type = 1; QuantizeType quantize_type = 2; - // When enabled, vectors are rotated before INT8 quantization to reduce - // quantization error. Only effective with quantize_type=INT8. - // Shared by all vector index types. - bool enable_rotate = 3; + // Quantizer parameters (enable_rotate, etc.) + QuantizerParam quantizer_param = 4; }; message HnswIndexParams { @@ -104,9 +110,6 @@ message HnswIndexParams { // arena for all graph nodes, which improves cache locality / search // throughput at the cost of peak memory usage. Defaults to false. bool use_contiguous_memory = 4; - // Deprecated: use BaseIndexParams.enable_rotate instead. - // Kept for backward compatibility with old serialized data. - bool enable_rotate = 5 [deprecated = true]; } message HnswRabitqIndexParams { diff --git a/src/include/zvec/db/index_params.h b/src/include/zvec/db/index_params.h index 1223105df..6c0fbcb10 100644 --- a/src/include/zvec/db/index_params.h +++ b/src/include/zvec/db/index_params.h @@ -118,6 +118,39 @@ class InvertIndexParams : public IndexParams { bool enable_extended_wildcard_{false}; }; +/* + * Quantizer parameters for vector indexes. + * Encapsulates quantization-related settings such as enable_rotate. + * Designed for future extensibility (e.g., num_bits, calibration_size). + */ +class QuantizerParam { + public: + QuantizerParam() = default; + explicit QuantizerParam(bool enable_rotate) + : enable_rotate_(enable_rotate) {} + + bool enable_rotate() const { + return enable_rotate_; + } + + void set_enable_rotate(bool v) { + enable_rotate_ = v; + } + + bool operator==(const QuantizerParam &other) const { + return enable_rotate_ == other.enable_rotate_; + } + + bool operator!=(const QuantizerParam &other) const { + return !(*this == other); + } + + private: + // When enabled, vectors are rotated before INT8 quantization to reduce + // quantization error. Only effective with quantize_type=INT8. + bool enable_rotate_{false}; +}; + /* * Column index params */ @@ -125,11 +158,11 @@ class VectorIndexParams : public IndexParams { public: VectorIndexParams(IndexType type, MetricType metric_type, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : IndexParams(type), metric_type_(metric_type), quantize_type_(quantize_type), - enable_rotate_(enable_rotate) {} + quantizer_param_(quantizer_param) {} ~VectorIndexParams() override = default; @@ -153,20 +186,23 @@ class VectorIndexParams : public IndexParams { quantize_type_ = quantize_type; } - bool enable_rotate() const { - return enable_rotate_; + const QuantizerParam &quantizer_param() const { + return quantizer_param_; + } + + void set_quantizer_param(const QuantizerParam &quantizer_param) { + quantizer_param_ = quantizer_param; } - void set_enable_rotate(bool enable_rotate) { - enable_rotate_ = enable_rotate; + // Convenience getter for internal use (engine_helper, segment, etc.) + bool enable_rotate() const { + return quantizer_param_.enable_rotate(); } protected: MetricType metric_type_; QuantizeType quantize_type_; - // When enabled, vectors are rotated before INT8 quantization to reduce - // quantization error. Only effective with quantize_type=INT8. - bool enable_rotate_{false}; + QuantizerParam quantizer_param_; }; /* @@ -179,9 +215,9 @@ class HnswIndexParams : public VectorIndexParams { int ef_construction = core_interface::kDefaultHnswEfConstruction, QuantizeType quantize_type = QuantizeType::UNDEFINED, bool use_contiguous_memory = false, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::HNSW, metric_type, quantize_type, - enable_rotate), + quantizer_param), m_(m), ef_construction_(ef_construction), use_contiguous_memory_(use_contiguous_memory) {} @@ -193,7 +229,7 @@ class HnswIndexParams : public VectorIndexParams { return std::make_shared(metric_type_, m_, ef_construction_, quantize_type_, use_contiguous_memory_, - enable_rotate_); + quantizer_param_); } std::string to_string() const override { @@ -204,7 +240,7 @@ class HnswIndexParams : public VectorIndexParams { << ",use_contiguous_memory:" << (use_contiguous_memory_ ? "true" : "false") << ",enable_rotate:" - << (enable_rotate_ ? "true" : "false") << "}"; + << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -219,8 +255,8 @@ class HnswIndexParams : public VectorIndexParams { static_cast(other).quantize_type() && use_contiguous_memory_ == static_cast(other) .use_contiguous_memory_ && - enable_rotate_ == static_cast(other) - .enable_rotate_; + quantizer_param_ == static_cast(other) + .quantizer_param_; } void set_m(int m) { @@ -369,16 +405,16 @@ class FlatIndexParams : public VectorIndexParams { public: FlatIndexParams(MetricType metric_type, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::FLAT, metric_type, quantize_type, - enable_rotate) {} + quantizer_param) {} using OPtr = std::shared_ptr; public: Ptr clone() const override { return std::make_shared(metric_type_, quantize_type_, - enable_rotate_); + quantizer_param_); } std::string to_string() const override { @@ -386,7 +422,8 @@ class FlatIndexParams : public VectorIndexParams { metric_type_, quantize_type_); std::ostringstream oss; oss << base_str - << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; + << ",enable_rotate:" + << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -396,8 +433,8 @@ class FlatIndexParams : public VectorIndexParams { static_cast(other).metric_type() && quantize_type() == static_cast(other).quantize_type() && - enable_rotate_ == - static_cast(other).enable_rotate(); + quantizer_param() == + static_cast(other).quantizer_param(); } }; @@ -410,8 +447,8 @@ inline FlatIndexParams MakeDefaultVectorIndexParams(MetricType metric_type) { inline FlatIndexParams MakeDefaultQuantVectorIndexParams( MetricType metric_type, QuantizeType quantize_type, - bool enable_rotate = false) { - return FlatIndexParams(metric_type, quantize_type, enable_rotate); + QuantizerParam quantizer_param = {}) { + return FlatIndexParams(metric_type, quantize_type, quantizer_param); } class IVFIndexParams : public VectorIndexParams { @@ -419,9 +456,9 @@ class IVFIndexParams : public VectorIndexParams { IVFIndexParams(MetricType metric_type, int n_list = 1024, int n_iters = 10, bool use_soar = false, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::IVF, metric_type, quantize_type, - enable_rotate), + quantizer_param), n_list_(n_list), n_iters_(n_iters), use_soar_(use_soar) {} @@ -432,7 +469,7 @@ class IVFIndexParams : public VectorIndexParams { Ptr clone() const override { return std::make_shared(metric_type_, n_list_, n_iters_, use_soar_, quantize_type_, - enable_rotate_); + quantizer_param_); } std::string to_string() const override { @@ -440,7 +477,8 @@ class IVFIndexParams : public VectorIndexParams { metric_type_, quantize_type_); std::ostringstream oss; oss << base_str << ",n_list:" << n_list_ << ",n_iters:" << n_iters_ - << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; + << ",enable_rotate:" + << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -477,8 +515,8 @@ class IVFIndexParams : public VectorIndexParams { use_soar_ == static_cast(other).use_soar_ && quantize_type() == static_cast(other).quantize_type() && - enable_rotate_ == - static_cast(other).enable_rotate_; + quantizer_param_ == + static_cast(other).quantizer_param_; } private: @@ -492,9 +530,9 @@ class DiskAnnIndexParams : public VectorIndexParams { DiskAnnIndexParams(MetricType metric_type, int max_degree = 100, int list_size = 50, int pq_chunk_num = 0, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::DISKANN, metric_type, quantize_type, - enable_rotate), + quantizer_param), max_degree_{max_degree}, list_size_{list_size}, pq_chunk_num_{pq_chunk_num} {} @@ -505,7 +543,7 @@ class DiskAnnIndexParams : public VectorIndexParams { Ptr clone() const override { return std::make_shared( metric_type_, max_degree_, list_size_, pq_chunk_num_, quantize_type_, - enable_rotate_); + quantizer_param_); } std::string to_string() const override { @@ -514,7 +552,8 @@ class DiskAnnIndexParams : public VectorIndexParams { std::ostringstream oss; oss << base_str << ",max_degree:" << max_degree_ << ",list_size:" << list_size_ << ", pq_chunk_num:" << pq_chunk_num_ - << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; + << ",enable_rotate:" + << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -554,8 +593,8 @@ class DiskAnnIndexParams : public VectorIndexParams { static_cast(other).pq_chunk_num_ && quantize_type() == static_cast(other).quantize_type() && - enable_rotate_ == - static_cast(other).enable_rotate_; + quantizer_param_ == + static_cast(other).quantizer_param_; } private: @@ -577,9 +616,9 @@ class VamanaIndexParams : public VectorIndexParams { bool saturate_graph = core_interface::kDefaultVamanaSaturateGraph, bool use_contiguous_memory = false, bool use_id_map = false, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool enable_rotate = false) + QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::VAMANA, metric_type, quantize_type, - enable_rotate), + quantizer_param), max_degree_(max_degree), search_list_size_(search_list_size), alpha_(alpha), @@ -593,7 +632,7 @@ class VamanaIndexParams : public VectorIndexParams { Ptr clone() const override { return std::make_shared( metric_type_, max_degree_, search_list_size_, alpha_, saturate_graph_, - use_contiguous_memory_, use_id_map_, quantize_type_, enable_rotate_); + use_contiguous_memory_, use_id_map_, quantize_type_, quantizer_param_); } std::string to_string() const override { @@ -606,7 +645,8 @@ class VamanaIndexParams : public VectorIndexParams { << ",use_contiguous_memory:" << (use_contiguous_memory_ ? "true" : "false") << ",use_id_map:" << (use_id_map_ ? "true" : "false") - << ",enable_rotate:" << (enable_rotate_ ? "true" : "false") << "}"; + << ",enable_rotate:" + << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -622,7 +662,7 @@ class VamanaIndexParams : public VectorIndexParams { saturate_graph_ == rhs.saturate_graph_ && use_contiguous_memory_ == rhs.use_contiguous_memory_ && use_id_map_ == rhs.use_id_map_ && - enable_rotate_ == rhs.enable_rotate_; + quantizer_param_ == rhs.quantizer_param_; } int max_degree() const { diff --git a/test_unrotate.py b/test_unrotate.py deleted file mode 100644 index d1260f414..000000000 --- a/test_unrotate.py +++ /dev/null @@ -1,209 +0,0 @@ -"""Test inverse rotation: verify that vectors can be recovered after -rotation + quantization via the revert path.""" -import os -import shutil -import tempfile -import numpy as np -import zvec -from zvec import ( - Collection, - CollectionOption, - DataType, - Doc, - FieldSchema, - FlatIndexParam, - MetricType, - VectorSchema, - OptimizeOption, -) -from zvec.typing import QuantizeType - - -def test_inverse_rotation_int8(): - """Test inverse rotation with INT8 streaming quantizer + rotation.""" - dim = 128 - n_docs = 10 - - # Create temp dir for the collection - tmpdir = tempfile.mkdtemp(prefix="zvec_test_inv_rotate_") - coll_path = os.path.join(tmpdir, "collection") - try: - schema = zvec.CollectionSchema( - name="test_inv_rotate", - fields=[FieldSchema("id", DataType.INT64, nullable=False)], - vectors=[ - VectorSchema( - "embedding", - DataType.VECTOR_FP32, - dimension=dim, - index_param=FlatIndexParam( - metric_type=MetricType.IP, - quantize_type=QuantizeType.INT8, - enable_rotate=True, - ), - ), - ], - ) - - collection = zvec.create_and_open( - path=coll_path, - schema=schema, - option=CollectionOption(read_only=False), - ) - - # Generate random vectors - np.random.seed(42) - docs = [] - original_vecs = {} - for i in range(n_docs): - vec = np.random.randn(dim).astype(np.float32) - vec = vec / np.linalg.norm(vec) # Normalize for IP - original_vecs[str(i)] = vec - docs.append( - Doc( - id=str(i), - fields={"id": i}, - vectors={"embedding": vec.tolist()}, - ) - ) - - # Insert - for doc in docs: - result = collection.insert(doc) - assert result.ok(), f"Insert failed: {result.code()}" - - collection.flush() - - # Optimize to trigger reformer build - collection.optimize(option=OptimizeOption()) - import time - time.sleep(2) # Wait for optimization to complete - - # Fetch vectors back (triggers revert path) - ids = [str(i) for i in range(n_docs)] - fetched = collection.fetch(ids=ids) - - assert len(fetched) == n_docs, f"Expected {n_docs} docs, got {len(fetched)}" - - # Compare fetched vectors with originals - max_error = 0.0 - avg_error = 0.0 - print("\nDiagnostic: first fetched vector vs original:") - for i, doc_id in enumerate(ids): - assert doc_id in fetched, f"Doc {doc_id} not found in fetched results" - fetched_vec = np.array(fetched[doc_id].vector("embedding"), dtype=np.float32) - original_vec = original_vecs[doc_id] - if i == 0: - print(f" fetched[:8] = {fetched_vec[:8]}") - print(f" original[:8] = {original_vec[:8]}") - print(f" fetched shape: {fetched_vec.shape}, original shape: {original_vec.shape}") - error = np.max(np.abs(fetched_vec - original_vec)) - avg_error += np.mean(np.abs(fetched_vec - original_vec)) - max_error = max(max_error, error) - - avg_error /= n_docs - - print(f"\n=== INT8 + Rotate Inverse Rotation Test ===") - print(f"Max absolute error: {max_error:.6f}") - print(f"Avg absolute error: {avg_error:.6f}") - print(f"Number of docs: {n_docs}") - - # The error should be bounded (INT8 quantization introduces some loss) - # With rotation, the error should still be reasonable - assert max_error < 0.5, f"Max error {max_error} too large!" - print("PASSED!") - - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -def test_inverse_rotation_cosine(): - """Test inverse rotation with COSINE metric + INT8 quantizer + rotation.""" - dim = 128 - n_docs = 10 - - tmpdir = tempfile.mkdtemp(prefix="zvec_test_inv_rotate_cosine_") - coll_path = os.path.join(tmpdir, "collection") - try: - schema = zvec.CollectionSchema( - name="test_inv_rotate_cosine", - fields=[FieldSchema("id", DataType.INT64, nullable=False)], - vectors=[ - VectorSchema( - "embedding", - DataType.VECTOR_FP32, - dimension=dim, - index_param=FlatIndexParam( - metric_type=MetricType.COSINE, - quantize_type=QuantizeType.INT8, - enable_rotate=True, - ), - ), - ], - ) - - collection = zvec.create_and_open( - path=coll_path, - schema=schema, - option=CollectionOption(read_only=False), - ) - - # Generate random vectors - np.random.seed(42) - docs = [] - original_vecs = {} - for i in range(n_docs): - vec = np.random.randn(dim).astype(np.float32) - original_vecs[str(i)] = vec - docs.append( - Doc( - id=str(i), - fields={"id": i}, - vectors={"embedding": vec.tolist()}, - ) - ) - - for doc in docs: - result = collection.insert(doc) - assert result.ok(), f"Insert failed: {result.code()}" - - collection.flush() - collection.optimize(option=OptimizeOption()) - import time - time.sleep(2) - - ids = [str(i) for i in range(n_docs)] - fetched = collection.fetch(ids=ids) - - assert len(fetched) == n_docs, f"Expected {n_docs} docs, got {len(fetched)}" - - max_error = 0.0 - avg_error = 0.0 - for doc_id in ids: - assert doc_id in fetched, f"Doc {doc_id} not found" - fetched_vec = np.array(fetched[doc_id].vector("embedding"), dtype=np.float32) - original_vec = original_vecs[doc_id] - # Normalize both for comparison (COSINE metric normalizes) - fetched_norm = fetched_vec / (np.linalg.norm(fetched_vec) + 1e-8) - original_norm = original_vec / (np.linalg.norm(original_vec) + 1e-8) - error = np.max(np.abs(fetched_norm - original_norm)) - avg_error += np.mean(np.abs(fetched_norm - original_norm)) - max_error = max(max_error, error) - - avg_error /= n_docs - - print(f"\n=== COSINE + INT8 + Rotate Inverse Rotation Test ===") - print(f"Max absolute error (normalized): {max_error:.6f}") - print(f"Avg absolute error (normalized): {avg_error:.6f}") - - assert max_error < 0.5, f"Max error {max_error} too large!" - print("PASSED!") - - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -if __name__ == "__main__": - test_inverse_rotation_int8() - test_inverse_rotation_cosine() - print("\n=== All tests passed! ===") From 51806dd96db63c54620287da08798fdc13b4977d Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 15 Jun 2026 18:04:33 +0800 Subject: [PATCH 21/38] add c_api --- .gitignore | 1 + examples/c/index_example.c | 5 ++ src/binding/c/c_api.cc | 54 ++++++++++++++++++ .../quantizer/integer_quantizer_converter.cc | 2 +- .../quantizer/integer_quantizer_reformer.cc | 8 +-- src/include/zvec/c_api.h | 25 +++++++++ tests/c/c_api_test.c | 56 +++++++++++++++++++ 7 files changed, 146 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 53fb938be..f2039e4a7 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,5 @@ allure-* doc/ config/ examples/python/ +examples/c_api/ logs/ \ No newline at end of file diff --git a/examples/c/index_example.c b/examples/c/index_example.c index 403c0bef9..e281a7368 100644 --- a/examples/c/index_example.c +++ b/examples/c/index_example.c @@ -85,6 +85,11 @@ int main() { zvec_index_params_set_metric_type(hnsw_params_fast, ZVEC_METRIC_TYPE_L2); zvec_index_params_set_hnsw_params(hnsw_params_fast, 16, 100); + // Demonstrate INT8 quantization with random rotation preprocessing + // (enable_rotate rotates vectors before INT8 quantization to reduce error) + zvec_index_params_set_quantize_type(hnsw_params_fast, ZVEC_QUANTIZE_TYPE_INT8); + zvec_index_params_set_quantizer_enable_rotate(hnsw_params_fast, true); + zvec_index_params_t *hnsw_params_balanced = zvec_index_params_create(ZVEC_INDEX_TYPE_HNSW); if (!hnsw_params_balanced) { diff --git a/src/binding/c/c_api.cc b/src/binding/c/c_api.cc index 2736169be..56e916512 100644 --- a/src/binding/c/c_api.cc +++ b/src/binding/c/c_api.cc @@ -1464,6 +1464,60 @@ zvec_quantize_type_t zvec_index_params_get_quantize_type( return ZVEC_QUANTIZE_TYPE_UNDEFINED; } +/** + * @brief Set enable_rotate for quantizer parameters + * @param params Index parameters (must be vector index type) + * @param enable_rotate Whether to enable random rotation before quantization + * @return ZVEC_OK on success, error code on failure + */ +zvec_error_code_t zvec_index_params_set_quantizer_enable_rotate( + zvec_index_params_t *params, bool enable_rotate) { + if (!params) { + SET_LAST_ERROR(ZVEC_ERROR_INVALID_ARGUMENT, + "Index params pointer cannot be null"); + return ZVEC_ERROR_INVALID_ARGUMENT; + } + auto *cpp_params = reinterpret_cast(params); + + if (!cpp_params->is_vector_index_type()) { + SET_LAST_ERROR(ZVEC_ERROR_INVALID_ARGUMENT, + "Index params is not a vector index type"); + return ZVEC_ERROR_INVALID_ARGUMENT; + } + auto *vec_params = dynamic_cast(cpp_params); + if (!vec_params) { + SET_LAST_ERROR(ZVEC_ERROR_INVALID_ARGUMENT, + "Failed to cast to VectorIndexParams"); + return ZVEC_ERROR_INVALID_ARGUMENT; + } + zvec::QuantizerParam qp = vec_params->quantizer_param(); + qp.set_enable_rotate(enable_rotate); + vec_params->set_quantizer_param(qp); + return ZVEC_OK; +} + +/** + * @brief Get enable_rotate setting from quantizer parameters + * @param params Index parameters + * @return true if rotation is enabled, false otherwise + */ +bool zvec_index_params_get_quantizer_enable_rotate( + const zvec_index_params_t *params) { + if (!params) { + return false; + } + auto *cpp_params = reinterpret_cast(params); + + if (cpp_params->is_vector_index_type()) { + auto *vec_params = + dynamic_cast(cpp_params); + if (vec_params) { + return vec_params->quantizer_param().enable_rotate(); + } + } + return false; +} + /** * @brief Get index type from index parameters * @param params Index parameters diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index adbdab52b..f5d3db650 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -502,7 +502,7 @@ class IntegerStreamingConverter : public IndexConverter { IndexHolder::Iterator::Pointer &&iter) : owner_(owner), buffer_(owner->element_size(), 0), - normalize_buffer_(owner->front_->element_size(), 0), + normalize_buffer_(owner->padded_dim() * sizeof(float), 0), rotate_buffer_(owner->padded_dim() * sizeof(float), 0), front_iter_(std::move(iter)) { this->encode_record(); diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 498522a4e..fc636f78b 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -349,7 +349,7 @@ class IntegerStreamingReformer : public IndexReformer { const float *vec = reinterpret_cast(query); std::unique_ptr rotate_buffer; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[qmeta.dimension()]); + rotate_buffer.reset(new float[rotator_->padded_dim()]); rotator_->rotate(vec, rotate_buffer.get()); vec = rotate_buffer.get(); } @@ -381,7 +381,7 @@ class IntegerStreamingReformer : public IndexReformer { std::unique_ptr rotate_buffer; std::unique_ptr normalized; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[qmeta.dimension()]); + rotate_buffer.reset(new float[rotator_->padded_dim()]); } if (enable_normalize_) { normalized.reset(new float[qmeta.dimension()]); @@ -422,7 +422,7 @@ class IntegerStreamingReformer : public IndexReformer { const float *vec = reinterpret_cast(record); std::unique_ptr rotate_buffer; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rmeta.dimension()]); + rotate_buffer.reset(new float[rotator_->padded_dim()]); rotator_->rotate(vec, rotate_buffer.get()); vec = rotate_buffer.get(); } @@ -455,7 +455,7 @@ class IntegerStreamingReformer : public IndexReformer { std::unique_ptr rotate_buffer; std::unique_ptr normalized; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rmeta.dimension()]); + rotate_buffer.reset(new float[rotator_->padded_dim()]); } if (enable_normalize_) { normalized.reset(new float[rmeta.dimension()]); diff --git a/src/include/zvec/c_api.h b/src/include/zvec/c_api.h index 500265fa2..ad54afb9d 100644 --- a/src/include/zvec/c_api.h +++ b/src/include/zvec/c_api.h @@ -960,6 +960,31 @@ ZVEC_EXPORT zvec_error_code_t ZVEC_CALL zvec_index_params_set_quantize_type( ZVEC_EXPORT zvec_quantize_type_t ZVEC_CALL zvec_index_params_get_quantize_type(const zvec_index_params_t *params); +/** + * @brief Set enable_rotate for quantizer (only effective with INT8 quantize + * type) + * + * When enabled, vectors are randomly rotated before INT8 quantization to + * reduce quantization error. The rotation matrix is stored with the index + * and automatically applied to query vectors at search time. + * + * @param params Index parameters (must be vector index type) + * @param enable_rotate Whether to enable random rotation before quantization + * @return ZVEC_OK on success, error code on failure + */ +ZVEC_EXPORT zvec_error_code_t ZVEC_CALL +zvec_index_params_set_quantizer_enable_rotate(zvec_index_params_t *params, + bool enable_rotate); + +/** + * @brief Get enable_rotate setting from quantizer parameters + * @param params Index parameters (must not be NULL) + * @return true if rotation is enabled, false otherwise (default) + */ +ZVEC_EXPORT bool ZVEC_CALL +zvec_index_params_get_quantizer_enable_rotate( + const zvec_index_params_t *params); + /** * @brief Set HNSW specific parameters * @param params Index parameters (must be HNSW type) diff --git a/tests/c/c_api_test.c b/tests/c/c_api_test.c index 5a0c3a144..af5cd4220 100644 --- a/tests/c/c_api_test.c +++ b/tests/c/c_api_test.c @@ -3491,6 +3491,61 @@ void test_index_params_functions(void) { TEST_END(); } +void test_quantizer_enable_rotate(void) { + TEST_START(); + + // Test 1: set enable_rotate=true on HNSW params and verify + zvec_index_params_t *hnsw_params = + zvec_index_params_create(ZVEC_INDEX_TYPE_HNSW); + TEST_ASSERT(hnsw_params != NULL); + + // Default should be false + TEST_ASSERT(zvec_index_params_get_quantizer_enable_rotate(hnsw_params) == + false); + + // Set to true and verify + zvec_error_code_t err = + zvec_index_params_set_quantizer_enable_rotate(hnsw_params, true); + TEST_ASSERT(err == ZVEC_OK); + TEST_ASSERT(zvec_index_params_get_quantizer_enable_rotate(hnsw_params) == + true); + + // Set back to false and verify + err = zvec_index_params_set_quantizer_enable_rotate(hnsw_params, false); + TEST_ASSERT(err == ZVEC_OK); + TEST_ASSERT(zvec_index_params_get_quantizer_enable_rotate(hnsw_params) == + false); + + zvec_index_params_destroy(hnsw_params); + + // Test 2: set enable_rotate on FLAT index params (also a vector index) + zvec_index_params_t *flat_params = + zvec_index_params_create(ZVEC_INDEX_TYPE_FLAT); + TEST_ASSERT(flat_params != NULL); + err = zvec_index_params_set_quantizer_enable_rotate(flat_params, true); + TEST_ASSERT(err == ZVEC_OK); + TEST_ASSERT(zvec_index_params_get_quantizer_enable_rotate(flat_params) == + true); + zvec_index_params_destroy(flat_params); + + // Test 3: set enable_rotate on non-vector index (INVERT) should fail + zvec_index_params_t *invert_params = + zvec_index_params_create(ZVEC_INDEX_TYPE_INVERT); + TEST_ASSERT(invert_params != NULL); + err = zvec_index_params_set_quantizer_enable_rotate(invert_params, true); + TEST_ASSERT(err != ZVEC_OK); + zvec_index_params_destroy(invert_params); + + // Test 4: NULL params should return false for getter + TEST_ASSERT(zvec_index_params_get_quantizer_enable_rotate(NULL) == false); + + // Test 5: NULL params should return error for setter + err = zvec_index_params_set_quantizer_enable_rotate(NULL, true); + TEST_ASSERT(err != ZVEC_OK); + + TEST_END(); +} + void test_index_params_api_functions(void) { TEST_START(); @@ -5878,6 +5933,7 @@ int main(void) { // Index tests test_index_params(); test_index_params_functions(); + test_quantizer_enable_rotate(); test_index_params_api_functions(); test_index_creation_and_management(); From a534d18fc0691c55ac0cbaa346450b28de3ba7c8 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 15 Jun 2026 18:18:17 +0800 Subject: [PATCH 22/38] c_api ut --- tests/core/interface/index_interface_test.cc | 82 +++++++++++++++++ .../index/common/db_proto_converter_test.cc | 77 ++++++++++++++++ tests/db/index/common/index_params_test.cc | 92 +++++++++++++++++++ 3 files changed, 251 insertions(+) diff --git a/tests/core/interface/index_interface_test.cc b/tests/core/interface/index_interface_test.cc index 44d906139..d3d4cee68 100644 --- a/tests/core/interface/index_interface_test.cc +++ b/tests/core/interface/index_interface_test.cc @@ -1856,6 +1856,88 @@ TEST(IndexInterface, ContiguousMemoryEndToEnd) { .build()); } +TEST(IndexInterface, QuantizerParamEnableRotateSerialization) { + constexpr uint32_t kDimension = 64; + + // Test 1: HNSW with enable_rotate=true via builder + { + auto param = HNSWIndexParamBuilder() + .WithMetricType(MetricType::kCosine) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithEFConstruction(100) + .WithEnableRotate(true) + .Build(); + ASSERT_NE(nullptr, param.get()); + EXPECT_TRUE(param->quantizer_param.enable_rotate); + + // Serialize to JSON and verify enable_rotate is present + std::string json = param->SerializeToJson(); + EXPECT_TRUE(json.find("\"enable_rotate\":true") != std::string::npos) + << "JSON: " << json; + + // Deserialize and verify + auto restored = IndexFactory::DeserializeIndexParamFromJson(json); + ASSERT_NE(nullptr, restored.get()); + + auto *restored_hnsw = dynamic_cast(restored.get()); + ASSERT_NE(nullptr, restored_hnsw); + EXPECT_TRUE(restored_hnsw->quantizer_param.enable_rotate); + + // Roundtrip consistency + EXPECT_EQ(restored->SerializeToJson(), param->SerializeToJson()); + } + + // Test 2: Flat with enable_rotate=true via WithQuantizerParam + { + QuantizerParam qp(QuantizerType::kNone, 8, 8, true); + EXPECT_TRUE(qp.enable_rotate); + + auto param = FlatIndexParamBuilder() + .WithMetricType(MetricType::kCosine) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithQuantizerParam(qp) + .Build(); + ASSERT_NE(nullptr, param.get()); + EXPECT_TRUE(param->quantizer_param.enable_rotate); + + std::string json = param->SerializeToJson(); + EXPECT_TRUE(json.find("\"enable_rotate\":true") != std::string::npos); + + auto restored = IndexFactory::DeserializeIndexParamFromJson(json); + ASSERT_NE(nullptr, restored.get()); + + auto *restored_flat = dynamic_cast(restored.get()); + ASSERT_NE(nullptr, restored_flat); + EXPECT_TRUE(restored_flat->quantizer_param.enable_rotate); + } + + // Test 3: enable_rotate=false should be omitted when omit_empty_value=true + { + auto param = HNSWIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithEFConstruction(100) + .WithEnableRotate(false) + .Build(); + + std::string json_omit = param->SerializeToJson(true); + // enable_rotate=false should be omitted + EXPECT_TRUE(json_omit.find("enable_rotate") == std::string::npos) + << "Omitted JSON: " << json_omit; + + // But present in full serialization + std::string json_full = param->SerializeToJson(false); + EXPECT_TRUE(json_full.find("enable_rotate") != std::string::npos) + << "Full JSON: " << json_full; + } +} + #if defined(__GNUC__) || defined(__GNUG__) #pragma GCC diagnostic pop #endif \ No newline at end of file diff --git a/tests/db/index/common/db_proto_converter_test.cc b/tests/db/index/common/db_proto_converter_test.cc index dff93e9dd..9c71c3c89 100644 --- a/tests/db/index/common/db_proto_converter_test.cc +++ b/tests/db/index/common/db_proto_converter_test.cc @@ -470,4 +470,81 @@ TEST(ConverterTest, SegmentMetaWithEmptyFields) { EXPECT_EQ(pb_result.persisted_blocks_size(), 0); EXPECT_FALSE(pb_result.has_writing_forward_block()); EXPECT_EQ(pb_result.indexed_vector_fields_size(), 0); +} + +// ==================== enable_rotate roundtrip tests ==================== + +TEST(ConverterTest, HnswIndexParamsWithEnableRotate) { + // C++ -> PB -> C++ roundtrip with enable_rotate = true + HnswIndexParams original(MetricType::COSINE, 16, 200, QuantizeType::INT8, + false, QuantizerParam(true)); + EXPECT_TRUE(original.quantizer_param().enable_rotate()); + + auto pb = ProtoConverter::ToPb(&original); + EXPECT_TRUE(pb.base().quantizer_param().enable_rotate()); + + auto restored = ProtoConverter::FromPb(pb); + ASSERT_NE(restored, nullptr); + EXPECT_TRUE(restored->quantizer_param().enable_rotate()); + EXPECT_TRUE(restored->enable_rotate()); // convenience getter + EXPECT_EQ(restored->metric_type(), MetricType::COSINE); + EXPECT_EQ(restored->m(), 16); + EXPECT_EQ(restored->ef_construction(), 200); + EXPECT_EQ(restored->quantize_type(), QuantizeType::INT8); + + // C++ -> PB -> C++ roundtrip with enable_rotate = false + HnswIndexParams original_no_rot(MetricType::L2, 32, 100, QuantizeType::FP16); + auto pb2 = ProtoConverter::ToPb(&original_no_rot); + EXPECT_FALSE(pb2.base().quantizer_param().enable_rotate()); + auto restored2 = ProtoConverter::FromPb(pb2); + ASSERT_NE(restored2, nullptr); + EXPECT_FALSE(restored2->quantizer_param().enable_rotate()); +} + +TEST(ConverterTest, FlatIndexParamsWithEnableRotate) { + FlatIndexParams original(MetricType::IP, QuantizeType::INT8, + QuantizerParam(true)); + EXPECT_TRUE(original.quantizer_param().enable_rotate()); + + auto pb = ProtoConverter::ToPb(&original); + EXPECT_TRUE(pb.base().quantizer_param().enable_rotate()); + + auto restored = ProtoConverter::FromPb(pb); + ASSERT_NE(restored, nullptr); + EXPECT_TRUE(restored->quantizer_param().enable_rotate()); + EXPECT_EQ(restored->metric_type(), MetricType::IP); + EXPECT_EQ(restored->quantize_type(), QuantizeType::INT8); + + // enable_rotate = false + FlatIndexParams original_no_rot(MetricType::L2, QuantizeType::FP16); + auto pb2 = ProtoConverter::ToPb(&original_no_rot); + EXPECT_FALSE(pb2.base().quantizer_param().enable_rotate()); + auto restored2 = ProtoConverter::FromPb(pb2); + EXPECT_FALSE(restored2->quantizer_param().enable_rotate()); +} + +TEST(ConverterTest, IVFIndexParamsWithEnableRotate) { + IVFIndexParams original(MetricType::COSINE, 256, 20, true, QuantizeType::INT8, + QuantizerParam(true)); + EXPECT_TRUE(original.quantizer_param().enable_rotate()); + + auto pb = ProtoConverter::ToPb(&original); + EXPECT_TRUE(pb.base().quantizer_param().enable_rotate()); + + auto restored = ProtoConverter::FromPb(pb); + ASSERT_NE(restored, nullptr); + EXPECT_TRUE(restored->quantizer_param().enable_rotate()); + EXPECT_EQ(restored->metric_type(), MetricType::COSINE); + EXPECT_EQ(restored->n_list(), 256); + EXPECT_EQ(restored->n_iters(), 20); + EXPECT_TRUE(restored->use_soar()); + EXPECT_EQ(restored->quantize_type(), QuantizeType::INT8); + + // enable_rotate = false + IVFIndexParams original_no_rot(MetricType::L2, 128, 10, false, + QuantizeType::FP16); + auto pb2 = ProtoConverter::ToPb(&original_no_rot); + EXPECT_FALSE(pb2.base().quantizer_param().enable_rotate()); + auto restored2 = ProtoConverter::FromPb(pb2); + EXPECT_FALSE(restored2->quantizer_param().enable_rotate()); } \ No newline at end of file diff --git a/tests/db/index/common/index_params_test.cc b/tests/db/index/common/index_params_test.cc index af67e7398..d5a85aeb9 100644 --- a/tests/db/index/common/index_params_test.cc +++ b/tests/db/index/common/index_params_test.cc @@ -186,4 +186,96 @@ TEST(IndexParamsTest, DynamicPointerCast) { IndexParams &base_ref = *base_ptr; auto &hnsw_ref = dynamic_cast(base_ref); EXPECT_EQ(hnsw_ref.type(), IndexType::HNSW); +} + +// ==================== QuantizerParam tests ==================== + +TEST(IndexParamsTest, QuantizerParamBasic) { + // Default constructor: enable_rotate should be false + QuantizerParam qp_default; + EXPECT_FALSE(qp_default.enable_rotate()); + + // Constructor with true + QuantizerParam qp_true(true); + EXPECT_TRUE(qp_true.enable_rotate()); + + // Constructor with false + QuantizerParam qp_false(false); + EXPECT_FALSE(qp_false.enable_rotate()); + + // Setter + qp_default.set_enable_rotate(true); + EXPECT_TRUE(qp_default.enable_rotate()); + qp_default.set_enable_rotate(false); + EXPECT_FALSE(qp_default.enable_rotate()); + + // Equality + EXPECT_TRUE(qp_true == QuantizerParam(true)); + EXPECT_TRUE(qp_false == QuantizerParam(false)); + EXPECT_FALSE(qp_true == qp_false); + + // Inequality + EXPECT_TRUE(qp_true != qp_false); + EXPECT_FALSE(qp_true != QuantizerParam(true)); +} + +TEST(IndexParamsTest, QuantizerParamWithVectorIndex) { + // HnswIndexParams + { + HnswIndexParams params(MetricType::COSINE, 16, 100, QuantizeType::INT8); + EXPECT_FALSE(params.quantizer_param().enable_rotate()); + EXPECT_FALSE(params.enable_rotate()); // convenience getter + + params.set_quantizer_param(QuantizerParam(true)); + EXPECT_TRUE(params.quantizer_param().enable_rotate()); + EXPECT_TRUE(params.enable_rotate()); + + // Clone preserves quantizer_param + auto cloned = params.clone(); + auto *cloned_hnsw = dynamic_cast(cloned.get()); + ASSERT_NE(cloned_hnsw, nullptr); + EXPECT_TRUE(cloned_hnsw->quantizer_param().enable_rotate()); + EXPECT_TRUE(*cloned == params); + + // Equality: different enable_rotate -> not equal + HnswIndexParams params2(MetricType::COSINE, 16, 100, QuantizeType::INT8); + params2.set_quantizer_param(QuantizerParam(false)); + EXPECT_FALSE(params == params2); + } + + // FlatIndexParams + { + FlatIndexParams params(MetricType::L2, QuantizeType::INT8); + EXPECT_FALSE(params.quantizer_param().enable_rotate()); + + params.set_quantizer_param(QuantizerParam(true)); + EXPECT_TRUE(params.quantizer_param().enable_rotate()); + EXPECT_TRUE(params.enable_rotate()); + + auto cloned = params.clone(); + auto *cloned_flat = dynamic_cast(cloned.get()); + ASSERT_NE(cloned_flat, nullptr); + EXPECT_TRUE(cloned_flat->quantizer_param().enable_rotate()); + + FlatIndexParams params2(MetricType::L2, QuantizeType::INT8); + EXPECT_FALSE(params == params2); + } + + // IVFIndexParams + { + IVFIndexParams params(MetricType::IP, 128, 10, false, QuantizeType::INT8); + EXPECT_FALSE(params.quantizer_param().enable_rotate()); + + params.set_quantizer_param(QuantizerParam(true)); + EXPECT_TRUE(params.quantizer_param().enable_rotate()); + EXPECT_TRUE(params.enable_rotate()); + + auto cloned = params.clone(); + auto *cloned_ivf = dynamic_cast(cloned.get()); + ASSERT_NE(cloned_ivf, nullptr); + EXPECT_TRUE(cloned_ivf->quantizer_param().enable_rotate()); + + IVFIndexParams params2(MetricType::IP, 128, 10, false, QuantizeType::INT8); + EXPECT_FALSE(params == params2); + } } \ No newline at end of file From 6a94a68c123415a0cf185bd7eece666fb7a735e2 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 15 Jun 2026 19:55:30 +0800 Subject: [PATCH 23/38] SSE --- src/core/CMakeLists.txt | 25 +- src/core/quantizer/CMakeLists.txt | 12 +- src/core/quantizer/record_rotator.cc | 611 +- src/core/quantizer/record_rotator.h | 10 +- src/core/utility/fht_avx.hpp | 19698 +++++++++++++++++++++++++ 5 files changed, 20280 insertions(+), 76 deletions(-) create mode 100644 src/core/utility/fht_avx.hpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 33dac1197..cac3d1840 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -13,11 +13,6 @@ if(RABITQ_SUPPORTED AND AUTO_DETECT_ARCH) set(HNSW_RABITQ_FILES_FULL ${HNSW_RABITQ_FILES}) list(TRANSFORM HNSW_RABITQ_FILES_FULL PREPEND "algorithm/hnsw_rabitq/") - # record_rotator.cc includes rabitqlib's rotator.hpp which uses AVX2 - # intrinsics in flip_sign() and kacs_walk(), so it also needs the - # RABITQ_ARCH_FLAG at compile time. - list(APPEND HNSW_RABITQ_FILES_FULL "quantizer/record_rotator.cc") - foreach(FILE ${HNSW_RABITQ_FILES_FULL}) set_source_files_properties( ${FILE} @@ -48,6 +43,26 @@ if(NOT ANDROID AND AUTO_DETECT_ARCH) endif() endif() +# quantizer/record_rotator.cc uses FFHT AVX inline assembly guarded by +# __AVX2__/__AVX512F__. zvec_core glob-collects this source, so per-file +# AVX2 flags must be set here as well (in addition to the core_quantizer +# target in quantizer/CMakeLists.txt). Without this the zvec_core copy +# would compile without AVX2 and the fast path would never be emitted. +if(NOT ANDROID AND AUTO_DETECT_ARCH) + if(HOST_ARCH MATCHES "^(x86|x64)$") + setup_compiler_march_for_x86( + _ROTATOR_MARCH_SSE _ROTATOR_MARCH_AVX2 + _ROTATOR_MARCH_AVX512 _ROTATOR_MARCH_AVX512FP16) + if(_ROTATOR_MARCH_AVX2) + set_source_files_properties( + quantizer/record_rotator.cc + PROPERTIES + COMPILE_FLAGS "${_ROTATOR_MARCH_AVX2}" + ) + endif() + endif() +endif() + cc_directory(framework) cc_directory(algorithm) cc_directory(metric) diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index a7c7c2732..e8514e1d0 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -6,20 +6,28 @@ if(NOT APPLE) "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") endif() +# x86: use AVX2/AVX512 arch flag from RABITQ detection if(RABITQ_SUPPORTED AND RABITQ_ARCH_FLAG) set_source_files_properties( record_rotator.cc PROPERTIES COMPILE_FLAGS "${RABITQ_ARCH_FLAG}" ) +# ARM aarch64: use armv8-a to enable NEON intrinsics +elseif(HOST_ARCH MATCHES "^(arm|arm64)$" AND NOT MSVC) + set_source_files_properties( + record_rotator.cc + PROPERTIES + COMPILE_FLAGS "-march=armv8-a" + ) endif() cc_library( NAME core_quantizer STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc - LIBS zvec_ailego core_framework rabitqlib - LIBS zvec_ailego zvec_turbo core_framework rabitqlib + LIBS zvec_ailego core_framework + LIBS zvec_ailego zvec_turbo core_framework INCS . ${PROJECT_ROOT_DIR}/src/core LDFLAGS "${CORE_QUANTIZER_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index ca353b850..7db436222 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -13,8 +13,28 @@ // limitations under the License. #include "record_rotator.h" + +#include +#include #include -#include +#include +#include +#include + +#if defined(__AVX2__) || defined(__AVX512F__) +#include +// FFHT (Fastest Fast Hadamard Transform) — hand-tuned AVX inline assembly +// from https://github.com/FALCONN-LIB/FFHT, originally bundled in rabitqlib. +// Provides fht_float(buf, log_n) with per-size helper_float_N specialisations. +#include "utility/fht_avx.hpp" +#elif defined(__SSE2__) +#include +#endif + +#if defined(__ARM_NEON) && defined(__aarch64__) +#include +#endif + #include #include "zvec/core/framework/index_error.h" #include "zvec/core/framework/index_logger.h" @@ -22,39 +42,465 @@ namespace zvec { namespace core { -// All rabitqlib types are confined to this translation unit via pimpl. +namespace { + +// ============================================================================ +// Scalar / SIMD helper functions for rotation +// ============================================================================ + +//! Compute floor(log2(n)) for power-of-2 n. +inline int ilog2(size_t n) { + int r = 0; + while (n > 1) { n >>= 1; ++r; } + return r; +} + +//! In-place Fast Hadamard Transform on a power-of-2 length array. +//! Uses FFHT hand-tuned AVX assembly when available; generic scalar loop +//! otherwise (ARM NEON / SSE2 / pure scalar). +void fht_inplace(float *data, size_t n) { +#if defined(__AVX2__) || defined(__AVX512F__) + fht_float(data, ilog2(n)); +#else + for (size_t len = 1; len < n; len <<= 1) { + for (size_t i = 0; i < n; i += len << 1) { + for (size_t j = i; j < i + len; ++j) { + float u = data[j]; + float v = data[j + len]; + data[j] = u + v; + data[j + len] = u - v; + } + } + } +#endif +} + +//! Flip the sign of elements based on a packed bit-array. +void flip_sign(const uint8_t *flip, float *data, size_t dim) { +#if defined(__AVX512F__) && defined(__AVX512DQ__) + constexpr size_t kChunk = 64; + const __m512 sign_flip = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + for (size_t i = 0; i < dim; i += kChunk) { + uint64_t mask_bits; + std::memcpy(&mask_bits, &flip[i / 8], sizeof(mask_bits)); + const __mmask16 m0 = _cvtu32_mask16(mask_bits & 0xFFFF); + const __mmask16 m1 = _cvtu32_mask16((mask_bits >> 16) & 0xFFFF); + const __mmask16 m2 = _cvtu32_mask16((mask_bits >> 32) & 0xFFFF); + const __mmask16 m3 = _cvtu32_mask16((mask_bits >> 48) & 0xFFFF); + __m512 v0 = _mm512_loadu_ps(&data[i]); + v0 = _mm512_mask_xor_ps(v0, m0, v0, sign_flip); + _mm512_storeu_ps(&data[i], v0); + __m512 v1 = _mm512_loadu_ps(&data[i + 16]); + v1 = _mm512_mask_xor_ps(v1, m1, v1, sign_flip); + _mm512_storeu_ps(&data[i + 16], v1); + __m512 v2 = _mm512_loadu_ps(&data[i + 32]); + v2 = _mm512_mask_xor_ps(v2, m2, v2, sign_flip); + _mm512_storeu_ps(&data[i + 32], v2); + __m512 v3 = _mm512_loadu_ps(&data[i + 48]); + v3 = _mm512_mask_xor_ps(v3, m3, v3, sign_flip); + _mm512_storeu_ps(&data[i + 48], v3); + } +#elif defined(__AVX2__) + constexpr size_t kChunk = 32; + const __m256i bit_select = + _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + const __m256 sign_flip = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + for (size_t i = 0; i < dim; i += kChunk) { + uint32_t mask_bits; + std::memcpy(&mask_bits, &flip[i / 8], sizeof(mask_bits)); + for (int b = 0; b < 4; ++b) { + __m256i mb = _mm256_set1_epi32((mask_bits >> (b * 8)) & 0xFF); + __m256i test = _mm256_and_si256(mb, bit_select); + __m256i cmp = _mm256_cmpeq_epi32(test, bit_select); + __m256 xor_mask = _mm256_and_ps(_mm256_castsi256_ps(cmp), sign_flip); + __m256 v = _mm256_loadu_ps(&data[i + b * 8]); + v = _mm256_xor_ps(v, xor_mask); + _mm256_storeu_ps(&data[i + b * 8], v); + } + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + // 128-bit NEON: process 4 floats per iteration. + // Load 2 bytes (16 bits) to safely handle cross-byte boundaries. + const uint32x4_t sign_bit = vdupq_n_u32(0x80000000u); + for (size_t i = 0; i < dim; i += 4) { + uint16_t bits16; + std::memcpy(&bits16, &flip[i / 8], sizeof(bits16)); + bits16 >>= (i % 8); + uint32_t b0 = bits16 & 1u; + uint32_t b1 = (bits16 >> 1) & 1u; + uint32_t b2 = (bits16 >> 2) & 1u; + uint32_t b3 = (bits16 >> 3) & 1u; + uint32x4_t bit_mask = {b0, b1, b2, b3}; + uint32x4_t sign_mask = vmulq_u32(bit_mask, sign_bit); + float32x4_t v = vld1q_f32(&data[i]); + v = vreinterpretq_f32_u32( + veorq_u32(vreinterpretq_u32_f32(v), sign_mask)); + vst1q_f32(&data[i], v); + } +#elif defined(__SSE2__) + // 128-bit SSE2: process 4 floats per iteration. + // Load 2 bytes (16 bits) to safely handle cross-byte boundaries. + const __m128i sign_bit = _mm_set1_epi32(static_cast(0x80000000u)); + for (size_t i = 0; i < dim; i += 4) { + uint16_t bits16; + std::memcpy(&bits16, &flip[i / 8], sizeof(bits16)); + bits16 >>= (i % 8); + uint32_t b0 = bits16 & 1u; + uint32_t b1 = (bits16 >> 1) & 1u; + uint32_t b2 = (bits16 >> 2) & 1u; + uint32_t b3 = (bits16 >> 3) & 1u; + __m128i bit_mask = _mm_set_epi32(b3, b2, b1, b0); + __m128i sign_mask = _mm_mullo_epi32(bit_mask, sign_bit); + __m128 v = _mm_loadu_ps(&data[i]); + v = _mm_xor_ps(v, _mm_castsi128_ps(sign_mask)); + _mm_storeu_ps(&data[i], v); + } +#else + for (size_t i = 0; i < dim; ++i) { + if (flip[i / 8] & (1u << (i % 8))) { + data[i] = -data[i]; + } + } +#endif +} + +//! Kac random walk: butterfly add/sub between first and second halves. +void kacs_walk(float *data, size_t len) { + size_t half = len / 2; +#if defined(__AVX512F__) + for (size_t i = 0; i < half; i += 16) { + __m512 x = _mm512_loadu_ps(&data[i]); + __m512 y = _mm512_loadu_ps(&data[i + half]); + _mm512_storeu_ps(&data[i], _mm512_add_ps(x, y)); + _mm512_storeu_ps(&data[i + half], _mm512_sub_ps(x, y)); + } +#elif defined(__AVX2__) + for (size_t i = 0; i < half; i += 8) { + __m256 x = _mm256_loadu_ps(&data[i]); + __m256 y = _mm256_loadu_ps(&data[i + half]); + _mm256_storeu_ps(&data[i], _mm256_add_ps(x, y)); + _mm256_storeu_ps(&data[i + half], _mm256_sub_ps(x, y)); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (size_t i = 0; i < half; i += 4) { + float32x4_t x = vld1q_f32(&data[i]); + float32x4_t y = vld1q_f32(&data[i + half]); + vst1q_f32(&data[i], vaddq_f32(x, y)); + vst1q_f32(&data[i + half], vsubq_f32(x, y)); + } +#elif defined(__SSE2__) + for (size_t i = 0; i < half; i += 4) { + __m128 x = _mm_loadu_ps(&data[i]); + __m128 y = _mm_loadu_ps(&data[i + half]); + _mm_storeu_ps(&data[i], _mm_add_ps(x, y)); + _mm_storeu_ps(&data[i + half], _mm_sub_ps(x, y)); + } +#else + for (size_t i = 0; i < half; ++i) { + float x = data[i]; + float y = data[i + half]; + data[i] = x + y; + data[i + half] = x - y; + } +#endif +} + +//! Scale each element by a constant factor. +void vec_rescale(float *data, size_t n, float factor) { + for (size_t i = 0; i < n; ++i) { + data[i] *= factor; + } +} + +//! Largest power-of-2 not exceeding n. +size_t floor_pow2(size_t n) { + size_t p = 1; + while ((p << 1) <= n) p <<= 1; + return p; +} + +//! Read a little-endian uint32 from raw bytes. +uint32_t read_u32_le(const char *p) { + return static_cast(static_cast(p[0])) | + (static_cast(static_cast(p[1])) << 8) | + (static_cast(static_cast(p[2])) << 16) | + (static_cast(static_cast(p[3])) << 24); +} + +//! Write a uint32 in little-endian to raw bytes. +void write_u32_le(char *p, uint32_t v) { + p[0] = static_cast(v & 0xFF); + p[1] = static_cast((v >> 8) & 0xFF); + p[2] = static_cast((v >> 16) & 0xFF); + p[3] = static_cast((v >> 24) & 0xFF); +} + +// ============================================================================ +// FhtKacRotatorImpl - O(d log d) FHT-based Kac random rotation +// ============================================================================ + +struct FhtKacRotatorImpl { + std::vector flip; + size_t trunc_dim{0}; + float fac{0}; + + static constexpr size_t kByteLen = 8; + + void init(size_t /*dim*/, size_t padded_dim) { + flip.resize(4 * padded_dim / kByteLen); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(0, 255); + for (auto &b : flip) b = static_cast(dist(gen)); + + // Log SIMD path for debugging + const char *simd = +#if defined(__AVX512F__) && defined(__AVX512DQ__) + "AVX512F+DQ" +#elif defined(__AVX2__) + "AVX2" +#elif defined(__ARM_NEON) && defined(__aarch64__) + "ARM-NEON" +#elif defined(__SSE2__) + "SSE2" +#else + "Scalar" +#endif + ; + const char *fht = +#if defined(__AVX2__) || defined(__AVX512F__) + "FFHT-AVX" +#else + "Generic" +#endif + ; + LOG_WARN("RecordRotator[FhtKac] SIMD=%s, FHT=%s, padded_dim=%zu", + simd, fht, padded_dim); + } + + void rotate(const float *in, float *out, size_t dim, + size_t padded_dim) const { + std::memcpy(out, in, sizeof(float) * dim); + std::fill(out + dim, out + padded_dim, 0.0f); + + if (trunc_dim == padded_dim) { + // Exact power-of-2: 4 rounds of (flip -> FHT -> rescale) + flip_sign(flip.data(), out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + + flip_sign(flip.data() + padded_dim / kByteLen, out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + + flip_sign(flip.data() + 2 * padded_dim / kByteLen, out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + + flip_sign(flip.data() + 3 * padded_dim / kByteLen, out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + + return; + } + + // Non-power-of-2: 4 rounds with kacs_walk reduction. + // FHT always operates on trunc_dim (largest power-of-2 <= dim), + // matching the original rabitqlib behavior. + size_t start = padded_dim - trunc_dim; + float *trunc_ptr = out + start; + + // Round 1: FHT on [0, trunc_dim) + flip_sign(flip.data(), out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + kacs_walk(out, padded_dim); + + // Round 2: FHT on [start, start + trunc_dim) + flip_sign(flip.data() + padded_dim / kByteLen, out, padded_dim); + fht_inplace(trunc_ptr, trunc_dim); + vec_rescale(trunc_ptr, trunc_dim, fac); + kacs_walk(out, padded_dim); + + // Round 3: FHT on [0, trunc_dim) + flip_sign(flip.data() + 2 * padded_dim / kByteLen, out, padded_dim); + fht_inplace(out, trunc_dim); + vec_rescale(out, trunc_dim, fac); + kacs_walk(out, padded_dim); + + // Round 4: FHT on [start, start + trunc_dim) + flip_sign(flip.data() + 3 * padded_dim / kByteLen, out, padded_dim); + fht_inplace(trunc_ptr, trunc_dim); + vec_rescale(trunc_ptr, trunc_dim, fac); + kacs_walk(out, padded_dim); + + // Final rescale: combine the 4 kacs_walk reductions + vec_rescale(out, padded_dim, 0.25f); + } + + void save(char *data) const { + std::memcpy(data, flip.data(), flip.size()); + } + + void load(const char *data) { + std::memcpy(flip.data(), data, flip.size()); + } + + size_t dump_bytes() const { return flip.size(); } +}; + +// ============================================================================ +// MatrixRotatorImpl - O(d^2) random orthogonal matrix rotation +// ============================================================================ + +struct MatrixRotatorImpl { + std::vector matrix; // dim x padded_dim, row-major + + void init(size_t dim, size_t padded_dim) { + LOG_WARN("RecordRotator[Matrix] dim=%zu, padded_dim=%zu", dim, padded_dim); + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution normal(0.0f, 1.0f); + + // Generate padded_dim random Gaussian vectors of length padded_dim + std::vector q(padded_dim * padded_dim); + for (auto &v : q) v = normal(gen); + + // Modified Gram-Schmidt orthogonalization + for (size_t i = 0; i < padded_dim; ++i) { + float *qi = &q[i * padded_dim]; + + // Subtract projections onto all previous basis vectors + for (size_t j = 0; j < i; ++j) { + const float *qj = &q[j * padded_dim]; + float dot = 0.0f; + for (size_t k = 0; k < padded_dim; ++k) dot += qi[k] * qj[k]; + for (size_t k = 0; k < padded_dim; ++k) qi[k] -= dot * qj[k]; + } + + // Normalize + float norm = 0.0f; + for (size_t k = 0; k < padded_dim; ++k) norm += qi[k] * qi[k]; + norm = std::sqrt(norm); + + if (norm < 1e-10f) { + // Degenerate vector: re-randomize and re-orthogonalize + for (size_t k = 0; k < padded_dim; ++k) qi[k] = normal(gen); + for (size_t j = 0; j < i; ++j) { + const float *qj = &q[j * padded_dim]; + float dot = 0.0f; + for (size_t k = 0; k < padded_dim; ++k) dot += qi[k] * qj[k]; + for (size_t k = 0; k < padded_dim; ++k) qi[k] -= dot * qj[k]; + } + norm = 0.0f; + for (size_t k = 0; k < padded_dim; ++k) norm += qi[k] * qi[k]; + norm = std::sqrt(norm); + } + for (size_t k = 0; k < padded_dim; ++k) qi[k] /= norm; + } + + // Keep only the first dim rows (the rest are zero-padded in input) + matrix.resize(dim * padded_dim); + std::memcpy(matrix.data(), q.data(), dim * padded_dim * sizeof(float)); + } + + void rotate(const float *in, float *out, size_t dim, + size_t padded_dim) const { + for (size_t i = 0; i < padded_dim; ++i) { + float sum = 0.0f; + for (size_t j = 0; j < dim; ++j) { + sum += matrix[j * padded_dim + i] * in[j]; + } + out[i] = sum; + } + } + + void save(char *data) const { + std::memcpy(data, matrix.data(), matrix.size() * sizeof(float)); + } + + void load(const char *data) { + std::memcpy(matrix.data(), data, matrix.size() * sizeof(float)); + } + + size_t dump_bytes() const { return matrix.size() * sizeof(float); } +}; + +} // anonymous namespace + +// ============================================================================ +// RecordRotator::Impl +// ============================================================================ + struct RecordRotator::Impl { - //! Self-describing header prepended to the rabitqlib blob on dump + //! Header layout must match the original struct on x86_64: + //! type(1B) + padding(3B) + origin_dim(4B) + padded_dim(4B) = 12B + //! This preserves backward compatibility with existing serialized data. + static constexpr size_t kHeaderSize = 12; + struct Header { uint8_t type; uint32_t origin_dim; uint32_t padded_dim; - }; - static constexpr size_t kHeaderSize = sizeof(Header); // 9 bytes + void write_to(char *buf) const { + std::memset(buf, 0, kHeaderSize); // zero-fill padding + buf[0] = static_cast(type); + write_u32_le(buf + 4, origin_dim); + write_u32_le(buf + 8, padded_dim); + } + + void read_from(const char *buf) { + type = static_cast(buf[0]); + origin_dim = read_u32_le(buf + 4); + padded_dim = read_u32_le(buf + 8); + } + }; size_t dimension{0}; size_t padded_dim{0}; RecordRotatorType type{RecordRotatorType::FhtKac}; - std::unique_ptr> rotator; - //! Inverse rotation matrix, column-major: dim x padded_dim - //! Element [col][row] = inv_matrix_[col * dimension + row] - //! where col in [0, padded_dim), row in [0, dimension) + + std::unique_ptr fht_impl; + std::unique_ptr mat_impl; + + //! Inverse rotation matrix, column-major: padded_dim columns x dimension rows std::vector inv_matrix; - static rabitqlib::RotatorType to_rabitq(RecordRotatorType t) { - return t == RecordRotatorType::Matrix - ? rabitqlib::RotatorType::MatrixRotator - : rabitqlib::RotatorType::FhtKacRotator; + void do_rotate(const float *in, float *out) const { + if (fht_impl) { + fht_impl->rotate(in, out, dimension, padded_dim); + } else { + mat_impl->rotate(in, out, dimension, padded_dim); + } + } + + size_t blob_bytes() const { + if (fht_impl) return fht_impl->dump_bytes(); + return mat_impl->dump_bytes(); + } + + void save_blob(char *data) const { + if (fht_impl) { + fht_impl->save(data); + } else { + mat_impl->save(data); + } } - static RecordRotatorType from_rabitq(uint8_t t) { - return t == static_cast(RecordRotatorType::Matrix) - ? RecordRotatorType::Matrix - : RecordRotatorType::FhtKac; + void load_blob(const char *data) { + if (fht_impl) { + fht_impl->load(data); + } else { + mat_impl->load(data); + } } }; +// ============================================================================ +// RecordRotator public methods +// ============================================================================ + RecordRotator::RecordRotator() : impl_(std::make_unique()) {} RecordRotator::~RecordRotator() = default; @@ -67,24 +513,34 @@ void RecordRotator::init(size_t dimension, size_t padded_dim, impl_->dimension = dimension; impl_->padded_dim = padded_dim; impl_->type = rotator_type; - impl_->rotator.reset(rabitqlib::choose_rotator( - dimension, Impl::to_rabitq(rotator_type), padded_dim)); + + if (rotator_type == RecordRotatorType::FhtKac) { + impl_->fht_impl = std::make_unique(); + impl_->fht_impl->trunc_dim = floor_pow2(dimension); + impl_->fht_impl->fac = + 1.0f / std::sqrt(static_cast(impl_->fht_impl->trunc_dim)); + impl_->fht_impl->init(dimension, padded_dim); + } else { + impl_->mat_impl = std::make_unique(); + impl_->mat_impl->init(dimension, padded_dim); + } + // Build inverse rotation data for unrotate support build_inverse(); } void RecordRotator::rotate(const float *in, float *out) const { - impl_->rotator->rotate(in, out); + impl_->do_rotate(in, out); } std::vector RecordRotator::rotate(const float *in) const { std::vector out(impl_->padded_dim); - impl_->rotator->rotate(in, out.data()); + impl_->do_rotate(in, out.data()); return out; } void RecordRotator::build_inverse() { - if (!impl_->rotator) { + if (!impl_->fht_impl && !impl_->mat_impl) { LOG_ERROR("RecordRotator::build_inverse: rotator not initialized"); return; } @@ -96,7 +552,7 @@ void RecordRotator::build_inverse() { impl_->inv_matrix.resize(pdim * dim, 0.0f); // Compute rotation matrix by rotating each standard basis vector e_i. - // R * e_i = i-th column of R, which we store as inv_matrix[i * dim + j]. + // R * e_i = i-th column of R, stored as inv_matrix[i * dim + j]. std::vector basis(dim, 0.0f); std::vector rotated(pdim, 0.0f); @@ -105,15 +561,14 @@ void RecordRotator::build_inverse() { if (i < dim) { basis[i] = 1.0f; } - impl_->rotator->rotate(basis.data(), rotated.data()); - // Store as column i of the rotation matrix + impl_->do_rotate(basis.data(), rotated.data()); for (size_t j = 0; j < dim; ++j) { impl_->inv_matrix[i * dim + j] = rotated[j]; } } - LOG_DEBUG("RecordRotator::build_inverse done: dim=%zu, padded_dim=%zu", - dim, pdim); + LOG_DEBUG("RecordRotator::build_inverse done: dim=%zu, padded_dim=%zu", dim, + pdim); } void RecordRotator::unrotate(const float *in, float *out) const { @@ -123,12 +578,9 @@ void RecordRotator::unrotate(const float *in, float *out) const { } const size_t dim = impl_->dimension; - const size_t pdim = impl_->padded_dim; - // Compute x = R^T * y, where y is the dim-dimensional input (padded with zeros). - // x[j] = sum_{i=0}^{pdim-1} R[j][i] * y[i] - // = sum_{i=0}^{dim-1} inv_matrix_[i * dim + j] * in[i] - // (since y[i] = 0 for i >= dim) + // Compute x = R^T * y, where y is the dim-dimensional input. + // x[j] = sum_{i=0}^{dim-1} inv_matrix[i * dim + j] * in[i] std::vector tmp(dim, 0.0f); for (size_t i = 0; i < dim; ++i) { const float yi = in[i]; @@ -146,7 +598,7 @@ std::vector RecordRotator::unrotate(const float *in) const { } size_t RecordRotator::dump_bytes() const { - return Impl::kHeaderSize + impl_->rotator->dump_bytes(); + return Impl::kHeaderSize + impl_->blob_bytes(); } int RecordRotator::dump(const IndexStorage::Pointer &storage, @@ -155,7 +607,7 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, LOG_ERROR("RecordRotator::dump(storage): null storage"); return IndexError_InvalidArgument; } - if (!impl_->rotator) { + if (!impl_->fht_impl && !impl_->mat_impl) { LOG_ERROR("RecordRotator::dump(storage): rotator not initialized"); return IndexError_NoReady; } @@ -164,8 +616,8 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, return (size + 0x1F) & (~0x1F); }; - // Serialize: [Header: type|origin_dim|padded_dim] [rabitqlib blob] - const size_t blob_size = impl_->rotator->dump_bytes(); + // Serialize: [Header: type|origin_dim|padded_dim] [rotation blob] + const size_t blob_size = impl_->blob_bytes(); const size_t data_size = Impl::kHeaderSize + blob_size; const size_t total_size = align_size(data_size); std::vector buffer(data_size); @@ -174,8 +626,8 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, header.type = static_cast(impl_->type); header.origin_dim = static_cast(impl_->dimension); header.padded_dim = static_cast(impl_->padded_dim); - std::memcpy(buffer.data(), &header, Impl::kHeaderSize); - impl_->rotator->save(buffer.data() + Impl::kHeaderSize); + header.write_to(buffer.data()); + impl_->save_blob(buffer.data() + Impl::kHeaderSize); // Append segment to storage int ret = storage->append(seg_id, total_size); @@ -214,13 +666,13 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, LOG_ERROR("RecordRotator::dump(dumper): null dumper"); return IndexError_InvalidArgument; } - if (!impl_->rotator) { + if (!impl_->fht_impl && !impl_->mat_impl) { LOG_ERROR("RecordRotator::dump(dumper): rotator not initialized"); return IndexError_NoReady; } - // Serialize: [Header: type|origin_dim|padded_dim] [rabitqlib blob] - const size_t blob_size = impl_->rotator->dump_bytes(); + // Serialize: [Header: type|origin_dim|padded_dim] [rotation blob] + const size_t blob_size = impl_->blob_bytes(); const size_t data_size = Impl::kHeaderSize + blob_size; const size_t total_size = (data_size + 0x1F) & (~0x1F); @@ -229,15 +681,16 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, header.type = static_cast(impl_->type); header.origin_dim = static_cast(impl_->dimension); header.padded_dim = static_cast(impl_->padded_dim); - std::memcpy(buffer.data(), &header, Impl::kHeaderSize); - impl_->rotator->save(buffer.data() + Impl::kHeaderSize); + header.write_to(buffer.data()); + impl_->save_blob(buffer.data() + Impl::kHeaderSize); const uint32_t crc = ailego::Crc32c::Hash(buffer.data(), data_size, 0); const size_t padding_size = total_size - data_size; // Write data + padding to dumper if (dumper->write(buffer.data(), total_size) != total_size) { - LOG_ERROR("RecordRotator::dump(dumper): write failed, seg=%s", seg_id.c_str()); + LOG_ERROR("RecordRotator::dump(dumper): write failed, seg=%s", + seg_id.c_str()); return IndexError_WriteData; } @@ -298,22 +751,58 @@ int RecordRotator::open(IndexStorage::Pointer storage, // Parse self-describing header const char *raw = reinterpret_cast(block.data()); Impl::Header header; - std::memcpy(&header, raw, Impl::kHeaderSize); + header.read_from(raw); - impl_->type = Impl::from_rabitq(header.type); + impl_->type = static_cast(header.type); impl_->dimension = static_cast(header.origin_dim); impl_->padded_dim = static_cast(header.padded_dim); // Reconstruct the rotator from header info and load blob - impl_->rotator.reset(rabitqlib::choose_rotator( - impl_->dimension, Impl::to_rabitq(impl_->type), impl_->padded_dim)); - impl_->rotator->load(raw + Impl::kHeaderSize); + if (impl_->type == RecordRotatorType::FhtKac) { + impl_->fht_impl = std::make_unique(); + impl_->fht_impl->flip.resize(4 * impl_->padded_dim / + FhtKacRotatorImpl::kByteLen); + impl_->fht_impl->trunc_dim = floor_pow2(impl_->dimension); + impl_->fht_impl->fac = + 1.0f / std::sqrt(static_cast(impl_->fht_impl->trunc_dim)); + impl_->fht_impl->load(raw + Impl::kHeaderSize); + } else { + impl_->mat_impl = std::make_unique(); + impl_->mat_impl->matrix.resize(impl_->dimension * impl_->padded_dim); + impl_->mat_impl->load(raw + Impl::kHeaderSize); + } LOG_DEBUG( "RecordRotator::open done: seg=%s, dim=%zu, padded_dim=%zu, " "data_size=%zu", seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); + // Log SIMD path (same format as init, for open/load path) + const char *simd = +#if defined(__AVX512F__) && defined(__AVX512DQ__) + "AVX512F+DQ" +#elif defined(__AVX2__) + "AVX2" +#elif defined(__ARM_NEON) && defined(__aarch64__) + "ARM-NEON" +#elif defined(__SSE2__) + "SSE2" +#else + "Scalar" +#endif + ; + const char *fht = +#if defined(__AVX2__) || defined(__AVX512F__) + "FFHT-AVX" +#else + "Generic" +#endif + ; + const char *type_name = (impl_->type == RecordRotatorType::FhtKac) + ? "FhtKac" : "Matrix"; + LOG_WARN("RecordRotator::open [%s] SIMD=%s, FHT=%s, dim=%zu, padded_dim=%zu", + type_name, simd, fht, impl_->dimension, impl_->padded_dim); + // Build inverse rotation data for unrotate support build_inverse(); @@ -335,12 +824,12 @@ int RecordRotator::load(const float *matrix, size_t dimension, impl_->dimension = dimension; impl_->padded_dim = padded_dim; impl_->type = RecordRotatorType::Matrix; - impl_->rotator.reset(rabitqlib::choose_rotator( - dimension, rabitqlib::RotatorType::MatrixRotator, padded_dim)); - impl_->rotator->load(reinterpret_cast(matrix)); + impl_->mat_impl = std::make_unique(); + impl_->mat_impl->matrix.resize(dimension * padded_dim); + impl_->mat_impl->load(reinterpret_cast(matrix)); - LOG_DEBUG("RecordRotator::load done: dim=%zu, padded_dim=%zu", - dimension, padded_dim); + LOG_DEBUG("RecordRotator::load done: dim=%zu, padded_dim=%zu", dimension, + padded_dim); // Build inverse rotation data for unrotate support build_inverse(); @@ -348,20 +837,14 @@ int RecordRotator::load(const float *matrix, size_t dimension, return 0; } -size_t RecordRotator::dimension() const { - return impl_->dimension; -} +size_t RecordRotator::dimension() const { return impl_->dimension; } -size_t RecordRotator::padded_dim() const { - return impl_->padded_dim; -} +size_t RecordRotator::padded_dim() const { return impl_->padded_dim; } -RecordRotatorType RecordRotator::rotator_type() const { - return impl_->type; -} +RecordRotatorType RecordRotator::rotator_type() const { return impl_->type; } bool RecordRotator::initialized() const { - return impl_->rotator != nullptr; + return impl_->fht_impl != nullptr || impl_->mat_impl != nullptr; } } // namespace core diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index 83e9aa7e9..e2c9440af 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -32,13 +32,13 @@ enum class RecordRotatorType : uint8_t { Matrix = 1, //!< O(d^2) explicit random matrix rotation }; -/*! RecordRotator wraps rabitqlib::Rotator for per-vector rotation. +/*! RecordRotator provides per-vector rotation without external dependencies. * - * All rabitqlib types are hidden behind a pimpl to avoid leaking - * rabitqlib headers to consumers of this class. + * All rotation algorithms are implemented inline (FHT-based Kac walk and + * explicit random matrix), so no rabitqlib headers are required. * * Provides O(d log d) fast rotation (FHT-based Kac random rotation), - * as well as serialization (save/load) of the rotation matrix. + * as well as serialization (save/load) of the rotation parameters. * Used by IntegerStreamingConverter/Reformer when enable_rotate is true. */ class RecordRotator { @@ -93,7 +93,7 @@ class RecordRotator { const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; //! Dump the rotator to an IndexDumper as a named segment. - //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rabitqlib blob] + //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rotation blob] //! Appends padding for 32-byte alignment. int dump(const IndexDumper::Pointer &dumper, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; diff --git a/src/core/utility/fht_avx.hpp b/src/core/utility/fht_avx.hpp new file mode 100644 index 000000000..310b6f96f --- /dev/null +++ b/src/core/utility/fht_avx.hpp @@ -0,0 +1,19698 @@ +// https://github.com/FALCONN-LIB/FFHT + +// The MIT License (MIT) + +// Copyright (c) 2015 Alexandr Andoni, Piotr Indyk, Thijs Laarhoven, +// Ilya Razenshteyn, Ludwig Schmidt + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +#pragma once + +inline void helper_float_1(float *buf); +inline void helper_float_1(float *buf) { + for (int j = 0; j < 2; j += 2) { + for (int k = 0; k < 1; ++k) { + float u = buf[j + k]; + float v = buf[j + k + 1]; + buf[j + k] = u + v; + buf[j + k + 1] = u - v; + } + } +} +inline void helper_float_2(float *buf); +inline void helper_float_2(float *buf) { + for (int j = 0; j < 4; j += 2) { + for (int k = 0; k < 1; ++k) { + float u = buf[j + k]; + float v = buf[j + k + 1]; + buf[j + k] = u + v; + buf[j + k + 1] = u - v; + } + } + for (int j = 0; j < 4; j += 4) { + for (int k = 0; k < 2; ++k) { + float u = buf[j + k]; + float v = buf[j + k + 2]; + buf[j + k] = u + v; + buf[j + k + 2] = u - v; + } + } +} +inline void helper_float_3(float *buf); +inline void helper_float_3(float *buf) { + for (int j = 0; j < 8; j += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vmovups %%ymm0, (%0)\n" + :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } +} +inline void helper_float_4(float *buf); +inline void helper_float_4(float *buf) { + for (int j = 0; j < 16; j += 16) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_float_5(float *buf); +inline void helper_float_5(float *buf) { + for (int j = 0; j < 32; j += 32) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_float_6(float *buf); +inline void helper_float_6(float *buf) { + for (int j = 0; j < 64; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_float_7_recursive(float *buf, int depth); +inline void helper_float_7_recursive(float *buf, int depth) { + if (depth == 7) { + for (int j = 0; j < 128; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_7(float *buf); +inline void helper_float_7(float *buf) { + helper_float_7_recursive(buf, 7); +} +inline void helper_float_8_recursive(float *buf, int depth); +inline void helper_float_8_recursive(float *buf, int depth) { + if (depth == 6) { + for (int j = 0; j < 64; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 8) { + helper_float_8_recursive(buf + 0, 6); + helper_float_8_recursive(buf + 64, 6); + helper_float_8_recursive(buf + 128, 6); + helper_float_8_recursive(buf + 192, 6); + for (int j = 0; j < 256; j += 256) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_8(float *buf); +inline void helper_float_8(float *buf) { + helper_float_8_recursive(buf, 8); +} +inline void helper_float_9(float *buf); +inline void helper_float_9(float *buf) { + for (int j = 0; j < 512; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_float_10_recursive(float *buf, int depth); +inline void helper_float_10_recursive(float *buf, int depth) { + if (depth == 10) { + for (int j = 0; j < 1024; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_10(float *buf); +inline void helper_float_10(float *buf) { + helper_float_10_recursive(buf, 10); +} +inline void helper_float_11_recursive(float *buf, int depth); +inline void helper_float_11_recursive(float *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_11(float *buf); +inline void helper_float_11(float *buf) { + helper_float_11_recursive(buf, 11); +} +inline void helper_float_12(float *buf); +inline void helper_float_12(float *buf) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_float_13_recursive(float *buf, int depth); +inline void helper_float_13_recursive(float *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_float_13_recursive(buf + 0, 11); + helper_float_13_recursive(buf + 2048, 11); + helper_float_13_recursive(buf + 4096, 11); + helper_float_13_recursive(buf + 6144, 11); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 2048; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_13(float *buf); +inline void helper_float_13(float *buf) { + helper_float_13_recursive(buf, 13); +} +inline void helper_float_14_recursive(float *buf, int depth); +inline void helper_float_14_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_float_14_recursive(buf + 0, 12); + helper_float_14_recursive(buf + 4096, 12); + helper_float_14_recursive(buf + 8192, 12); + helper_float_14_recursive(buf + 12288, 12); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_14(float *buf); +inline void helper_float_14(float *buf) { + helper_float_14_recursive(buf, 14); +} +inline void helper_float_15_recursive(float *buf, int depth); +inline void helper_float_15_recursive(float *buf, int depth) { + if (depth == 13) { + for (int j = 0; j < 8192; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_15_recursive(buf + 0, 13); + helper_float_15_recursive(buf + 8192, 13); + helper_float_15_recursive(buf + 16384, 13); + helper_float_15_recursive(buf + 24576, 13); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 8192; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_15(float *buf); +inline void helper_float_15(float *buf) { + helper_float_15_recursive(buf, 15); +} +inline void helper_float_16_recursive(float *buf, int depth); +inline void helper_float_16_recursive(float *buf, int depth) { + if (depth == 13) { + for (int j = 0; j < 8192; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_float_16_recursive(buf + 0, 13); + helper_float_16_recursive(buf + 8192, 13); + helper_float_16_recursive(buf + 16384, 13); + helper_float_16_recursive(buf + 24576, 13); + helper_float_16_recursive(buf + 32768, 13); + helper_float_16_recursive(buf + 40960, 13); + helper_float_16_recursive(buf + 49152, 13); + helper_float_16_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_16(float *buf); +inline void helper_float_16(float *buf) { + helper_float_16_recursive(buf, 16); +} +inline void helper_float_17_recursive(float *buf, int depth); +inline void helper_float_17_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_17_recursive(buf + 0, 12); + helper_float_17_recursive(buf + 4096, 12); + helper_float_17_recursive(buf + 8192, 12); + helper_float_17_recursive(buf + 12288, 12); + helper_float_17_recursive(buf + 16384, 12); + helper_float_17_recursive(buf + 20480, 12); + helper_float_17_recursive(buf + 24576, 12); + helper_float_17_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_float_17_recursive(buf + 0, 15); + helper_float_17_recursive(buf + 32768, 15); + helper_float_17_recursive(buf + 65536, 15); + helper_float_17_recursive(buf + 98304, 15); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_17(float *buf); +inline void helper_float_17(float *buf) { + helper_float_17_recursive(buf, 17); +} +inline void helper_float_18_recursive(float *buf, int depth); +inline void helper_float_18_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_18_recursive(buf + 0, 12); + helper_float_18_recursive(buf + 4096, 12); + helper_float_18_recursive(buf + 8192, 12); + helper_float_18_recursive(buf + 12288, 12); + helper_float_18_recursive(buf + 16384, 12); + helper_float_18_recursive(buf + 20480, 12); + helper_float_18_recursive(buf + 24576, 12); + helper_float_18_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_18_recursive(buf + 0, 15); + helper_float_18_recursive(buf + 32768, 15); + helper_float_18_recursive(buf + 65536, 15); + helper_float_18_recursive(buf + 98304, 15); + helper_float_18_recursive(buf + 131072, 15); + helper_float_18_recursive(buf + 163840, 15); + helper_float_18_recursive(buf + 196608, 15); + helper_float_18_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_18(float *buf); +inline void helper_float_18(float *buf) { + helper_float_18_recursive(buf, 18); +} +inline void helper_float_19_recursive(float *buf, int depth); +inline void helper_float_19_recursive(float *buf, int depth) { + if (depth == 13) { + for (int j = 0; j < 8192; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_float_19_recursive(buf + 0, 13); + helper_float_19_recursive(buf + 8192, 13); + helper_float_19_recursive(buf + 16384, 13); + helper_float_19_recursive(buf + 24576, 13); + helper_float_19_recursive(buf + 32768, 13); + helper_float_19_recursive(buf + 40960, 13); + helper_float_19_recursive(buf + 49152, 13); + helper_float_19_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_float_19_recursive(buf + 0, 16); + helper_float_19_recursive(buf + 65536, 16); + helper_float_19_recursive(buf + 131072, 16); + helper_float_19_recursive(buf + 196608, 16); + helper_float_19_recursive(buf + 262144, 16); + helper_float_19_recursive(buf + 327680, 16); + helper_float_19_recursive(buf + 393216, 16); + helper_float_19_recursive(buf + 458752, 16); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 65536; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_19(float *buf); +inline void helper_float_19(float *buf) { + helper_float_19_recursive(buf, 19); +} +inline void helper_float_20_recursive(float *buf, int depth); +inline void helper_float_20_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_20_recursive(buf + 0, 12); + helper_float_20_recursive(buf + 4096, 12); + helper_float_20_recursive(buf + 8192, 12); + helper_float_20_recursive(buf + 12288, 12); + helper_float_20_recursive(buf + 16384, 12); + helper_float_20_recursive(buf + 20480, 12); + helper_float_20_recursive(buf + 24576, 12); + helper_float_20_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_20_recursive(buf + 0, 15); + helper_float_20_recursive(buf + 32768, 15); + helper_float_20_recursive(buf + 65536, 15); + helper_float_20_recursive(buf + 98304, 15); + helper_float_20_recursive(buf + 131072, 15); + helper_float_20_recursive(buf + 163840, 15); + helper_float_20_recursive(buf + 196608, 15); + helper_float_20_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_float_20_recursive(buf + 0, 18); + helper_float_20_recursive(buf + 262144, 18); + helper_float_20_recursive(buf + 524288, 18); + helper_float_20_recursive(buf + 786432, 18); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_20(float *buf); +inline void helper_float_20(float *buf) { + helper_float_20_recursive(buf, 20); +} +inline void helper_float_21_recursive(float *buf, int depth); +inline void helper_float_21_recursive(float *buf, int depth) { + if (depth == 9) { + for (int j = 0; j < 512; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_float_21_recursive(buf + 0, 9); + helper_float_21_recursive(buf + 512, 9); + helper_float_21_recursive(buf + 1024, 9); + helper_float_21_recursive(buf + 1536, 9); + helper_float_21_recursive(buf + 2048, 9); + helper_float_21_recursive(buf + 2560, 9); + helper_float_21_recursive(buf + 3072, 9); + helper_float_21_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_21_recursive(buf + 0, 12); + helper_float_21_recursive(buf + 4096, 12); + helper_float_21_recursive(buf + 8192, 12); + helper_float_21_recursive(buf + 12288, 12); + helper_float_21_recursive(buf + 16384, 12); + helper_float_21_recursive(buf + 20480, 12); + helper_float_21_recursive(buf + 24576, 12); + helper_float_21_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_21_recursive(buf + 0, 15); + helper_float_21_recursive(buf + 32768, 15); + helper_float_21_recursive(buf + 65536, 15); + helper_float_21_recursive(buf + 98304, 15); + helper_float_21_recursive(buf + 131072, 15); + helper_float_21_recursive(buf + 163840, 15); + helper_float_21_recursive(buf + 196608, 15); + helper_float_21_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_21_recursive(buf + 0, 18); + helper_float_21_recursive(buf + 262144, 18); + helper_float_21_recursive(buf + 524288, 18); + helper_float_21_recursive(buf + 786432, 18); + helper_float_21_recursive(buf + 1048576, 18); + helper_float_21_recursive(buf + 1310720, 18); + helper_float_21_recursive(buf + 1572864, 18); + helper_float_21_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_21(float *buf); +inline void helper_float_21(float *buf) { + helper_float_21_recursive(buf, 21); +} +inline void helper_float_22_recursive(float *buf, int depth); +inline void helper_float_22_recursive(float *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_float_22_recursive(buf + 0, 11); + helper_float_22_recursive(buf + 2048, 11); + helper_float_22_recursive(buf + 4096, 11); + helper_float_22_recursive(buf + 6144, 11); + helper_float_22_recursive(buf + 8192, 11); + helper_float_22_recursive(buf + 10240, 11); + helper_float_22_recursive(buf + 12288, 11); + helper_float_22_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_float_22_recursive(buf + 0, 14); + helper_float_22_recursive(buf + 16384, 14); + helper_float_22_recursive(buf + 32768, 14); + helper_float_22_recursive(buf + 49152, 14); + helper_float_22_recursive(buf + 65536, 14); + helper_float_22_recursive(buf + 81920, 14); + helper_float_22_recursive(buf + 98304, 14); + helper_float_22_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_float_22_recursive(buf + 0, 17); + helper_float_22_recursive(buf + 131072, 17); + helper_float_22_recursive(buf + 262144, 17); + helper_float_22_recursive(buf + 393216, 17); + helper_float_22_recursive(buf + 524288, 17); + helper_float_22_recursive(buf + 655360, 17); + helper_float_22_recursive(buf + 786432, 17); + helper_float_22_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 22) { + helper_float_22_recursive(buf + 0, 20); + helper_float_22_recursive(buf + 1048576, 20); + helper_float_22_recursive(buf + 2097152, 20); + helper_float_22_recursive(buf + 3145728, 20); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 1048576; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_22(float *buf); +inline void helper_float_22(float *buf) { + helper_float_22_recursive(buf, 22); +} +inline void helper_float_23_recursive(float *buf, int depth); +inline void helper_float_23_recursive(float *buf, int depth) { + if (depth == 9) { + for (int j = 0; j < 512; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_float_23_recursive(buf + 0, 9); + helper_float_23_recursive(buf + 512, 9); + helper_float_23_recursive(buf + 1024, 9); + helper_float_23_recursive(buf + 1536, 9); + helper_float_23_recursive(buf + 2048, 9); + helper_float_23_recursive(buf + 2560, 9); + helper_float_23_recursive(buf + 3072, 9); + helper_float_23_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_23_recursive(buf + 0, 12); + helper_float_23_recursive(buf + 4096, 12); + helper_float_23_recursive(buf + 8192, 12); + helper_float_23_recursive(buf + 12288, 12); + helper_float_23_recursive(buf + 16384, 12); + helper_float_23_recursive(buf + 20480, 12); + helper_float_23_recursive(buf + 24576, 12); + helper_float_23_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_23_recursive(buf + 0, 15); + helper_float_23_recursive(buf + 32768, 15); + helper_float_23_recursive(buf + 65536, 15); + helper_float_23_recursive(buf + 98304, 15); + helper_float_23_recursive(buf + 131072, 15); + helper_float_23_recursive(buf + 163840, 15); + helper_float_23_recursive(buf + 196608, 15); + helper_float_23_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_23_recursive(buf + 0, 18); + helper_float_23_recursive(buf + 262144, 18); + helper_float_23_recursive(buf + 524288, 18); + helper_float_23_recursive(buf + 786432, 18); + helper_float_23_recursive(buf + 1048576, 18); + helper_float_23_recursive(buf + 1310720, 18); + helper_float_23_recursive(buf + 1572864, 18); + helper_float_23_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_float_23_recursive(buf + 0, 21); + helper_float_23_recursive(buf + 2097152, 21); + helper_float_23_recursive(buf + 4194304, 21); + helper_float_23_recursive(buf + 6291456, 21); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_23(float *buf); +inline void helper_float_23(float *buf) { + helper_float_23_recursive(buf, 23); +} +inline void helper_float_24_recursive(float *buf, int depth); +inline void helper_float_24_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_24_recursive(buf + 0, 12); + helper_float_24_recursive(buf + 4096, 12); + helper_float_24_recursive(buf + 8192, 12); + helper_float_24_recursive(buf + 12288, 12); + helper_float_24_recursive(buf + 16384, 12); + helper_float_24_recursive(buf + 20480, 12); + helper_float_24_recursive(buf + 24576, 12); + helper_float_24_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_24_recursive(buf + 0, 15); + helper_float_24_recursive(buf + 32768, 15); + helper_float_24_recursive(buf + 65536, 15); + helper_float_24_recursive(buf + 98304, 15); + helper_float_24_recursive(buf + 131072, 15); + helper_float_24_recursive(buf + 163840, 15); + helper_float_24_recursive(buf + 196608, 15); + helper_float_24_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_24_recursive(buf + 0, 18); + helper_float_24_recursive(buf + 262144, 18); + helper_float_24_recursive(buf + 524288, 18); + helper_float_24_recursive(buf + 786432, 18); + helper_float_24_recursive(buf + 1048576, 18); + helper_float_24_recursive(buf + 1310720, 18); + helper_float_24_recursive(buf + 1572864, 18); + helper_float_24_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_float_24_recursive(buf + 0, 21); + helper_float_24_recursive(buf + 2097152, 21); + helper_float_24_recursive(buf + 4194304, 21); + helper_float_24_recursive(buf + 6291456, 21); + helper_float_24_recursive(buf + 8388608, 21); + helper_float_24_recursive(buf + 10485760, 21); + helper_float_24_recursive(buf + 12582912, 21); + helper_float_24_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_24(float *buf); +inline void helper_float_24(float *buf) { + helper_float_24_recursive(buf, 24); +} +inline void helper_float_25_recursive(float *buf, int depth); +inline void helper_float_25_recursive(float *buf, int depth) { + if (depth == 7) { + for (int j = 0; j < 128; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 10) { + helper_float_25_recursive(buf + 0, 7); + helper_float_25_recursive(buf + 128, 7); + helper_float_25_recursive(buf + 256, 7); + helper_float_25_recursive(buf + 384, 7); + helper_float_25_recursive(buf + 512, 7); + helper_float_25_recursive(buf + 640, 7); + helper_float_25_recursive(buf + 768, 7); + helper_float_25_recursive(buf + 896, 7); + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 128; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_float_25_recursive(buf + 0, 10); + helper_float_25_recursive(buf + 1024, 10); + helper_float_25_recursive(buf + 2048, 10); + helper_float_25_recursive(buf + 3072, 10); + helper_float_25_recursive(buf + 4096, 10); + helper_float_25_recursive(buf + 5120, 10); + helper_float_25_recursive(buf + 6144, 10); + helper_float_25_recursive(buf + 7168, 10); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 1024; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_float_25_recursive(buf + 0, 13); + helper_float_25_recursive(buf + 8192, 13); + helper_float_25_recursive(buf + 16384, 13); + helper_float_25_recursive(buf + 24576, 13); + helper_float_25_recursive(buf + 32768, 13); + helper_float_25_recursive(buf + 40960, 13); + helper_float_25_recursive(buf + 49152, 13); + helper_float_25_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_float_25_recursive(buf + 0, 16); + helper_float_25_recursive(buf + 65536, 16); + helper_float_25_recursive(buf + 131072, 16); + helper_float_25_recursive(buf + 196608, 16); + helper_float_25_recursive(buf + 262144, 16); + helper_float_25_recursive(buf + 327680, 16); + helper_float_25_recursive(buf + 393216, 16); + helper_float_25_recursive(buf + 458752, 16); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 65536; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 22) { + helper_float_25_recursive(buf + 0, 19); + helper_float_25_recursive(buf + 524288, 19); + helper_float_25_recursive(buf + 1048576, 19); + helper_float_25_recursive(buf + 1572864, 19); + helper_float_25_recursive(buf + 2097152, 19); + helper_float_25_recursive(buf + 2621440, 19); + helper_float_25_recursive(buf + 3145728, 19); + helper_float_25_recursive(buf + 3670016, 19); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 524288; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 25) { + helper_float_25_recursive(buf + 0, 22); + helper_float_25_recursive(buf + 4194304, 22); + helper_float_25_recursive(buf + 8388608, 22); + helper_float_25_recursive(buf + 12582912, 22); + helper_float_25_recursive(buf + 16777216, 22); + helper_float_25_recursive(buf + 20971520, 22); + helper_float_25_recursive(buf + 25165824, 22); + helper_float_25_recursive(buf + 29360128, 22); + for (int j = 0; j < 33554432; j += 33554432) { + for (int k = 0; k < 4194304; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_25(float *buf); +inline void helper_float_25(float *buf) { + helper_float_25_recursive(buf, 25); +} +inline void helper_float_26_recursive(float *buf, int depth); +inline void helper_float_26_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_26_recursive(buf + 0, 12); + helper_float_26_recursive(buf + 4096, 12); + helper_float_26_recursive(buf + 8192, 12); + helper_float_26_recursive(buf + 12288, 12); + helper_float_26_recursive(buf + 16384, 12); + helper_float_26_recursive(buf + 20480, 12); + helper_float_26_recursive(buf + 24576, 12); + helper_float_26_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_26_recursive(buf + 0, 15); + helper_float_26_recursive(buf + 32768, 15); + helper_float_26_recursive(buf + 65536, 15); + helper_float_26_recursive(buf + 98304, 15); + helper_float_26_recursive(buf + 131072, 15); + helper_float_26_recursive(buf + 163840, 15); + helper_float_26_recursive(buf + 196608, 15); + helper_float_26_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_26_recursive(buf + 0, 18); + helper_float_26_recursive(buf + 262144, 18); + helper_float_26_recursive(buf + 524288, 18); + helper_float_26_recursive(buf + 786432, 18); + helper_float_26_recursive(buf + 1048576, 18); + helper_float_26_recursive(buf + 1310720, 18); + helper_float_26_recursive(buf + 1572864, 18); + helper_float_26_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_float_26_recursive(buf + 0, 21); + helper_float_26_recursive(buf + 2097152, 21); + helper_float_26_recursive(buf + 4194304, 21); + helper_float_26_recursive(buf + 6291456, 21); + helper_float_26_recursive(buf + 8388608, 21); + helper_float_26_recursive(buf + 10485760, 21); + helper_float_26_recursive(buf + 12582912, 21); + helper_float_26_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 26) { + helper_float_26_recursive(buf + 0, 24); + helper_float_26_recursive(buf + 16777216, 24); + helper_float_26_recursive(buf + 33554432, 24); + helper_float_26_recursive(buf + 50331648, 24); + for (int j = 0; j < 67108864; j += 67108864) { + for (int k = 0; k < 16777216; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_26(float *buf); +inline void helper_float_26(float *buf) { + helper_float_26_recursive(buf, 26); +} +inline void helper_float_27_recursive(float *buf, int depth); +inline void helper_float_27_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_27_recursive(buf + 0, 12); + helper_float_27_recursive(buf + 4096, 12); + helper_float_27_recursive(buf + 8192, 12); + helper_float_27_recursive(buf + 12288, 12); + helper_float_27_recursive(buf + 16384, 12); + helper_float_27_recursive(buf + 20480, 12); + helper_float_27_recursive(buf + 24576, 12); + helper_float_27_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_27_recursive(buf + 0, 15); + helper_float_27_recursive(buf + 32768, 15); + helper_float_27_recursive(buf + 65536, 15); + helper_float_27_recursive(buf + 98304, 15); + helper_float_27_recursive(buf + 131072, 15); + helper_float_27_recursive(buf + 163840, 15); + helper_float_27_recursive(buf + 196608, 15); + helper_float_27_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_27_recursive(buf + 0, 18); + helper_float_27_recursive(buf + 262144, 18); + helper_float_27_recursive(buf + 524288, 18); + helper_float_27_recursive(buf + 786432, 18); + helper_float_27_recursive(buf + 1048576, 18); + helper_float_27_recursive(buf + 1310720, 18); + helper_float_27_recursive(buf + 1572864, 18); + helper_float_27_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_float_27_recursive(buf + 0, 21); + helper_float_27_recursive(buf + 2097152, 21); + helper_float_27_recursive(buf + 4194304, 21); + helper_float_27_recursive(buf + 6291456, 21); + helper_float_27_recursive(buf + 8388608, 21); + helper_float_27_recursive(buf + 10485760, 21); + helper_float_27_recursive(buf + 12582912, 21); + helper_float_27_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 27) { + helper_float_27_recursive(buf + 0, 24); + helper_float_27_recursive(buf + 16777216, 24); + helper_float_27_recursive(buf + 33554432, 24); + helper_float_27_recursive(buf + 50331648, 24); + helper_float_27_recursive(buf + 67108864, 24); + helper_float_27_recursive(buf + 83886080, 24); + helper_float_27_recursive(buf + 100663296, 24); + helper_float_27_recursive(buf + 117440512, 24); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 16777216; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_27(float *buf); +inline void helper_float_27(float *buf) { + helper_float_27_recursive(buf, 27); +} +inline void helper_float_28_recursive(float *buf, int depth); +inline void helper_float_28_recursive(float *buf, int depth) { + if (depth == 7) { + for (int j = 0; j < 128; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 10) { + helper_float_28_recursive(buf + 0, 7); + helper_float_28_recursive(buf + 128, 7); + helper_float_28_recursive(buf + 256, 7); + helper_float_28_recursive(buf + 384, 7); + helper_float_28_recursive(buf + 512, 7); + helper_float_28_recursive(buf + 640, 7); + helper_float_28_recursive(buf + 768, 7); + helper_float_28_recursive(buf + 896, 7); + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 128; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_float_28_recursive(buf + 0, 10); + helper_float_28_recursive(buf + 1024, 10); + helper_float_28_recursive(buf + 2048, 10); + helper_float_28_recursive(buf + 3072, 10); + helper_float_28_recursive(buf + 4096, 10); + helper_float_28_recursive(buf + 5120, 10); + helper_float_28_recursive(buf + 6144, 10); + helper_float_28_recursive(buf + 7168, 10); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 1024; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_float_28_recursive(buf + 0, 13); + helper_float_28_recursive(buf + 8192, 13); + helper_float_28_recursive(buf + 16384, 13); + helper_float_28_recursive(buf + 24576, 13); + helper_float_28_recursive(buf + 32768, 13); + helper_float_28_recursive(buf + 40960, 13); + helper_float_28_recursive(buf + 49152, 13); + helper_float_28_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_float_28_recursive(buf + 0, 16); + helper_float_28_recursive(buf + 65536, 16); + helper_float_28_recursive(buf + 131072, 16); + helper_float_28_recursive(buf + 196608, 16); + helper_float_28_recursive(buf + 262144, 16); + helper_float_28_recursive(buf + 327680, 16); + helper_float_28_recursive(buf + 393216, 16); + helper_float_28_recursive(buf + 458752, 16); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 65536; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 22) { + helper_float_28_recursive(buf + 0, 19); + helper_float_28_recursive(buf + 524288, 19); + helper_float_28_recursive(buf + 1048576, 19); + helper_float_28_recursive(buf + 1572864, 19); + helper_float_28_recursive(buf + 2097152, 19); + helper_float_28_recursive(buf + 2621440, 19); + helper_float_28_recursive(buf + 3145728, 19); + helper_float_28_recursive(buf + 3670016, 19); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 524288; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 25) { + helper_float_28_recursive(buf + 0, 22); + helper_float_28_recursive(buf + 4194304, 22); + helper_float_28_recursive(buf + 8388608, 22); + helper_float_28_recursive(buf + 12582912, 22); + helper_float_28_recursive(buf + 16777216, 22); + helper_float_28_recursive(buf + 20971520, 22); + helper_float_28_recursive(buf + 25165824, 22); + helper_float_28_recursive(buf + 29360128, 22); + for (int j = 0; j < 33554432; j += 33554432) { + for (int k = 0; k < 4194304; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 28) { + helper_float_28_recursive(buf + 0, 25); + helper_float_28_recursive(buf + 33554432, 25); + helper_float_28_recursive(buf + 67108864, 25); + helper_float_28_recursive(buf + 100663296, 25); + helper_float_28_recursive(buf + 134217728, 25); + helper_float_28_recursive(buf + 167772160, 25); + helper_float_28_recursive(buf + 201326592, 25); + helper_float_28_recursive(buf + 234881024, 25); + for (int j = 0; j < 268435456; j += 268435456) { + for (int k = 0; k < 33554432; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_28(float *buf); +inline void helper_float_28(float *buf) { + helper_float_28_recursive(buf, 28); +} +inline void helper_float_29_recursive(float *buf, int depth); +inline void helper_float_29_recursive(float *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_29_recursive(buf + 0, 12); + helper_float_29_recursive(buf + 4096, 12); + helper_float_29_recursive(buf + 8192, 12); + helper_float_29_recursive(buf + 12288, 12); + helper_float_29_recursive(buf + 16384, 12); + helper_float_29_recursive(buf + 20480, 12); + helper_float_29_recursive(buf + 24576, 12); + helper_float_29_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_29_recursive(buf + 0, 15); + helper_float_29_recursive(buf + 32768, 15); + helper_float_29_recursive(buf + 65536, 15); + helper_float_29_recursive(buf + 98304, 15); + helper_float_29_recursive(buf + 131072, 15); + helper_float_29_recursive(buf + 163840, 15); + helper_float_29_recursive(buf + 196608, 15); + helper_float_29_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_29_recursive(buf + 0, 18); + helper_float_29_recursive(buf + 262144, 18); + helper_float_29_recursive(buf + 524288, 18); + helper_float_29_recursive(buf + 786432, 18); + helper_float_29_recursive(buf + 1048576, 18); + helper_float_29_recursive(buf + 1310720, 18); + helper_float_29_recursive(buf + 1572864, 18); + helper_float_29_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_float_29_recursive(buf + 0, 21); + helper_float_29_recursive(buf + 2097152, 21); + helper_float_29_recursive(buf + 4194304, 21); + helper_float_29_recursive(buf + 6291456, 21); + helper_float_29_recursive(buf + 8388608, 21); + helper_float_29_recursive(buf + 10485760, 21); + helper_float_29_recursive(buf + 12582912, 21); + helper_float_29_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 27) { + helper_float_29_recursive(buf + 0, 24); + helper_float_29_recursive(buf + 16777216, 24); + helper_float_29_recursive(buf + 33554432, 24); + helper_float_29_recursive(buf + 50331648, 24); + helper_float_29_recursive(buf + 67108864, 24); + helper_float_29_recursive(buf + 83886080, 24); + helper_float_29_recursive(buf + 100663296, 24); + helper_float_29_recursive(buf + 117440512, 24); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 16777216; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 29) { + helper_float_29_recursive(buf + 0, 27); + helper_float_29_recursive(buf + 134217728, 27); + helper_float_29_recursive(buf + 268435456, 27); + helper_float_29_recursive(buf + 402653184, 27); + for (int j = 0; j < 536870912; j += 536870912) { + for (int k = 0; k < 134217728; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vmovups %%ymm0, (%0)\n" + "vmovups %%ymm1, (%1)\n" + "vmovups %%ymm2, (%2)\n" + "vmovups %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_29(float *buf); +inline void helper_float_29(float *buf) { + helper_float_29_recursive(buf, 29); +} +inline void helper_float_30_recursive(float *buf, int depth); +inline void helper_float_30_recursive(float *buf, int depth) { + if (depth == 6) { + for (int j = 0; j < 64; j += 64) { + for (int k = 0; k < 8; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vpermilps $160, %%ymm0, %%ymm8\n" + "vpermilps $245, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" + "vpermilps $160, %%ymm1, %%ymm8\n" + "vpermilps $245, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" + "vpermilps $160, %%ymm2, %%ymm8\n" + "vpermilps $245, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" + "vpermilps $160, %%ymm3, %%ymm8\n" + "vpermilps $245, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" + "vpermilps $160, %%ymm4, %%ymm8\n" + "vpermilps $245, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" + "vpermilps $160, %%ymm5, %%ymm8\n" + "vpermilps $245, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" + "vpermilps $160, %%ymm6, %%ymm8\n" + "vpermilps $245, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" + "vpermilps $160, %%ymm7, %%ymm8\n" + "vpermilps $245, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" + "vpermilps $68, %%ymm0, %%ymm8\n" + "vpermilps $238, %%ymm0, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm0\n" + "vpermilps $68, %%ymm1, %%ymm8\n" + "vpermilps $238, %%ymm1, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm1\n" + "vpermilps $68, %%ymm2, %%ymm8\n" + "vpermilps $238, %%ymm2, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm2\n" + "vpermilps $68, %%ymm3, %%ymm8\n" + "vpermilps $238, %%ymm3, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm3\n" + "vpermilps $68, %%ymm4, %%ymm8\n" + "vpermilps $238, %%ymm4, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm4\n" + "vpermilps $68, %%ymm5, %%ymm8\n" + "vpermilps $238, %%ymm5, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm5\n" + "vpermilps $68, %%ymm6, %%ymm8\n" + "vpermilps $238, %%ymm6, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm6\n" + "vpermilps $68, %%ymm7, %%ymm8\n" + "vpermilps $238, %%ymm7, %%ymm9\n" + "vxorps %%ymm10, %%ymm10, %%ymm10\n" + "vsubps %%ymm9, %%ymm10, %%ymm11\n" + "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" + "vaddps %%ymm8, %%ymm12, %%ymm7\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm0, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm0\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm1, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm1\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm2, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm2\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm3, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm3\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm4, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm4\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm5, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm5\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm6, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm6\n" + "vxorps %%ymm8, %%ymm8, %%ymm8\n" + "vsubps %%ymm7, %%ymm8, %%ymm9\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" + "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" + "vaddps %%ymm10, %%ymm11, %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 9) { + helper_float_30_recursive(buf + 0, 6); + helper_float_30_recursive(buf + 64, 6); + helper_float_30_recursive(buf + 128, 6); + helper_float_30_recursive(buf + 192, 6); + helper_float_30_recursive(buf + 256, 6); + helper_float_30_recursive(buf + 320, 6); + helper_float_30_recursive(buf + 384, 6); + helper_float_30_recursive(buf + 448, 6); + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 64; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_float_30_recursive(buf + 0, 9); + helper_float_30_recursive(buf + 512, 9); + helper_float_30_recursive(buf + 1024, 9); + helper_float_30_recursive(buf + 1536, 9); + helper_float_30_recursive(buf + 2048, 9); + helper_float_30_recursive(buf + 2560, 9); + helper_float_30_recursive(buf + 3072, 9); + helper_float_30_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_float_30_recursive(buf + 0, 12); + helper_float_30_recursive(buf + 4096, 12); + helper_float_30_recursive(buf + 8192, 12); + helper_float_30_recursive(buf + 12288, 12); + helper_float_30_recursive(buf + 16384, 12); + helper_float_30_recursive(buf + 20480, 12); + helper_float_30_recursive(buf + 24576, 12); + helper_float_30_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_float_30_recursive(buf + 0, 15); + helper_float_30_recursive(buf + 32768, 15); + helper_float_30_recursive(buf + 65536, 15); + helper_float_30_recursive(buf + 98304, 15); + helper_float_30_recursive(buf + 131072, 15); + helper_float_30_recursive(buf + 163840, 15); + helper_float_30_recursive(buf + 196608, 15); + helper_float_30_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_float_30_recursive(buf + 0, 18); + helper_float_30_recursive(buf + 262144, 18); + helper_float_30_recursive(buf + 524288, 18); + helper_float_30_recursive(buf + 786432, 18); + helper_float_30_recursive(buf + 1048576, 18); + helper_float_30_recursive(buf + 1310720, 18); + helper_float_30_recursive(buf + 1572864, 18); + helper_float_30_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_float_30_recursive(buf + 0, 21); + helper_float_30_recursive(buf + 2097152, 21); + helper_float_30_recursive(buf + 4194304, 21); + helper_float_30_recursive(buf + 6291456, 21); + helper_float_30_recursive(buf + 8388608, 21); + helper_float_30_recursive(buf + 10485760, 21); + helper_float_30_recursive(buf + 12582912, 21); + helper_float_30_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 27) { + helper_float_30_recursive(buf + 0, 24); + helper_float_30_recursive(buf + 16777216, 24); + helper_float_30_recursive(buf + 33554432, 24); + helper_float_30_recursive(buf + 50331648, 24); + helper_float_30_recursive(buf + 67108864, 24); + helper_float_30_recursive(buf + 83886080, 24); + helper_float_30_recursive(buf + 100663296, 24); + helper_float_30_recursive(buf + 117440512, 24); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 16777216; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 30) { + helper_float_30_recursive(buf + 0, 27); + helper_float_30_recursive(buf + 134217728, 27); + helper_float_30_recursive(buf + 268435456, 27); + helper_float_30_recursive(buf + 402653184, 27); + helper_float_30_recursive(buf + 536870912, 27); + helper_float_30_recursive(buf + 671088640, 27); + helper_float_30_recursive(buf + 805306368, 27); + helper_float_30_recursive(buf + 939524096, 27); + for (int j = 0; j < 1073741824; j += 1073741824) { + for (int k = 0; k < 134217728; k += 8) { + __asm__ volatile ( + "vmovups (%0), %%ymm0\n" + "vmovups (%1), %%ymm1\n" + "vmovups (%2), %%ymm2\n" + "vmovups (%3), %%ymm3\n" + "vmovups (%4), %%ymm4\n" + "vmovups (%5), %%ymm5\n" + "vmovups (%6), %%ymm6\n" + "vmovups (%7), %%ymm7\n" + "vaddps %%ymm1, %%ymm0, %%ymm8\n" + "vsubps %%ymm1, %%ymm0, %%ymm9\n" + "vaddps %%ymm3, %%ymm2, %%ymm10\n" + "vsubps %%ymm3, %%ymm2, %%ymm11\n" + "vaddps %%ymm5, %%ymm4, %%ymm12\n" + "vsubps %%ymm5, %%ymm4, %%ymm13\n" + "vaddps %%ymm7, %%ymm6, %%ymm14\n" + "vsubps %%ymm7, %%ymm6, %%ymm15\n" + "vaddps %%ymm10, %%ymm8, %%ymm0\n" + "vsubps %%ymm10, %%ymm8, %%ymm2\n" + "vaddps %%ymm11, %%ymm9, %%ymm1\n" + "vsubps %%ymm11, %%ymm9, %%ymm3\n" + "vaddps %%ymm14, %%ymm12, %%ymm4\n" + "vsubps %%ymm14, %%ymm12, %%ymm6\n" + "vaddps %%ymm15, %%ymm13, %%ymm5\n" + "vsubps %%ymm15, %%ymm13, %%ymm7\n" + "vaddps %%ymm4, %%ymm0, %%ymm8\n" + "vsubps %%ymm4, %%ymm0, %%ymm12\n" + "vaddps %%ymm5, %%ymm1, %%ymm9\n" + "vsubps %%ymm5, %%ymm1, %%ymm13\n" + "vaddps %%ymm6, %%ymm2, %%ymm10\n" + "vsubps %%ymm6, %%ymm2, %%ymm14\n" + "vaddps %%ymm7, %%ymm3, %%ymm11\n" + "vsubps %%ymm7, %%ymm3, %%ymm15\n" + "vmovups %%ymm8, (%0)\n" + "vmovups %%ymm9, (%1)\n" + "vmovups %%ymm10, (%2)\n" + "vmovups %%ymm11, (%3)\n" + "vmovups %%ymm12, (%4)\n" + "vmovups %%ymm13, (%5)\n" + "vmovups %%ymm14, (%6)\n" + "vmovups %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_float_30(float *buf); +inline void helper_float_30(float *buf) { + helper_float_30_recursive(buf, 30); +} +inline int fht_float(float *buf, int log_n) { + if (log_n == 0) { + return 0; + } + if (log_n == 1) { + helper_float_1(buf); + return 0; + } + if (log_n == 2) { + helper_float_2(buf); + return 0; + } + if (log_n == 3) { + helper_float_3(buf); + return 0; + } + if (log_n == 4) { + helper_float_4(buf); + return 0; + } + if (log_n == 5) { + helper_float_5(buf); + return 0; + } + if (log_n == 6) { + helper_float_6(buf); + return 0; + } + if (log_n == 7) { + helper_float_7(buf); + return 0; + } + if (log_n == 8) { + helper_float_8(buf); + return 0; + } + if (log_n == 9) { + helper_float_9(buf); + return 0; + } + if (log_n == 10) { + helper_float_10(buf); + return 0; + } + if (log_n == 11) { + helper_float_11(buf); + return 0; + } + if (log_n == 12) { + helper_float_12(buf); + return 0; + } + if (log_n == 13) { + helper_float_13(buf); + return 0; + } + if (log_n == 14) { + helper_float_14(buf); + return 0; + } + if (log_n == 15) { + helper_float_15(buf); + return 0; + } + if (log_n == 16) { + helper_float_16(buf); + return 0; + } + if (log_n == 17) { + helper_float_17(buf); + return 0; + } + if (log_n == 18) { + helper_float_18(buf); + return 0; + } + if (log_n == 19) { + helper_float_19(buf); + return 0; + } + if (log_n == 20) { + helper_float_20(buf); + return 0; + } + if (log_n == 21) { + helper_float_21(buf); + return 0; + } + if (log_n == 22) { + helper_float_22(buf); + return 0; + } + if (log_n == 23) { + helper_float_23(buf); + return 0; + } + if (log_n == 24) { + helper_float_24(buf); + return 0; + } + if (log_n == 25) { + helper_float_25(buf); + return 0; + } + if (log_n == 26) { + helper_float_26(buf); + return 0; + } + if (log_n == 27) { + helper_float_27(buf); + return 0; + } + if (log_n == 28) { + helper_float_28(buf); + return 0; + } + if (log_n == 29) { + helper_float_29(buf); + return 0; + } + if (log_n == 30) { + helper_float_30(buf); + return 0; + } + return 1; +} +inline void helper_double_1(double *buf); +inline void helper_double_1(double *buf) { + for (int j = 0; j < 2; j += 2) { + for (int k = 0; k < 1; ++k) { + double u = buf[j + k]; + double v = buf[j + k + 1]; + buf[j + k] = u + v; + buf[j + k + 1] = u - v; + } + } +} +inline void helper_double_2(double *buf); +inline void helper_double_2(double *buf) { + for (int j = 0; j < 4; j += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vmovupd %%ymm0, (%0)\n" + :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } +} +inline void helper_double_3(double *buf); +inline void helper_double_3(double *buf) { + for (int j = 0; j < 8; j += 8) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_4_recursive(double *buf, int depth); +inline void helper_double_4_recursive(double *buf, int depth) { + if (depth == 4) { + for (int j = 0; j < 16; j += 16) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_4(double *buf); +inline void helper_double_4(double *buf) { + helper_double_4_recursive(buf, 4); +} +inline void helper_double_5(double *buf); +inline void helper_double_5(double *buf) { + for (int j = 0; j < 32; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_6(double *buf); +inline void helper_double_6(double *buf) { + for (int j = 0; j < 64; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 64; j += 64) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_7(double *buf); +inline void helper_double_7(double *buf) { + for (int j = 0; j < 128; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_8(double *buf); +inline void helper_double_8(double *buf) { + for (int j = 0; j < 256; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 256; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_9(double *buf); +inline void helper_double_9(double *buf) { + for (int j = 0; j < 512; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_10(double *buf); +inline void helper_double_10(double *buf) { + for (int j = 0; j < 1024; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_11(double *buf); +inline void helper_double_11(double *buf) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } +} +inline void helper_double_12_recursive(double *buf, int depth); +inline void helper_double_12_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_double_12_recursive(buf + 0, 11); + helper_double_12_recursive(buf + 2048, 11); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_12(double *buf); +inline void helper_double_12(double *buf) { + helper_double_12_recursive(buf, 12); +} +inline void helper_double_13_recursive(double *buf, int depth); +inline void helper_double_13_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_double_13_recursive(buf + 0, 11); + helper_double_13_recursive(buf + 2048, 11); + helper_double_13_recursive(buf + 4096, 11); + helper_double_13_recursive(buf + 6144, 11); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_13(double *buf); +inline void helper_double_13(double *buf) { + helper_double_13_recursive(buf, 13); +} +inline void helper_double_14_recursive(double *buf, int depth); +inline void helper_double_14_recursive(double *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_14_recursive(buf + 0, 12); + helper_double_14_recursive(buf + 4096, 12); + helper_double_14_recursive(buf + 8192, 12); + helper_double_14_recursive(buf + 12288, 12); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_14(double *buf); +inline void helper_double_14(double *buf) { + helper_double_14_recursive(buf, 14); +} +inline void helper_double_15_recursive(double *buf, int depth); +inline void helper_double_15_recursive(double *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_double_15_recursive(buf + 0, 12); + helper_double_15_recursive(buf + 4096, 12); + helper_double_15_recursive(buf + 8192, 12); + helper_double_15_recursive(buf + 12288, 12); + helper_double_15_recursive(buf + 16384, 12); + helper_double_15_recursive(buf + 20480, 12); + helper_double_15_recursive(buf + 24576, 12); + helper_double_15_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_15(double *buf); +inline void helper_double_15(double *buf) { + helper_double_15_recursive(buf, 15); +} +inline void helper_double_16_recursive(double *buf, int depth); +inline void helper_double_16_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_16_recursive(buf + 0, 11); + helper_double_16_recursive(buf + 2048, 11); + helper_double_16_recursive(buf + 4096, 11); + helper_double_16_recursive(buf + 6144, 11); + helper_double_16_recursive(buf + 8192, 11); + helper_double_16_recursive(buf + 10240, 11); + helper_double_16_recursive(buf + 12288, 11); + helper_double_16_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_double_16_recursive(buf + 0, 14); + helper_double_16_recursive(buf + 16384, 14); + helper_double_16_recursive(buf + 32768, 14); + helper_double_16_recursive(buf + 49152, 14); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_16(double *buf); +inline void helper_double_16(double *buf) { + helper_double_16_recursive(buf, 16); +} +inline void helper_double_17_recursive(double *buf, int depth); +inline void helper_double_17_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_17_recursive(buf + 0, 11); + helper_double_17_recursive(buf + 2048, 11); + helper_double_17_recursive(buf + 4096, 11); + helper_double_17_recursive(buf + 6144, 11); + helper_double_17_recursive(buf + 8192, 11); + helper_double_17_recursive(buf + 10240, 11); + helper_double_17_recursive(buf + 12288, 11); + helper_double_17_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_17_recursive(buf + 0, 14); + helper_double_17_recursive(buf + 16384, 14); + helper_double_17_recursive(buf + 32768, 14); + helper_double_17_recursive(buf + 49152, 14); + helper_double_17_recursive(buf + 65536, 14); + helper_double_17_recursive(buf + 81920, 14); + helper_double_17_recursive(buf + 98304, 14); + helper_double_17_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_17(double *buf); +inline void helper_double_17(double *buf) { + helper_double_17_recursive(buf, 17); +} +inline void helper_double_18_recursive(double *buf, int depth); +inline void helper_double_18_recursive(double *buf, int depth) { + if (depth == 12) { + for (int j = 0; j < 4096; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_double_18_recursive(buf + 0, 12); + helper_double_18_recursive(buf + 4096, 12); + helper_double_18_recursive(buf + 8192, 12); + helper_double_18_recursive(buf + 12288, 12); + helper_double_18_recursive(buf + 16384, 12); + helper_double_18_recursive(buf + 20480, 12); + helper_double_18_recursive(buf + 24576, 12); + helper_double_18_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_double_18_recursive(buf + 0, 15); + helper_double_18_recursive(buf + 32768, 15); + helper_double_18_recursive(buf + 65536, 15); + helper_double_18_recursive(buf + 98304, 15); + helper_double_18_recursive(buf + 131072, 15); + helper_double_18_recursive(buf + 163840, 15); + helper_double_18_recursive(buf + 196608, 15); + helper_double_18_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_18(double *buf); +inline void helper_double_18(double *buf) { + helper_double_18_recursive(buf, 18); +} +inline void helper_double_19_recursive(double *buf, int depth); +inline void helper_double_19_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_19_recursive(buf + 0, 11); + helper_double_19_recursive(buf + 2048, 11); + helper_double_19_recursive(buf + 4096, 11); + helper_double_19_recursive(buf + 6144, 11); + helper_double_19_recursive(buf + 8192, 11); + helper_double_19_recursive(buf + 10240, 11); + helper_double_19_recursive(buf + 12288, 11); + helper_double_19_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_19_recursive(buf + 0, 14); + helper_double_19_recursive(buf + 16384, 14); + helper_double_19_recursive(buf + 32768, 14); + helper_double_19_recursive(buf + 49152, 14); + helper_double_19_recursive(buf + 65536, 14); + helper_double_19_recursive(buf + 81920, 14); + helper_double_19_recursive(buf + 98304, 14); + helper_double_19_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_double_19_recursive(buf + 0, 17); + helper_double_19_recursive(buf + 131072, 17); + helper_double_19_recursive(buf + 262144, 17); + helper_double_19_recursive(buf + 393216, 17); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_19(double *buf); +inline void helper_double_19(double *buf) { + helper_double_19_recursive(buf, 19); +} +inline void helper_double_20_recursive(double *buf, int depth); +inline void helper_double_20_recursive(double *buf, int depth) { + if (depth == 9) { + for (int j = 0; j < 512; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_double_20_recursive(buf + 0, 9); + helper_double_20_recursive(buf + 512, 9); + helper_double_20_recursive(buf + 1024, 9); + helper_double_20_recursive(buf + 1536, 9); + helper_double_20_recursive(buf + 2048, 9); + helper_double_20_recursive(buf + 2560, 9); + helper_double_20_recursive(buf + 3072, 9); + helper_double_20_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_double_20_recursive(buf + 0, 12); + helper_double_20_recursive(buf + 4096, 12); + helper_double_20_recursive(buf + 8192, 12); + helper_double_20_recursive(buf + 12288, 12); + helper_double_20_recursive(buf + 16384, 12); + helper_double_20_recursive(buf + 20480, 12); + helper_double_20_recursive(buf + 24576, 12); + helper_double_20_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_double_20_recursive(buf + 0, 15); + helper_double_20_recursive(buf + 32768, 15); + helper_double_20_recursive(buf + 65536, 15); + helper_double_20_recursive(buf + 98304, 15); + helper_double_20_recursive(buf + 131072, 15); + helper_double_20_recursive(buf + 163840, 15); + helper_double_20_recursive(buf + 196608, 15); + helper_double_20_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_20_recursive(buf + 0, 18); + helper_double_20_recursive(buf + 262144, 18); + helper_double_20_recursive(buf + 524288, 18); + helper_double_20_recursive(buf + 786432, 18); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 262144; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_20(double *buf); +inline void helper_double_20(double *buf) { + helper_double_20_recursive(buf, 20); +} +inline void helper_double_21_recursive(double *buf, int depth); +inline void helper_double_21_recursive(double *buf, int depth) { + if (depth == 7) { + for (int j = 0; j < 128; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 128; j += 128) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 10) { + helper_double_21_recursive(buf + 0, 7); + helper_double_21_recursive(buf + 128, 7); + helper_double_21_recursive(buf + 256, 7); + helper_double_21_recursive(buf + 384, 7); + helper_double_21_recursive(buf + 512, 7); + helper_double_21_recursive(buf + 640, 7); + helper_double_21_recursive(buf + 768, 7); + helper_double_21_recursive(buf + 896, 7); + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 128; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_double_21_recursive(buf + 0, 10); + helper_double_21_recursive(buf + 1024, 10); + helper_double_21_recursive(buf + 2048, 10); + helper_double_21_recursive(buf + 3072, 10); + helper_double_21_recursive(buf + 4096, 10); + helper_double_21_recursive(buf + 5120, 10); + helper_double_21_recursive(buf + 6144, 10); + helper_double_21_recursive(buf + 7168, 10); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_double_21_recursive(buf + 0, 13); + helper_double_21_recursive(buf + 8192, 13); + helper_double_21_recursive(buf + 16384, 13); + helper_double_21_recursive(buf + 24576, 13); + helper_double_21_recursive(buf + 32768, 13); + helper_double_21_recursive(buf + 40960, 13); + helper_double_21_recursive(buf + 49152, 13); + helper_double_21_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_double_21_recursive(buf + 0, 16); + helper_double_21_recursive(buf + 65536, 16); + helper_double_21_recursive(buf + 131072, 16); + helper_double_21_recursive(buf + 196608, 16); + helper_double_21_recursive(buf + 262144, 16); + helper_double_21_recursive(buf + 327680, 16); + helper_double_21_recursive(buf + 393216, 16); + helper_double_21_recursive(buf + 458752, 16); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 65536; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_double_21_recursive(buf + 0, 19); + helper_double_21_recursive(buf + 524288, 19); + helper_double_21_recursive(buf + 1048576, 19); + helper_double_21_recursive(buf + 1572864, 19); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 524288; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_21(double *buf); +inline void helper_double_21(double *buf) { + helper_double_21_recursive(buf, 21); +} +inline void helper_double_22_recursive(double *buf, int depth); +inline void helper_double_22_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_22_recursive(buf + 0, 11); + helper_double_22_recursive(buf + 2048, 11); + helper_double_22_recursive(buf + 4096, 11); + helper_double_22_recursive(buf + 6144, 11); + helper_double_22_recursive(buf + 8192, 11); + helper_double_22_recursive(buf + 10240, 11); + helper_double_22_recursive(buf + 12288, 11); + helper_double_22_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_22_recursive(buf + 0, 14); + helper_double_22_recursive(buf + 16384, 14); + helper_double_22_recursive(buf + 32768, 14); + helper_double_22_recursive(buf + 49152, 14); + helper_double_22_recursive(buf + 65536, 14); + helper_double_22_recursive(buf + 81920, 14); + helper_double_22_recursive(buf + 98304, 14); + helper_double_22_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_22_recursive(buf + 0, 17); + helper_double_22_recursive(buf + 131072, 17); + helper_double_22_recursive(buf + 262144, 17); + helper_double_22_recursive(buf + 393216, 17); + helper_double_22_recursive(buf + 524288, 17); + helper_double_22_recursive(buf + 655360, 17); + helper_double_22_recursive(buf + 786432, 17); + helper_double_22_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 22) { + helper_double_22_recursive(buf + 0, 20); + helper_double_22_recursive(buf + 1048576, 20); + helper_double_22_recursive(buf + 2097152, 20); + helper_double_22_recursive(buf + 3145728, 20); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_22(double *buf); +inline void helper_double_22(double *buf) { + helper_double_22_recursive(buf, 22); +} +inline void helper_double_23_recursive(double *buf, int depth); +inline void helper_double_23_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_23_recursive(buf + 0, 11); + helper_double_23_recursive(buf + 2048, 11); + helper_double_23_recursive(buf + 4096, 11); + helper_double_23_recursive(buf + 6144, 11); + helper_double_23_recursive(buf + 8192, 11); + helper_double_23_recursive(buf + 10240, 11); + helper_double_23_recursive(buf + 12288, 11); + helper_double_23_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_23_recursive(buf + 0, 14); + helper_double_23_recursive(buf + 16384, 14); + helper_double_23_recursive(buf + 32768, 14); + helper_double_23_recursive(buf + 49152, 14); + helper_double_23_recursive(buf + 65536, 14); + helper_double_23_recursive(buf + 81920, 14); + helper_double_23_recursive(buf + 98304, 14); + helper_double_23_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_23_recursive(buf + 0, 17); + helper_double_23_recursive(buf + 131072, 17); + helper_double_23_recursive(buf + 262144, 17); + helper_double_23_recursive(buf + 393216, 17); + helper_double_23_recursive(buf + 524288, 17); + helper_double_23_recursive(buf + 655360, 17); + helper_double_23_recursive(buf + 786432, 17); + helper_double_23_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_double_23_recursive(buf + 0, 20); + helper_double_23_recursive(buf + 1048576, 20); + helper_double_23_recursive(buf + 2097152, 20); + helper_double_23_recursive(buf + 3145728, 20); + helper_double_23_recursive(buf + 4194304, 20); + helper_double_23_recursive(buf + 5242880, 20); + helper_double_23_recursive(buf + 6291456, 20); + helper_double_23_recursive(buf + 7340032, 20); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_23(double *buf); +inline void helper_double_23(double *buf) { + helper_double_23_recursive(buf, 23); +} +inline void helper_double_24_recursive(double *buf, int depth); +inline void helper_double_24_recursive(double *buf, int depth) { + if (depth == 10) { + for (int j = 0; j < 1024; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 1024; j += 1024) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 13) { + helper_double_24_recursive(buf + 0, 10); + helper_double_24_recursive(buf + 1024, 10); + helper_double_24_recursive(buf + 2048, 10); + helper_double_24_recursive(buf + 3072, 10); + helper_double_24_recursive(buf + 4096, 10); + helper_double_24_recursive(buf + 5120, 10); + helper_double_24_recursive(buf + 6144, 10); + helper_double_24_recursive(buf + 7168, 10); + for (int j = 0; j < 8192; j += 8192) { + for (int k = 0; k < 1024; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 16) { + helper_double_24_recursive(buf + 0, 13); + helper_double_24_recursive(buf + 8192, 13); + helper_double_24_recursive(buf + 16384, 13); + helper_double_24_recursive(buf + 24576, 13); + helper_double_24_recursive(buf + 32768, 13); + helper_double_24_recursive(buf + 40960, 13); + helper_double_24_recursive(buf + 49152, 13); + helper_double_24_recursive(buf + 57344, 13); + for (int j = 0; j < 65536; j += 65536) { + for (int k = 0; k < 8192; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 19) { + helper_double_24_recursive(buf + 0, 16); + helper_double_24_recursive(buf + 65536, 16); + helper_double_24_recursive(buf + 131072, 16); + helper_double_24_recursive(buf + 196608, 16); + helper_double_24_recursive(buf + 262144, 16); + helper_double_24_recursive(buf + 327680, 16); + helper_double_24_recursive(buf + 393216, 16); + helper_double_24_recursive(buf + 458752, 16); + for (int j = 0; j < 524288; j += 524288) { + for (int k = 0; k < 65536; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 22) { + helper_double_24_recursive(buf + 0, 19); + helper_double_24_recursive(buf + 524288, 19); + helper_double_24_recursive(buf + 1048576, 19); + helper_double_24_recursive(buf + 1572864, 19); + helper_double_24_recursive(buf + 2097152, 19); + helper_double_24_recursive(buf + 2621440, 19); + helper_double_24_recursive(buf + 3145728, 19); + helper_double_24_recursive(buf + 3670016, 19); + for (int j = 0; j < 4194304; j += 4194304) { + for (int k = 0; k < 524288; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_double_24_recursive(buf + 0, 22); + helper_double_24_recursive(buf + 4194304, 22); + helper_double_24_recursive(buf + 8388608, 22); + helper_double_24_recursive(buf + 12582912, 22); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 4194304; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_24(double *buf); +inline void helper_double_24(double *buf) { + helper_double_24_recursive(buf, 24); +} +inline void helper_double_25_recursive(double *buf, int depth); +inline void helper_double_25_recursive(double *buf, int depth) { + if (depth == 8) { + for (int j = 0; j < 256; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 256; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 11) { + helper_double_25_recursive(buf + 0, 8); + helper_double_25_recursive(buf + 256, 8); + helper_double_25_recursive(buf + 512, 8); + helper_double_25_recursive(buf + 768, 8); + helper_double_25_recursive(buf + 1024, 8); + helper_double_25_recursive(buf + 1280, 8); + helper_double_25_recursive(buf + 1536, 8); + helper_double_25_recursive(buf + 1792, 8); + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_25_recursive(buf + 0, 11); + helper_double_25_recursive(buf + 2048, 11); + helper_double_25_recursive(buf + 4096, 11); + helper_double_25_recursive(buf + 6144, 11); + helper_double_25_recursive(buf + 8192, 11); + helper_double_25_recursive(buf + 10240, 11); + helper_double_25_recursive(buf + 12288, 11); + helper_double_25_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_25_recursive(buf + 0, 14); + helper_double_25_recursive(buf + 16384, 14); + helper_double_25_recursive(buf + 32768, 14); + helper_double_25_recursive(buf + 49152, 14); + helper_double_25_recursive(buf + 65536, 14); + helper_double_25_recursive(buf + 81920, 14); + helper_double_25_recursive(buf + 98304, 14); + helper_double_25_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_25_recursive(buf + 0, 17); + helper_double_25_recursive(buf + 131072, 17); + helper_double_25_recursive(buf + 262144, 17); + helper_double_25_recursive(buf + 393216, 17); + helper_double_25_recursive(buf + 524288, 17); + helper_double_25_recursive(buf + 655360, 17); + helper_double_25_recursive(buf + 786432, 17); + helper_double_25_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_double_25_recursive(buf + 0, 20); + helper_double_25_recursive(buf + 1048576, 20); + helper_double_25_recursive(buf + 2097152, 20); + helper_double_25_recursive(buf + 3145728, 20); + helper_double_25_recursive(buf + 4194304, 20); + helper_double_25_recursive(buf + 5242880, 20); + helper_double_25_recursive(buf + 6291456, 20); + helper_double_25_recursive(buf + 7340032, 20); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 25) { + helper_double_25_recursive(buf + 0, 23); + helper_double_25_recursive(buf + 8388608, 23); + helper_double_25_recursive(buf + 16777216, 23); + helper_double_25_recursive(buf + 25165824, 23); + for (int j = 0; j < 33554432; j += 33554432) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_25(double *buf); +inline void helper_double_25(double *buf) { + helper_double_25_recursive(buf, 25); +} +inline void helper_double_26_recursive(double *buf, int depth); +inline void helper_double_26_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_26_recursive(buf + 0, 11); + helper_double_26_recursive(buf + 2048, 11); + helper_double_26_recursive(buf + 4096, 11); + helper_double_26_recursive(buf + 6144, 11); + helper_double_26_recursive(buf + 8192, 11); + helper_double_26_recursive(buf + 10240, 11); + helper_double_26_recursive(buf + 12288, 11); + helper_double_26_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_26_recursive(buf + 0, 14); + helper_double_26_recursive(buf + 16384, 14); + helper_double_26_recursive(buf + 32768, 14); + helper_double_26_recursive(buf + 49152, 14); + helper_double_26_recursive(buf + 65536, 14); + helper_double_26_recursive(buf + 81920, 14); + helper_double_26_recursive(buf + 98304, 14); + helper_double_26_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_26_recursive(buf + 0, 17); + helper_double_26_recursive(buf + 131072, 17); + helper_double_26_recursive(buf + 262144, 17); + helper_double_26_recursive(buf + 393216, 17); + helper_double_26_recursive(buf + 524288, 17); + helper_double_26_recursive(buf + 655360, 17); + helper_double_26_recursive(buf + 786432, 17); + helper_double_26_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_double_26_recursive(buf + 0, 20); + helper_double_26_recursive(buf + 1048576, 20); + helper_double_26_recursive(buf + 2097152, 20); + helper_double_26_recursive(buf + 3145728, 20); + helper_double_26_recursive(buf + 4194304, 20); + helper_double_26_recursive(buf + 5242880, 20); + helper_double_26_recursive(buf + 6291456, 20); + helper_double_26_recursive(buf + 7340032, 20); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 26) { + helper_double_26_recursive(buf + 0, 23); + helper_double_26_recursive(buf + 8388608, 23); + helper_double_26_recursive(buf + 16777216, 23); + helper_double_26_recursive(buf + 25165824, 23); + helper_double_26_recursive(buf + 33554432, 23); + helper_double_26_recursive(buf + 41943040, 23); + helper_double_26_recursive(buf + 50331648, 23); + helper_double_26_recursive(buf + 58720256, 23); + for (int j = 0; j < 67108864; j += 67108864) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_26(double *buf); +inline void helper_double_26(double *buf) { + helper_double_26_recursive(buf, 26); +} +inline void helper_double_27_recursive(double *buf, int depth); +inline void helper_double_27_recursive(double *buf, int depth) { + if (depth == 9) { + for (int j = 0; j < 512; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_double_27_recursive(buf + 0, 9); + helper_double_27_recursive(buf + 512, 9); + helper_double_27_recursive(buf + 1024, 9); + helper_double_27_recursive(buf + 1536, 9); + helper_double_27_recursive(buf + 2048, 9); + helper_double_27_recursive(buf + 2560, 9); + helper_double_27_recursive(buf + 3072, 9); + helper_double_27_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_double_27_recursive(buf + 0, 12); + helper_double_27_recursive(buf + 4096, 12); + helper_double_27_recursive(buf + 8192, 12); + helper_double_27_recursive(buf + 12288, 12); + helper_double_27_recursive(buf + 16384, 12); + helper_double_27_recursive(buf + 20480, 12); + helper_double_27_recursive(buf + 24576, 12); + helper_double_27_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_double_27_recursive(buf + 0, 15); + helper_double_27_recursive(buf + 32768, 15); + helper_double_27_recursive(buf + 65536, 15); + helper_double_27_recursive(buf + 98304, 15); + helper_double_27_recursive(buf + 131072, 15); + helper_double_27_recursive(buf + 163840, 15); + helper_double_27_recursive(buf + 196608, 15); + helper_double_27_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_double_27_recursive(buf + 0, 18); + helper_double_27_recursive(buf + 262144, 18); + helper_double_27_recursive(buf + 524288, 18); + helper_double_27_recursive(buf + 786432, 18); + helper_double_27_recursive(buf + 1048576, 18); + helper_double_27_recursive(buf + 1310720, 18); + helper_double_27_recursive(buf + 1572864, 18); + helper_double_27_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_double_27_recursive(buf + 0, 21); + helper_double_27_recursive(buf + 2097152, 21); + helper_double_27_recursive(buf + 4194304, 21); + helper_double_27_recursive(buf + 6291456, 21); + helper_double_27_recursive(buf + 8388608, 21); + helper_double_27_recursive(buf + 10485760, 21); + helper_double_27_recursive(buf + 12582912, 21); + helper_double_27_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 27) { + helper_double_27_recursive(buf + 0, 24); + helper_double_27_recursive(buf + 16777216, 24); + helper_double_27_recursive(buf + 33554432, 24); + helper_double_27_recursive(buf + 50331648, 24); + helper_double_27_recursive(buf + 67108864, 24); + helper_double_27_recursive(buf + 83886080, 24); + helper_double_27_recursive(buf + 100663296, 24); + helper_double_27_recursive(buf + 117440512, 24); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 16777216; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_27(double *buf); +inline void helper_double_27(double *buf) { + helper_double_27_recursive(buf, 27); +} +inline void helper_double_28_recursive(double *buf, int depth); +inline void helper_double_28_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_28_recursive(buf + 0, 11); + helper_double_28_recursive(buf + 2048, 11); + helper_double_28_recursive(buf + 4096, 11); + helper_double_28_recursive(buf + 6144, 11); + helper_double_28_recursive(buf + 8192, 11); + helper_double_28_recursive(buf + 10240, 11); + helper_double_28_recursive(buf + 12288, 11); + helper_double_28_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_28_recursive(buf + 0, 14); + helper_double_28_recursive(buf + 16384, 14); + helper_double_28_recursive(buf + 32768, 14); + helper_double_28_recursive(buf + 49152, 14); + helper_double_28_recursive(buf + 65536, 14); + helper_double_28_recursive(buf + 81920, 14); + helper_double_28_recursive(buf + 98304, 14); + helper_double_28_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_28_recursive(buf + 0, 17); + helper_double_28_recursive(buf + 131072, 17); + helper_double_28_recursive(buf + 262144, 17); + helper_double_28_recursive(buf + 393216, 17); + helper_double_28_recursive(buf + 524288, 17); + helper_double_28_recursive(buf + 655360, 17); + helper_double_28_recursive(buf + 786432, 17); + helper_double_28_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_double_28_recursive(buf + 0, 20); + helper_double_28_recursive(buf + 1048576, 20); + helper_double_28_recursive(buf + 2097152, 20); + helper_double_28_recursive(buf + 3145728, 20); + helper_double_28_recursive(buf + 4194304, 20); + helper_double_28_recursive(buf + 5242880, 20); + helper_double_28_recursive(buf + 6291456, 20); + helper_double_28_recursive(buf + 7340032, 20); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 26) { + helper_double_28_recursive(buf + 0, 23); + helper_double_28_recursive(buf + 8388608, 23); + helper_double_28_recursive(buf + 16777216, 23); + helper_double_28_recursive(buf + 25165824, 23); + helper_double_28_recursive(buf + 33554432, 23); + helper_double_28_recursive(buf + 41943040, 23); + helper_double_28_recursive(buf + 50331648, 23); + helper_double_28_recursive(buf + 58720256, 23); + for (int j = 0; j < 67108864; j += 67108864) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 28) { + helper_double_28_recursive(buf + 0, 26); + helper_double_28_recursive(buf + 67108864, 26); + helper_double_28_recursive(buf + 134217728, 26); + helper_double_28_recursive(buf + 201326592, 26); + for (int j = 0; j < 268435456; j += 268435456) { + for (int k = 0; k < 67108864; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vmovupd %%ymm0, (%0)\n" + "vmovupd %%ymm1, (%1)\n" + "vmovupd %%ymm2, (%2)\n" + "vmovupd %%ymm3, (%3)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_28(double *buf); +inline void helper_double_28(double *buf) { + helper_double_28_recursive(buf, 28); +} +inline void helper_double_29_recursive(double *buf, int depth); +inline void helper_double_29_recursive(double *buf, int depth) { + if (depth == 11) { + for (int j = 0; j < 2048; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 2048; j += 2048) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 14) { + helper_double_29_recursive(buf + 0, 11); + helper_double_29_recursive(buf + 2048, 11); + helper_double_29_recursive(buf + 4096, 11); + helper_double_29_recursive(buf + 6144, 11); + helper_double_29_recursive(buf + 8192, 11); + helper_double_29_recursive(buf + 10240, 11); + helper_double_29_recursive(buf + 12288, 11); + helper_double_29_recursive(buf + 14336, 11); + for (int j = 0; j < 16384; j += 16384) { + for (int k = 0; k < 2048; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 17) { + helper_double_29_recursive(buf + 0, 14); + helper_double_29_recursive(buf + 16384, 14); + helper_double_29_recursive(buf + 32768, 14); + helper_double_29_recursive(buf + 49152, 14); + helper_double_29_recursive(buf + 65536, 14); + helper_double_29_recursive(buf + 81920, 14); + helper_double_29_recursive(buf + 98304, 14); + helper_double_29_recursive(buf + 114688, 14); + for (int j = 0; j < 131072; j += 131072) { + for (int k = 0; k < 16384; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 20) { + helper_double_29_recursive(buf + 0, 17); + helper_double_29_recursive(buf + 131072, 17); + helper_double_29_recursive(buf + 262144, 17); + helper_double_29_recursive(buf + 393216, 17); + helper_double_29_recursive(buf + 524288, 17); + helper_double_29_recursive(buf + 655360, 17); + helper_double_29_recursive(buf + 786432, 17); + helper_double_29_recursive(buf + 917504, 17); + for (int j = 0; j < 1048576; j += 1048576) { + for (int k = 0; k < 131072; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 23) { + helper_double_29_recursive(buf + 0, 20); + helper_double_29_recursive(buf + 1048576, 20); + helper_double_29_recursive(buf + 2097152, 20); + helper_double_29_recursive(buf + 3145728, 20); + helper_double_29_recursive(buf + 4194304, 20); + helper_double_29_recursive(buf + 5242880, 20); + helper_double_29_recursive(buf + 6291456, 20); + helper_double_29_recursive(buf + 7340032, 20); + for (int j = 0; j < 8388608; j += 8388608) { + for (int k = 0; k < 1048576; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 26) { + helper_double_29_recursive(buf + 0, 23); + helper_double_29_recursive(buf + 8388608, 23); + helper_double_29_recursive(buf + 16777216, 23); + helper_double_29_recursive(buf + 25165824, 23); + helper_double_29_recursive(buf + 33554432, 23); + helper_double_29_recursive(buf + 41943040, 23); + helper_double_29_recursive(buf + 50331648, 23); + helper_double_29_recursive(buf + 58720256, 23); + for (int j = 0; j < 67108864; j += 67108864) { + for (int k = 0; k < 8388608; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 29) { + helper_double_29_recursive(buf + 0, 26); + helper_double_29_recursive(buf + 67108864, 26); + helper_double_29_recursive(buf + 134217728, 26); + helper_double_29_recursive(buf + 201326592, 26); + helper_double_29_recursive(buf + 268435456, 26); + helper_double_29_recursive(buf + 335544320, 26); + helper_double_29_recursive(buf + 402653184, 26); + helper_double_29_recursive(buf + 469762048, 26); + for (int j = 0; j < 536870912; j += 536870912) { + for (int k = 0; k < 67108864; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_29(double *buf); +inline void helper_double_29(double *buf) { + helper_double_29_recursive(buf, 29); +} +inline void helper_double_30_recursive(double *buf, int depth); +inline void helper_double_30_recursive(double *buf, int depth) { + if (depth == 9) { + for (int j = 0; j < 512; j += 32) { + for (int k = 0; k < 4; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vpermilpd $0, %%ymm0, %%ymm8\n" + "vpermilpd $15, %%ymm0, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" + "vpermilpd $0, %%ymm1, %%ymm8\n" + "vpermilpd $15, %%ymm1, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" + "vpermilpd $0, %%ymm2, %%ymm8\n" + "vpermilpd $15, %%ymm2, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" + "vpermilpd $0, %%ymm3, %%ymm8\n" + "vpermilpd $15, %%ymm3, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" + "vpermilpd $0, %%ymm4, %%ymm8\n" + "vpermilpd $15, %%ymm4, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" + "vpermilpd $0, %%ymm5, %%ymm8\n" + "vpermilpd $15, %%ymm5, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" + "vpermilpd $0, %%ymm6, %%ymm8\n" + "vpermilpd $15, %%ymm6, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" + "vpermilpd $0, %%ymm7, %%ymm8\n" + "vpermilpd $15, %%ymm7, %%ymm9\n" + "vxorpd %%ymm10, %%ymm10, %%ymm10\n" + "vsubpd %%ymm9, %%ymm10, %%ymm11\n" + "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" + "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm0, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm0\n" + "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm1, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm1\n" + "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm2, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm2\n" + "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm3, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm3\n" + "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm4, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm4\n" + "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm5, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm5\n" + "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm6, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm6\n" + "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" + "vxorpd %%ymm9, %%ymm9, %%ymm9\n" + "vsubpd %%ymm7, %%ymm9, %%ymm10\n" + "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" + "vaddpd %%ymm11, %%ymm8, %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 256) { + for (int k = 0; k < 32; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + for (int j = 0; j < 512; j += 512) { + for (int k = 0; k < 256; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 12) { + helper_double_30_recursive(buf + 0, 9); + helper_double_30_recursive(buf + 512, 9); + helper_double_30_recursive(buf + 1024, 9); + helper_double_30_recursive(buf + 1536, 9); + helper_double_30_recursive(buf + 2048, 9); + helper_double_30_recursive(buf + 2560, 9); + helper_double_30_recursive(buf + 3072, 9); + helper_double_30_recursive(buf + 3584, 9); + for (int j = 0; j < 4096; j += 4096) { + for (int k = 0; k < 512; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 15) { + helper_double_30_recursive(buf + 0, 12); + helper_double_30_recursive(buf + 4096, 12); + helper_double_30_recursive(buf + 8192, 12); + helper_double_30_recursive(buf + 12288, 12); + helper_double_30_recursive(buf + 16384, 12); + helper_double_30_recursive(buf + 20480, 12); + helper_double_30_recursive(buf + 24576, 12); + helper_double_30_recursive(buf + 28672, 12); + for (int j = 0; j < 32768; j += 32768) { + for (int k = 0; k < 4096; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 18) { + helper_double_30_recursive(buf + 0, 15); + helper_double_30_recursive(buf + 32768, 15); + helper_double_30_recursive(buf + 65536, 15); + helper_double_30_recursive(buf + 98304, 15); + helper_double_30_recursive(buf + 131072, 15); + helper_double_30_recursive(buf + 163840, 15); + helper_double_30_recursive(buf + 196608, 15); + helper_double_30_recursive(buf + 229376, 15); + for (int j = 0; j < 262144; j += 262144) { + for (int k = 0; k < 32768; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 21) { + helper_double_30_recursive(buf + 0, 18); + helper_double_30_recursive(buf + 262144, 18); + helper_double_30_recursive(buf + 524288, 18); + helper_double_30_recursive(buf + 786432, 18); + helper_double_30_recursive(buf + 1048576, 18); + helper_double_30_recursive(buf + 1310720, 18); + helper_double_30_recursive(buf + 1572864, 18); + helper_double_30_recursive(buf + 1835008, 18); + for (int j = 0; j < 2097152; j += 2097152) { + for (int k = 0; k < 262144; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 24) { + helper_double_30_recursive(buf + 0, 21); + helper_double_30_recursive(buf + 2097152, 21); + helper_double_30_recursive(buf + 4194304, 21); + helper_double_30_recursive(buf + 6291456, 21); + helper_double_30_recursive(buf + 8388608, 21); + helper_double_30_recursive(buf + 10485760, 21); + helper_double_30_recursive(buf + 12582912, 21); + helper_double_30_recursive(buf + 14680064, 21); + for (int j = 0; j < 16777216; j += 16777216) { + for (int k = 0; k < 2097152; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 27) { + helper_double_30_recursive(buf + 0, 24); + helper_double_30_recursive(buf + 16777216, 24); + helper_double_30_recursive(buf + 33554432, 24); + helper_double_30_recursive(buf + 50331648, 24); + helper_double_30_recursive(buf + 67108864, 24); + helper_double_30_recursive(buf + 83886080, 24); + helper_double_30_recursive(buf + 100663296, 24); + helper_double_30_recursive(buf + 117440512, 24); + for (int j = 0; j < 134217728; j += 134217728) { + for (int k = 0; k < 16777216; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } + if (depth == 30) { + helper_double_30_recursive(buf + 0, 27); + helper_double_30_recursive(buf + 134217728, 27); + helper_double_30_recursive(buf + 268435456, 27); + helper_double_30_recursive(buf + 402653184, 27); + helper_double_30_recursive(buf + 536870912, 27); + helper_double_30_recursive(buf + 671088640, 27); + helper_double_30_recursive(buf + 805306368, 27); + helper_double_30_recursive(buf + 939524096, 27); + for (int j = 0; j < 1073741824; j += 1073741824) { + for (int k = 0; k < 134217728; k += 4) { + __asm__ volatile ( + "vmovupd (%0), %%ymm0\n" + "vmovupd (%1), %%ymm1\n" + "vmovupd (%2), %%ymm2\n" + "vmovupd (%3), %%ymm3\n" + "vmovupd (%4), %%ymm4\n" + "vmovupd (%5), %%ymm5\n" + "vmovupd (%6), %%ymm6\n" + "vmovupd (%7), %%ymm7\n" + "vaddpd %%ymm1, %%ymm0, %%ymm8\n" + "vsubpd %%ymm1, %%ymm0, %%ymm9\n" + "vaddpd %%ymm3, %%ymm2, %%ymm10\n" + "vsubpd %%ymm3, %%ymm2, %%ymm11\n" + "vaddpd %%ymm5, %%ymm4, %%ymm12\n" + "vsubpd %%ymm5, %%ymm4, %%ymm13\n" + "vaddpd %%ymm7, %%ymm6, %%ymm14\n" + "vsubpd %%ymm7, %%ymm6, %%ymm15\n" + "vaddpd %%ymm10, %%ymm8, %%ymm0\n" + "vsubpd %%ymm10, %%ymm8, %%ymm2\n" + "vaddpd %%ymm11, %%ymm9, %%ymm1\n" + "vsubpd %%ymm11, %%ymm9, %%ymm3\n" + "vaddpd %%ymm14, %%ymm12, %%ymm4\n" + "vsubpd %%ymm14, %%ymm12, %%ymm6\n" + "vaddpd %%ymm15, %%ymm13, %%ymm5\n" + "vsubpd %%ymm15, %%ymm13, %%ymm7\n" + "vaddpd %%ymm4, %%ymm0, %%ymm8\n" + "vsubpd %%ymm4, %%ymm0, %%ymm12\n" + "vaddpd %%ymm5, %%ymm1, %%ymm9\n" + "vsubpd %%ymm5, %%ymm1, %%ymm13\n" + "vaddpd %%ymm6, %%ymm2, %%ymm10\n" + "vsubpd %%ymm6, %%ymm2, %%ymm14\n" + "vaddpd %%ymm7, %%ymm3, %%ymm11\n" + "vsubpd %%ymm7, %%ymm3, %%ymm15\n" + "vmovupd %%ymm8, (%0)\n" + "vmovupd %%ymm9, (%1)\n" + "vmovupd %%ymm10, (%2)\n" + "vmovupd %%ymm11, (%3)\n" + "vmovupd %%ymm12, (%4)\n" + "vmovupd %%ymm13, (%5)\n" + "vmovupd %%ymm14, (%6)\n" + "vmovupd %%ymm15, (%7)\n" + :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" + ); + } + } + return; + } +} +inline void helper_double_30(double *buf); +inline void helper_double_30(double *buf) { + helper_double_30_recursive(buf, 30); +} +inline int fht_double(double *buf, int log_n) { + if (log_n == 0) { + return 0; + } + if (log_n == 1) { + helper_double_1(buf); + return 0; + } + if (log_n == 2) { + helper_double_2(buf); + return 0; + } + if (log_n == 3) { + helper_double_3(buf); + return 0; + } + if (log_n == 4) { + helper_double_4(buf); + return 0; + } + if (log_n == 5) { + helper_double_5(buf); + return 0; + } + if (log_n == 6) { + helper_double_6(buf); + return 0; + } + if (log_n == 7) { + helper_double_7(buf); + return 0; + } + if (log_n == 8) { + helper_double_8(buf); + return 0; + } + if (log_n == 9) { + helper_double_9(buf); + return 0; + } + if (log_n == 10) { + helper_double_10(buf); + return 0; + } + if (log_n == 11) { + helper_double_11(buf); + return 0; + } + if (log_n == 12) { + helper_double_12(buf); + return 0; + } + if (log_n == 13) { + helper_double_13(buf); + return 0; + } + if (log_n == 14) { + helper_double_14(buf); + return 0; + } + if (log_n == 15) { + helper_double_15(buf); + return 0; + } + if (log_n == 16) { + helper_double_16(buf); + return 0; + } + if (log_n == 17) { + helper_double_17(buf); + return 0; + } + if (log_n == 18) { + helper_double_18(buf); + return 0; + } + if (log_n == 19) { + helper_double_19(buf); + return 0; + } + if (log_n == 20) { + helper_double_20(buf); + return 0; + } + if (log_n == 21) { + helper_double_21(buf); + return 0; + } + if (log_n == 22) { + helper_double_22(buf); + return 0; + } + if (log_n == 23) { + helper_double_23(buf); + return 0; + } + if (log_n == 24) { + helper_double_24(buf); + return 0; + } + if (log_n == 25) { + helper_double_25(buf); + return 0; + } + if (log_n == 26) { + helper_double_26(buf); + return 0; + } + if (log_n == 27) { + helper_double_27(buf); + return 0; + } + if (log_n == 28) { + helper_double_28(buf); + return 0; + } + if (log_n == 29) { + helper_double_29(buf); + return 0; + } + if (log_n == 30) { + helper_double_30(buf); + return 0; + } + return 1; +} From b5ffbc5a9f49926b1f39327de46a3d20258ea17f Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Mon, 15 Jun 2026 21:03:16 +0800 Subject: [PATCH 24/38] update c_api --- src/core/quantizer/CMakeLists.txt | 2 +- src/core/quantizer/record_rotator.cc | 56 +- src/core/utility/fht_avx.hpp | 19698 ------------------------- 3 files changed, 3 insertions(+), 19753 deletions(-) delete mode 100644 src/core/utility/fht_avx.hpp diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index e8514e1d0..4c1558735 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -27,7 +27,7 @@ cc_library( STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego core_framework - LIBS zvec_ailego zvec_turbo core_framework + LIBS zvec_ailego zvec_turbo core_framework rabitqlib INCS . ${PROJECT_ROOT_DIR}/src/core LDFLAGS "${CORE_QUANTIZER_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 7db436222..c888a2276 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -26,7 +26,7 @@ // FFHT (Fastest Fast Hadamard Transform) — hand-tuned AVX inline assembly // from https://github.com/FALCONN-LIB/FFHT, originally bundled in rabitqlib. // Provides fht_float(buf, log_n) with per-size helper_float_N specialisations. -#include "utility/fht_avx.hpp" +#include "rabitqlib/utils/fht_avx.hpp" #elif defined(__SSE2__) #include #endif @@ -140,7 +140,6 @@ void flip_sign(const uint8_t *flip, float *data, size_t dim) { #elif defined(__SSE2__) // 128-bit SSE2: process 4 floats per iteration. // Load 2 bytes (16 bits) to safely handle cross-byte boundaries. - const __m128i sign_bit = _mm_set1_epi32(static_cast(0x80000000u)); for (size_t i = 0; i < dim; i += 4) { uint16_t bits16; std::memcpy(&bits16, &flip[i / 8], sizeof(bits16)); @@ -150,7 +149,7 @@ void flip_sign(const uint8_t *flip, float *data, size_t dim) { uint32_t b2 = (bits16 >> 2) & 1u; uint32_t b3 = (bits16 >> 3) & 1u; __m128i bit_mask = _mm_set_epi32(b3, b2, b1, b0); - __m128i sign_mask = _mm_mullo_epi32(bit_mask, sign_bit); + __m128i sign_mask = _mm_slli_epi32(bit_mask, 31); __m128 v = _mm_loadu_ps(&data[i]); v = _mm_xor_ps(v, _mm_castsi128_ps(sign_mask)); _mm_storeu_ps(&data[i], v); @@ -252,30 +251,6 @@ struct FhtKacRotatorImpl { std::mt19937 gen(rd()); std::uniform_int_distribution dist(0, 255); for (auto &b : flip) b = static_cast(dist(gen)); - - // Log SIMD path for debugging - const char *simd = -#if defined(__AVX512F__) && defined(__AVX512DQ__) - "AVX512F+DQ" -#elif defined(__AVX2__) - "AVX2" -#elif defined(__ARM_NEON) && defined(__aarch64__) - "ARM-NEON" -#elif defined(__SSE2__) - "SSE2" -#else - "Scalar" -#endif - ; - const char *fht = -#if defined(__AVX2__) || defined(__AVX512F__) - "FFHT-AVX" -#else - "Generic" -#endif - ; - LOG_WARN("RecordRotator[FhtKac] SIMD=%s, FHT=%s, padded_dim=%zu", - simd, fht, padded_dim); } void rotate(const float *in, float *out, size_t dim, @@ -357,7 +332,6 @@ struct MatrixRotatorImpl { std::vector matrix; // dim x padded_dim, row-major void init(size_t dim, size_t padded_dim) { - LOG_WARN("RecordRotator[Matrix] dim=%zu, padded_dim=%zu", dim, padded_dim); std::random_device rd; std::mt19937 gen(rd()); std::normal_distribution normal(0.0f, 1.0f); @@ -777,32 +751,6 @@ int RecordRotator::open(IndexStorage::Pointer storage, "data_size=%zu", seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); - // Log SIMD path (same format as init, for open/load path) - const char *simd = -#if defined(__AVX512F__) && defined(__AVX512DQ__) - "AVX512F+DQ" -#elif defined(__AVX2__) - "AVX2" -#elif defined(__ARM_NEON) && defined(__aarch64__) - "ARM-NEON" -#elif defined(__SSE2__) - "SSE2" -#else - "Scalar" -#endif - ; - const char *fht = -#if defined(__AVX2__) || defined(__AVX512F__) - "FFHT-AVX" -#else - "Generic" -#endif - ; - const char *type_name = (impl_->type == RecordRotatorType::FhtKac) - ? "FhtKac" : "Matrix"; - LOG_WARN("RecordRotator::open [%s] SIMD=%s, FHT=%s, dim=%zu, padded_dim=%zu", - type_name, simd, fht, impl_->dimension, impl_->padded_dim); - // Build inverse rotation data for unrotate support build_inverse(); diff --git a/src/core/utility/fht_avx.hpp b/src/core/utility/fht_avx.hpp deleted file mode 100644 index 310b6f96f..000000000 --- a/src/core/utility/fht_avx.hpp +++ /dev/null @@ -1,19698 +0,0 @@ -// https://github.com/FALCONN-LIB/FFHT - -// The MIT License (MIT) - -// Copyright (c) 2015 Alexandr Andoni, Piotr Indyk, Thijs Laarhoven, -// Ilya Razenshteyn, Ludwig Schmidt - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - - -#pragma once - -inline void helper_float_1(float *buf); -inline void helper_float_1(float *buf) { - for (int j = 0; j < 2; j += 2) { - for (int k = 0; k < 1; ++k) { - float u = buf[j + k]; - float v = buf[j + k + 1]; - buf[j + k] = u + v; - buf[j + k + 1] = u - v; - } - } -} -inline void helper_float_2(float *buf); -inline void helper_float_2(float *buf) { - for (int j = 0; j < 4; j += 2) { - for (int k = 0; k < 1; ++k) { - float u = buf[j + k]; - float v = buf[j + k + 1]; - buf[j + k] = u + v; - buf[j + k + 1] = u - v; - } - } - for (int j = 0; j < 4; j += 4) { - for (int k = 0; k < 2; ++k) { - float u = buf[j + k]; - float v = buf[j + k + 2]; - buf[j + k] = u + v; - buf[j + k + 2] = u - v; - } - } -} -inline void helper_float_3(float *buf); -inline void helper_float_3(float *buf) { - for (int j = 0; j < 8; j += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vmovups %%ymm0, (%0)\n" - :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } -} -inline void helper_float_4(float *buf); -inline void helper_float_4(float *buf) { - for (int j = 0; j < 16; j += 16) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_float_5(float *buf); -inline void helper_float_5(float *buf) { - for (int j = 0; j < 32; j += 32) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_float_6(float *buf); -inline void helper_float_6(float *buf) { - for (int j = 0; j < 64; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_float_7_recursive(float *buf, int depth); -inline void helper_float_7_recursive(float *buf, int depth) { - if (depth == 7) { - for (int j = 0; j < 128; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 128; j += 128) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_7(float *buf); -inline void helper_float_7(float *buf) { - helper_float_7_recursive(buf, 7); -} -inline void helper_float_8_recursive(float *buf, int depth); -inline void helper_float_8_recursive(float *buf, int depth) { - if (depth == 6) { - for (int j = 0; j < 64; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 8) { - helper_float_8_recursive(buf + 0, 6); - helper_float_8_recursive(buf + 64, 6); - helper_float_8_recursive(buf + 128, 6); - helper_float_8_recursive(buf + 192, 6); - for (int j = 0; j < 256; j += 256) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_8(float *buf); -inline void helper_float_8(float *buf) { - helper_float_8_recursive(buf, 8); -} -inline void helper_float_9(float *buf); -inline void helper_float_9(float *buf) { - for (int j = 0; j < 512; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_float_10_recursive(float *buf, int depth); -inline void helper_float_10_recursive(float *buf, int depth) { - if (depth == 10) { - for (int j = 0; j < 1024; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_10(float *buf); -inline void helper_float_10(float *buf) { - helper_float_10_recursive(buf, 10); -} -inline void helper_float_11_recursive(float *buf, int depth); -inline void helper_float_11_recursive(float *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_11(float *buf); -inline void helper_float_11(float *buf) { - helper_float_11_recursive(buf, 11); -} -inline void helper_float_12(float *buf); -inline void helper_float_12(float *buf) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_float_13_recursive(float *buf, int depth); -inline void helper_float_13_recursive(float *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_float_13_recursive(buf + 0, 11); - helper_float_13_recursive(buf + 2048, 11); - helper_float_13_recursive(buf + 4096, 11); - helper_float_13_recursive(buf + 6144, 11); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 2048; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_13(float *buf); -inline void helper_float_13(float *buf) { - helper_float_13_recursive(buf, 13); -} -inline void helper_float_14_recursive(float *buf, int depth); -inline void helper_float_14_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_float_14_recursive(buf + 0, 12); - helper_float_14_recursive(buf + 4096, 12); - helper_float_14_recursive(buf + 8192, 12); - helper_float_14_recursive(buf + 12288, 12); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_14(float *buf); -inline void helper_float_14(float *buf) { - helper_float_14_recursive(buf, 14); -} -inline void helper_float_15_recursive(float *buf, int depth); -inline void helper_float_15_recursive(float *buf, int depth) { - if (depth == 13) { - for (int j = 0; j < 8192; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_15_recursive(buf + 0, 13); - helper_float_15_recursive(buf + 8192, 13); - helper_float_15_recursive(buf + 16384, 13); - helper_float_15_recursive(buf + 24576, 13); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 8192; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_15(float *buf); -inline void helper_float_15(float *buf) { - helper_float_15_recursive(buf, 15); -} -inline void helper_float_16_recursive(float *buf, int depth); -inline void helper_float_16_recursive(float *buf, int depth) { - if (depth == 13) { - for (int j = 0; j < 8192; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_float_16_recursive(buf + 0, 13); - helper_float_16_recursive(buf + 8192, 13); - helper_float_16_recursive(buf + 16384, 13); - helper_float_16_recursive(buf + 24576, 13); - helper_float_16_recursive(buf + 32768, 13); - helper_float_16_recursive(buf + 40960, 13); - helper_float_16_recursive(buf + 49152, 13); - helper_float_16_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_16(float *buf); -inline void helper_float_16(float *buf) { - helper_float_16_recursive(buf, 16); -} -inline void helper_float_17_recursive(float *buf, int depth); -inline void helper_float_17_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_17_recursive(buf + 0, 12); - helper_float_17_recursive(buf + 4096, 12); - helper_float_17_recursive(buf + 8192, 12); - helper_float_17_recursive(buf + 12288, 12); - helper_float_17_recursive(buf + 16384, 12); - helper_float_17_recursive(buf + 20480, 12); - helper_float_17_recursive(buf + 24576, 12); - helper_float_17_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_float_17_recursive(buf + 0, 15); - helper_float_17_recursive(buf + 32768, 15); - helper_float_17_recursive(buf + 65536, 15); - helper_float_17_recursive(buf + 98304, 15); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_17(float *buf); -inline void helper_float_17(float *buf) { - helper_float_17_recursive(buf, 17); -} -inline void helper_float_18_recursive(float *buf, int depth); -inline void helper_float_18_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_18_recursive(buf + 0, 12); - helper_float_18_recursive(buf + 4096, 12); - helper_float_18_recursive(buf + 8192, 12); - helper_float_18_recursive(buf + 12288, 12); - helper_float_18_recursive(buf + 16384, 12); - helper_float_18_recursive(buf + 20480, 12); - helper_float_18_recursive(buf + 24576, 12); - helper_float_18_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_18_recursive(buf + 0, 15); - helper_float_18_recursive(buf + 32768, 15); - helper_float_18_recursive(buf + 65536, 15); - helper_float_18_recursive(buf + 98304, 15); - helper_float_18_recursive(buf + 131072, 15); - helper_float_18_recursive(buf + 163840, 15); - helper_float_18_recursive(buf + 196608, 15); - helper_float_18_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_18(float *buf); -inline void helper_float_18(float *buf) { - helper_float_18_recursive(buf, 18); -} -inline void helper_float_19_recursive(float *buf, int depth); -inline void helper_float_19_recursive(float *buf, int depth) { - if (depth == 13) { - for (int j = 0; j < 8192; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_float_19_recursive(buf + 0, 13); - helper_float_19_recursive(buf + 8192, 13); - helper_float_19_recursive(buf + 16384, 13); - helper_float_19_recursive(buf + 24576, 13); - helper_float_19_recursive(buf + 32768, 13); - helper_float_19_recursive(buf + 40960, 13); - helper_float_19_recursive(buf + 49152, 13); - helper_float_19_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_float_19_recursive(buf + 0, 16); - helper_float_19_recursive(buf + 65536, 16); - helper_float_19_recursive(buf + 131072, 16); - helper_float_19_recursive(buf + 196608, 16); - helper_float_19_recursive(buf + 262144, 16); - helper_float_19_recursive(buf + 327680, 16); - helper_float_19_recursive(buf + 393216, 16); - helper_float_19_recursive(buf + 458752, 16); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 65536; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_19(float *buf); -inline void helper_float_19(float *buf) { - helper_float_19_recursive(buf, 19); -} -inline void helper_float_20_recursive(float *buf, int depth); -inline void helper_float_20_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_20_recursive(buf + 0, 12); - helper_float_20_recursive(buf + 4096, 12); - helper_float_20_recursive(buf + 8192, 12); - helper_float_20_recursive(buf + 12288, 12); - helper_float_20_recursive(buf + 16384, 12); - helper_float_20_recursive(buf + 20480, 12); - helper_float_20_recursive(buf + 24576, 12); - helper_float_20_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_20_recursive(buf + 0, 15); - helper_float_20_recursive(buf + 32768, 15); - helper_float_20_recursive(buf + 65536, 15); - helper_float_20_recursive(buf + 98304, 15); - helper_float_20_recursive(buf + 131072, 15); - helper_float_20_recursive(buf + 163840, 15); - helper_float_20_recursive(buf + 196608, 15); - helper_float_20_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_float_20_recursive(buf + 0, 18); - helper_float_20_recursive(buf + 262144, 18); - helper_float_20_recursive(buf + 524288, 18); - helper_float_20_recursive(buf + 786432, 18); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_20(float *buf); -inline void helper_float_20(float *buf) { - helper_float_20_recursive(buf, 20); -} -inline void helper_float_21_recursive(float *buf, int depth); -inline void helper_float_21_recursive(float *buf, int depth) { - if (depth == 9) { - for (int j = 0; j < 512; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_float_21_recursive(buf + 0, 9); - helper_float_21_recursive(buf + 512, 9); - helper_float_21_recursive(buf + 1024, 9); - helper_float_21_recursive(buf + 1536, 9); - helper_float_21_recursive(buf + 2048, 9); - helper_float_21_recursive(buf + 2560, 9); - helper_float_21_recursive(buf + 3072, 9); - helper_float_21_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_21_recursive(buf + 0, 12); - helper_float_21_recursive(buf + 4096, 12); - helper_float_21_recursive(buf + 8192, 12); - helper_float_21_recursive(buf + 12288, 12); - helper_float_21_recursive(buf + 16384, 12); - helper_float_21_recursive(buf + 20480, 12); - helper_float_21_recursive(buf + 24576, 12); - helper_float_21_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_21_recursive(buf + 0, 15); - helper_float_21_recursive(buf + 32768, 15); - helper_float_21_recursive(buf + 65536, 15); - helper_float_21_recursive(buf + 98304, 15); - helper_float_21_recursive(buf + 131072, 15); - helper_float_21_recursive(buf + 163840, 15); - helper_float_21_recursive(buf + 196608, 15); - helper_float_21_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_21_recursive(buf + 0, 18); - helper_float_21_recursive(buf + 262144, 18); - helper_float_21_recursive(buf + 524288, 18); - helper_float_21_recursive(buf + 786432, 18); - helper_float_21_recursive(buf + 1048576, 18); - helper_float_21_recursive(buf + 1310720, 18); - helper_float_21_recursive(buf + 1572864, 18); - helper_float_21_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_21(float *buf); -inline void helper_float_21(float *buf) { - helper_float_21_recursive(buf, 21); -} -inline void helper_float_22_recursive(float *buf, int depth); -inline void helper_float_22_recursive(float *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_float_22_recursive(buf + 0, 11); - helper_float_22_recursive(buf + 2048, 11); - helper_float_22_recursive(buf + 4096, 11); - helper_float_22_recursive(buf + 6144, 11); - helper_float_22_recursive(buf + 8192, 11); - helper_float_22_recursive(buf + 10240, 11); - helper_float_22_recursive(buf + 12288, 11); - helper_float_22_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_float_22_recursive(buf + 0, 14); - helper_float_22_recursive(buf + 16384, 14); - helper_float_22_recursive(buf + 32768, 14); - helper_float_22_recursive(buf + 49152, 14); - helper_float_22_recursive(buf + 65536, 14); - helper_float_22_recursive(buf + 81920, 14); - helper_float_22_recursive(buf + 98304, 14); - helper_float_22_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_float_22_recursive(buf + 0, 17); - helper_float_22_recursive(buf + 131072, 17); - helper_float_22_recursive(buf + 262144, 17); - helper_float_22_recursive(buf + 393216, 17); - helper_float_22_recursive(buf + 524288, 17); - helper_float_22_recursive(buf + 655360, 17); - helper_float_22_recursive(buf + 786432, 17); - helper_float_22_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 22) { - helper_float_22_recursive(buf + 0, 20); - helper_float_22_recursive(buf + 1048576, 20); - helper_float_22_recursive(buf + 2097152, 20); - helper_float_22_recursive(buf + 3145728, 20); - for (int j = 0; j < 4194304; j += 4194304) { - for (int k = 0; k < 1048576; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_22(float *buf); -inline void helper_float_22(float *buf) { - helper_float_22_recursive(buf, 22); -} -inline void helper_float_23_recursive(float *buf, int depth); -inline void helper_float_23_recursive(float *buf, int depth) { - if (depth == 9) { - for (int j = 0; j < 512; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_float_23_recursive(buf + 0, 9); - helper_float_23_recursive(buf + 512, 9); - helper_float_23_recursive(buf + 1024, 9); - helper_float_23_recursive(buf + 1536, 9); - helper_float_23_recursive(buf + 2048, 9); - helper_float_23_recursive(buf + 2560, 9); - helper_float_23_recursive(buf + 3072, 9); - helper_float_23_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_23_recursive(buf + 0, 12); - helper_float_23_recursive(buf + 4096, 12); - helper_float_23_recursive(buf + 8192, 12); - helper_float_23_recursive(buf + 12288, 12); - helper_float_23_recursive(buf + 16384, 12); - helper_float_23_recursive(buf + 20480, 12); - helper_float_23_recursive(buf + 24576, 12); - helper_float_23_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_23_recursive(buf + 0, 15); - helper_float_23_recursive(buf + 32768, 15); - helper_float_23_recursive(buf + 65536, 15); - helper_float_23_recursive(buf + 98304, 15); - helper_float_23_recursive(buf + 131072, 15); - helper_float_23_recursive(buf + 163840, 15); - helper_float_23_recursive(buf + 196608, 15); - helper_float_23_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_23_recursive(buf + 0, 18); - helper_float_23_recursive(buf + 262144, 18); - helper_float_23_recursive(buf + 524288, 18); - helper_float_23_recursive(buf + 786432, 18); - helper_float_23_recursive(buf + 1048576, 18); - helper_float_23_recursive(buf + 1310720, 18); - helper_float_23_recursive(buf + 1572864, 18); - helper_float_23_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_float_23_recursive(buf + 0, 21); - helper_float_23_recursive(buf + 2097152, 21); - helper_float_23_recursive(buf + 4194304, 21); - helper_float_23_recursive(buf + 6291456, 21); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_23(float *buf); -inline void helper_float_23(float *buf) { - helper_float_23_recursive(buf, 23); -} -inline void helper_float_24_recursive(float *buf, int depth); -inline void helper_float_24_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_24_recursive(buf + 0, 12); - helper_float_24_recursive(buf + 4096, 12); - helper_float_24_recursive(buf + 8192, 12); - helper_float_24_recursive(buf + 12288, 12); - helper_float_24_recursive(buf + 16384, 12); - helper_float_24_recursive(buf + 20480, 12); - helper_float_24_recursive(buf + 24576, 12); - helper_float_24_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_24_recursive(buf + 0, 15); - helper_float_24_recursive(buf + 32768, 15); - helper_float_24_recursive(buf + 65536, 15); - helper_float_24_recursive(buf + 98304, 15); - helper_float_24_recursive(buf + 131072, 15); - helper_float_24_recursive(buf + 163840, 15); - helper_float_24_recursive(buf + 196608, 15); - helper_float_24_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_24_recursive(buf + 0, 18); - helper_float_24_recursive(buf + 262144, 18); - helper_float_24_recursive(buf + 524288, 18); - helper_float_24_recursive(buf + 786432, 18); - helper_float_24_recursive(buf + 1048576, 18); - helper_float_24_recursive(buf + 1310720, 18); - helper_float_24_recursive(buf + 1572864, 18); - helper_float_24_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_float_24_recursive(buf + 0, 21); - helper_float_24_recursive(buf + 2097152, 21); - helper_float_24_recursive(buf + 4194304, 21); - helper_float_24_recursive(buf + 6291456, 21); - helper_float_24_recursive(buf + 8388608, 21); - helper_float_24_recursive(buf + 10485760, 21); - helper_float_24_recursive(buf + 12582912, 21); - helper_float_24_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_24(float *buf); -inline void helper_float_24(float *buf) { - helper_float_24_recursive(buf, 24); -} -inline void helper_float_25_recursive(float *buf, int depth); -inline void helper_float_25_recursive(float *buf, int depth) { - if (depth == 7) { - for (int j = 0; j < 128; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 128; j += 128) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 10) { - helper_float_25_recursive(buf + 0, 7); - helper_float_25_recursive(buf + 128, 7); - helper_float_25_recursive(buf + 256, 7); - helper_float_25_recursive(buf + 384, 7); - helper_float_25_recursive(buf + 512, 7); - helper_float_25_recursive(buf + 640, 7); - helper_float_25_recursive(buf + 768, 7); - helper_float_25_recursive(buf + 896, 7); - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 128; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_float_25_recursive(buf + 0, 10); - helper_float_25_recursive(buf + 1024, 10); - helper_float_25_recursive(buf + 2048, 10); - helper_float_25_recursive(buf + 3072, 10); - helper_float_25_recursive(buf + 4096, 10); - helper_float_25_recursive(buf + 5120, 10); - helper_float_25_recursive(buf + 6144, 10); - helper_float_25_recursive(buf + 7168, 10); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 1024; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_float_25_recursive(buf + 0, 13); - helper_float_25_recursive(buf + 8192, 13); - helper_float_25_recursive(buf + 16384, 13); - helper_float_25_recursive(buf + 24576, 13); - helper_float_25_recursive(buf + 32768, 13); - helper_float_25_recursive(buf + 40960, 13); - helper_float_25_recursive(buf + 49152, 13); - helper_float_25_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_float_25_recursive(buf + 0, 16); - helper_float_25_recursive(buf + 65536, 16); - helper_float_25_recursive(buf + 131072, 16); - helper_float_25_recursive(buf + 196608, 16); - helper_float_25_recursive(buf + 262144, 16); - helper_float_25_recursive(buf + 327680, 16); - helper_float_25_recursive(buf + 393216, 16); - helper_float_25_recursive(buf + 458752, 16); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 65536; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 22) { - helper_float_25_recursive(buf + 0, 19); - helper_float_25_recursive(buf + 524288, 19); - helper_float_25_recursive(buf + 1048576, 19); - helper_float_25_recursive(buf + 1572864, 19); - helper_float_25_recursive(buf + 2097152, 19); - helper_float_25_recursive(buf + 2621440, 19); - helper_float_25_recursive(buf + 3145728, 19); - helper_float_25_recursive(buf + 3670016, 19); - for (int j = 0; j < 4194304; j += 4194304) { - for (int k = 0; k < 524288; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 25) { - helper_float_25_recursive(buf + 0, 22); - helper_float_25_recursive(buf + 4194304, 22); - helper_float_25_recursive(buf + 8388608, 22); - helper_float_25_recursive(buf + 12582912, 22); - helper_float_25_recursive(buf + 16777216, 22); - helper_float_25_recursive(buf + 20971520, 22); - helper_float_25_recursive(buf + 25165824, 22); - helper_float_25_recursive(buf + 29360128, 22); - for (int j = 0; j < 33554432; j += 33554432) { - for (int k = 0; k < 4194304; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_25(float *buf); -inline void helper_float_25(float *buf) { - helper_float_25_recursive(buf, 25); -} -inline void helper_float_26_recursive(float *buf, int depth); -inline void helper_float_26_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_26_recursive(buf + 0, 12); - helper_float_26_recursive(buf + 4096, 12); - helper_float_26_recursive(buf + 8192, 12); - helper_float_26_recursive(buf + 12288, 12); - helper_float_26_recursive(buf + 16384, 12); - helper_float_26_recursive(buf + 20480, 12); - helper_float_26_recursive(buf + 24576, 12); - helper_float_26_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_26_recursive(buf + 0, 15); - helper_float_26_recursive(buf + 32768, 15); - helper_float_26_recursive(buf + 65536, 15); - helper_float_26_recursive(buf + 98304, 15); - helper_float_26_recursive(buf + 131072, 15); - helper_float_26_recursive(buf + 163840, 15); - helper_float_26_recursive(buf + 196608, 15); - helper_float_26_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_26_recursive(buf + 0, 18); - helper_float_26_recursive(buf + 262144, 18); - helper_float_26_recursive(buf + 524288, 18); - helper_float_26_recursive(buf + 786432, 18); - helper_float_26_recursive(buf + 1048576, 18); - helper_float_26_recursive(buf + 1310720, 18); - helper_float_26_recursive(buf + 1572864, 18); - helper_float_26_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_float_26_recursive(buf + 0, 21); - helper_float_26_recursive(buf + 2097152, 21); - helper_float_26_recursive(buf + 4194304, 21); - helper_float_26_recursive(buf + 6291456, 21); - helper_float_26_recursive(buf + 8388608, 21); - helper_float_26_recursive(buf + 10485760, 21); - helper_float_26_recursive(buf + 12582912, 21); - helper_float_26_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 26) { - helper_float_26_recursive(buf + 0, 24); - helper_float_26_recursive(buf + 16777216, 24); - helper_float_26_recursive(buf + 33554432, 24); - helper_float_26_recursive(buf + 50331648, 24); - for (int j = 0; j < 67108864; j += 67108864) { - for (int k = 0; k < 16777216; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_26(float *buf); -inline void helper_float_26(float *buf) { - helper_float_26_recursive(buf, 26); -} -inline void helper_float_27_recursive(float *buf, int depth); -inline void helper_float_27_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_27_recursive(buf + 0, 12); - helper_float_27_recursive(buf + 4096, 12); - helper_float_27_recursive(buf + 8192, 12); - helper_float_27_recursive(buf + 12288, 12); - helper_float_27_recursive(buf + 16384, 12); - helper_float_27_recursive(buf + 20480, 12); - helper_float_27_recursive(buf + 24576, 12); - helper_float_27_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_27_recursive(buf + 0, 15); - helper_float_27_recursive(buf + 32768, 15); - helper_float_27_recursive(buf + 65536, 15); - helper_float_27_recursive(buf + 98304, 15); - helper_float_27_recursive(buf + 131072, 15); - helper_float_27_recursive(buf + 163840, 15); - helper_float_27_recursive(buf + 196608, 15); - helper_float_27_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_27_recursive(buf + 0, 18); - helper_float_27_recursive(buf + 262144, 18); - helper_float_27_recursive(buf + 524288, 18); - helper_float_27_recursive(buf + 786432, 18); - helper_float_27_recursive(buf + 1048576, 18); - helper_float_27_recursive(buf + 1310720, 18); - helper_float_27_recursive(buf + 1572864, 18); - helper_float_27_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_float_27_recursive(buf + 0, 21); - helper_float_27_recursive(buf + 2097152, 21); - helper_float_27_recursive(buf + 4194304, 21); - helper_float_27_recursive(buf + 6291456, 21); - helper_float_27_recursive(buf + 8388608, 21); - helper_float_27_recursive(buf + 10485760, 21); - helper_float_27_recursive(buf + 12582912, 21); - helper_float_27_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 27) { - helper_float_27_recursive(buf + 0, 24); - helper_float_27_recursive(buf + 16777216, 24); - helper_float_27_recursive(buf + 33554432, 24); - helper_float_27_recursive(buf + 50331648, 24); - helper_float_27_recursive(buf + 67108864, 24); - helper_float_27_recursive(buf + 83886080, 24); - helper_float_27_recursive(buf + 100663296, 24); - helper_float_27_recursive(buf + 117440512, 24); - for (int j = 0; j < 134217728; j += 134217728) { - for (int k = 0; k < 16777216; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_27(float *buf); -inline void helper_float_27(float *buf) { - helper_float_27_recursive(buf, 27); -} -inline void helper_float_28_recursive(float *buf, int depth); -inline void helper_float_28_recursive(float *buf, int depth) { - if (depth == 7) { - for (int j = 0; j < 128; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 128; j += 128) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 10) { - helper_float_28_recursive(buf + 0, 7); - helper_float_28_recursive(buf + 128, 7); - helper_float_28_recursive(buf + 256, 7); - helper_float_28_recursive(buf + 384, 7); - helper_float_28_recursive(buf + 512, 7); - helper_float_28_recursive(buf + 640, 7); - helper_float_28_recursive(buf + 768, 7); - helper_float_28_recursive(buf + 896, 7); - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 128; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_float_28_recursive(buf + 0, 10); - helper_float_28_recursive(buf + 1024, 10); - helper_float_28_recursive(buf + 2048, 10); - helper_float_28_recursive(buf + 3072, 10); - helper_float_28_recursive(buf + 4096, 10); - helper_float_28_recursive(buf + 5120, 10); - helper_float_28_recursive(buf + 6144, 10); - helper_float_28_recursive(buf + 7168, 10); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 1024; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_float_28_recursive(buf + 0, 13); - helper_float_28_recursive(buf + 8192, 13); - helper_float_28_recursive(buf + 16384, 13); - helper_float_28_recursive(buf + 24576, 13); - helper_float_28_recursive(buf + 32768, 13); - helper_float_28_recursive(buf + 40960, 13); - helper_float_28_recursive(buf + 49152, 13); - helper_float_28_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_float_28_recursive(buf + 0, 16); - helper_float_28_recursive(buf + 65536, 16); - helper_float_28_recursive(buf + 131072, 16); - helper_float_28_recursive(buf + 196608, 16); - helper_float_28_recursive(buf + 262144, 16); - helper_float_28_recursive(buf + 327680, 16); - helper_float_28_recursive(buf + 393216, 16); - helper_float_28_recursive(buf + 458752, 16); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 65536; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 22) { - helper_float_28_recursive(buf + 0, 19); - helper_float_28_recursive(buf + 524288, 19); - helper_float_28_recursive(buf + 1048576, 19); - helper_float_28_recursive(buf + 1572864, 19); - helper_float_28_recursive(buf + 2097152, 19); - helper_float_28_recursive(buf + 2621440, 19); - helper_float_28_recursive(buf + 3145728, 19); - helper_float_28_recursive(buf + 3670016, 19); - for (int j = 0; j < 4194304; j += 4194304) { - for (int k = 0; k < 524288; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 25) { - helper_float_28_recursive(buf + 0, 22); - helper_float_28_recursive(buf + 4194304, 22); - helper_float_28_recursive(buf + 8388608, 22); - helper_float_28_recursive(buf + 12582912, 22); - helper_float_28_recursive(buf + 16777216, 22); - helper_float_28_recursive(buf + 20971520, 22); - helper_float_28_recursive(buf + 25165824, 22); - helper_float_28_recursive(buf + 29360128, 22); - for (int j = 0; j < 33554432; j += 33554432) { - for (int k = 0; k < 4194304; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 28) { - helper_float_28_recursive(buf + 0, 25); - helper_float_28_recursive(buf + 33554432, 25); - helper_float_28_recursive(buf + 67108864, 25); - helper_float_28_recursive(buf + 100663296, 25); - helper_float_28_recursive(buf + 134217728, 25); - helper_float_28_recursive(buf + 167772160, 25); - helper_float_28_recursive(buf + 201326592, 25); - helper_float_28_recursive(buf + 234881024, 25); - for (int j = 0; j < 268435456; j += 268435456) { - for (int k = 0; k < 33554432; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_28(float *buf); -inline void helper_float_28(float *buf) { - helper_float_28_recursive(buf, 28); -} -inline void helper_float_29_recursive(float *buf, int depth); -inline void helper_float_29_recursive(float *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_29_recursive(buf + 0, 12); - helper_float_29_recursive(buf + 4096, 12); - helper_float_29_recursive(buf + 8192, 12); - helper_float_29_recursive(buf + 12288, 12); - helper_float_29_recursive(buf + 16384, 12); - helper_float_29_recursive(buf + 20480, 12); - helper_float_29_recursive(buf + 24576, 12); - helper_float_29_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_29_recursive(buf + 0, 15); - helper_float_29_recursive(buf + 32768, 15); - helper_float_29_recursive(buf + 65536, 15); - helper_float_29_recursive(buf + 98304, 15); - helper_float_29_recursive(buf + 131072, 15); - helper_float_29_recursive(buf + 163840, 15); - helper_float_29_recursive(buf + 196608, 15); - helper_float_29_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_29_recursive(buf + 0, 18); - helper_float_29_recursive(buf + 262144, 18); - helper_float_29_recursive(buf + 524288, 18); - helper_float_29_recursive(buf + 786432, 18); - helper_float_29_recursive(buf + 1048576, 18); - helper_float_29_recursive(buf + 1310720, 18); - helper_float_29_recursive(buf + 1572864, 18); - helper_float_29_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_float_29_recursive(buf + 0, 21); - helper_float_29_recursive(buf + 2097152, 21); - helper_float_29_recursive(buf + 4194304, 21); - helper_float_29_recursive(buf + 6291456, 21); - helper_float_29_recursive(buf + 8388608, 21); - helper_float_29_recursive(buf + 10485760, 21); - helper_float_29_recursive(buf + 12582912, 21); - helper_float_29_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 27) { - helper_float_29_recursive(buf + 0, 24); - helper_float_29_recursive(buf + 16777216, 24); - helper_float_29_recursive(buf + 33554432, 24); - helper_float_29_recursive(buf + 50331648, 24); - helper_float_29_recursive(buf + 67108864, 24); - helper_float_29_recursive(buf + 83886080, 24); - helper_float_29_recursive(buf + 100663296, 24); - helper_float_29_recursive(buf + 117440512, 24); - for (int j = 0; j < 134217728; j += 134217728) { - for (int k = 0; k < 16777216; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 29) { - helper_float_29_recursive(buf + 0, 27); - helper_float_29_recursive(buf + 134217728, 27); - helper_float_29_recursive(buf + 268435456, 27); - helper_float_29_recursive(buf + 402653184, 27); - for (int j = 0; j < 536870912; j += 536870912) { - for (int k = 0; k < 134217728; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vmovups %%ymm0, (%0)\n" - "vmovups %%ymm1, (%1)\n" - "vmovups %%ymm2, (%2)\n" - "vmovups %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_29(float *buf); -inline void helper_float_29(float *buf) { - helper_float_29_recursive(buf, 29); -} -inline void helper_float_30_recursive(float *buf, int depth); -inline void helper_float_30_recursive(float *buf, int depth) { - if (depth == 6) { - for (int j = 0; j < 64; j += 64) { - for (int k = 0; k < 8; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vpermilps $160, %%ymm0, %%ymm8\n" - "vpermilps $245, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm0\n" - "vpermilps $160, %%ymm1, %%ymm8\n" - "vpermilps $245, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm1\n" - "vpermilps $160, %%ymm2, %%ymm8\n" - "vpermilps $245, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm2\n" - "vpermilps $160, %%ymm3, %%ymm8\n" - "vpermilps $245, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm3\n" - "vpermilps $160, %%ymm4, %%ymm8\n" - "vpermilps $245, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm4\n" - "vpermilps $160, %%ymm5, %%ymm8\n" - "vpermilps $245, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm5\n" - "vpermilps $160, %%ymm6, %%ymm8\n" - "vpermilps $245, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm6\n" - "vpermilps $160, %%ymm7, %%ymm8\n" - "vpermilps $245, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubps %%ymm11, %%ymm8, %%ymm7\n" - "vpermilps $68, %%ymm0, %%ymm8\n" - "vpermilps $238, %%ymm0, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm0\n" - "vpermilps $68, %%ymm1, %%ymm8\n" - "vpermilps $238, %%ymm1, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm1\n" - "vpermilps $68, %%ymm2, %%ymm8\n" - "vpermilps $238, %%ymm2, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm2\n" - "vpermilps $68, %%ymm3, %%ymm8\n" - "vpermilps $238, %%ymm3, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm3\n" - "vpermilps $68, %%ymm4, %%ymm8\n" - "vpermilps $238, %%ymm4, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm4\n" - "vpermilps $68, %%ymm5, %%ymm8\n" - "vpermilps $238, %%ymm5, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm5\n" - "vpermilps $68, %%ymm6, %%ymm8\n" - "vpermilps $238, %%ymm6, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm6\n" - "vpermilps $68, %%ymm7, %%ymm8\n" - "vpermilps $238, %%ymm7, %%ymm9\n" - "vxorps %%ymm10, %%ymm10, %%ymm10\n" - "vsubps %%ymm9, %%ymm10, %%ymm11\n" - "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n" - "vaddps %%ymm8, %%ymm12, %%ymm7\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm0, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm0\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm1, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm1\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm2, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm2\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm3, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm3\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm4, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm4\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm5, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm5\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm6, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm6\n" - "vxorps %%ymm8, %%ymm8, %%ymm8\n" - "vsubps %%ymm7, %%ymm8, %%ymm9\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n" - "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n" - "vaddps %%ymm10, %%ymm11, %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 9) { - helper_float_30_recursive(buf + 0, 6); - helper_float_30_recursive(buf + 64, 6); - helper_float_30_recursive(buf + 128, 6); - helper_float_30_recursive(buf + 192, 6); - helper_float_30_recursive(buf + 256, 6); - helper_float_30_recursive(buf + 320, 6); - helper_float_30_recursive(buf + 384, 6); - helper_float_30_recursive(buf + 448, 6); - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 64; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_float_30_recursive(buf + 0, 9); - helper_float_30_recursive(buf + 512, 9); - helper_float_30_recursive(buf + 1024, 9); - helper_float_30_recursive(buf + 1536, 9); - helper_float_30_recursive(buf + 2048, 9); - helper_float_30_recursive(buf + 2560, 9); - helper_float_30_recursive(buf + 3072, 9); - helper_float_30_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_float_30_recursive(buf + 0, 12); - helper_float_30_recursive(buf + 4096, 12); - helper_float_30_recursive(buf + 8192, 12); - helper_float_30_recursive(buf + 12288, 12); - helper_float_30_recursive(buf + 16384, 12); - helper_float_30_recursive(buf + 20480, 12); - helper_float_30_recursive(buf + 24576, 12); - helper_float_30_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_float_30_recursive(buf + 0, 15); - helper_float_30_recursive(buf + 32768, 15); - helper_float_30_recursive(buf + 65536, 15); - helper_float_30_recursive(buf + 98304, 15); - helper_float_30_recursive(buf + 131072, 15); - helper_float_30_recursive(buf + 163840, 15); - helper_float_30_recursive(buf + 196608, 15); - helper_float_30_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_float_30_recursive(buf + 0, 18); - helper_float_30_recursive(buf + 262144, 18); - helper_float_30_recursive(buf + 524288, 18); - helper_float_30_recursive(buf + 786432, 18); - helper_float_30_recursive(buf + 1048576, 18); - helper_float_30_recursive(buf + 1310720, 18); - helper_float_30_recursive(buf + 1572864, 18); - helper_float_30_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_float_30_recursive(buf + 0, 21); - helper_float_30_recursive(buf + 2097152, 21); - helper_float_30_recursive(buf + 4194304, 21); - helper_float_30_recursive(buf + 6291456, 21); - helper_float_30_recursive(buf + 8388608, 21); - helper_float_30_recursive(buf + 10485760, 21); - helper_float_30_recursive(buf + 12582912, 21); - helper_float_30_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 27) { - helper_float_30_recursive(buf + 0, 24); - helper_float_30_recursive(buf + 16777216, 24); - helper_float_30_recursive(buf + 33554432, 24); - helper_float_30_recursive(buf + 50331648, 24); - helper_float_30_recursive(buf + 67108864, 24); - helper_float_30_recursive(buf + 83886080, 24); - helper_float_30_recursive(buf + 100663296, 24); - helper_float_30_recursive(buf + 117440512, 24); - for (int j = 0; j < 134217728; j += 134217728) { - for (int k = 0; k < 16777216; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 30) { - helper_float_30_recursive(buf + 0, 27); - helper_float_30_recursive(buf + 134217728, 27); - helper_float_30_recursive(buf + 268435456, 27); - helper_float_30_recursive(buf + 402653184, 27); - helper_float_30_recursive(buf + 536870912, 27); - helper_float_30_recursive(buf + 671088640, 27); - helper_float_30_recursive(buf + 805306368, 27); - helper_float_30_recursive(buf + 939524096, 27); - for (int j = 0; j < 1073741824; j += 1073741824) { - for (int k = 0; k < 134217728; k += 8) { - __asm__ volatile ( - "vmovups (%0), %%ymm0\n" - "vmovups (%1), %%ymm1\n" - "vmovups (%2), %%ymm2\n" - "vmovups (%3), %%ymm3\n" - "vmovups (%4), %%ymm4\n" - "vmovups (%5), %%ymm5\n" - "vmovups (%6), %%ymm6\n" - "vmovups (%7), %%ymm7\n" - "vaddps %%ymm1, %%ymm0, %%ymm8\n" - "vsubps %%ymm1, %%ymm0, %%ymm9\n" - "vaddps %%ymm3, %%ymm2, %%ymm10\n" - "vsubps %%ymm3, %%ymm2, %%ymm11\n" - "vaddps %%ymm5, %%ymm4, %%ymm12\n" - "vsubps %%ymm5, %%ymm4, %%ymm13\n" - "vaddps %%ymm7, %%ymm6, %%ymm14\n" - "vsubps %%ymm7, %%ymm6, %%ymm15\n" - "vaddps %%ymm10, %%ymm8, %%ymm0\n" - "vsubps %%ymm10, %%ymm8, %%ymm2\n" - "vaddps %%ymm11, %%ymm9, %%ymm1\n" - "vsubps %%ymm11, %%ymm9, %%ymm3\n" - "vaddps %%ymm14, %%ymm12, %%ymm4\n" - "vsubps %%ymm14, %%ymm12, %%ymm6\n" - "vaddps %%ymm15, %%ymm13, %%ymm5\n" - "vsubps %%ymm15, %%ymm13, %%ymm7\n" - "vaddps %%ymm4, %%ymm0, %%ymm8\n" - "vsubps %%ymm4, %%ymm0, %%ymm12\n" - "vaddps %%ymm5, %%ymm1, %%ymm9\n" - "vsubps %%ymm5, %%ymm1, %%ymm13\n" - "vaddps %%ymm6, %%ymm2, %%ymm10\n" - "vsubps %%ymm6, %%ymm2, %%ymm14\n" - "vaddps %%ymm7, %%ymm3, %%ymm11\n" - "vsubps %%ymm7, %%ymm3, %%ymm15\n" - "vmovups %%ymm8, (%0)\n" - "vmovups %%ymm9, (%1)\n" - "vmovups %%ymm10, (%2)\n" - "vmovups %%ymm11, (%3)\n" - "vmovups %%ymm12, (%4)\n" - "vmovups %%ymm13, (%5)\n" - "vmovups %%ymm14, (%6)\n" - "vmovups %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_float_30(float *buf); -inline void helper_float_30(float *buf) { - helper_float_30_recursive(buf, 30); -} -inline int fht_float(float *buf, int log_n) { - if (log_n == 0) { - return 0; - } - if (log_n == 1) { - helper_float_1(buf); - return 0; - } - if (log_n == 2) { - helper_float_2(buf); - return 0; - } - if (log_n == 3) { - helper_float_3(buf); - return 0; - } - if (log_n == 4) { - helper_float_4(buf); - return 0; - } - if (log_n == 5) { - helper_float_5(buf); - return 0; - } - if (log_n == 6) { - helper_float_6(buf); - return 0; - } - if (log_n == 7) { - helper_float_7(buf); - return 0; - } - if (log_n == 8) { - helper_float_8(buf); - return 0; - } - if (log_n == 9) { - helper_float_9(buf); - return 0; - } - if (log_n == 10) { - helper_float_10(buf); - return 0; - } - if (log_n == 11) { - helper_float_11(buf); - return 0; - } - if (log_n == 12) { - helper_float_12(buf); - return 0; - } - if (log_n == 13) { - helper_float_13(buf); - return 0; - } - if (log_n == 14) { - helper_float_14(buf); - return 0; - } - if (log_n == 15) { - helper_float_15(buf); - return 0; - } - if (log_n == 16) { - helper_float_16(buf); - return 0; - } - if (log_n == 17) { - helper_float_17(buf); - return 0; - } - if (log_n == 18) { - helper_float_18(buf); - return 0; - } - if (log_n == 19) { - helper_float_19(buf); - return 0; - } - if (log_n == 20) { - helper_float_20(buf); - return 0; - } - if (log_n == 21) { - helper_float_21(buf); - return 0; - } - if (log_n == 22) { - helper_float_22(buf); - return 0; - } - if (log_n == 23) { - helper_float_23(buf); - return 0; - } - if (log_n == 24) { - helper_float_24(buf); - return 0; - } - if (log_n == 25) { - helper_float_25(buf); - return 0; - } - if (log_n == 26) { - helper_float_26(buf); - return 0; - } - if (log_n == 27) { - helper_float_27(buf); - return 0; - } - if (log_n == 28) { - helper_float_28(buf); - return 0; - } - if (log_n == 29) { - helper_float_29(buf); - return 0; - } - if (log_n == 30) { - helper_float_30(buf); - return 0; - } - return 1; -} -inline void helper_double_1(double *buf); -inline void helper_double_1(double *buf) { - for (int j = 0; j < 2; j += 2) { - for (int k = 0; k < 1; ++k) { - double u = buf[j + k]; - double v = buf[j + k + 1]; - buf[j + k] = u + v; - buf[j + k + 1] = u - v; - } - } -} -inline void helper_double_2(double *buf); -inline void helper_double_2(double *buf) { - for (int j = 0; j < 4; j += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vmovupd %%ymm0, (%0)\n" - :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } -} -inline void helper_double_3(double *buf); -inline void helper_double_3(double *buf) { - for (int j = 0; j < 8; j += 8) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_4_recursive(double *buf, int depth); -inline void helper_double_4_recursive(double *buf, int depth) { - if (depth == 4) { - for (int j = 0; j < 16; j += 16) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_4(double *buf); -inline void helper_double_4(double *buf) { - helper_double_4_recursive(buf, 4); -} -inline void helper_double_5(double *buf); -inline void helper_double_5(double *buf) { - for (int j = 0; j < 32; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_6(double *buf); -inline void helper_double_6(double *buf) { - for (int j = 0; j < 64; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 64; j += 64) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_7(double *buf); -inline void helper_double_7(double *buf) { - for (int j = 0; j < 128; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 128; j += 128) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_8(double *buf); -inline void helper_double_8(double *buf) { - for (int j = 0; j < 256; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 256; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_9(double *buf); -inline void helper_double_9(double *buf) { - for (int j = 0; j < 512; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_10(double *buf); -inline void helper_double_10(double *buf) { - for (int j = 0; j < 1024; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_11(double *buf); -inline void helper_double_11(double *buf) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } -} -inline void helper_double_12_recursive(double *buf, int depth); -inline void helper_double_12_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_double_12_recursive(buf + 0, 11); - helper_double_12_recursive(buf + 2048, 11); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_12(double *buf); -inline void helper_double_12(double *buf) { - helper_double_12_recursive(buf, 12); -} -inline void helper_double_13_recursive(double *buf, int depth); -inline void helper_double_13_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_double_13_recursive(buf + 0, 11); - helper_double_13_recursive(buf + 2048, 11); - helper_double_13_recursive(buf + 4096, 11); - helper_double_13_recursive(buf + 6144, 11); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_13(double *buf); -inline void helper_double_13(double *buf) { - helper_double_13_recursive(buf, 13); -} -inline void helper_double_14_recursive(double *buf, int depth); -inline void helper_double_14_recursive(double *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_14_recursive(buf + 0, 12); - helper_double_14_recursive(buf + 4096, 12); - helper_double_14_recursive(buf + 8192, 12); - helper_double_14_recursive(buf + 12288, 12); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_14(double *buf); -inline void helper_double_14(double *buf) { - helper_double_14_recursive(buf, 14); -} -inline void helper_double_15_recursive(double *buf, int depth); -inline void helper_double_15_recursive(double *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_double_15_recursive(buf + 0, 12); - helper_double_15_recursive(buf + 4096, 12); - helper_double_15_recursive(buf + 8192, 12); - helper_double_15_recursive(buf + 12288, 12); - helper_double_15_recursive(buf + 16384, 12); - helper_double_15_recursive(buf + 20480, 12); - helper_double_15_recursive(buf + 24576, 12); - helper_double_15_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_15(double *buf); -inline void helper_double_15(double *buf) { - helper_double_15_recursive(buf, 15); -} -inline void helper_double_16_recursive(double *buf, int depth); -inline void helper_double_16_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_16_recursive(buf + 0, 11); - helper_double_16_recursive(buf + 2048, 11); - helper_double_16_recursive(buf + 4096, 11); - helper_double_16_recursive(buf + 6144, 11); - helper_double_16_recursive(buf + 8192, 11); - helper_double_16_recursive(buf + 10240, 11); - helper_double_16_recursive(buf + 12288, 11); - helper_double_16_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_double_16_recursive(buf + 0, 14); - helper_double_16_recursive(buf + 16384, 14); - helper_double_16_recursive(buf + 32768, 14); - helper_double_16_recursive(buf + 49152, 14); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_16(double *buf); -inline void helper_double_16(double *buf) { - helper_double_16_recursive(buf, 16); -} -inline void helper_double_17_recursive(double *buf, int depth); -inline void helper_double_17_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_17_recursive(buf + 0, 11); - helper_double_17_recursive(buf + 2048, 11); - helper_double_17_recursive(buf + 4096, 11); - helper_double_17_recursive(buf + 6144, 11); - helper_double_17_recursive(buf + 8192, 11); - helper_double_17_recursive(buf + 10240, 11); - helper_double_17_recursive(buf + 12288, 11); - helper_double_17_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_17_recursive(buf + 0, 14); - helper_double_17_recursive(buf + 16384, 14); - helper_double_17_recursive(buf + 32768, 14); - helper_double_17_recursive(buf + 49152, 14); - helper_double_17_recursive(buf + 65536, 14); - helper_double_17_recursive(buf + 81920, 14); - helper_double_17_recursive(buf + 98304, 14); - helper_double_17_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_17(double *buf); -inline void helper_double_17(double *buf) { - helper_double_17_recursive(buf, 17); -} -inline void helper_double_18_recursive(double *buf, int depth); -inline void helper_double_18_recursive(double *buf, int depth) { - if (depth == 12) { - for (int j = 0; j < 4096; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_double_18_recursive(buf + 0, 12); - helper_double_18_recursive(buf + 4096, 12); - helper_double_18_recursive(buf + 8192, 12); - helper_double_18_recursive(buf + 12288, 12); - helper_double_18_recursive(buf + 16384, 12); - helper_double_18_recursive(buf + 20480, 12); - helper_double_18_recursive(buf + 24576, 12); - helper_double_18_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_double_18_recursive(buf + 0, 15); - helper_double_18_recursive(buf + 32768, 15); - helper_double_18_recursive(buf + 65536, 15); - helper_double_18_recursive(buf + 98304, 15); - helper_double_18_recursive(buf + 131072, 15); - helper_double_18_recursive(buf + 163840, 15); - helper_double_18_recursive(buf + 196608, 15); - helper_double_18_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_18(double *buf); -inline void helper_double_18(double *buf) { - helper_double_18_recursive(buf, 18); -} -inline void helper_double_19_recursive(double *buf, int depth); -inline void helper_double_19_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_19_recursive(buf + 0, 11); - helper_double_19_recursive(buf + 2048, 11); - helper_double_19_recursive(buf + 4096, 11); - helper_double_19_recursive(buf + 6144, 11); - helper_double_19_recursive(buf + 8192, 11); - helper_double_19_recursive(buf + 10240, 11); - helper_double_19_recursive(buf + 12288, 11); - helper_double_19_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_19_recursive(buf + 0, 14); - helper_double_19_recursive(buf + 16384, 14); - helper_double_19_recursive(buf + 32768, 14); - helper_double_19_recursive(buf + 49152, 14); - helper_double_19_recursive(buf + 65536, 14); - helper_double_19_recursive(buf + 81920, 14); - helper_double_19_recursive(buf + 98304, 14); - helper_double_19_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_double_19_recursive(buf + 0, 17); - helper_double_19_recursive(buf + 131072, 17); - helper_double_19_recursive(buf + 262144, 17); - helper_double_19_recursive(buf + 393216, 17); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_19(double *buf); -inline void helper_double_19(double *buf) { - helper_double_19_recursive(buf, 19); -} -inline void helper_double_20_recursive(double *buf, int depth); -inline void helper_double_20_recursive(double *buf, int depth) { - if (depth == 9) { - for (int j = 0; j < 512; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_double_20_recursive(buf + 0, 9); - helper_double_20_recursive(buf + 512, 9); - helper_double_20_recursive(buf + 1024, 9); - helper_double_20_recursive(buf + 1536, 9); - helper_double_20_recursive(buf + 2048, 9); - helper_double_20_recursive(buf + 2560, 9); - helper_double_20_recursive(buf + 3072, 9); - helper_double_20_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_double_20_recursive(buf + 0, 12); - helper_double_20_recursive(buf + 4096, 12); - helper_double_20_recursive(buf + 8192, 12); - helper_double_20_recursive(buf + 12288, 12); - helper_double_20_recursive(buf + 16384, 12); - helper_double_20_recursive(buf + 20480, 12); - helper_double_20_recursive(buf + 24576, 12); - helper_double_20_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_double_20_recursive(buf + 0, 15); - helper_double_20_recursive(buf + 32768, 15); - helper_double_20_recursive(buf + 65536, 15); - helper_double_20_recursive(buf + 98304, 15); - helper_double_20_recursive(buf + 131072, 15); - helper_double_20_recursive(buf + 163840, 15); - helper_double_20_recursive(buf + 196608, 15); - helper_double_20_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_20_recursive(buf + 0, 18); - helper_double_20_recursive(buf + 262144, 18); - helper_double_20_recursive(buf + 524288, 18); - helper_double_20_recursive(buf + 786432, 18); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 262144; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_20(double *buf); -inline void helper_double_20(double *buf) { - helper_double_20_recursive(buf, 20); -} -inline void helper_double_21_recursive(double *buf, int depth); -inline void helper_double_21_recursive(double *buf, int depth) { - if (depth == 7) { - for (int j = 0; j < 128; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 128; j += 128) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 10) { - helper_double_21_recursive(buf + 0, 7); - helper_double_21_recursive(buf + 128, 7); - helper_double_21_recursive(buf + 256, 7); - helper_double_21_recursive(buf + 384, 7); - helper_double_21_recursive(buf + 512, 7); - helper_double_21_recursive(buf + 640, 7); - helper_double_21_recursive(buf + 768, 7); - helper_double_21_recursive(buf + 896, 7); - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 128; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_double_21_recursive(buf + 0, 10); - helper_double_21_recursive(buf + 1024, 10); - helper_double_21_recursive(buf + 2048, 10); - helper_double_21_recursive(buf + 3072, 10); - helper_double_21_recursive(buf + 4096, 10); - helper_double_21_recursive(buf + 5120, 10); - helper_double_21_recursive(buf + 6144, 10); - helper_double_21_recursive(buf + 7168, 10); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 1024; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_double_21_recursive(buf + 0, 13); - helper_double_21_recursive(buf + 8192, 13); - helper_double_21_recursive(buf + 16384, 13); - helper_double_21_recursive(buf + 24576, 13); - helper_double_21_recursive(buf + 32768, 13); - helper_double_21_recursive(buf + 40960, 13); - helper_double_21_recursive(buf + 49152, 13); - helper_double_21_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_double_21_recursive(buf + 0, 16); - helper_double_21_recursive(buf + 65536, 16); - helper_double_21_recursive(buf + 131072, 16); - helper_double_21_recursive(buf + 196608, 16); - helper_double_21_recursive(buf + 262144, 16); - helper_double_21_recursive(buf + 327680, 16); - helper_double_21_recursive(buf + 393216, 16); - helper_double_21_recursive(buf + 458752, 16); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 65536; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_double_21_recursive(buf + 0, 19); - helper_double_21_recursive(buf + 524288, 19); - helper_double_21_recursive(buf + 1048576, 19); - helper_double_21_recursive(buf + 1572864, 19); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 524288; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_21(double *buf); -inline void helper_double_21(double *buf) { - helper_double_21_recursive(buf, 21); -} -inline void helper_double_22_recursive(double *buf, int depth); -inline void helper_double_22_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_22_recursive(buf + 0, 11); - helper_double_22_recursive(buf + 2048, 11); - helper_double_22_recursive(buf + 4096, 11); - helper_double_22_recursive(buf + 6144, 11); - helper_double_22_recursive(buf + 8192, 11); - helper_double_22_recursive(buf + 10240, 11); - helper_double_22_recursive(buf + 12288, 11); - helper_double_22_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_22_recursive(buf + 0, 14); - helper_double_22_recursive(buf + 16384, 14); - helper_double_22_recursive(buf + 32768, 14); - helper_double_22_recursive(buf + 49152, 14); - helper_double_22_recursive(buf + 65536, 14); - helper_double_22_recursive(buf + 81920, 14); - helper_double_22_recursive(buf + 98304, 14); - helper_double_22_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_22_recursive(buf + 0, 17); - helper_double_22_recursive(buf + 131072, 17); - helper_double_22_recursive(buf + 262144, 17); - helper_double_22_recursive(buf + 393216, 17); - helper_double_22_recursive(buf + 524288, 17); - helper_double_22_recursive(buf + 655360, 17); - helper_double_22_recursive(buf + 786432, 17); - helper_double_22_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 22) { - helper_double_22_recursive(buf + 0, 20); - helper_double_22_recursive(buf + 1048576, 20); - helper_double_22_recursive(buf + 2097152, 20); - helper_double_22_recursive(buf + 3145728, 20); - for (int j = 0; j < 4194304; j += 4194304) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_22(double *buf); -inline void helper_double_22(double *buf) { - helper_double_22_recursive(buf, 22); -} -inline void helper_double_23_recursive(double *buf, int depth); -inline void helper_double_23_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_23_recursive(buf + 0, 11); - helper_double_23_recursive(buf + 2048, 11); - helper_double_23_recursive(buf + 4096, 11); - helper_double_23_recursive(buf + 6144, 11); - helper_double_23_recursive(buf + 8192, 11); - helper_double_23_recursive(buf + 10240, 11); - helper_double_23_recursive(buf + 12288, 11); - helper_double_23_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_23_recursive(buf + 0, 14); - helper_double_23_recursive(buf + 16384, 14); - helper_double_23_recursive(buf + 32768, 14); - helper_double_23_recursive(buf + 49152, 14); - helper_double_23_recursive(buf + 65536, 14); - helper_double_23_recursive(buf + 81920, 14); - helper_double_23_recursive(buf + 98304, 14); - helper_double_23_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_23_recursive(buf + 0, 17); - helper_double_23_recursive(buf + 131072, 17); - helper_double_23_recursive(buf + 262144, 17); - helper_double_23_recursive(buf + 393216, 17); - helper_double_23_recursive(buf + 524288, 17); - helper_double_23_recursive(buf + 655360, 17); - helper_double_23_recursive(buf + 786432, 17); - helper_double_23_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_double_23_recursive(buf + 0, 20); - helper_double_23_recursive(buf + 1048576, 20); - helper_double_23_recursive(buf + 2097152, 20); - helper_double_23_recursive(buf + 3145728, 20); - helper_double_23_recursive(buf + 4194304, 20); - helper_double_23_recursive(buf + 5242880, 20); - helper_double_23_recursive(buf + 6291456, 20); - helper_double_23_recursive(buf + 7340032, 20); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_23(double *buf); -inline void helper_double_23(double *buf) { - helper_double_23_recursive(buf, 23); -} -inline void helper_double_24_recursive(double *buf, int depth); -inline void helper_double_24_recursive(double *buf, int depth) { - if (depth == 10) { - for (int j = 0; j < 1024; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 1024; j += 1024) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 13) { - helper_double_24_recursive(buf + 0, 10); - helper_double_24_recursive(buf + 1024, 10); - helper_double_24_recursive(buf + 2048, 10); - helper_double_24_recursive(buf + 3072, 10); - helper_double_24_recursive(buf + 4096, 10); - helper_double_24_recursive(buf + 5120, 10); - helper_double_24_recursive(buf + 6144, 10); - helper_double_24_recursive(buf + 7168, 10); - for (int j = 0; j < 8192; j += 8192) { - for (int k = 0; k < 1024; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 16) { - helper_double_24_recursive(buf + 0, 13); - helper_double_24_recursive(buf + 8192, 13); - helper_double_24_recursive(buf + 16384, 13); - helper_double_24_recursive(buf + 24576, 13); - helper_double_24_recursive(buf + 32768, 13); - helper_double_24_recursive(buf + 40960, 13); - helper_double_24_recursive(buf + 49152, 13); - helper_double_24_recursive(buf + 57344, 13); - for (int j = 0; j < 65536; j += 65536) { - for (int k = 0; k < 8192; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 19) { - helper_double_24_recursive(buf + 0, 16); - helper_double_24_recursive(buf + 65536, 16); - helper_double_24_recursive(buf + 131072, 16); - helper_double_24_recursive(buf + 196608, 16); - helper_double_24_recursive(buf + 262144, 16); - helper_double_24_recursive(buf + 327680, 16); - helper_double_24_recursive(buf + 393216, 16); - helper_double_24_recursive(buf + 458752, 16); - for (int j = 0; j < 524288; j += 524288) { - for (int k = 0; k < 65536; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 22) { - helper_double_24_recursive(buf + 0, 19); - helper_double_24_recursive(buf + 524288, 19); - helper_double_24_recursive(buf + 1048576, 19); - helper_double_24_recursive(buf + 1572864, 19); - helper_double_24_recursive(buf + 2097152, 19); - helper_double_24_recursive(buf + 2621440, 19); - helper_double_24_recursive(buf + 3145728, 19); - helper_double_24_recursive(buf + 3670016, 19); - for (int j = 0; j < 4194304; j += 4194304) { - for (int k = 0; k < 524288; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_double_24_recursive(buf + 0, 22); - helper_double_24_recursive(buf + 4194304, 22); - helper_double_24_recursive(buf + 8388608, 22); - helper_double_24_recursive(buf + 12582912, 22); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 4194304; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_24(double *buf); -inline void helper_double_24(double *buf) { - helper_double_24_recursive(buf, 24); -} -inline void helper_double_25_recursive(double *buf, int depth); -inline void helper_double_25_recursive(double *buf, int depth) { - if (depth == 8) { - for (int j = 0; j < 256; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 256; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 11) { - helper_double_25_recursive(buf + 0, 8); - helper_double_25_recursive(buf + 256, 8); - helper_double_25_recursive(buf + 512, 8); - helper_double_25_recursive(buf + 768, 8); - helper_double_25_recursive(buf + 1024, 8); - helper_double_25_recursive(buf + 1280, 8); - helper_double_25_recursive(buf + 1536, 8); - helper_double_25_recursive(buf + 1792, 8); - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_25_recursive(buf + 0, 11); - helper_double_25_recursive(buf + 2048, 11); - helper_double_25_recursive(buf + 4096, 11); - helper_double_25_recursive(buf + 6144, 11); - helper_double_25_recursive(buf + 8192, 11); - helper_double_25_recursive(buf + 10240, 11); - helper_double_25_recursive(buf + 12288, 11); - helper_double_25_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_25_recursive(buf + 0, 14); - helper_double_25_recursive(buf + 16384, 14); - helper_double_25_recursive(buf + 32768, 14); - helper_double_25_recursive(buf + 49152, 14); - helper_double_25_recursive(buf + 65536, 14); - helper_double_25_recursive(buf + 81920, 14); - helper_double_25_recursive(buf + 98304, 14); - helper_double_25_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_25_recursive(buf + 0, 17); - helper_double_25_recursive(buf + 131072, 17); - helper_double_25_recursive(buf + 262144, 17); - helper_double_25_recursive(buf + 393216, 17); - helper_double_25_recursive(buf + 524288, 17); - helper_double_25_recursive(buf + 655360, 17); - helper_double_25_recursive(buf + 786432, 17); - helper_double_25_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_double_25_recursive(buf + 0, 20); - helper_double_25_recursive(buf + 1048576, 20); - helper_double_25_recursive(buf + 2097152, 20); - helper_double_25_recursive(buf + 3145728, 20); - helper_double_25_recursive(buf + 4194304, 20); - helper_double_25_recursive(buf + 5242880, 20); - helper_double_25_recursive(buf + 6291456, 20); - helper_double_25_recursive(buf + 7340032, 20); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 25) { - helper_double_25_recursive(buf + 0, 23); - helper_double_25_recursive(buf + 8388608, 23); - helper_double_25_recursive(buf + 16777216, 23); - helper_double_25_recursive(buf + 25165824, 23); - for (int j = 0; j < 33554432; j += 33554432) { - for (int k = 0; k < 8388608; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_25(double *buf); -inline void helper_double_25(double *buf) { - helper_double_25_recursive(buf, 25); -} -inline void helper_double_26_recursive(double *buf, int depth); -inline void helper_double_26_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_26_recursive(buf + 0, 11); - helper_double_26_recursive(buf + 2048, 11); - helper_double_26_recursive(buf + 4096, 11); - helper_double_26_recursive(buf + 6144, 11); - helper_double_26_recursive(buf + 8192, 11); - helper_double_26_recursive(buf + 10240, 11); - helper_double_26_recursive(buf + 12288, 11); - helper_double_26_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_26_recursive(buf + 0, 14); - helper_double_26_recursive(buf + 16384, 14); - helper_double_26_recursive(buf + 32768, 14); - helper_double_26_recursive(buf + 49152, 14); - helper_double_26_recursive(buf + 65536, 14); - helper_double_26_recursive(buf + 81920, 14); - helper_double_26_recursive(buf + 98304, 14); - helper_double_26_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_26_recursive(buf + 0, 17); - helper_double_26_recursive(buf + 131072, 17); - helper_double_26_recursive(buf + 262144, 17); - helper_double_26_recursive(buf + 393216, 17); - helper_double_26_recursive(buf + 524288, 17); - helper_double_26_recursive(buf + 655360, 17); - helper_double_26_recursive(buf + 786432, 17); - helper_double_26_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_double_26_recursive(buf + 0, 20); - helper_double_26_recursive(buf + 1048576, 20); - helper_double_26_recursive(buf + 2097152, 20); - helper_double_26_recursive(buf + 3145728, 20); - helper_double_26_recursive(buf + 4194304, 20); - helper_double_26_recursive(buf + 5242880, 20); - helper_double_26_recursive(buf + 6291456, 20); - helper_double_26_recursive(buf + 7340032, 20); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 26) { - helper_double_26_recursive(buf + 0, 23); - helper_double_26_recursive(buf + 8388608, 23); - helper_double_26_recursive(buf + 16777216, 23); - helper_double_26_recursive(buf + 25165824, 23); - helper_double_26_recursive(buf + 33554432, 23); - helper_double_26_recursive(buf + 41943040, 23); - helper_double_26_recursive(buf + 50331648, 23); - helper_double_26_recursive(buf + 58720256, 23); - for (int j = 0; j < 67108864; j += 67108864) { - for (int k = 0; k < 8388608; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_26(double *buf); -inline void helper_double_26(double *buf) { - helper_double_26_recursive(buf, 26); -} -inline void helper_double_27_recursive(double *buf, int depth); -inline void helper_double_27_recursive(double *buf, int depth) { - if (depth == 9) { - for (int j = 0; j < 512; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_double_27_recursive(buf + 0, 9); - helper_double_27_recursive(buf + 512, 9); - helper_double_27_recursive(buf + 1024, 9); - helper_double_27_recursive(buf + 1536, 9); - helper_double_27_recursive(buf + 2048, 9); - helper_double_27_recursive(buf + 2560, 9); - helper_double_27_recursive(buf + 3072, 9); - helper_double_27_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_double_27_recursive(buf + 0, 12); - helper_double_27_recursive(buf + 4096, 12); - helper_double_27_recursive(buf + 8192, 12); - helper_double_27_recursive(buf + 12288, 12); - helper_double_27_recursive(buf + 16384, 12); - helper_double_27_recursive(buf + 20480, 12); - helper_double_27_recursive(buf + 24576, 12); - helper_double_27_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_double_27_recursive(buf + 0, 15); - helper_double_27_recursive(buf + 32768, 15); - helper_double_27_recursive(buf + 65536, 15); - helper_double_27_recursive(buf + 98304, 15); - helper_double_27_recursive(buf + 131072, 15); - helper_double_27_recursive(buf + 163840, 15); - helper_double_27_recursive(buf + 196608, 15); - helper_double_27_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_double_27_recursive(buf + 0, 18); - helper_double_27_recursive(buf + 262144, 18); - helper_double_27_recursive(buf + 524288, 18); - helper_double_27_recursive(buf + 786432, 18); - helper_double_27_recursive(buf + 1048576, 18); - helper_double_27_recursive(buf + 1310720, 18); - helper_double_27_recursive(buf + 1572864, 18); - helper_double_27_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_double_27_recursive(buf + 0, 21); - helper_double_27_recursive(buf + 2097152, 21); - helper_double_27_recursive(buf + 4194304, 21); - helper_double_27_recursive(buf + 6291456, 21); - helper_double_27_recursive(buf + 8388608, 21); - helper_double_27_recursive(buf + 10485760, 21); - helper_double_27_recursive(buf + 12582912, 21); - helper_double_27_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 27) { - helper_double_27_recursive(buf + 0, 24); - helper_double_27_recursive(buf + 16777216, 24); - helper_double_27_recursive(buf + 33554432, 24); - helper_double_27_recursive(buf + 50331648, 24); - helper_double_27_recursive(buf + 67108864, 24); - helper_double_27_recursive(buf + 83886080, 24); - helper_double_27_recursive(buf + 100663296, 24); - helper_double_27_recursive(buf + 117440512, 24); - for (int j = 0; j < 134217728; j += 134217728) { - for (int k = 0; k < 16777216; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_27(double *buf); -inline void helper_double_27(double *buf) { - helper_double_27_recursive(buf, 27); -} -inline void helper_double_28_recursive(double *buf, int depth); -inline void helper_double_28_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_28_recursive(buf + 0, 11); - helper_double_28_recursive(buf + 2048, 11); - helper_double_28_recursive(buf + 4096, 11); - helper_double_28_recursive(buf + 6144, 11); - helper_double_28_recursive(buf + 8192, 11); - helper_double_28_recursive(buf + 10240, 11); - helper_double_28_recursive(buf + 12288, 11); - helper_double_28_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_28_recursive(buf + 0, 14); - helper_double_28_recursive(buf + 16384, 14); - helper_double_28_recursive(buf + 32768, 14); - helper_double_28_recursive(buf + 49152, 14); - helper_double_28_recursive(buf + 65536, 14); - helper_double_28_recursive(buf + 81920, 14); - helper_double_28_recursive(buf + 98304, 14); - helper_double_28_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_28_recursive(buf + 0, 17); - helper_double_28_recursive(buf + 131072, 17); - helper_double_28_recursive(buf + 262144, 17); - helper_double_28_recursive(buf + 393216, 17); - helper_double_28_recursive(buf + 524288, 17); - helper_double_28_recursive(buf + 655360, 17); - helper_double_28_recursive(buf + 786432, 17); - helper_double_28_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_double_28_recursive(buf + 0, 20); - helper_double_28_recursive(buf + 1048576, 20); - helper_double_28_recursive(buf + 2097152, 20); - helper_double_28_recursive(buf + 3145728, 20); - helper_double_28_recursive(buf + 4194304, 20); - helper_double_28_recursive(buf + 5242880, 20); - helper_double_28_recursive(buf + 6291456, 20); - helper_double_28_recursive(buf + 7340032, 20); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 26) { - helper_double_28_recursive(buf + 0, 23); - helper_double_28_recursive(buf + 8388608, 23); - helper_double_28_recursive(buf + 16777216, 23); - helper_double_28_recursive(buf + 25165824, 23); - helper_double_28_recursive(buf + 33554432, 23); - helper_double_28_recursive(buf + 41943040, 23); - helper_double_28_recursive(buf + 50331648, 23); - helper_double_28_recursive(buf + 58720256, 23); - for (int j = 0; j < 67108864; j += 67108864) { - for (int k = 0; k < 8388608; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 28) { - helper_double_28_recursive(buf + 0, 26); - helper_double_28_recursive(buf + 67108864, 26); - helper_double_28_recursive(buf + 134217728, 26); - helper_double_28_recursive(buf + 201326592, 26); - for (int j = 0; j < 268435456; j += 268435456) { - for (int k = 0; k < 67108864; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vmovupd %%ymm0, (%0)\n" - "vmovupd %%ymm1, (%1)\n" - "vmovupd %%ymm2, (%2)\n" - "vmovupd %%ymm3, (%3)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_28(double *buf); -inline void helper_double_28(double *buf) { - helper_double_28_recursive(buf, 28); -} -inline void helper_double_29_recursive(double *buf, int depth); -inline void helper_double_29_recursive(double *buf, int depth) { - if (depth == 11) { - for (int j = 0; j < 2048; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 2048; j += 2048) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 14) { - helper_double_29_recursive(buf + 0, 11); - helper_double_29_recursive(buf + 2048, 11); - helper_double_29_recursive(buf + 4096, 11); - helper_double_29_recursive(buf + 6144, 11); - helper_double_29_recursive(buf + 8192, 11); - helper_double_29_recursive(buf + 10240, 11); - helper_double_29_recursive(buf + 12288, 11); - helper_double_29_recursive(buf + 14336, 11); - for (int j = 0; j < 16384; j += 16384) { - for (int k = 0; k < 2048; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 17) { - helper_double_29_recursive(buf + 0, 14); - helper_double_29_recursive(buf + 16384, 14); - helper_double_29_recursive(buf + 32768, 14); - helper_double_29_recursive(buf + 49152, 14); - helper_double_29_recursive(buf + 65536, 14); - helper_double_29_recursive(buf + 81920, 14); - helper_double_29_recursive(buf + 98304, 14); - helper_double_29_recursive(buf + 114688, 14); - for (int j = 0; j < 131072; j += 131072) { - for (int k = 0; k < 16384; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 20) { - helper_double_29_recursive(buf + 0, 17); - helper_double_29_recursive(buf + 131072, 17); - helper_double_29_recursive(buf + 262144, 17); - helper_double_29_recursive(buf + 393216, 17); - helper_double_29_recursive(buf + 524288, 17); - helper_double_29_recursive(buf + 655360, 17); - helper_double_29_recursive(buf + 786432, 17); - helper_double_29_recursive(buf + 917504, 17); - for (int j = 0; j < 1048576; j += 1048576) { - for (int k = 0; k < 131072; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 23) { - helper_double_29_recursive(buf + 0, 20); - helper_double_29_recursive(buf + 1048576, 20); - helper_double_29_recursive(buf + 2097152, 20); - helper_double_29_recursive(buf + 3145728, 20); - helper_double_29_recursive(buf + 4194304, 20); - helper_double_29_recursive(buf + 5242880, 20); - helper_double_29_recursive(buf + 6291456, 20); - helper_double_29_recursive(buf + 7340032, 20); - for (int j = 0; j < 8388608; j += 8388608) { - for (int k = 0; k < 1048576; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 26) { - helper_double_29_recursive(buf + 0, 23); - helper_double_29_recursive(buf + 8388608, 23); - helper_double_29_recursive(buf + 16777216, 23); - helper_double_29_recursive(buf + 25165824, 23); - helper_double_29_recursive(buf + 33554432, 23); - helper_double_29_recursive(buf + 41943040, 23); - helper_double_29_recursive(buf + 50331648, 23); - helper_double_29_recursive(buf + 58720256, 23); - for (int j = 0; j < 67108864; j += 67108864) { - for (int k = 0; k < 8388608; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 29) { - helper_double_29_recursive(buf + 0, 26); - helper_double_29_recursive(buf + 67108864, 26); - helper_double_29_recursive(buf + 134217728, 26); - helper_double_29_recursive(buf + 201326592, 26); - helper_double_29_recursive(buf + 268435456, 26); - helper_double_29_recursive(buf + 335544320, 26); - helper_double_29_recursive(buf + 402653184, 26); - helper_double_29_recursive(buf + 469762048, 26); - for (int j = 0; j < 536870912; j += 536870912) { - for (int k = 0; k < 67108864; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_29(double *buf); -inline void helper_double_29(double *buf) { - helper_double_29_recursive(buf, 29); -} -inline void helper_double_30_recursive(double *buf, int depth); -inline void helper_double_30_recursive(double *buf, int depth) { - if (depth == 9) { - for (int j = 0; j < 512; j += 32) { - for (int k = 0; k < 4; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vpermilpd $0, %%ymm0, %%ymm8\n" - "vpermilpd $15, %%ymm0, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n" - "vpermilpd $0, %%ymm1, %%ymm8\n" - "vpermilpd $15, %%ymm1, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n" - "vpermilpd $0, %%ymm2, %%ymm8\n" - "vpermilpd $15, %%ymm2, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n" - "vpermilpd $0, %%ymm3, %%ymm8\n" - "vpermilpd $15, %%ymm3, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n" - "vpermilpd $0, %%ymm4, %%ymm8\n" - "vpermilpd $15, %%ymm4, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n" - "vpermilpd $0, %%ymm5, %%ymm8\n" - "vpermilpd $15, %%ymm5, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n" - "vpermilpd $0, %%ymm6, %%ymm8\n" - "vpermilpd $15, %%ymm6, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n" - "vpermilpd $0, %%ymm7, %%ymm8\n" - "vpermilpd $15, %%ymm7, %%ymm9\n" - "vxorpd %%ymm10, %%ymm10, %%ymm10\n" - "vsubpd %%ymm9, %%ymm10, %%ymm11\n" - "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n" - "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm0, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm0\n" - "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm1, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm1\n" - "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm2, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm2\n" - "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm3, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm3\n" - "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm4, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm4\n" - "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm5, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm5\n" - "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm6, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm6\n" - "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n" - "vxorpd %%ymm9, %%ymm9, %%ymm9\n" - "vsubpd %%ymm7, %%ymm9, %%ymm10\n" - "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n" - "vaddpd %%ymm11, %%ymm8, %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 256) { - for (int k = 0; k < 32; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - for (int j = 0; j < 512; j += 512) { - for (int k = 0; k < 256; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 12) { - helper_double_30_recursive(buf + 0, 9); - helper_double_30_recursive(buf + 512, 9); - helper_double_30_recursive(buf + 1024, 9); - helper_double_30_recursive(buf + 1536, 9); - helper_double_30_recursive(buf + 2048, 9); - helper_double_30_recursive(buf + 2560, 9); - helper_double_30_recursive(buf + 3072, 9); - helper_double_30_recursive(buf + 3584, 9); - for (int j = 0; j < 4096; j += 4096) { - for (int k = 0; k < 512; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 15) { - helper_double_30_recursive(buf + 0, 12); - helper_double_30_recursive(buf + 4096, 12); - helper_double_30_recursive(buf + 8192, 12); - helper_double_30_recursive(buf + 12288, 12); - helper_double_30_recursive(buf + 16384, 12); - helper_double_30_recursive(buf + 20480, 12); - helper_double_30_recursive(buf + 24576, 12); - helper_double_30_recursive(buf + 28672, 12); - for (int j = 0; j < 32768; j += 32768) { - for (int k = 0; k < 4096; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 18) { - helper_double_30_recursive(buf + 0, 15); - helper_double_30_recursive(buf + 32768, 15); - helper_double_30_recursive(buf + 65536, 15); - helper_double_30_recursive(buf + 98304, 15); - helper_double_30_recursive(buf + 131072, 15); - helper_double_30_recursive(buf + 163840, 15); - helper_double_30_recursive(buf + 196608, 15); - helper_double_30_recursive(buf + 229376, 15); - for (int j = 0; j < 262144; j += 262144) { - for (int k = 0; k < 32768; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 21) { - helper_double_30_recursive(buf + 0, 18); - helper_double_30_recursive(buf + 262144, 18); - helper_double_30_recursive(buf + 524288, 18); - helper_double_30_recursive(buf + 786432, 18); - helper_double_30_recursive(buf + 1048576, 18); - helper_double_30_recursive(buf + 1310720, 18); - helper_double_30_recursive(buf + 1572864, 18); - helper_double_30_recursive(buf + 1835008, 18); - for (int j = 0; j < 2097152; j += 2097152) { - for (int k = 0; k < 262144; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 24) { - helper_double_30_recursive(buf + 0, 21); - helper_double_30_recursive(buf + 2097152, 21); - helper_double_30_recursive(buf + 4194304, 21); - helper_double_30_recursive(buf + 6291456, 21); - helper_double_30_recursive(buf + 8388608, 21); - helper_double_30_recursive(buf + 10485760, 21); - helper_double_30_recursive(buf + 12582912, 21); - helper_double_30_recursive(buf + 14680064, 21); - for (int j = 0; j < 16777216; j += 16777216) { - for (int k = 0; k < 2097152; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 27) { - helper_double_30_recursive(buf + 0, 24); - helper_double_30_recursive(buf + 16777216, 24); - helper_double_30_recursive(buf + 33554432, 24); - helper_double_30_recursive(buf + 50331648, 24); - helper_double_30_recursive(buf + 67108864, 24); - helper_double_30_recursive(buf + 83886080, 24); - helper_double_30_recursive(buf + 100663296, 24); - helper_double_30_recursive(buf + 117440512, 24); - for (int j = 0; j < 134217728; j += 134217728) { - for (int k = 0; k < 16777216; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } - if (depth == 30) { - helper_double_30_recursive(buf + 0, 27); - helper_double_30_recursive(buf + 134217728, 27); - helper_double_30_recursive(buf + 268435456, 27); - helper_double_30_recursive(buf + 402653184, 27); - helper_double_30_recursive(buf + 536870912, 27); - helper_double_30_recursive(buf + 671088640, 27); - helper_double_30_recursive(buf + 805306368, 27); - helper_double_30_recursive(buf + 939524096, 27); - for (int j = 0; j < 1073741824; j += 1073741824) { - for (int k = 0; k < 134217728; k += 4) { - __asm__ volatile ( - "vmovupd (%0), %%ymm0\n" - "vmovupd (%1), %%ymm1\n" - "vmovupd (%2), %%ymm2\n" - "vmovupd (%3), %%ymm3\n" - "vmovupd (%4), %%ymm4\n" - "vmovupd (%5), %%ymm5\n" - "vmovupd (%6), %%ymm6\n" - "vmovupd (%7), %%ymm7\n" - "vaddpd %%ymm1, %%ymm0, %%ymm8\n" - "vsubpd %%ymm1, %%ymm0, %%ymm9\n" - "vaddpd %%ymm3, %%ymm2, %%ymm10\n" - "vsubpd %%ymm3, %%ymm2, %%ymm11\n" - "vaddpd %%ymm5, %%ymm4, %%ymm12\n" - "vsubpd %%ymm5, %%ymm4, %%ymm13\n" - "vaddpd %%ymm7, %%ymm6, %%ymm14\n" - "vsubpd %%ymm7, %%ymm6, %%ymm15\n" - "vaddpd %%ymm10, %%ymm8, %%ymm0\n" - "vsubpd %%ymm10, %%ymm8, %%ymm2\n" - "vaddpd %%ymm11, %%ymm9, %%ymm1\n" - "vsubpd %%ymm11, %%ymm9, %%ymm3\n" - "vaddpd %%ymm14, %%ymm12, %%ymm4\n" - "vsubpd %%ymm14, %%ymm12, %%ymm6\n" - "vaddpd %%ymm15, %%ymm13, %%ymm5\n" - "vsubpd %%ymm15, %%ymm13, %%ymm7\n" - "vaddpd %%ymm4, %%ymm0, %%ymm8\n" - "vsubpd %%ymm4, %%ymm0, %%ymm12\n" - "vaddpd %%ymm5, %%ymm1, %%ymm9\n" - "vsubpd %%ymm5, %%ymm1, %%ymm13\n" - "vaddpd %%ymm6, %%ymm2, %%ymm10\n" - "vsubpd %%ymm6, %%ymm2, %%ymm14\n" - "vaddpd %%ymm7, %%ymm3, %%ymm11\n" - "vsubpd %%ymm7, %%ymm3, %%ymm15\n" - "vmovupd %%ymm8, (%0)\n" - "vmovupd %%ymm9, (%1)\n" - "vmovupd %%ymm10, (%2)\n" - "vmovupd %%ymm11, (%3)\n" - "vmovupd %%ymm12, (%4)\n" - "vmovupd %%ymm13, (%5)\n" - "vmovupd %%ymm14, (%6)\n" - "vmovupd %%ymm15, (%7)\n" - :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory" - ); - } - } - return; - } -} -inline void helper_double_30(double *buf); -inline void helper_double_30(double *buf) { - helper_double_30_recursive(buf, 30); -} -inline int fht_double(double *buf, int log_n) { - if (log_n == 0) { - return 0; - } - if (log_n == 1) { - helper_double_1(buf); - return 0; - } - if (log_n == 2) { - helper_double_2(buf); - return 0; - } - if (log_n == 3) { - helper_double_3(buf); - return 0; - } - if (log_n == 4) { - helper_double_4(buf); - return 0; - } - if (log_n == 5) { - helper_double_5(buf); - return 0; - } - if (log_n == 6) { - helper_double_6(buf); - return 0; - } - if (log_n == 7) { - helper_double_7(buf); - return 0; - } - if (log_n == 8) { - helper_double_8(buf); - return 0; - } - if (log_n == 9) { - helper_double_9(buf); - return 0; - } - if (log_n == 10) { - helper_double_10(buf); - return 0; - } - if (log_n == 11) { - helper_double_11(buf); - return 0; - } - if (log_n == 12) { - helper_double_12(buf); - return 0; - } - if (log_n == 13) { - helper_double_13(buf); - return 0; - } - if (log_n == 14) { - helper_double_14(buf); - return 0; - } - if (log_n == 15) { - helper_double_15(buf); - return 0; - } - if (log_n == 16) { - helper_double_16(buf); - return 0; - } - if (log_n == 17) { - helper_double_17(buf); - return 0; - } - if (log_n == 18) { - helper_double_18(buf); - return 0; - } - if (log_n == 19) { - helper_double_19(buf); - return 0; - } - if (log_n == 20) { - helper_double_20(buf); - return 0; - } - if (log_n == 21) { - helper_double_21(buf); - return 0; - } - if (log_n == 22) { - helper_double_22(buf); - return 0; - } - if (log_n == 23) { - helper_double_23(buf); - return 0; - } - if (log_n == 24) { - helper_double_24(buf); - return 0; - } - if (log_n == 25) { - helper_double_25(buf); - return 0; - } - if (log_n == 26) { - helper_double_26(buf); - return 0; - } - if (log_n == 27) { - helper_double_27(buf); - return 0; - } - if (log_n == 28) { - helper_double_28(buf); - return 0; - } - if (log_n == 29) { - helper_double_29(buf); - return 0; - } - if (log_n == 30) { - helper_double_30(buf); - return 0; - } - return 1; -} From ec5b382f1a273b54f2a460615571858176ba254c Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 10:00:11 +0800 Subject: [PATCH 25/38] Fix clang-format violations --- src/core/algorithm/ivf/ivf_searcher_context.h | 4 +-- src/core/interface/index.cc | 6 ++-- src/core/quantizer/cosine_converter.cc | 18 +++++----- src/core/quantizer/cosine_reformer.cc | 8 ++--- .../quantizer/integer_quantizer_converter.cc | 16 ++++----- .../quantizer/integer_quantizer_reformer.cc | 10 +++--- src/core/quantizer/record_rotator.cc | 34 +++++++++++++------ src/core/quantizer/record_rotator.h | 8 +++-- src/db/index/common/proto_converter.cc | 3 +- 9 files changed, 60 insertions(+), 47 deletions(-) diff --git a/src/core/algorithm/ivf/ivf_searcher_context.h b/src/core/algorithm/ivf/ivf_searcher_context.h index a0a941e5e..dbd2b7ae1 100644 --- a/src/core/algorithm/ivf/ivf_searcher_context.h +++ b/src/core/algorithm/ivf/ivf_searcher_context.h @@ -77,8 +77,8 @@ class IVFSearcherContext : public IndexSearcher::Context { topk_val = std::max(topk_val, static_cast(1)); max_scan_count_ = static_cast(entity_->vector_count()); } else { - topk_val = std::max( - static_cast(std::round(nlist * scan_ratio_)), 1u); + topk_val = + std::max(static_cast(std::round(nlist * scan_ratio_)), 1u); max_scan_count_ = static_cast( std::ceil(entity_->vector_count() * scan_ratio_)); } diff --git a/src/core/interface/index.cc b/src/core/interface/index.cc index 90ab019cf..96d811058 100644 --- a/src/core/interface/index.cc +++ b/src/core/interface/index.cc @@ -187,8 +187,7 @@ int Index::CreateAndInitConverterReformer(const QuantizerParam ¶m, if (index_param.metric_type == MetricType::kCosine) { converter_params.set("cosine.converter.enable_rotate", true); } else { - converter_params.set("integer_streaming.converter.enable_rotate", - true); + converter_params.set("integer_streaming.converter.enable_rotate", true); } } else { LOG_WARN( @@ -341,7 +340,8 @@ int Index::Open(const std::string &file_path, StorageOptions storage_options) { // converter/reformer/metric are created in IndexFactory::CreateIndex // TODO: init - // Load reformer data from storage (e.g., rotation matrix for IntegerStreaming) + // Load reformer data from storage (e.g., rotation matrix for + // IntegerStreaming) if (reformer_ != nullptr) { // When building a new index, dump converter state (e.g., rotator) to // storage so the reformer can load it. This is needed for diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index c9b8cb7d1..0112537b8 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -131,15 +131,15 @@ class CosineConverterHolder : public IndexHolder { } float norm = 0.0f; - ailego::Normalizer::L2( - const_cast(vec), - owner_->rotator_ ? owner_->rotator_->padded_dim() - : original_dimension_, - &norm); + ailego::Normalizer::L2(const_cast(vec), + owner_->rotator_ + ? owner_->rotator_->padded_dim() + : original_dimension_, + &norm); if (type_ == IndexMeta::DataType::DT_FP32) { - ::memcpy(reinterpret_cast(&normalize_buffer_[0]), - vec, original_dimension_ * sizeof(float)); + ::memcpy(reinterpret_cast(&normalize_buffer_[0]), vec, + original_dimension_ * sizeof(float)); ::memcpy(reinterpret_cast(&normalize_buffer_[0]) + original_dimension_, &norm, NORM_SIZE); @@ -153,8 +153,8 @@ class CosineConverterHolder : public IndexHolder { &norm, NORM_SIZE); } else if (type_ == IndexMeta::DataType::DT_INT4 || type_ == IndexMeta::DataType::DT_INT8) { - RecordQuantizer::quantize_record( - vec, original_dimension_, type_, false, &buffer_[0]); + RecordQuantizer::quantize_record(vec, original_dimension_, type_, + false, &buffer_[0]); ::memcpy(reinterpret_cast(&buffer_[0]) + element_size - NORM_SIZE, diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index 442e08e34..8e89fbcd1 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -124,8 +124,8 @@ class CosineReformer : public IndexReformer { float *buf = reinterpret_cast(&normalized_buffer[0]); if (enable_rotate_ && rotator_) { // Already rotated, normalize the rotated vector - ailego::Normalizer::L2(const_cast(vec), origin_dimension, - &norm); + ailego::Normalizer::L2(const_cast(vec), + origin_dimension, &norm); } else { ailego::Normalizer::L2(buf, origin_dimension, &norm); vec = buf; @@ -140,8 +140,8 @@ class CosineReformer : public IndexReformer { ometa->element_size() - NORM_SIZE); } else if (dst_type_ == IndexMeta::DataType::DT_FP16) { RecordQuantizer::quantize_record(const_cast(vec), - qmeta.dimension(), dst_type_, - false, &(*out)[0]); + qmeta.dimension(), dst_type_, false, + &(*out)[0]); } else if (dst_type_ == IndexMeta::DataType::DT_INT4 || dst_type_ == IndexMeta::DataType::DT_INT8) { RecordQuantizer::quantize_record(vec, qmeta.dimension(), dst_type_, diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index f5d3db650..a660bc1af 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -402,9 +402,10 @@ class IntegerStreamingConverter : public IndexConverter { padded_dim = ((dim + 63) / 64) * 64; rotator_ = std::make_shared(); rotator_->init(dim, padded_dim); - LOG_DEBUG("IntegerStreamingConverter: rotation enabled, dim=%zu, " - "padded_dim=%zu", - dim, padded_dim); + LOG_DEBUG( + "IntegerStreamingConverter: rotation enabled, dim=%zu, " + "padded_dim=%zu", + dim, padded_dim); } if (data_type_ == IndexMeta::DataType::DT_INT8) { @@ -547,16 +548,15 @@ class IntegerStreamingConverter : public IndexConverter { } if (owner_->enable_normalize_) { float norm = 0.0; - memcpy((void *)normalize_buffer_.data(), vec, - pdim * sizeof(float)); + memcpy((void *)normalize_buffer_.data(), vec, pdim * sizeof(float)); ailego::Normalizer::L2((float *)normalize_buffer_.data(), pdim, &norm); vec = (float *)normalize_buffer_.data(); } - RecordQuantizer::quantize_record( - vec, pdim, owner_->data_type(), - owner_->is_euclidean_, buffer_.data()); + RecordQuantizer::quantize_record(vec, pdim, owner_->data_type(), + owner_->is_euclidean_, + buffer_.data()); } } diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index fc636f78b..b9c9e8278 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include #include #include -#include #include "record_quantizer.h" #include "record_rotator.h" @@ -301,16 +301,16 @@ class IntegerStreamingReformer : public IndexReformer { //! Auto-detects rotation by checking for rotator segment in storage. //! No need for enable_rotate in search config. int load(IndexStorage::Pointer storage) override { - // If config explicitly enables rotate but rotator not yet loaded, try storage - // If config doesn't enable rotate, still try storage (auto-detect) + // If config explicitly enables rotate but rotator not yet loaded, try + // storage If config doesn't enable rotate, still try storage (auto-detect) if (enable_rotate_ || storage->get(RECORD_ROTATOR_SEG_ID)) { rotator_ = std::make_shared(); int ret = rotator_->open(storage); if (ret != 0) { if (enable_rotate_) { // Config said enable_rotate but storage has no rotator — error - LOG_ERROR( - "IntegerStreamingReformer: load rotator failed, ret=%d", ret); + LOG_ERROR("IntegerStreamingReformer: load rotator failed, ret=%d", + ret); rotator_.reset(); return ret; } diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index c888a2276..67ace63f6 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "record_rotator.h" - #include #include #include @@ -51,7 +50,10 @@ namespace { //! Compute floor(log2(n)) for power-of-2 n. inline int ilog2(size_t n) { int r = 0; - while (n > 1) { n >>= 1; ++r; } + while (n > 1) { + n >>= 1; + ++r; + } return r; } @@ -133,8 +135,7 @@ void flip_sign(const uint8_t *flip, float *data, size_t dim) { uint32x4_t bit_mask = {b0, b1, b2, b3}; uint32x4_t sign_mask = vmulq_u32(bit_mask, sign_bit); float32x4_t v = vld1q_f32(&data[i]); - v = vreinterpretq_f32_u32( - veorq_u32(vreinterpretq_u32_f32(v), sign_mask)); + v = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v), sign_mask)); vst1q_f32(&data[i], v); } #elif defined(__SSE2__) @@ -321,7 +322,9 @@ struct FhtKacRotatorImpl { std::memcpy(flip.data(), data, flip.size()); } - size_t dump_bytes() const { return flip.size(); } + size_t dump_bytes() const { + return flip.size(); + } }; // ============================================================================ @@ -397,7 +400,9 @@ struct MatrixRotatorImpl { std::memcpy(matrix.data(), data, matrix.size() * sizeof(float)); } - size_t dump_bytes() const { return matrix.size() * sizeof(float); } + size_t dump_bytes() const { + return matrix.size() * sizeof(float); + } }; } // anonymous namespace @@ -606,8 +611,9 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, // Append segment to storage int ret = storage->append(seg_id, total_size); if (ret != 0) { - LOG_ERROR("RecordRotator::dump(storage): append segment '%s' failed, ret=%d", - seg_id.c_str(), ret); + LOG_ERROR( + "RecordRotator::dump(storage): append segment '%s' failed, ret=%d", + seg_id.c_str(), ret); return ret; } @@ -785,11 +791,17 @@ int RecordRotator::load(const float *matrix, size_t dimension, return 0; } -size_t RecordRotator::dimension() const { return impl_->dimension; } +size_t RecordRotator::dimension() const { + return impl_->dimension; +} -size_t RecordRotator::padded_dim() const { return impl_->padded_dim; } +size_t RecordRotator::padded_dim() const { + return impl_->padded_dim; +} -RecordRotatorType RecordRotator::rotator_type() const { return impl_->type; } +RecordRotatorType RecordRotator::rotator_type() const { + return impl_->type; +} bool RecordRotator::initialized() const { return impl_->fht_impl != nullptr || impl_->mat_impl != nullptr; diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index e2c9440af..4db73b06e 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -76,7 +76,8 @@ class RecordRotator { //! Inverse-rotate a single vector into a managed buffer //! @param in input vector of size >= dimension (rotated, truncated) - //! @return vector of size dimension containing inverse-rotated result + //! @return vector of size dimension containing inverse-rotated + //! result std::vector unrotate(const float *in) const; //! Prepare internal data structures for inverse rotation. @@ -98,8 +99,9 @@ class RecordRotator { int dump(const IndexDumper::Pointer &dumper, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; - //! Open the rotator from an IndexStorage segment (self-describing, no init needed). - //! Parses header to get type/dimension/padded_dim, then reconstructs the rotator. + //! Open the rotator from an IndexStorage segment (self-describing, no init + //! needed). Parses header to get type/dimension/padded_dim, then reconstructs + //! the rotator. int open(IndexStorage::Pointer storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID); diff --git a/src/db/index/common/proto_converter.cc b/src/db/index/common/proto_converter.cc index ce32e42b8..80b4c61ca 100644 --- a/src/db/index/common/proto_converter.cc +++ b/src/db/index/common/proto_converter.cc @@ -23,8 +23,7 @@ HnswIndexParams::OPtr ProtoConverter::FromPb( MetricTypeCodeBook::Get(params_pb.base().metric_type()), params_pb.m(), params_pb.ef_construction(), QuantizeTypeCodeBook::Get(params_pb.base().quantize_type()), - params_pb.use_contiguous_memory(), - QuantizerParam(enable_rotate)); + params_pb.use_contiguous_memory(), QuantizerParam(enable_rotate)); return params; } From ee40d457bd9e9c8f142a76d0f335213e374c2e94 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 10:24:55 +0800 Subject: [PATCH 26/38] clang-format check --- .../python/model/param/python_param.cc | 192 ++++++++---------- src/include/zvec/c_api.h | 3 +- src/include/zvec/core/interface/index_param.h | 7 +- src/include/zvec/db/index_params.h | 23 +-- tools/core/local_builder.cc | 15 +- 5 files changed, 108 insertions(+), 132 deletions(-) diff --git a/src/binding/python/model/param/python_param.cc b/src/binding/python/model/param/python_param.cc index b0ec0a46b..80dc203af 100644 --- a/src/binding/python/model/param/python_param.cc +++ b/src/binding/python/model/param/python_param.cc @@ -356,8 +356,7 @@ Designed for future extensibility. >>> print(qp.enable_rotate) True )pbdoc"); - quantizer_param - .def(py::init(), py::arg("enable_rotate") = false) + quantizer_param.def(py::init(), py::arg("enable_rotate") = false) .def_property_readonly( "enable_rotate", [](const QuantizerParam &self) -> bool { @@ -524,30 +523,28 @@ encapsulates its construction hyperparameters. quantize_type_to_string(self.quantize_type()); dict["use_contiguous_memory"] = self.use_contiguous_memory(); py::dict qp_dict; - qp_dict["enable_rotate"] = - self.quantizer_param().enable_rotate(); + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") - .def("__repr__", - [](const HnswIndexParams &self) -> std::string { - return "{" - "\"metric_type\":" + - metric_type_to_string(self.metric_type()) + - ", \"m\":" + std::to_string(self.m()) + - ", \"ef_construction\":" + - std::to_string(self.ef_construction()) + - ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + - ", \"use_contiguous_memory\":" + - (self.use_contiguous_memory() ? "true" : "false") + - ", \"quantizer_param\":{" + - "\"enable_rotate\":" + - (self.quantizer_param().enable_rotate() ? "true" - : "false") + - "}}"; - }) + .def( + "__repr__", + [](const HnswIndexParams &self) -> std::string { + return "{" + "\"metric_type\":" + + metric_type_to_string(self.metric_type()) + + ", \"m\":" + std::to_string(self.m()) + + ", \"ef_construction\":" + + std::to_string(self.ef_construction()) + + ", \"quantize_type\":" + + quantize_type_to_string(self.quantize_type()) + + ", \"use_contiguous_memory\":" + + (self.use_contiguous_memory() ? "true" : "false") + + ", \"quantizer_param\":{" + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" : "false") + + "}}"; + }) .def(py::pickle( [](const HnswIndexParams &self) { return py::make_tuple(self.metric_type(), self.m(), @@ -561,8 +558,7 @@ encapsulates its construction hyperparameters. QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast(), - qp); + t[3].cast(), t[4].cast(), qp); })); // binding hnsw rabitq index params @@ -768,46 +764,43 @@ its construction hyperparameters. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); py::dict qp_dict; - qp_dict["enable_rotate"] = - self.quantizer_param().enable_rotate(); + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") - .def("__repr__", - [](const VamanaIndexParams &self) -> std::string { - return "{" - "\"type\":\"" + - index_type_to_string(self.type()) + - "\", \"metric_type\":\"" + - metric_type_to_string(self.metric_type()) + - "\", \"max_degree\":" + std::to_string(self.max_degree()) + - ", \"search_list_size\":" + - std::to_string(self.search_list_size()) + - ", \"alpha\":" + std::to_string(self.alpha()) + - ", \"saturate_graph\":" + - std::string(self.saturate_graph() ? "true" : "false") + - ", \"use_contiguous_memory\":" + - std::string(self.use_contiguous_memory() ? "true" - : "false") + - ", \"use_id_map\":" + - std::string(self.use_id_map() ? "true" : "false") + - ", \"quantize_type\":\"" + - quantize_type_to_string(self.quantize_type()) + - "\", \"quantizer_param\":{" + - "\"enable_rotate\":" + - (self.quantizer_param().enable_rotate() ? "true" + .def( + "__repr__", + [](const VamanaIndexParams &self) -> std::string { + return "{" + "\"type\":\"" + + index_type_to_string(self.type()) + + "\", \"metric_type\":\"" + + metric_type_to_string(self.metric_type()) + + "\", \"max_degree\":" + std::to_string(self.max_degree()) + + ", \"search_list_size\":" + + std::to_string(self.search_list_size()) + + ", \"alpha\":" + std::to_string(self.alpha()) + + ", \"saturate_graph\":" + + std::string(self.saturate_graph() ? "true" : "false") + + ", \"use_contiguous_memory\":" + + std::string(self.use_contiguous_memory() ? "true" : "false") + - "}}"; - }) + ", \"use_id_map\":" + + std::string(self.use_id_map() ? "true" : "false") + + ", \"quantize_type\":\"" + + quantize_type_to_string(self.quantize_type()) + + "\", \"quantizer_param\":{" + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" : "false") + + "}}"; + }) .def(py::pickle( [](const VamanaIndexParams &self) { - return py::make_tuple(self.metric_type(), self.max_degree(), - self.search_list_size(), self.alpha(), - self.saturate_graph(), - self.use_contiguous_memory(), - self.use_id_map(), self.quantize_type(), - self.quantizer_param().enable_rotate()); + return py::make_tuple( + self.metric_type(), self.max_degree(), self.search_list_size(), + self.alpha(), self.saturate_graph(), + self.use_contiguous_memory(), self.use_id_map(), + self.quantize_type(), self.quantizer_param().enable_rotate()); }, [](py::tuple t) { if (t.size() != 8 && t.size() != 9) @@ -816,8 +809,7 @@ its construction hyperparameters. return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), t[3].cast(), t[4].cast(), t[5].cast(), - t[6].cast(), t[7].cast(), - qp); + t[6].cast(), t[7].cast(), qp); })); // FlatIndexParams @@ -873,25 +865,23 @@ Constructs a FlatIndexParam instance. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); py::dict qp_dict; - qp_dict["enable_rotate"] = - self.quantizer_param().enable_rotate(); + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") - .def("__repr__", - [](const FlatIndexParams &self) -> std::string { - return "{" - "\"metric_type\":" + - metric_type_to_string(self.metric_type()) + - ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + - ", \"quantizer_param\":{" + - "\"enable_rotate\":" + - (self.quantizer_param().enable_rotate() ? "true" - : "false") + - "}}"; - }) + .def( + "__repr__", + [](const FlatIndexParams &self) -> std::string { + return "{" + "\"metric_type\":" + + metric_type_to_string(self.metric_type()) + + ", \"quantize_type\":" + + quantize_type_to_string(self.quantize_type()) + + ", \"quantizer_param\":{" + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" : "false") + + "}}"; + }) .def(py::pickle( [](const FlatIndexParams &self) { return py::make_tuple(self.metric_type(), self.quantize_type(), @@ -901,9 +891,8 @@ Constructs a FlatIndexParam instance. if (t.size() != 2 && t.size() != 3) throw std::runtime_error("Invalid state for FlatIndexParams"); QuantizerParam qp(t.size() >= 3 ? t[2].cast() : false); - return std::make_shared(t[0].cast(), - t[1].cast(), - qp); + return std::make_shared( + t[0].cast(), t[1].cast(), qp); })); // IVFIndexParams @@ -985,28 +974,26 @@ Constructs an IVFIndexParam instance. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); py::dict qp_dict; - qp_dict["enable_rotate"] = - self.quantizer_param().enable_rotate(); + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); dict["quantizer_param"] = qp_dict; return dict; }, "Convert to dictionary with all fields") - .def("__repr__", - [](const IVFIndexParams &self) { - return "{" - "\"metric_type\":" + - metric_type_to_string(self.metric_type()) + - ", \"n_list\":" + std::to_string(self.n_list()) + - ", \"n_iters\":" + std::to_string(self.n_iters()) + - ", \"use_soar\":" + std::to_string(self.use_soar()) + - ", \"quantize_type\":" + - quantize_type_to_string(self.quantize_type()) + - ", \"quantizer_param\":{" + - "\"enable_rotate\":" + - (self.quantizer_param().enable_rotate() ? "true" - : "false") + - "}}"; - }) + .def( + "__repr__", + [](const IVFIndexParams &self) { + return "{" + "\"metric_type\":" + + metric_type_to_string(self.metric_type()) + + ", \"n_list\":" + std::to_string(self.n_list()) + + ", \"n_iters\":" + std::to_string(self.n_iters()) + + ", \"use_soar\":" + std::to_string(self.use_soar()) + + ", \"quantize_type\":" + + quantize_type_to_string(self.quantize_type()) + + ", \"quantizer_param\":{" + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" : "false") + + "}}"; + }) .def(py::pickle( [](const IVFIndexParams &self) { return py::make_tuple(self.metric_type(), self.n_list(), @@ -1020,8 +1007,7 @@ Constructs an IVFIndexParam instance. QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast(), - qp); + t[3].cast(), t[4].cast(), qp); })); // DiskAnnIndexParams @@ -1111,8 +1097,7 @@ Constructs an DiskAnnIndexParams instance. dict["quantize_type"] = quantize_type_to_string(self.quantize_type()); py::dict qp_dict; - qp_dict["enable_rotate"] = - self.quantizer_param().enable_rotate(); + qp_dict["enable_rotate"] = self.quantizer_param().enable_rotate(); dict["quantizer_param"] = qp_dict; return dict; }, @@ -1128,10 +1113,8 @@ Constructs an DiskAnnIndexParams instance. ", \"pq_chunk_num\":" + std::to_string(self.pq_chunk_num()) + ", \"quantize_type\":" + quantize_type_to_string(self.quantize_type()) + - ", \"quantizer_param\":{" + - "\"enable_rotate\":" + - (self.quantizer_param().enable_rotate() ? "true" - : "false") + + ", \"quantizer_param\":{" + "\"enable_rotate\":" + + (self.quantizer_param().enable_rotate() ? "true" : "false") + "}}"; }) .def(py::pickle( @@ -1147,8 +1130,7 @@ Constructs an DiskAnnIndexParams instance. QuantizerParam qp(t.size() >= 6 ? t[5].cast() : false); return std::make_shared( t[0].cast(), t[1].cast(), t[2].cast(), - t[3].cast(), t[4].cast(), - qp); + t[3].cast(), t[4].cast(), qp); })); } diff --git a/src/include/zvec/c_api.h b/src/include/zvec/c_api.h index 3101f3fa7..3f3e38638 100644 --- a/src/include/zvec/c_api.h +++ b/src/include/zvec/c_api.h @@ -982,8 +982,7 @@ zvec_index_params_set_quantizer_enable_rotate(zvec_index_params_t *params, * @param params Index parameters (must not be NULL) * @return true if rotation is enabled, false otherwise (default) */ -ZVEC_EXPORT bool ZVEC_CALL -zvec_index_params_get_quantizer_enable_rotate( +ZVEC_EXPORT bool ZVEC_CALL zvec_index_params_get_quantizer_enable_rotate( const zvec_index_params_t *params); /** diff --git a/src/include/zvec/core/interface/index_param.h b/src/include/zvec/core/interface/index_param.h index 5f2848253..84eab85c8 100644 --- a/src/include/zvec/core/interface/index_param.h +++ b/src/include/zvec/core/interface/index_param.h @@ -116,13 +116,16 @@ struct QuantizerParam : public SerializableBase { QuantizerType type = QuantizerType::kNone; int num_subquantizers = 8; // M int num_bits = 8; // bits per subquantizer - bool enable_rotate = false; // rotate vectors before quantization to reduce error + bool enable_rotate = + false; // rotate vectors before quantization to reduce error // Constructors // QuantizerParam() = default; QuantizerParam(QuantizerType t = QuantizerType::kNone, int subquantizers = 8, int bits = 8, bool rotate = false) - : type(t), num_subquantizers(subquantizers), num_bits(bits), + : type(t), + num_subquantizers(subquantizers), + num_bits(bits), enable_rotate(rotate) {} diff --git a/src/include/zvec/db/index_params.h b/src/include/zvec/db/index_params.h index 6c0fbcb10..a4c2654d9 100644 --- a/src/include/zvec/db/index_params.h +++ b/src/include/zvec/db/index_params.h @@ -126,8 +126,7 @@ class InvertIndexParams : public IndexParams { class QuantizerParam { public: QuantizerParam() = default; - explicit QuantizerParam(bool enable_rotate) - : enable_rotate_(enable_rotate) {} + explicit QuantizerParam(bool enable_rotate) : enable_rotate_(enable_rotate) {} bool enable_rotate() const { return enable_rotate_; @@ -214,8 +213,7 @@ class HnswIndexParams : public VectorIndexParams { MetricType metric_type, int m = core_interface::kDefaultHnswNeighborCnt, int ef_construction = core_interface::kDefaultHnswEfConstruction, QuantizeType quantize_type = QuantizeType::UNDEFINED, - bool use_contiguous_memory = false, - QuantizerParam quantizer_param = {}) + bool use_contiguous_memory = false, QuantizerParam quantizer_param = {}) : VectorIndexParams(IndexType::HNSW, metric_type, quantize_type, quantizer_param), m_(m), @@ -226,10 +224,9 @@ class HnswIndexParams : public VectorIndexParams { public: Ptr clone() const override { - return std::make_shared(metric_type_, m_, ef_construction_, - quantize_type_, - use_contiguous_memory_, - quantizer_param_); + return std::make_shared( + metric_type_, m_, ef_construction_, quantize_type_, + use_contiguous_memory_, quantizer_param_); } std::string to_string() const override { @@ -238,8 +235,7 @@ class HnswIndexParams : public VectorIndexParams { std::ostringstream oss; oss << base_str << ",m:" << m_ << ",ef_construction:" << ef_construction_ << ",use_contiguous_memory:" - << (use_contiguous_memory_ ? "true" : "false") - << ",enable_rotate:" + << (use_contiguous_memory_ ? "true" : "false") << ",enable_rotate:" << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } @@ -255,8 +251,8 @@ class HnswIndexParams : public VectorIndexParams { static_cast(other).quantize_type() && use_contiguous_memory_ == static_cast(other) .use_contiguous_memory_ && - quantizer_param_ == static_cast(other) - .quantizer_param_; + quantizer_param_ == + static_cast(other).quantizer_param_; } void set_m(int m) { @@ -421,8 +417,7 @@ class FlatIndexParams : public VectorIndexParams { auto base_str = vector_index_params_to_string("FlatIndexParams", metric_type_, quantize_type_); std::ostringstream oss; - oss << base_str - << ",enable_rotate:" + oss << base_str << ",enable_rotate:" << (quantizer_param_.enable_rotate() ? "true" : "false") << "}"; return oss.str(); } diff --git a/tools/core/local_builder.cc b/tools/core/local_builder.cc index f53e3ec8a..7d3b7bf0a 100644 --- a/tools/core/local_builder.cc +++ b/tools/core/local_builder.cc @@ -913,9 +913,8 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { IndexConverter::Pointer build_converter; - IndexHolder::Pointer cv_build_holder = - convert_holder(converter_name, converter_params, build_holder, meta, - &build_converter); + IndexHolder::Pointer cv_build_holder = convert_holder( + converter_name, converter_params, build_holder, meta, &build_converter); if (!cv_build_holder) { LOG_ERROR("Convert holder failed."); return -1; @@ -1006,9 +1005,8 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { // support fp16 convert - IndexHolder::Pointer cv_train_holder = - convert_holder(converter_name, converter_params, train_holder, meta, - nullptr); + IndexHolder::Pointer cv_train_holder = convert_holder( + converter_name, converter_params, train_holder, meta, nullptr); if (!cv_train_holder) { LOG_ERROR("Convert train holder failed."); return -1; @@ -1064,9 +1062,8 @@ int do_build(YAML::Node &config_root, YAML::Node &config_common) { if (!metric_name.empty()) { train_holder->set_metric(metric_name, metric_params); } - IndexHolder::Pointer cv_train_holder = - convert_holder(converter_name, converter_params, train_holder, meta, - nullptr); + IndexHolder::Pointer cv_train_holder = convert_holder( + converter_name, converter_params, train_holder, meta, nullptr); if (!cv_train_holder) { LOG_ERROR("Convert train holder failed."); return -1; From 7acc196c063738c8a03808328d48f4a365d51a7b Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 13:52:32 +0800 Subject: [PATCH 27/38] cancel padden_dim --- src/core/quantizer/cosine_converter.cc | 14 +- src/core/quantizer/cosine_reformer.cc | 9 +- .../quantizer/integer_quantizer_converter.cc | 34 +- .../quantizer/integer_quantizer_reformer.cc | 32 +- src/core/quantizer/record_rotator.cc | 392 ++++++++++-------- src/core/quantizer/record_rotator.h | 54 ++- 6 files changed, 281 insertions(+), 254 deletions(-) diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index 0112537b8..02fb3dddc 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -58,7 +58,7 @@ class CosineConverterHolder : public IndexHolder { // Allocate rotate buffer if owner has a rotator if (owner_->rotator_) { - rotate_buffer_.resize(owner_->rotator_->padded_dim()); + rotate_buffer_.resize(owner_->rotator_->dimension()); } } @@ -132,9 +132,7 @@ class CosineConverterHolder : public IndexHolder { float norm = 0.0f; ailego::Normalizer::L2(const_cast(vec), - owner_->rotator_ - ? owner_->rotator_->padded_dim() - : original_dimension_, + original_dimension_, &norm); if (type_ == IndexMeta::DataType::DT_FP32) { @@ -294,14 +292,12 @@ class CosineConverter : public IndexConverter { reformer_params.set(COSINE_REFORMER_ENABLE_ROTATE, true); } - // Compute padded dimension and create rotator if rotation is enabled + // Create rotator if rotation is enabled if (enable_rotate_) { size_t dim = index_meta.dimension(); - size_t padded_dim = ((dim + 63) / 64) * 64; rotator_ = std::make_shared(); - rotator_->init(dim, padded_dim); - LOG_DEBUG("CosineConverter: rotation enabled, dim=%zu, padded_dim=%zu", - dim, padded_dim); + rotator_->init(dim); + LOG_DEBUG("CosineConverter: rotation enabled, dim=%zu", dim); } if (dst_type_ == IndexMeta::DataType::DT_INT8) { diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index 8e89fbcd1..b495fc944 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -71,9 +71,8 @@ class CosineReformer : public IndexReformer { } else { enable_rotate_ = true; LOG_DEBUG( - "CosineReformer: rotator auto-loaded, origin_dim=%zu, " - "padded_dim=%zu", - rotator_->dimension(), rotator_->padded_dim()); + "CosineReformer: rotator auto-loaded, dim=%zu", + rotator_->dimension()); } } return 0; @@ -111,10 +110,10 @@ class CosineReformer : public IndexReformer { // Apply rotation if enabled std::unique_ptr rotate_buffer; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotate_buffer.reset(new float[rotator_->dimension()]); rotator_->rotate(vec, rotate_buffer.get()); vec = rotate_buffer.get(); - origin_dimension = rotator_->padded_dim(); + // rotation preserves dimension: origin_dimension stays qmeta.dimension() } // Normalize (L2) diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index a660bc1af..242074b02 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -395,17 +395,13 @@ class IntegerStreamingConverter : public IndexConverter { reformer_params.set(INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN, true); } - // Compute padded dimension and create rotator if rotation is enabled - size_t padded_dim = index_meta.dimension(); + // Create rotator if rotation is enabled if (enable_rotate_) { - size_t dim = index_meta.dimension(); - padded_dim = ((dim + 63) / 64) * 64; rotator_ = std::make_shared(); - rotator_->init(dim, padded_dim); + rotator_->init(index_meta.dimension()); LOG_DEBUG( - "IntegerStreamingConverter: rotation enabled, dim=%zu, " - "padded_dim=%zu", - dim, padded_dim); + "IntegerStreamingConverter: rotation enabled, dim=%zu", + static_cast(index_meta.dimension())); } if (data_type_ == IndexMeta::DataType::DT_INT8) { @@ -426,7 +422,7 @@ class IntegerStreamingConverter : public IndexConverter { metric_params.set(QUANTIZED_INTEGER_METRIC_ORIGIN_METRIC_PARAMS, index_meta.metric_params()); meta_.set_metric("QuantizedInteger", 0, metric_params); - meta_.set_meta(data_type_, padded_dim + ExtraDimension(data_type_)); + meta_.set_meta(data_type_, meta_.dimension() + ExtraDimension(data_type_)); return 0; } @@ -503,8 +499,8 @@ class IntegerStreamingConverter : public IndexConverter { IndexHolder::Iterator::Pointer &&iter) : owner_(owner), buffer_(owner->element_size(), 0), - normalize_buffer_(owner->padded_dim() * sizeof(float), 0), - rotate_buffer_(owner->padded_dim() * sizeof(float), 0), + normalize_buffer_(owner->dimension_ * sizeof(float), 0), + rotate_buffer_(owner->dimension_ * sizeof(float), 0), front_iter_(std::move(iter)) { this->encode_record(); } @@ -539,7 +535,7 @@ class IntegerStreamingConverter : public IndexConverter { if (front_iter_->is_valid()) { const float *vec = reinterpret_cast(front_iter_->data()); - size_t pdim = owner_->padded_dim(); + size_t dim = owner_->dimension_; if (owner_->rotator_) { float *rotate_buf = reinterpret_cast(rotate_buffer_.data()); @@ -548,13 +544,13 @@ class IntegerStreamingConverter : public IndexConverter { } if (owner_->enable_normalize_) { float norm = 0.0; - memcpy((void *)normalize_buffer_.data(), vec, pdim * sizeof(float)); + memcpy((void *)normalize_buffer_.data(), vec, dim * sizeof(float)); ailego::Normalizer::L2((float *)normalize_buffer_.data(), - pdim, &norm); + dim, &norm); vec = (float *)normalize_buffer_.data(); } - RecordQuantizer::quantize_record(vec, pdim, owner_->data_type(), + RecordQuantizer::quantize_record(vec, dim, owner_->data_type(), owner_->is_euclidean_, buffer_.data()); } @@ -580,12 +576,6 @@ class IntegerStreamingConverter : public IndexConverter { is_euclidean_(is_euclidean), rotator_(std::move(rotator)) {} - //! Retrieve padded dimension - size_t padded_dim(void) const { - return rotator_ ? rotator_->padded_dim() - : static_cast(dimension_); - } - //! Retrieve count of elements in holder (-1 indicates unknown) size_t count(void) const override { return front_->count(); @@ -593,7 +583,7 @@ class IntegerStreamingConverter : public IndexConverter { //! Retrieve dimension size_t dimension(void) const override { - return padded_dim() + ExtraDimension(data_type_); + return dimension_ + ExtraDimension(data_type_); } //! Retrieve type information diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index b9c9e8278..b77b98aa0 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -319,9 +319,8 @@ class IntegerStreamingReformer : public IndexReformer { } else { enable_rotate_ = true; LOG_DEBUG( - "IntegerStreamingReformer: rotator auto-loaded, origin_dim=%zu, " - "padded_dim=%zu", - rotator_->dimension(), rotator_->padded_dim()); + "IntegerStreamingReformer: rotator auto-loaded, dim=%zu", + rotator_->dimension()); } } return 0; @@ -349,7 +348,7 @@ class IntegerStreamingReformer : public IndexReformer { const float *vec = reinterpret_cast(query); std::unique_ptr rotate_buffer; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotate_buffer.reset(new float[rotator_->dimension()]); rotator_->rotate(vec, rotate_buffer.get()); vec = rotate_buffer.get(); } @@ -381,7 +380,7 @@ class IntegerStreamingReformer : public IndexReformer { std::unique_ptr rotate_buffer; std::unique_ptr normalized; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotate_buffer.reset(new float[rotator_->dimension()]); } if (enable_normalize_) { normalized.reset(new float[qmeta.dimension()]); @@ -422,7 +421,7 @@ class IntegerStreamingReformer : public IndexReformer { const float *vec = reinterpret_cast(record); std::unique_ptr rotate_buffer; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotate_buffer.reset(new float[rotator_->dimension()]); rotator_->rotate(vec, rotate_buffer.get()); vec = rotate_buffer.get(); } @@ -455,7 +454,7 @@ class IntegerStreamingReformer : public IndexReformer { std::unique_ptr rotate_buffer; std::unique_ptr normalized; if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->padded_dim()]); + rotate_buffer.reset(new float[rotator_->dimension()]); } if (enable_normalize_) { normalized.reset(new float[rmeta.dimension()]); @@ -506,18 +505,23 @@ class IntegerStreamingReformer : public IndexReformer { return IndexError_Unsupported; } - const size_t origin_dim = qmeta.dimension() - extra_dimension_; - out->resize(origin_dim * sizeof(float)); - float *out_buf = reinterpret_cast(out->data()); + const size_t stored_dim = qmeta.dimension() - extra_dimension_; if (enable_rotate_ && rotator_) { - // First unquantize into a temporary buffer, then inverse rotate - std::vector unq_buf(origin_dim); - RecordQuantizer::unquantize_record(in, origin_dim, data_type_, + // Unquantize to stored_dim floats, then inverse rotate to dim floats + const size_t dim = rotator_->dimension(); + out->resize(dim * sizeof(float)); + float *out_buf = reinterpret_cast(out->data()); + + std::vector unq_buf(stored_dim); + RecordQuantizer::unquantize_record(in, stored_dim, data_type_, unq_buf.data()); rotator_->unrotate(unq_buf.data(), out_buf); } else { - RecordQuantizer::unquantize_record(in, origin_dim, data_type_, out_buf); + // No rotation: stored dim == original dim + out->resize(stored_dim * sizeof(float)); + float *out_buf = reinterpret_cast(out->data()); + RecordQuantizer::unquantize_record(in, stored_dim, data_type_, out_buf); } return 0; diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 67ace63f6..4b89e9c45 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -20,6 +20,13 @@ #include #include +// Eigen headers from rabitqlib — used by MatrixRotator for numerically stable +// HouseholderQR orthogonalisation and vectorised matrix multiplication. +#include "rabitqlib/defines.hpp" +#include "rabitqlib/utils/space.hpp" +#include +#include + #if defined(__AVX2__) || defined(__AVX512F__) #include // FFHT (Fastest Fast Hadamard Transform) — hand-tuned AVX inline assembly @@ -205,6 +212,54 @@ void kacs_walk(float *data, size_t len) { #endif } +//! Inverse Kac walk: undo butterfly add/sub with 0.5 factor. +//! If forward maps (x,y) -> (x+y, x-y), inverse maps (a,b) -> ((a+b)/2, (a-b)/2). +void inv_kacs_walk(float *data, size_t len) { + size_t half = len / 2; +#if defined(__AVX512F__) + const __m512 half_fac = _mm512_set1_ps(0.5f); + for (size_t i = 0; i < half; i += 16) { + __m512 a = _mm512_loadu_ps(&data[i]); + __m512 b = _mm512_loadu_ps(&data[i + half]); + _mm512_storeu_ps(&data[i], _mm512_mul_ps(_mm512_add_ps(a, b), half_fac)); + _mm512_storeu_ps(&data[i + half], + _mm512_mul_ps(_mm512_sub_ps(a, b), half_fac)); + } +#elif defined(__AVX2__) + const __m256 half_fac = _mm256_set1_ps(0.5f); + for (size_t i = 0; i < half; i += 8) { + __m256 a = _mm256_loadu_ps(&data[i]); + __m256 b = _mm256_loadu_ps(&data[i + half]); + _mm256_storeu_ps(&data[i], _mm256_mul_ps(_mm256_add_ps(a, b), half_fac)); + _mm256_storeu_ps(&data[i + half], + _mm256_mul_ps(_mm256_sub_ps(a, b), half_fac)); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + const float32x4_t half_fac = vdupq_n_f32(0.5f); + for (size_t i = 0; i < half; i += 4) { + float32x4_t a = vld1q_f32(&data[i]); + float32x4_t b = vld1q_f32(&data[i + half]); + vst1q_f32(&data[i], vmulq_f32(vaddq_f32(a, b), half_fac)); + vst1q_f32(&data[i + half], vmulq_f32(vsubq_f32(a, b), half_fac)); + } +#elif defined(__SSE2__) + const __m128 half_fac = _mm_set1_ps(0.5f); + for (size_t i = 0; i < half; i += 4) { + __m128 a = _mm_loadu_ps(&data[i]); + __m128 b = _mm_loadu_ps(&data[i + half]); + _mm_storeu_ps(&data[i], _mm_mul_ps(_mm_add_ps(a, b), half_fac)); + _mm_storeu_ps(&data[i + half], _mm_mul_ps(_mm_sub_ps(a, b), half_fac)); + } +#else + for (size_t i = 0; i < half; ++i) { + float a = data[i]; + float b = data[i + half]; + data[i] = (a + b) * 0.5f; + data[i + half] = (a - b) * 0.5f; + } +#endif +} + //! Scale each element by a constant factor. void vec_rescale(float *data, size_t n, float factor) { for (size_t i = 0; i < n; ++i) { @@ -237,6 +292,11 @@ void write_u32_le(char *p, uint32_t v) { // ============================================================================ // FhtKacRotatorImpl - O(d log d) FHT-based Kac random rotation +// +// Requires dimension % 64 == 0 for SIMD flip-sign correctness. +// When dimension is also a power of 2, uses 4 rounds of (flip -> FHT -> rescale). +// When dimension is 64-aligned but NOT a power of 2 (e.g. 192, 320), +// uses kacs_walk reduction to handle the non-power-of-2 case. // ============================================================================ struct FhtKacRotatorImpl { @@ -246,72 +306,69 @@ struct FhtKacRotatorImpl { static constexpr size_t kByteLen = 8; - void init(size_t /*dim*/, size_t padded_dim) { - flip.resize(4 * padded_dim / kByteLen); + void init(size_t dim) { + flip.resize(4 * dim / kByteLen); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dist(0, 255); for (auto &b : flip) b = static_cast(dist(gen)); } - void rotate(const float *in, float *out, size_t dim, - size_t padded_dim) const { + void rotate(const float *in, float *out, size_t dim) const { std::memcpy(out, in, sizeof(float) * dim); - std::fill(out + dim, out + padded_dim, 0.0f); - if (trunc_dim == padded_dim) { + if (trunc_dim == dim) { // Exact power-of-2: 4 rounds of (flip -> FHT -> rescale) - flip_sign(flip.data(), out, padded_dim); + flip_sign(flip.data(), out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); - flip_sign(flip.data() + padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + dim / kByteLen, out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); - flip_sign(flip.data() + 2 * padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + 2 * dim / kByteLen, out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); - flip_sign(flip.data() + 3 * padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + 3 * dim / kByteLen, out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); return; } - // Non-power-of-2: 4 rounds with kacs_walk reduction. - // FHT always operates on trunc_dim (largest power-of-2 <= dim), - // matching the original rabitqlib behavior. - size_t start = padded_dim - trunc_dim; + // Non-power-of-2 (64-aligned, e.g. 192, 320): 4 rounds with kacs_walk + // reduction. FHT always operates on trunc_dim (largest power-of-2 <= dim). + size_t start = dim - trunc_dim; float *trunc_ptr = out + start; // Round 1: FHT on [0, trunc_dim) - flip_sign(flip.data(), out, padded_dim); + flip_sign(flip.data(), out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); - kacs_walk(out, padded_dim); + kacs_walk(out, dim); // Round 2: FHT on [start, start + trunc_dim) - flip_sign(flip.data() + padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + dim / kByteLen, out, dim); fht_inplace(trunc_ptr, trunc_dim); vec_rescale(trunc_ptr, trunc_dim, fac); - kacs_walk(out, padded_dim); + kacs_walk(out, dim); // Round 3: FHT on [0, trunc_dim) - flip_sign(flip.data() + 2 * padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + 2 * dim / kByteLen, out, dim); fht_inplace(out, trunc_dim); vec_rescale(out, trunc_dim, fac); - kacs_walk(out, padded_dim); + kacs_walk(out, dim); // Round 4: FHT on [start, start + trunc_dim) - flip_sign(flip.data() + 3 * padded_dim / kByteLen, out, padded_dim); + flip_sign(flip.data() + 3 * dim / kByteLen, out, dim); fht_inplace(trunc_ptr, trunc_dim); vec_rescale(trunc_ptr, trunc_dim, fac); - kacs_walk(out, padded_dim); + kacs_walk(out, dim); // Final rescale: combine the 4 kacs_walk reductions - vec_rescale(out, padded_dim, 0.25f); + vec_rescale(out, dim, 0.25f); } void save(char *data) const { @@ -325,71 +382,93 @@ struct FhtKacRotatorImpl { size_t dump_bytes() const { return flip.size(); } + + void unrotate(const float *in, float *out, size_t dim) const { + // Copy input into working buffer + std::vector data(in, in + dim); + + if (trunc_dim == dim) { + // Exact power-of-2: reverse 4 rounds in reverse order. + // Forward per round: flip -> fht -> rescale(fac) + // Reverse per round: rescale(1/fac) -> inv_fht -> flip + // Combined: fht + rescale(1/sqrt(trunc_dim)) + flip + const float inv_fac = 1.0f / std::sqrt(static_cast(trunc_dim)); + for (int round = 3; round >= 0; --round) { + fht_inplace(data.data(), trunc_dim); + vec_rescale(data.data(), trunc_dim, inv_fac); + flip_sign(flip.data() + round * dim / kByteLen, data.data(), dim); + } + std::memcpy(out, data.data(), dim * sizeof(float)); + return; + } + + // Non-power-of-2: undo final rescale(0.25) first + vec_rescale(data.data(), dim, 4.0f); + + // Reverse 4 rounds in reverse order. + // Forward round: flip -> fht -> rescale(fac) -> kacs_walk + // Reverse: inv_kacs_walk -> rescale(1/fac) -> inv_fht -> flip + // Combined inv_fht: fht + rescale(1/sqrt(trunc_dim)) + const float inv_fac = 1.0f / std::sqrt(static_cast(trunc_dim)); + size_t start = dim - trunc_dim; + float *trunc_ptr = data.data() + start; + + // Undo Round 4 (FHT on [start, start+trunc_dim)) + inv_kacs_walk(data.data(), dim); + fht_inplace(trunc_ptr, trunc_dim); + vec_rescale(trunc_ptr, trunc_dim, inv_fac); + flip_sign(flip.data() + 3 * dim / kByteLen, data.data(), dim); + + // Undo Round 3 (FHT on [0, trunc_dim)) + inv_kacs_walk(data.data(), dim); + fht_inplace(data.data(), trunc_dim); + vec_rescale(data.data(), trunc_dim, inv_fac); + flip_sign(flip.data() + 2 * dim / kByteLen, data.data(), dim); + + // Undo Round 2 (FHT on [start, start+trunc_dim)) + inv_kacs_walk(data.data(), dim); + fht_inplace(trunc_ptr, trunc_dim); + vec_rescale(trunc_ptr, trunc_dim, inv_fac); + flip_sign(flip.data() + dim / kByteLen, data.data(), dim); + + // Undo Round 1 (FHT on [0, trunc_dim)) + inv_kacs_walk(data.data(), dim); + fht_inplace(data.data(), trunc_dim); + vec_rescale(data.data(), trunc_dim, inv_fac); + flip_sign(flip.data(), data.data(), dim); + + std::memcpy(out, data.data(), dim * sizeof(float)); + } }; // ============================================================================ // MatrixRotatorImpl - O(d^2) random orthogonal matrix rotation +// +// No alignment requirement on dimension. Uses a dim x dim square orthogonal +// matrix generated via Householder QR on a random Gaussian matrix. // ============================================================================ struct MatrixRotatorImpl { - std::vector matrix; // dim x padded_dim, row-major + std::vector matrix; // dim x dim, row-major - void init(size_t dim, size_t padded_dim) { - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution normal(0.0f, 1.0f); - - // Generate padded_dim random Gaussian vectors of length padded_dim - std::vector q(padded_dim * padded_dim); - for (auto &v : q) v = normal(gen); - - // Modified Gram-Schmidt orthogonalization - for (size_t i = 0; i < padded_dim; ++i) { - float *qi = &q[i * padded_dim]; - - // Subtract projections onto all previous basis vectors - for (size_t j = 0; j < i; ++j) { - const float *qj = &q[j * padded_dim]; - float dot = 0.0f; - for (size_t k = 0; k < padded_dim; ++k) dot += qi[k] * qj[k]; - for (size_t k = 0; k < padded_dim; ++k) qi[k] -= dot * qj[k]; - } + void init(size_t dim) { + // Generate dim x dim random Gaussian matrix + rabitqlib::RowMajorMatrix rand_mat = + rabitqlib::random_gaussian_matrix(dim, dim); - // Normalize - float norm = 0.0f; - for (size_t k = 0; k < padded_dim; ++k) norm += qi[k] * qi[k]; - norm = std::sqrt(norm); - - if (norm < 1e-10f) { - // Degenerate vector: re-randomize and re-orthogonalize - for (size_t k = 0; k < padded_dim; ++k) qi[k] = normal(gen); - for (size_t j = 0; j < i; ++j) { - const float *qj = &q[j * padded_dim]; - float dot = 0.0f; - for (size_t k = 0; k < padded_dim; ++k) dot += qi[k] * qj[k]; - for (size_t k = 0; k < padded_dim; ++k) qi[k] -= dot * qj[k]; - } - norm = 0.0f; - for (size_t k = 0; k < padded_dim; ++k) norm += qi[k] * qi[k]; - norm = std::sqrt(norm); - } - for (size_t k = 0; k < padded_dim; ++k) qi[k] /= norm; - } + // Householder QR: numerically stable orthogonalisation + Eigen::HouseholderQR> qr(rand_mat); + rabitqlib::RowMajorMatrix q_inv = qr.householderQ().transpose(); - // Keep only the first dim rows (the rest are zero-padded in input) - matrix.resize(dim * padded_dim); - std::memcpy(matrix.data(), q.data(), dim * padded_dim * sizeof(float)); + matrix.resize(dim * dim); + std::memcpy(matrix.data(), &q_inv(0, 0), sizeof(float) * dim * dim); } - void rotate(const float *in, float *out, size_t dim, - size_t padded_dim) const { - for (size_t i = 0; i < padded_dim; ++i) { - float sum = 0.0f; - for (size_t j = 0; j < dim; ++j) { - sum += matrix[j * padded_dim + i] * in[j]; - } - out[i] = sum; - } + void rotate(const float *in, float *out, size_t dim) const { + // v (1 x dim) * M (dim x dim) -> rv (1 x dim) + rabitqlib::ConstRowMajorMatrixMap v(in, 1, dim); + rabitqlib::RowMajorMatrixMap rv(out, 1, dim); + rv = v * rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim); } void save(char *data) const { @@ -403,6 +482,14 @@ struct MatrixRotatorImpl { size_t dump_bytes() const { return matrix.size() * sizeof(float); } + + //! Inverse rotate using M^T (transpose of the dim x dim orthogonal matrix). + void unrotate(const float *in, float *out, size_t dim) const { + // M^T (dim x dim) * in (dim x 1) -> out (dim x 1) + rabitqlib::ConstRowMajorMatrixMap v(in, dim, 1); + rabitqlib::RowMajorMatrixMap rv(out, dim, 1); + rv = rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim).transpose() * v; + } }; } // anonymous namespace @@ -412,45 +499,49 @@ struct MatrixRotatorImpl { // ============================================================================ struct RecordRotator::Impl { - //! Header layout must match the original struct on x86_64: - //! type(1B) + padding(3B) + origin_dim(4B) + padded_dim(4B) = 12B - //! This preserves backward compatibility with existing serialized data. + //! Header layout (12 bytes, backward-compatible with older serialised data): + //! type(1B) + padding(3B) + origin_dim(4B) + reserved(4B) = 12B + //! The reserved field previously stored padded_dim; it now mirrors origin_dim. static constexpr size_t kHeaderSize = 12; struct Header { uint8_t type; uint32_t origin_dim; - uint32_t padded_dim; + uint32_t reserved; // backward-compat placeholder (was padded_dim) void write_to(char *buf) const { std::memset(buf, 0, kHeaderSize); // zero-fill padding buf[0] = static_cast(type); write_u32_le(buf + 4, origin_dim); - write_u32_le(buf + 8, padded_dim); + write_u32_le(buf + 8, reserved); } void read_from(const char *buf) { type = static_cast(buf[0]); origin_dim = read_u32_le(buf + 4); - padded_dim = read_u32_le(buf + 8); + // reserved (buf+8) is intentionally ignored for forward compatibility } }; size_t dimension{0}; - size_t padded_dim{0}; RecordRotatorType type{RecordRotatorType::FhtKac}; std::unique_ptr fht_impl; std::unique_ptr mat_impl; - //! Inverse rotation matrix, column-major: padded_dim columns x dimension rows - std::vector inv_matrix; - void do_rotate(const float *in, float *out) const { if (fht_impl) { - fht_impl->rotate(in, out, dimension, padded_dim); + fht_impl->rotate(in, out, dimension); + } else { + mat_impl->rotate(in, out, dimension); + } + } + + void do_unrotate(const float *in, float *out) const { + if (fht_impl) { + fht_impl->unrotate(in, out, dimension); } else { - mat_impl->rotate(in, out, dimension, padded_dim); + mat_impl->unrotate(in, out, dimension); } } @@ -487,25 +578,34 @@ RecordRotator::~RecordRotator() = default; RecordRotator::RecordRotator(RecordRotator &&) noexcept = default; RecordRotator &RecordRotator::operator=(RecordRotator &&) noexcept = default; -void RecordRotator::init(size_t dimension, size_t padded_dim, - RecordRotatorType rotator_type) { +void RecordRotator::init(size_t dimension, RecordRotatorType rotator_type) { impl_->dimension = dimension; - impl_->padded_dim = padded_dim; - impl_->type = rotator_type; - if (rotator_type == RecordRotatorType::FhtKac) { + // Auto-select implementation based on dimension alignment when FhtKac + // is requested. FhtKac requires the dimension to be a multiple of 64 + // for SIMD flip-sign and FHT correctness. When the dimension is not + // 64-aligned we transparently fall back to the O(d^2) Matrix rotator. + bool use_fht = (rotator_type == RecordRotatorType::FhtKac) && + (dimension % 64 == 0); + + if (use_fht) { + impl_->type = RecordRotatorType::FhtKac; impl_->fht_impl = std::make_unique(); impl_->fht_impl->trunc_dim = floor_pow2(dimension); impl_->fht_impl->fac = 1.0f / std::sqrt(static_cast(impl_->fht_impl->trunc_dim)); - impl_->fht_impl->init(dimension, padded_dim); + impl_->fht_impl->init(dimension); } else { + if (rotator_type == RecordRotatorType::FhtKac) { + LOG_DEBUG( + "RecordRotator::init: dimension %zu is not 64-aligned, " + "falling back from FhtKac to Matrix rotator", + dimension); + } + impl_->type = RecordRotatorType::Matrix; impl_->mat_impl = std::make_unique(); - impl_->mat_impl->init(dimension, padded_dim); + impl_->mat_impl->init(dimension); } - - // Build inverse rotation data for unrotate support - build_inverse(); } void RecordRotator::rotate(const float *in, float *out) const { @@ -513,61 +613,17 @@ void RecordRotator::rotate(const float *in, float *out) const { } std::vector RecordRotator::rotate(const float *in) const { - std::vector out(impl_->padded_dim); + std::vector out(impl_->dimension); impl_->do_rotate(in, out.data()); return out; } -void RecordRotator::build_inverse() { - if (!impl_->fht_impl && !impl_->mat_impl) { - LOG_ERROR("RecordRotator::build_inverse: rotator not initialized"); - return; - } - - const size_t dim = impl_->dimension; - const size_t pdim = impl_->padded_dim; - - // Allocate column-major storage: padded_dim columns, each dim floats - impl_->inv_matrix.resize(pdim * dim, 0.0f); - - // Compute rotation matrix by rotating each standard basis vector e_i. - // R * e_i = i-th column of R, stored as inv_matrix[i * dim + j]. - std::vector basis(dim, 0.0f); - std::vector rotated(pdim, 0.0f); - - for (size_t i = 0; i < pdim; ++i) { - std::fill(basis.begin(), basis.end(), 0.0f); - if (i < dim) { - basis[i] = 1.0f; - } - impl_->do_rotate(basis.data(), rotated.data()); - for (size_t j = 0; j < dim; ++j) { - impl_->inv_matrix[i * dim + j] = rotated[j]; - } - } - - LOG_DEBUG("RecordRotator::build_inverse done: dim=%zu, padded_dim=%zu", dim, - pdim); -} - void RecordRotator::unrotate(const float *in, float *out) const { - if (impl_->inv_matrix.empty()) { - LOG_ERROR("RecordRotator::unrotate: build_inverse() not called"); + if (!impl_->fht_impl && !impl_->mat_impl) { + LOG_ERROR("RecordRotator::unrotate: rotator not initialized"); return; } - - const size_t dim = impl_->dimension; - - // Compute x = R^T * y, where y is the dim-dimensional input. - // x[j] = sum_{i=0}^{dim-1} inv_matrix[i * dim + j] * in[i] - std::vector tmp(dim, 0.0f); - for (size_t i = 0; i < dim; ++i) { - const float yi = in[i]; - for (size_t j = 0; j < dim; ++j) { - tmp[j] += impl_->inv_matrix[i * dim + j] * yi; - } - } - std::memcpy(out, tmp.data(), dim * sizeof(float)); + impl_->do_unrotate(in, out); } std::vector RecordRotator::unrotate(const float *in) const { @@ -595,7 +651,7 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, return (size + 0x1F) & (~0x1F); }; - // Serialize: [Header: type|origin_dim|padded_dim] [rotation blob] + // Serialize: [Header: type|origin_dim|reserved] [rotation blob] const size_t blob_size = impl_->blob_bytes(); const size_t data_size = Impl::kHeaderSize + blob_size; const size_t total_size = align_size(data_size); @@ -604,7 +660,7 @@ int RecordRotator::dump(const IndexStorage::Pointer &storage, Impl::Header header; header.type = static_cast(impl_->type); header.origin_dim = static_cast(impl_->dimension); - header.padded_dim = static_cast(impl_->padded_dim); + header.reserved = static_cast(impl_->dimension); // backward compat header.write_to(buffer.data()); impl_->save_blob(buffer.data() + Impl::kHeaderSize); @@ -651,7 +707,7 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, return IndexError_NoReady; } - // Serialize: [Header: type|origin_dim|padded_dim] [rotation blob] + // Serialize: [Header: type|origin_dim|reserved] [rotation blob] const size_t blob_size = impl_->blob_bytes(); const size_t data_size = Impl::kHeaderSize + blob_size; const size_t total_size = (data_size + 0x1F) & (~0x1F); @@ -660,7 +716,7 @@ int RecordRotator::dump(const IndexDumper::Pointer &dumper, Impl::Header header; header.type = static_cast(impl_->type); header.origin_dim = static_cast(impl_->dimension); - header.padded_dim = static_cast(impl_->padded_dim); + header.reserved = static_cast(impl_->dimension); // backward compat header.write_to(buffer.data()); impl_->save_blob(buffer.data() + Impl::kHeaderSize); @@ -728,19 +784,18 @@ int RecordRotator::open(IndexStorage::Pointer storage, } } - // Parse self-describing header + // Parse self-describing header (reserved field is ignored) const char *raw = reinterpret_cast(block.data()); Impl::Header header; header.read_from(raw); impl_->type = static_cast(header.type); impl_->dimension = static_cast(header.origin_dim); - impl_->padded_dim = static_cast(header.padded_dim); // Reconstruct the rotator from header info and load blob if (impl_->type == RecordRotatorType::FhtKac) { impl_->fht_impl = std::make_unique(); - impl_->fht_impl->flip.resize(4 * impl_->padded_dim / + impl_->fht_impl->flip.resize(4 * impl_->dimension / FhtKacRotatorImpl::kByteLen); impl_->fht_impl->trunc_dim = floor_pow2(impl_->dimension); impl_->fht_impl->fac = @@ -748,45 +803,34 @@ int RecordRotator::open(IndexStorage::Pointer storage, impl_->fht_impl->load(raw + Impl::kHeaderSize); } else { impl_->mat_impl = std::make_unique(); - impl_->mat_impl->matrix.resize(impl_->dimension * impl_->padded_dim); + impl_->mat_impl->matrix.resize(impl_->dimension * impl_->dimension); impl_->mat_impl->load(raw + Impl::kHeaderSize); } LOG_DEBUG( - "RecordRotator::open done: seg=%s, dim=%zu, padded_dim=%zu, " - "data_size=%zu", - seg_id.c_str(), impl_->dimension, impl_->padded_dim, data_size); - - // Build inverse rotation data for unrotate support - build_inverse(); + "RecordRotator::open done: seg=%s, dim=%zu, data_size=%zu", + seg_id.c_str(), impl_->dimension, data_size); return 0; } -int RecordRotator::load(const float *matrix, size_t dimension, - size_t padded_dim) { +int RecordRotator::load(const float *matrix, size_t dimension) { if (!matrix) { LOG_ERROR("RecordRotator::load: null matrix"); return IndexError_InvalidArgument; } - if (dimension == 0 || padded_dim == 0) { - LOG_ERROR("RecordRotator::load: invalid dims %zu x %zu", dimension, - padded_dim); + if (dimension == 0) { + LOG_ERROR("RecordRotator::load: invalid dim %zu", dimension); return IndexError_InvalidArgument; } impl_->dimension = dimension; - impl_->padded_dim = padded_dim; impl_->type = RecordRotatorType::Matrix; impl_->mat_impl = std::make_unique(); - impl_->mat_impl->matrix.resize(dimension * padded_dim); + impl_->mat_impl->matrix.resize(dimension * dimension); impl_->mat_impl->load(reinterpret_cast(matrix)); - LOG_DEBUG("RecordRotator::load done: dim=%zu, padded_dim=%zu", dimension, - padded_dim); - - // Build inverse rotation data for unrotate support - build_inverse(); + LOG_DEBUG("RecordRotator::load done: dim=%zu", dimension); return 0; } @@ -795,10 +839,6 @@ size_t RecordRotator::dimension() const { return impl_->dimension; } -size_t RecordRotator::padded_dim() const { - return impl_->padded_dim; -} - RecordRotatorType RecordRotator::rotator_type() const { return impl_->type; } diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index 4db73b06e..e9423ee47 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -37,9 +37,14 @@ enum class RecordRotatorType : uint8_t { * All rotation algorithms are implemented inline (FHT-based Kac walk and * explicit random matrix), so no rabitqlib headers are required. * - * Provides O(d log d) fast rotation (FHT-based Kac random rotation), - * as well as serialization (save/load) of the rotation parameters. - * Used by IntegerStreamingConverter/Reformer when enable_rotate is true. + * Auto-selects the rotation algorithm based on dimension alignment: + * - dimension % 64 == 0 -> FhtKac (O(d log d), requires 64-alignment) + * - otherwise -> Matrix (O(d^2), no alignment requirement) + * + * Rotation preserves dimension: output size == input size (no padding). + * + * Used by IntegerStreamingConverter/Reformer and CosineConverter/Reformer + * when enable_rotate is true. */ class RecordRotator { public: @@ -52,39 +57,36 @@ class RecordRotator { RecordRotator(const RecordRotator &) = delete; RecordRotator &operator=(const RecordRotator &) = delete; - //! Initialize the rotator - //! @param dimension original vector dimension - //! @param padded_dim padded dimension (rounded up for SIMD alignment) - //! @param rotator_type rotation algorithm (default: FhtKac) - void init(size_t dimension, size_t padded_dim, + //! Initialize the rotator. + //! Auto-selects FhtKac when dimension is 64-aligned, else falls back to + //! Matrix. The @p rotator_type parameter can force Matrix explicitly. + //! @param dimension vector dimension (input and output size) + //! @param rotator_type rotation algorithm (default: FhtKac, auto-degrades + //! to Matrix when dimension is not 64-aligned) + void init(size_t dimension, RecordRotatorType rotator_type = RecordRotatorType::FhtKac); //! Rotate a single vector //! @param in input vector of size >= dimension - //! @param out output buffer of size >= padded_dim + //! @param out output buffer of size >= dimension void rotate(const float *in, float *out) const; //! Rotate a single vector into a managed buffer //! @param in input vector of size >= dimension - //! @return vector of size padded_dim containing rotated result + //! @return vector of size dimension containing rotated result std::vector rotate(const float *in) const; //! Inverse-rotate a single vector (from rotated space back to original) - //! @param in input vector of size >= dimension (rotated, truncated) + //! @param in input vector of size >= dimension (rotated vector) //! @param out output buffer of size >= dimension (original space) void unrotate(const float *in, float *out) const; //! Inverse-rotate a single vector into a managed buffer - //! @param in input vector of size >= dimension (rotated, truncated) + //! @param in input vector of size >= dimension (rotated vector) //! @return vector of size dimension containing inverse-rotated //! result std::vector unrotate(const float *in) const; - //! Prepare internal data structures for inverse rotation. - //! Computes the rotation matrix by rotating basis vectors. - //! Must be called after init() or open() before using unrotate(). - void build_inverse(); - //! Return the serialized size of the rotator in bytes (header + blob) size_t dump_bytes() const; @@ -94,30 +96,26 @@ class RecordRotator { const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; //! Dump the rotator to an IndexDumper as a named segment. - //! Format: [Header: type(1B)|origin_dim(4B)|padded_dim(4B)] [rotation blob] + //! Format: [Header: type(1B)|origin_dim(4B)|reserved(4B)] [rotation blob] //! Appends padding for 32-byte alignment. int dump(const IndexDumper::Pointer &dumper, const std::string &seg_id = RECORD_ROTATOR_SEG_ID) const; //! Open the rotator from an IndexStorage segment (self-describing, no init - //! needed). Parses header to get type/dimension/padded_dim, then reconstructs - //! the rotator. + //! needed). Parses header to get type/dimension, then reconstructs the + //! rotator. int open(IndexStorage::Pointer storage, const std::string &seg_id = RECORD_ROTATOR_SEG_ID); //! Load a user-specified rotation matrix. //! Always uses MatrixRotator internally. - //! @param matrix row-major matrix of shape dimension x padded_dim - //! @param dimension original vector dimension - //! @param padded_dim padded dimension (must be multiple of 64) - int load(const float *matrix, size_t dimension, size_t padded_dim); + //! @param matrix row-major square matrix of shape dimension x dimension + //! @param dimension vector dimension + int load(const float *matrix, size_t dimension); - //! Return the original dimension + //! Return the vector dimension size_t dimension() const; - //! Return the padded dimension - size_t padded_dim() const; - //! Return the rotator type RecordRotatorType rotator_type() const; From bd782f908a1086a541e136bde7d9b0891a7de6ee Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 14:39:46 +0800 Subject: [PATCH 28/38] tmp --- src/core/quantizer/cosine_reformer.cc | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index b495fc944..fa8c2727f 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -106,25 +106,21 @@ class CosineReformer : public IndexReformer { size_t origin_dimension = qmeta.dimension(); const float *vec = reinterpret_cast(query); - - // Apply rotation if enabled - std::unique_ptr rotate_buffer; - if (enable_rotate_ && rotator_) { - rotate_buffer.reset(new float[rotator_->dimension()]); - rotator_->rotate(vec, rotate_buffer.get()); - vec = rotate_buffer.get(); - // rotation preserves dimension: origin_dimension stays qmeta.dimension() - } - - // Normalize (L2) float norm = 0.0f; + + // Fast path: no rotation — matches main branch behavior exactly std::string normalized_buffer(reinterpret_cast(query), qmeta.element_size()); float *buf = reinterpret_cast(&normalized_buffer[0]); + if (enable_rotate_ && rotator_) { - // Already rotated, normalize the rotated vector - ailego::Normalizer::L2(const_cast(vec), - origin_dimension, &norm); + // Rotate then normalize the rotated vector + std::vector rotate_buffer(rotator_->dimension()); + rotator_->rotate(vec, rotate_buffer.data()); + std::memcpy(buf, rotate_buffer.data(), + origin_dimension * sizeof(float)); + ailego::Normalizer::L2(buf, origin_dimension, &norm); + vec = buf; } else { ailego::Normalizer::L2(buf, origin_dimension, &norm); vec = buf; From 12e88c6b9f144359b474a81d17db2fd4b4b249a2 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 15:23:23 +0800 Subject: [PATCH 29/38] Remove redundancy --- src/core/quantizer/cosine_converter.cc | 3 --- src/core/quantizer/cosine_reformer.cc | 3 +-- src/core/quantizer/integer_quantizer_converter.cc | 3 --- src/core/quantizer/integer_quantizer_reformer.cc | 1 - src/core/quantizer/quantizer_params.h | 4 ---- src/core/quantizer/record_rotator.h | 2 +- 6 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index 02fb3dddc..4656ad029 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -288,9 +288,6 @@ class CosineConverter : public IndexConverter { params.get(COSINE_CONVERTER_ENABLE_ROTATE, &enable_rotate_); ailego::Params reformer_params; - if (enable_rotate_) { - reformer_params.set(COSINE_REFORMER_ENABLE_ROTATE, true); - } // Create rotator if rotation is enabled if (enable_rotate_) { diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index fa8c2727f..39102493e 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -45,8 +45,7 @@ class CosineReformer : public IndexReformer { dst_type_(IndexMeta::DataType::DT_UNDEFINED) {} //! Initialize Reformer - int init(const ailego::Params ¶ms) override { - params.get(COSINE_REFORMER_ENABLE_ROTATE, &enable_rotate_); + int init(const ailego::Params & /*params*/) override { return 0; } diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index 242074b02..bf1976520 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -384,9 +384,6 @@ class IntegerStreamingConverter : public IndexConverter { if (enable_normalize_) { reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE, true); } - if (enable_rotate_) { - reformer_params.set(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, true); - } is_euclidean_ = index_meta.metric_name() == "MipsSquaredEuclidean" || index_meta.metric_name() == "SquaredEuclidean" || diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index b77b98aa0..4dbe41b0d 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -288,7 +288,6 @@ class IntegerStreamingReformer : public IndexReformer { int init(const ailego::Params ¶ms) override { params.get(INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE, &enable_normalize_); params.get(INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN, &is_euclidean_); - params.get(INTEGER_STREAMING_REFORMER_ENABLE_ROTATE, &enable_rotate_); return 0; } diff --git a/src/core/quantizer/quantizer_params.h b/src/core/quantizer/quantizer_params.h index 99377108b..d56c8591d 100644 --- a/src/core/quantizer/quantizer_params.h +++ b/src/core/quantizer/quantizer_params.h @@ -106,8 +106,6 @@ static const std::string COSINE_CONVERTER_ENABLE_ROTATE = //! CosineReformer static const std::string COSINE_REFORMER_FORCED_HALF_FLOAT = "cosine.reformer.forced_half_float"; -static const std::string COSINE_REFORMER_ENABLE_ROTATE = - "cosine.reformer.enable_rotate"; //! IntegerStreamingConverter static const std::string INTEGER_STREAMING_CONVERTER_ENABLE_NORMALIZE = @@ -120,8 +118,6 @@ static const std::string INTEGER_STREAMING_REFORMER_ENABLE_NORMALIZE = "integer_streaming.reformer.enable_normalize"; static const std::string INTEGER_STREAMING_REFORMER_IS_EUCLIDEAN = "integer_streaming.reformer.is_euclidean"; -static const std::string INTEGER_STREAMING_REFORMER_ENABLE_ROTATE = - "integer_streaming.reformer.enable_rotate"; //! UniformInt8StreamingConverter / Reformer static const std::string UNIFORM_INT8_REFORMER_SCALE = diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index e9423ee47..98734e7dc 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -24,7 +24,7 @@ namespace zvec { namespace core { //! Segment ID used when dumping/loading the rotator data -inline const std::string RECORD_ROTATOR_SEG_ID{"integer_streaming.rotator"}; +inline const std::string RECORD_ROTATOR_SEG_ID{"enable_rotate"}; //! Rotator type exposed without rabitqlib dependency enum class RecordRotatorType : uint8_t { From f5ff7841e3e3bb0109a4646e272d76fd53ef97cd Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 15:30:27 +0800 Subject: [PATCH 30/38] macOS compatible --- src/core/quantizer/record_rotator.cc | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 4b89e9c45..bd5a2c74f 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -54,22 +54,15 @@ namespace { // Scalar / SIMD helper functions for rotation // ============================================================================ -//! Compute floor(log2(n)) for power-of-2 n. -inline int ilog2(size_t n) { - int r = 0; - while (n > 1) { - n >>= 1; - ++r; - } - return r; -} - //! In-place Fast Hadamard Transform on a power-of-2 length array. //! Uses FFHT hand-tuned AVX assembly when available; generic scalar loop //! otherwise (ARM NEON / SSE2 / pure scalar). void fht_inplace(float *data, size_t n) { #if defined(__AVX2__) || defined(__AVX512F__) - fht_float(data, ilog2(n)); + // Compute floor(log2(n)) for power-of-2 n. + int log_n = 0; + for (size_t v = n; v > 1; v >>= 1) ++log_n; + fht_float(data, log_n); #else for (size_t len = 1; len < n; len <<= 1) { for (size_t i = 0; i < n; i += len << 1) { From 879261abe1d1750cdd00e23f2dca4ed18df405b0 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 15:34:17 +0800 Subject: [PATCH 31/38] macOS compatible --- src/core/quantizer/cosine_converter.cc | 3 +-- src/core/quantizer/cosine_reformer.cc | 5 ++-- .../quantizer/integer_quantizer_converter.cc | 5 ++-- .../quantizer/integer_quantizer_reformer.cc | 5 ++-- src/core/quantizer/record_rotator.cc | 27 ++++++++++--------- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index 4656ad029..df5231534 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -132,8 +132,7 @@ class CosineConverterHolder : public IndexHolder { float norm = 0.0f; ailego::Normalizer::L2(const_cast(vec), - original_dimension_, - &norm); + original_dimension_, &norm); if (type_ == IndexMeta::DataType::DT_FP32) { ::memcpy(reinterpret_cast(&normalize_buffer_[0]), vec, diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index 39102493e..b01cc8d71 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -69,9 +69,8 @@ class CosineReformer : public IndexReformer { rotator_.reset(); } else { enable_rotate_ = true; - LOG_DEBUG( - "CosineReformer: rotator auto-loaded, dim=%zu", - rotator_->dimension()); + LOG_DEBUG("CosineReformer: rotator auto-loaded, dim=%zu", + rotator_->dimension()); } } return 0; diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index bf1976520..b67914162 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -396,9 +396,8 @@ class IntegerStreamingConverter : public IndexConverter { if (enable_rotate_) { rotator_ = std::make_shared(); rotator_->init(index_meta.dimension()); - LOG_DEBUG( - "IntegerStreamingConverter: rotation enabled, dim=%zu", - static_cast(index_meta.dimension())); + LOG_DEBUG("IntegerStreamingConverter: rotation enabled, dim=%zu", + static_cast(index_meta.dimension())); } if (data_type_ == IndexMeta::DataType::DT_INT8) { diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 4dbe41b0d..ea69e4644 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -317,9 +317,8 @@ class IntegerStreamingReformer : public IndexReformer { rotator_.reset(); } else { enable_rotate_ = true; - LOG_DEBUG( - "IntegerStreamingReformer: rotator auto-loaded, dim=%zu", - rotator_->dimension()); + LOG_DEBUG("IntegerStreamingReformer: rotator auto-loaded, dim=%zu", + rotator_->dimension()); } } return 0; diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index bd5a2c74f..8096828af 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -22,10 +22,10 @@ // Eigen headers from rabitqlib — used by MatrixRotator for numerically stable // HouseholderQR orthogonalisation and vectorised matrix multiplication. -#include "rabitqlib/defines.hpp" -#include "rabitqlib/utils/space.hpp" #include #include +#include "rabitqlib/defines.hpp" +#include "rabitqlib/utils/space.hpp" #if defined(__AVX2__) || defined(__AVX512F__) #include @@ -206,7 +206,8 @@ void kacs_walk(float *data, size_t len) { } //! Inverse Kac walk: undo butterfly add/sub with 0.5 factor. -//! If forward maps (x,y) -> (x+y, x-y), inverse maps (a,b) -> ((a+b)/2, (a-b)/2). +//! If forward maps (x,y) -> (x+y, x-y), inverse maps (a,b) -> ((a+b)/2, +//! (a-b)/2). void inv_kacs_walk(float *data, size_t len) { size_t half = len / 2; #if defined(__AVX512F__) @@ -287,8 +288,8 @@ void write_u32_le(char *p, uint32_t v) { // FhtKacRotatorImpl - O(d log d) FHT-based Kac random rotation // // Requires dimension % 64 == 0 for SIMD flip-sign correctness. -// When dimension is also a power of 2, uses 4 rounds of (flip -> FHT -> rescale). -// When dimension is 64-aligned but NOT a power of 2 (e.g. 192, 320), +// When dimension is also a power of 2, uses 4 rounds of (flip -> FHT -> +// rescale). When dimension is 64-aligned but NOT a power of 2 (e.g. 192, 320), // uses kacs_walk reduction to handle the non-power-of-2 case. // ============================================================================ @@ -481,7 +482,9 @@ struct MatrixRotatorImpl { // M^T (dim x dim) * in (dim x 1) -> out (dim x 1) rabitqlib::ConstRowMajorMatrixMap v(in, dim, 1); rabitqlib::RowMajorMatrixMap rv(out, dim, 1); - rv = rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim).transpose() * v; + rv = rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim) + .transpose() * + v; } }; @@ -494,7 +497,8 @@ struct MatrixRotatorImpl { struct RecordRotator::Impl { //! Header layout (12 bytes, backward-compatible with older serialised data): //! type(1B) + padding(3B) + origin_dim(4B) + reserved(4B) = 12B - //! The reserved field previously stored padded_dim; it now mirrors origin_dim. + //! The reserved field previously stored padded_dim; it now mirrors + //! origin_dim. static constexpr size_t kHeaderSize = 12; struct Header { @@ -578,8 +582,8 @@ void RecordRotator::init(size_t dimension, RecordRotatorType rotator_type) { // is requested. FhtKac requires the dimension to be a multiple of 64 // for SIMD flip-sign and FHT correctness. When the dimension is not // 64-aligned we transparently fall back to the O(d^2) Matrix rotator. - bool use_fht = (rotator_type == RecordRotatorType::FhtKac) && - (dimension % 64 == 0); + bool use_fht = + (rotator_type == RecordRotatorType::FhtKac) && (dimension % 64 == 0); if (use_fht) { impl_->type = RecordRotatorType::FhtKac; @@ -800,9 +804,8 @@ int RecordRotator::open(IndexStorage::Pointer storage, impl_->mat_impl->load(raw + Impl::kHeaderSize); } - LOG_DEBUG( - "RecordRotator::open done: seg=%s, dim=%zu, data_size=%zu", - seg_id.c_str(), impl_->dimension, data_size); + LOG_DEBUG("RecordRotator::open done: seg=%s, dim=%zu, data_size=%zu", + seg_id.c_str(), impl_->dimension, data_size); return 0; } From 21cda6414b51803c1ed32891e251188c55713105 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Tue, 16 Jun 2026 21:17:01 +0800 Subject: [PATCH 32/38] unrotate debug --- src/core/quantizer/cosine_reformer.cc | 6 +- .../quantizer/integer_quantizer_reformer.cc | 24 ++++---- src/core/quantizer/record_rotator.cc | 11 ++-- .../integer_quantizer_reformer_test.cc | 55 +++++++++++++++++++ 4 files changed, 73 insertions(+), 23 deletions(-) diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index b01cc8d71..5670f1221 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -219,10 +219,10 @@ class CosineReformer : public IndexReformer { NORM_SIZE, NORM_SIZE); - // For FP32 input type, rotation may have been applied during transform. - // For FP16 input type, rotation was NOT applied — skip inverse rotation. + // Rotation was applied in transform() for all FP32-origin paths (FP32, + // INT8, INT4 stored types). FP16 input path never rotates. const bool need_inv_rotate = - (type == IndexMeta::DataType::DT_FP32 && enable_rotate_ && rotator_); + (type != IndexMeta::DataType::DT_FP16 && enable_rotate_ && rotator_); if (type == IndexMeta::DataType::DT_FP32) { if (dst_type_ != IndexMeta::DataType::DT_FP32) { diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index ea69e4644..eaa858a10 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -505,21 +505,17 @@ class IntegerStreamingReformer : public IndexReformer { const size_t stored_dim = qmeta.dimension() - extra_dimension_; + // Step 1: Unquantize into out buffer (stored_dim floats) + out->resize(stored_dim * sizeof(float)); + float *out_buf = reinterpret_cast(out->data()); + RecordQuantizer::unquantize_record(in, stored_dim, data_type_, out_buf); + + // Step 2: Inverse rotate in-place if rotation was applied if (enable_rotate_ && rotator_) { - // Unquantize to stored_dim floats, then inverse rotate to dim floats - const size_t dim = rotator_->dimension(); - out->resize(dim * sizeof(float)); - float *out_buf = reinterpret_cast(out->data()); - - std::vector unq_buf(stored_dim); - RecordQuantizer::unquantize_record(in, stored_dim, data_type_, - unq_buf.data()); - rotator_->unrotate(unq_buf.data(), out_buf); - } else { - // No rotation: stored dim == original dim - out->resize(stored_dim * sizeof(float)); - float *out_buf = reinterpret_cast(out->data()); - RecordQuantizer::unquantize_record(in, stored_dim, data_type_, out_buf); + std::vector tmp(rotator_->dimension()); + rotator_->unrotate(out_buf, tmp.data()); + out->assign(reinterpret_cast(tmp.data()), + tmp.size() * sizeof(float)); } return 0; diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index 8096828af..d161935dd 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -479,12 +479,11 @@ struct MatrixRotatorImpl { //! Inverse rotate using M^T (transpose of the dim x dim orthogonal matrix). void unrotate(const float *in, float *out, size_t dim) const { - // M^T (dim x dim) * in (dim x 1) -> out (dim x 1) - rabitqlib::ConstRowMajorMatrixMap v(in, dim, 1); - rabitqlib::RowMajorMatrixMap rv(out, dim, 1); - rv = rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim) - .transpose() * - v; + // in (1 x dim) * M^T (dim x dim) -> out (1 x dim) + rabitqlib::ConstRowMajorMatrixMap v(in, 1, dim); + rabitqlib::RowMajorMatrixMap rv(out, 1, dim); + rv = v * rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim) + .transpose(); } }; diff --git a/tests/core/quantizer/integer_quantizer_reformer_test.cc b/tests/core/quantizer/integer_quantizer_reformer_test.cc index 21967bb23..de2368729 100644 --- a/tests/core/quantizer/integer_quantizer_reformer_test.cc +++ b/tests/core/quantizer/integer_quantizer_reformer_test.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include +#include #include #include +#include "quantizer/record_rotator.h" #include "zvec/core/framework/index_factory.h" #include "zvec/core/framework/index_holder.h" @@ -821,3 +824,55 @@ TEST(IntegerReformer, Int4InitConverterWithTrainedParams) { EXPECT_EQ(buffer, buffer2); } } + +// Test FhtKac rotator (dim=64, 64-aligned) +TEST(RecordRotatorTest, RotateUnrotateFhtKac) { + const size_t dim = 64; + RecordRotator rotator; + rotator.init(dim); + EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::FhtKac); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + std::vector original(dim); + for (size_t j = 0; j < dim; ++j) original[j] = dist(gen); + + std::vector rotated(dim); + rotator.rotate(original.data(), rotated.data()); + + std::vector recovered(dim); + rotator.unrotate(rotated.data(), recovered.data()); + + float max_err = 0.0f; + for (size_t j = 0; j < dim; ++j) + max_err = std::max(max_err, std::abs(recovered[j] - original[j])); + std::cout << "FhtKac (dim=64) max error: " << max_err << std::endl; + EXPECT_LT(max_err, 1e-3f); +} + +// Test Matrix rotator (dim=16, not 64-aligned, auto-fallback) +TEST(RecordRotatorTest, RotateUnrotateMatrix) { + const size_t dim = 16; + RecordRotator rotator; + rotator.init(dim); + EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::Matrix); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + std::vector original(dim); + for (size_t j = 0; j < dim; ++j) original[j] = dist(gen); + + std::vector rotated(dim); + rotator.rotate(original.data(), rotated.data()); + + std::vector recovered(dim); + rotator.unrotate(rotated.data(), recovered.data()); + + float max_err = 0.0f; + for (size_t j = 0; j < dim; ++j) + max_err = std::max(max_err, std::abs(recovered[j] - original[j])); + std::cout << "Matrix (dim=16) max error: " << max_err << std::endl; + EXPECT_LT(max_err, 1e-3f); +} From b77eef70c2400856243d77c411f613df1e8b9b8f Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Wed, 17 Jun 2026 10:50:19 +0800 Subject: [PATCH 33/38] remove cite of rabitq --- src/core/quantizer/record_rotator.cc | 58 ++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index d161935dd..e513282f3 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -20,12 +20,12 @@ #include #include -// Eigen headers from rabitqlib — used by MatrixRotator for numerically stable -// HouseholderQR orthogonalisation and vectorised matrix multiplication. +// Eigen headers (bundled in rabitqlib/third) — used by MatrixRotator for +// numerically stable HouseholderQR orthogonalisation and matrix multiplication. +// We intentionally avoid rabitqlib/defines.hpp and rabitqlib/utils/space.hpp +// to prevent x86 intrinsics leakage on ARM64/AArch64 platforms. #include #include -#include "rabitqlib/defines.hpp" -#include "rabitqlib/utils/space.hpp" #if defined(__AVX2__) || defined(__AVX512F__) #include @@ -50,6 +50,32 @@ namespace core { namespace { +template +using RowMajorMatrix = + Eigen::Matrix; + +template +using RowMajorMatrixMap = Eigen::Map>; + +template +using ConstRowMajorMatrixMap = Eigen::Map>; + +template +RowMajorMatrix random_gaussian_matrix(size_t rows, size_t cols) { + RowMajorMatrix rand(rows, cols); + static std::random_device rd; + static std::mt19937 gen(rd()); + std::normal_distribution dist(0, 1); + + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + rand(i, j) = dist(gen); + } + } + + return rand; +} + // ============================================================================ // Scalar / SIMD helper functions for rotation // ============================================================================ @@ -447,12 +473,11 @@ struct MatrixRotatorImpl { void init(size_t dim) { // Generate dim x dim random Gaussian matrix - rabitqlib::RowMajorMatrix rand_mat = - rabitqlib::random_gaussian_matrix(dim, dim); + RowMajorMatrix rand_mat = random_gaussian_matrix(dim, dim); // Householder QR: numerically stable orthogonalisation - Eigen::HouseholderQR> qr(rand_mat); - rabitqlib::RowMajorMatrix q_inv = qr.householderQ().transpose(); + Eigen::HouseholderQR> qr(rand_mat); + RowMajorMatrix q_inv = qr.householderQ().transpose(); matrix.resize(dim * dim); std::memcpy(matrix.data(), &q_inv(0, 0), sizeof(float) * dim * dim); @@ -460,9 +485,9 @@ struct MatrixRotatorImpl { void rotate(const float *in, float *out, size_t dim) const { // v (1 x dim) * M (dim x dim) -> rv (1 x dim) - rabitqlib::ConstRowMajorMatrixMap v(in, 1, dim); - rabitqlib::RowMajorMatrixMap rv(out, 1, dim); - rv = v * rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim); + ConstRowMajorMatrixMap v(in, 1, dim); + RowMajorMatrixMap rv(out, 1, dim); + rv = v * ConstRowMajorMatrixMap(matrix.data(), dim, dim); } void save(char *data) const { @@ -480,10 +505,9 @@ struct MatrixRotatorImpl { //! Inverse rotate using M^T (transpose of the dim x dim orthogonal matrix). void unrotate(const float *in, float *out, size_t dim) const { // in (1 x dim) * M^T (dim x dim) -> out (1 x dim) - rabitqlib::ConstRowMajorMatrixMap v(in, 1, dim); - rabitqlib::RowMajorMatrixMap rv(out, 1, dim); - rv = v * rabitqlib::ConstRowMajorMatrixMap(matrix.data(), dim, dim) - .transpose(); + ConstRowMajorMatrixMap v(in, 1, dim); + RowMajorMatrixMap rv(out, 1, dim); + rv = v * ConstRowMajorMatrixMap(matrix.data(), dim, dim).transpose(); } }; @@ -506,8 +530,10 @@ struct RecordRotator::Impl { uint32_t reserved; // backward-compat placeholder (was padded_dim) void write_to(char *buf) const { - std::memset(buf, 0, kHeaderSize); // zero-fill padding + // Write fields individually (avoids GCC -Warray-bounds false positive + // on memset when inlined through vector::data() at -O3). buf[0] = static_cast(type); + buf[1] = buf[2] = buf[3] = 0; // padding write_u32_le(buf + 4, origin_dim); write_u32_le(buf + 8, reserved); } From 4557bb8725492decdfb4d129f88f35bc5a32336e Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Wed, 17 Jun 2026 13:00:54 +0800 Subject: [PATCH 34/38] MSVC --- src/core/quantizer/record_rotator.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index e513282f3..ea03ea26c 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -32,7 +32,11 @@ // FFHT (Fastest Fast Hadamard Transform) — hand-tuned AVX inline assembly // from https://github.com/FALCONN-LIB/FFHT, originally bundled in rabitqlib. // Provides fht_float(buf, log_n) with per-size helper_float_N specialisations. +// NOTE: fht_avx.hpp uses GCC/Clang __asm__ volatile syntax; MSVC is +// unsupported and falls back to the scalar FHT implementation in fht_inplace(). +#if defined(__GNUC__) #include "rabitqlib/utils/fht_avx.hpp" +#endif #elif defined(__SSE2__) #include #endif @@ -84,7 +88,7 @@ RowMajorMatrix random_gaussian_matrix(size_t rows, size_t cols) { //! Uses FFHT hand-tuned AVX assembly when available; generic scalar loop //! otherwise (ARM NEON / SSE2 / pure scalar). void fht_inplace(float *data, size_t n) { -#if defined(__AVX2__) || defined(__AVX512F__) +#if (defined(__AVX2__) || defined(__AVX512F__)) && defined(__GNUC__) // Compute floor(log2(n)) for power-of-2 n. int log_n = 0; for (size_t v = n; v > 1; v >>= 1) ++log_n; From d1cea6a0505bb0a4b0d1cd3d8de1589fabd1e8bb Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Wed, 17 Jun 2026 15:10:14 +0800 Subject: [PATCH 35/38] merge main --- tests/core/interface/index_interface_test.cc | 444 ++++++++++++++++--- 1 file changed, 393 insertions(+), 51 deletions(-) diff --git a/tests/core/interface/index_interface_test.cc b/tests/core/interface/index_interface_test.cc index 6cc0d6f4d..5fcfe37e4 100644 --- a/tests/core/interface/index_interface_test.cc +++ b/tests/core/interface/index_interface_test.cc @@ -186,6 +186,267 @@ TEST(IndexInterface, General) { .build()); } +TEST(IndexInterface, CopyOnWrite) { + constexpr uint32_t kDimension = 64; + constexpr uint32_t kNumVectors = 50; + const std::string index_name{"test_cow.index"}; + + auto make_vec = [&](uint32_t seed) { + std::vector v(kDimension, 0.0f); + v[seed % kDimension] = 1.0f; + return v; + }; + + auto func = [&](const BaseIndexParam::Pointer ¶m, + const BaseIndexQueryParam::Pointer &query_param) { + zvec::test_util::RemoveTestFiles(index_name); + + // Phase 1: build the index with shared mmap (writeable shared mapping) + // since the COW mode isn't used as the initial ingest path here. + { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/true, /*read_only=*/false, + /*copy_on_write=*/false})); + + std::vector> vecs; + vecs.reserve(kNumVectors); + for (uint32_t i = 0; i < kNumVectors; ++i) { + vecs.emplace_back(make_vec(i)); + VectorData vd; + vd.vector = DenseVector{vecs.back().data()}; + ASSERT_EQ(0, index->Add(vd, /*key=*/100 + i)); + } + ASSERT_EQ(0, index->Train()); + ASSERT_EQ(0, index->Close()); + } + + // Phase 2: reopen with COW mmap. Search and Fetch must succeed against + // the persisted file. + { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/true})); + + for (uint32_t i = 0; i < kNumVectors; ++i) { + auto target = make_vec(i); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, query_param, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(100u + i, result.doc_list_[0].key()); + + VectorDataBuffer fetched; + ASSERT_EQ(0, index->Fetch(100 + i, &fetched)); + auto *fetched_ptr = reinterpret_cast( + std::get(fetched.vector_buffer).data.data()); + ASSERT_FLOAT_EQ(1.0f, fetched_ptr[i % kDimension]); + } + ASSERT_EQ(0, index->Close()); + } + + // Phase 3: reopen with shared mmap to confirm the file is intact after + // the COW session. + { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/false})); + + auto target = make_vec(13); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, query_param, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(113u, result.doc_list_[0].key()); + ASSERT_EQ(0, index->Close()); + } + + // Phase 4: repeated open/close under COW mmap must not lose entries. + for (int cycle = 0; cycle < 3; ++cycle) { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/true})); + uint32_t i = static_cast(cycle * 5 + 2); + auto target = make_vec(i); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, query_param, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(100u + i, result.doc_list_[0].key()); + ASSERT_EQ(0, index->Close()); + } + + // Phase 5: open in COW mmap (writable MAP_PRIVATE with forced flush). + // Without performing writes the close path still exercises the pwrite + // branch with no dirty pages, which must not corrupt the file. + { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/true})); + + auto target = make_vec(21); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, query_param, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(121u, result.doc_list_[0].key()); + ASSERT_EQ(0, index->Close()); + } + + // Phase 6: reopen with shared mmap to confirm Phase 5's open/close left + // the file intact. + { + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + ASSERT_EQ( + 0, index->Open(index_name, {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/false})); + for (uint32_t i = 0; i < kNumVectors; ++i) { + auto target = make_vec(i); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, query_param, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(100u + i, result.doc_list_[0].key()); + } + ASSERT_EQ(0, index->Close()); + } + + zvec::test_util::RemoveTestFiles(index_name); + }; + + func(FlatIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .Build(), + FlatQueryParamBuilder().with_topk(5).with_fetch_vector(false).build()); + + func(HNSWIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithEFConstruction(100) + .Build(), + HNSWQueryParamBuilder() + .with_topk(5) + .with_fetch_vector(false) + .with_ef_search(20) + .build()); + + func(VamanaIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithMaxDegree(32) + .WithSearchListSize(64) + .WithAlpha(1.2f) + .Build(), + VamanaQueryParamBuilder() + .with_topk(5) + .with_fetch_vector(false) + .with_ef_search(32) + .build()); + + // Flat-only durability check for COW mmap: writes performed under + // MAP_PRIVATE must be pwrite-flushed back and visible after a shared-mmap + // reopen. Flat is used because Add/Flush against a previously-built file is + // straightforward to reason about for this storage layer. + { + const std::string persist_index{"test_cow_persist.index"}; + zvec::test_util::RemoveTestFiles(persist_index); + auto persist_param = FlatIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .Build(); + auto persist_query = + FlatQueryParamBuilder().with_topk(5).with_fetch_vector(false).build(); + + { + auto index = IndexFactory::CreateAndInitIndex(*persist_param); + ASSERT_NE(nullptr, index); + ASSERT_EQ(0, index->Open(persist_index, + {StorageOptions::StorageType::kMMAP, + /*create_new=*/true, /*read_only=*/false, + /*copy_on_write=*/false})); + auto v0 = make_vec(0); + VectorData vd; + vd.vector = DenseVector{v0.data()}; + ASSERT_EQ(0, index->Add(vd, /*key=*/500)); + ASSERT_EQ(0, index->Train()); + ASSERT_EQ(0, index->Close()); + } + + // Add a new vector through COW mmap and explicitly Flush so + // dirty private pages are written back to the file. + { + auto index = IndexFactory::CreateAndInitIndex(*persist_param); + ASSERT_NE(nullptr, index); + ASSERT_EQ(0, index->Open(persist_index, + {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/false, + /*copy_on_write=*/true})); + auto v1 = make_vec(1); + VectorData vd; + vd.vector = DenseVector{v1.data()}; + ASSERT_EQ(0, index->Add(vd, /*key=*/501)); + ASSERT_EQ(0, index->Flush()); + ASSERT_EQ(0, index->Close()); + } + + // Reopen with shared mmap: the entry written in COW mode must be durable + // on disk. + { + auto index = IndexFactory::CreateAndInitIndex(*persist_param); + ASSERT_NE(nullptr, index); + ASSERT_EQ(0, index->Open(persist_index, + {StorageOptions::StorageType::kMMAP, + /*create_new=*/false, /*read_only=*/true, + /*copy_on_write=*/false})); + auto target = make_vec(1); + VectorData query; + query.vector = DenseVector{target.data()}; + SearchResult result; + ASSERT_EQ(0, index->Search(query, persist_query, &result)); + ASSERT_FALSE(result.doc_list_.empty()); + ASSERT_EQ(501u, result.doc_list_[0].key()); + + VectorDataBuffer fetched; + ASSERT_EQ(0, index->Fetch(501, &fetched)); + auto *fetched_ptr = reinterpret_cast( + std::get(fetched.vector_buffer).data.data()); + ASSERT_FLOAT_EQ(1.0f, fetched_ptr[1 % kDimension]); + ASSERT_EQ(0, index->Close()); + } + zvec::test_util::RemoveTestFiles(persist_index); + } +} + TEST(IndexInterface, BufferGeneral) { zvec::ailego::MemoryLimitPool::get_instance().init(100 * 1024 * 1024); constexpr uint32_t kDimension = 64; @@ -1856,70 +2117,151 @@ TEST(IndexInterface, ContiguousMemoryEndToEnd) { .build()); } -<<<<<<< HEAD -TEST(IndexInterface, QuantizerParamEnableRotateSerialization) { +class TestVectorSource : public zvec::core::VectorSource { + public: + TestVectorSource(const float *base, uint32_t dim) : base_(base), dim_(dim) {} + + const void *get_vector(uint32_t node_id) const override { + return base_ + static_cast(node_id) * dim_; + } + + private: + const float *base_; + uint32_t dim_; +}; + +TEST(IndexInterface, ExternalVectorEndToEnd) { constexpr uint32_t kDimension = 64; + constexpr uint32_t kNumVectors = 100; + const std::string index_name{"test_external.index"}; + + std::vector all_vectors(kDimension * kNumVectors); + for (uint32_t i = 0; i < kNumVectors; ++i) { + for (uint32_t d = 0; d < kDimension; ++d) { + all_vectors[i * kDimension + d] = + static_cast(i * kDimension + d) * 0.01f; + } + } - // Test 1: HNSW with enable_rotate=true via builder - { - auto param = HNSWIndexParamBuilder() - .WithMetricType(MetricType::kCosine) - .WithDataType(DataType::DT_FP32) - .WithDimension(kDimension) - .WithIsSparse(false) - .WithEFConstruction(100) - .WithEnableRotate(true) - .Build(); - ASSERT_NE(nullptr, param.get()); - EXPECT_TRUE(param->quantizer_param.enable_rotate); + TestVectorSource source(all_vectors.data(), kDimension); + + zvec::test_util::RemoveTestFiles(index_name + "*"); - // Serialize to JSON and verify enable_rotate is present - std::string json = param->SerializeToJson(); - EXPECT_TRUE(json.find("\"enable_rotate\":true") != std::string::npos) - << "JSON: " << json; + auto param = HNSWIndexParamBuilder() + .WithMetricType(MetricType::kL2sq) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithEFConstruction(100) + .WithUseExternalVector(true) + .Build(); - // Deserialize and verify - auto restored = IndexFactory::DeserializeIndexParamFromJson(json); - ASSERT_NE(nullptr, restored.get()); + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); - auto *restored_hnsw = dynamic_cast(restored.get()); - ASSERT_NE(nullptr, restored_hnsw); - EXPECT_TRUE(restored_hnsw->quantizer_param.enable_rotate); + index->Open(index_name, {StorageOptions::StorageType::kMMAP, true}); - // Roundtrip consistency - EXPECT_EQ(restored->SerializeToJson(), param->SerializeToJson()); + for (uint32_t i = 0; i < kNumVectors; ++i) { + VectorData vector_data; + vector_data.vector = DenseVector{all_vectors.data() + i * kDimension}; + int ret = index->AddWithSource(vector_data, i, source); + ASSERT_EQ(0, ret) << "AddWithSource failed for doc_id=" << i; } - // Test 2: Flat with enable_rotate=true via WithQuantizerParam - { - QuantizerParam qp(QuantizerType::kNone, 8, 8, true); - EXPECT_TRUE(qp.enable_rotate); + auto query_param = HNSWQueryParamBuilder() + .with_topk(5) + .with_fetch_vector(false) + .with_ef_search(50) + .build(); + + VectorData query; + query.vector = DenseVector{all_vectors.data()}; + SearchResult result; + int ret = index->SearchWithSource(query, query_param, source, &result); + ASSERT_EQ(0, ret); + ASSERT_GE(result.doc_list_.size(), 1u); + ASSERT_EQ(0u, result.doc_list_[0].key()); + ASSERT_FLOAT_EQ(0.0f, result.doc_list_[0].score()); + + VectorData query2; + query2.vector = DenseVector{all_vectors.data() + 50 * kDimension}; + SearchResult result2; + ret = index->SearchWithSource(query2, query_param, source, &result2); + ASSERT_EQ(0, ret); + ASSERT_GE(result2.doc_list_.size(), 1u); + ASSERT_EQ(50u, result2.doc_list_[0].key()); + ASSERT_FLOAT_EQ(0.0f, result2.doc_list_[0].score()); + + index->Close(); + + auto index2 = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index2); + index2->Open(index_name, {StorageOptions::StorageType::kMMAP, false}); + + SearchResult result3; + ret = index2->SearchWithSource(query, query_param, source, &result3); + ASSERT_EQ(0, ret); + ASSERT_GE(result3.doc_list_.size(), 1u); + ASSERT_EQ(0u, result3.doc_list_[0].key()); + ASSERT_FLOAT_EQ(0.0f, result3.doc_list_[0].score()); + + index2->Close(); + zvec::test_util::RemoveTestFiles(index_name + "*"); +} - auto param = FlatIndexParamBuilder() - .WithMetricType(MetricType::kCosine) - .WithDataType(DataType::DT_FP32) - .WithDimension(kDimension) - .WithIsSparse(false) - .WithQuantizerParam(qp) - .Build(); - ASSERT_NE(nullptr, param.get()); - EXPECT_TRUE(param->quantizer_param.enable_rotate); +TEST(IndexInterface, ExternalVectorInnerProduct) { + constexpr uint32_t kDimension = 16; + constexpr uint32_t kNumVectors = 10; + const std::string index_name{"test_external_ip.index"}; + + std::vector all_vectors(kDimension * kNumVectors, 0.0f); + for (uint32_t i = 0; i < kNumVectors; ++i) { + all_vectors[i * kDimension + i % kDimension] = static_cast(i + 1); + } - std::string json = param->SerializeToJson(); - EXPECT_TRUE(json.find("\"enable_rotate\":true") != std::string::npos); + TestVectorSource source(all_vectors.data(), kDimension); + + zvec::test_util::RemoveTestFiles(index_name + "*"); + + auto param = HNSWIndexParamBuilder() + .WithMetricType(MetricType::kInnerProduct) + .WithDataType(DataType::DT_FP32) + .WithDimension(kDimension) + .WithIsSparse(false) + .WithEFConstruction(100) + .WithUseExternalVector(true) + .Build(); - auto restored = IndexFactory::DeserializeIndexParamFromJson(json); - ASSERT_NE(nullptr, restored.get()); + auto index = IndexFactory::CreateAndInitIndex(*param); + ASSERT_NE(nullptr, index); + index->Open(index_name, {StorageOptions::StorageType::kMMAP, true}); - auto *restored_flat = dynamic_cast(restored.get()); - ASSERT_NE(nullptr, restored_flat); - EXPECT_TRUE(restored_flat->quantizer_param.enable_rotate); + for (uint32_t i = 0; i < kNumVectors; ++i) { + VectorData vector_data; + vector_data.vector = DenseVector{all_vectors.data() + i * kDimension}; + ASSERT_EQ(0, index->AddWithSource(vector_data, i, source)); } - // Test 3: enable_rotate=false should be omitted when omit_empty_value=true - { - auto param = HNSWIndexParamBuilder() -======= + std::vector query_vec(kDimension, 0.0f); + query_vec[0] = 1.0f; + VectorData query; + query.vector = DenseVector{query_vec.data()}; + + auto query_param = HNSWQueryParamBuilder() + .with_topk(1) + .with_fetch_vector(false) + .with_ef_search(50) + .build(); + + SearchResult result; + ASSERT_EQ(0, index->SearchWithSource(query, query_param, source, &result)); + ASSERT_EQ(1u, result.doc_list_.size()); + ASSERT_EQ(0u, result.doc_list_[0].key()); + ASSERT_FLOAT_EQ(1.0f, result.doc_list_[0].score()); + + index->Close(); + zvec::test_util::RemoveTestFiles(index_name + "*"); +} TEST(IndexInterface, IsDirty) { constexpr uint32_t kDimension = 16; const std::string index_name{"test_is_dirty.index"}; @@ -2040,4 +2382,4 @@ TEST(IndexInterface, IsDirtyBufferPool) { #if defined(__GNUC__) || defined(__GNUG__) #pragma GCC diagnostic pop -#endif \ No newline at end of file +#endif From 11cc02c62dbc73b2ecd9890df0c88d049f7b7c36 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 18 Jun 2026 10:47:53 +0800 Subject: [PATCH 36/38] COMDAT section reduced in window --- src/core/quantizer/record_rotator.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index ea03ea26c..e27c33d5d 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -19,12 +19,7 @@ #include #include #include - -// Eigen headers (bundled in rabitqlib/third) — used by MatrixRotator for -// numerically stable HouseholderQR orthogonalisation and matrix multiplication. -// We intentionally avoid rabitqlib/defines.hpp and rabitqlib/utils/space.hpp -// to prevent x86 intrinsics leakage on ARM64/AArch64 platforms. -#include +#include #include #if defined(__AVX2__) || defined(__AVX512F__) @@ -32,8 +27,6 @@ // FFHT (Fastest Fast Hadamard Transform) — hand-tuned AVX inline assembly // from https://github.com/FALCONN-LIB/FFHT, originally bundled in rabitqlib. // Provides fht_float(buf, log_n) with per-size helper_float_N specialisations. -// NOTE: fht_avx.hpp uses GCC/Clang __asm__ volatile syntax; MSVC is -// unsupported and falls back to the scalar FHT implementation in fht_inplace(). #if defined(__GNUC__) #include "rabitqlib/utils/fht_avx.hpp" #endif From 899de170b9a934f45d08bb73e33bef6567ea3f73 Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Thu, 18 Jun 2026 14:24:29 +0800 Subject: [PATCH 37/38] fht fix --- src/core/quantizer/record_rotator.cc | 84 ++++++++++++++---- src/core/quantizer/record_rotator.h | 8 +- .../integer_quantizer_reformer_test.cc | 86 ++++++++++++++++++- 3 files changed, 152 insertions(+), 26 deletions(-) diff --git a/src/core/quantizer/record_rotator.cc b/src/core/quantizer/record_rotator.cc index e27c33d5d..d2d51fd7f 100644 --- a/src/core/quantizer/record_rotator.cc +++ b/src/core/quantizer/record_rotator.cc @@ -102,10 +102,18 @@ void fht_inplace(float *data, size_t n) { //! Flip the sign of elements based on a packed bit-array. void flip_sign(const uint8_t *flip, float *data, size_t dim) { +#if defined(__AVX512F__) && defined(__AVX512DQ__) + size_t simd_end = dim & ~63u; +#elif defined(__AVX2__) + size_t simd_end = dim & ~31u; +#else + size_t simd_end = dim; // SSE2/NEON/scalar: chunk divides 4, no tail +#endif + #if defined(__AVX512F__) && defined(__AVX512DQ__) constexpr size_t kChunk = 64; const __m512 sign_flip = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - for (size_t i = 0; i < dim; i += kChunk) { + for (size_t i = 0; i < simd_end; i += kChunk) { uint64_t mask_bits; std::memcpy(&mask_bits, &flip[i / 8], sizeof(mask_bits)); const __mmask16 m0 = _cvtu32_mask16(mask_bits & 0xFFFF); @@ -130,7 +138,7 @@ void flip_sign(const uint8_t *flip, float *data, size_t dim) { const __m256i bit_select = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); const __m256 sign_flip = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - for (size_t i = 0; i < dim; i += kChunk) { + for (size_t i = 0; i < simd_end; i += kChunk) { uint32_t mask_bits; std::memcpy(&mask_bits, &flip[i / 8], sizeof(mask_bits)); for (int b = 0; b < 4; ++b) { @@ -185,34 +193,50 @@ void flip_sign(const uint8_t *flip, float *data, size_t dim) { } } #endif + // Scalar tail: handle remaining elements when dim is not SIMD-aligned. + for (size_t i = simd_end; i < dim; ++i) { + if (flip[i / 8] & (1u << (i % 8))) { + data[i] = -data[i]; + } + } } //! Kac random walk: butterfly add/sub between first and second halves. void kacs_walk(float *data, size_t len) { size_t half = len / 2; #if defined(__AVX512F__) - for (size_t i = 0; i < half; i += 16) { + size_t half_end = half & ~15u; +#elif defined(__AVX2__) + size_t half_end = half & ~7u; +#elif defined(__SSE2__) || (defined(__ARM_NEON) && defined(__aarch64__)) + size_t half_end = half & ~3u; +#else + size_t half_end = half; +#endif + +#if defined(__AVX512F__) + for (size_t i = 0; i < half_end; i += 16) { __m512 x = _mm512_loadu_ps(&data[i]); __m512 y = _mm512_loadu_ps(&data[i + half]); _mm512_storeu_ps(&data[i], _mm512_add_ps(x, y)); _mm512_storeu_ps(&data[i + half], _mm512_sub_ps(x, y)); } #elif defined(__AVX2__) - for (size_t i = 0; i < half; i += 8) { + for (size_t i = 0; i < half_end; i += 8) { __m256 x = _mm256_loadu_ps(&data[i]); __m256 y = _mm256_loadu_ps(&data[i + half]); _mm256_storeu_ps(&data[i], _mm256_add_ps(x, y)); _mm256_storeu_ps(&data[i + half], _mm256_sub_ps(x, y)); } #elif defined(__ARM_NEON) && defined(__aarch64__) - for (size_t i = 0; i < half; i += 4) { + for (size_t i = 0; i < half_end; i += 4) { float32x4_t x = vld1q_f32(&data[i]); float32x4_t y = vld1q_f32(&data[i + half]); vst1q_f32(&data[i], vaddq_f32(x, y)); vst1q_f32(&data[i + half], vsubq_f32(x, y)); } #elif defined(__SSE2__) - for (size_t i = 0; i < half; i += 4) { + for (size_t i = 0; i < half_end; i += 4) { __m128 x = _mm_loadu_ps(&data[i]); __m128 y = _mm_loadu_ps(&data[i + half]); _mm_storeu_ps(&data[i], _mm_add_ps(x, y)); @@ -226,6 +250,13 @@ void kacs_walk(float *data, size_t len) { data[i + half] = x - y; } #endif + // Scalar tail: handle remaining pairs when half is not SIMD-aligned. + for (size_t i = half_end; i < half; ++i) { + float x = data[i]; + float y = data[i + half]; + data[i] = x + y; + data[i + half] = x - y; + } } //! Inverse Kac walk: undo butterfly add/sub with 0.5 factor. @@ -233,9 +264,19 @@ void kacs_walk(float *data, size_t len) { //! (a-b)/2). void inv_kacs_walk(float *data, size_t len) { size_t half = len / 2; +#if defined(__AVX512F__) + size_t half_end = half & ~15u; +#elif defined(__AVX2__) + size_t half_end = half & ~7u; +#elif defined(__SSE2__) || (defined(__ARM_NEON) && defined(__aarch64__)) + size_t half_end = half & ~3u; +#else + size_t half_end = half; +#endif + #if defined(__AVX512F__) const __m512 half_fac = _mm512_set1_ps(0.5f); - for (size_t i = 0; i < half; i += 16) { + for (size_t i = 0; i < half_end; i += 16) { __m512 a = _mm512_loadu_ps(&data[i]); __m512 b = _mm512_loadu_ps(&data[i + half]); _mm512_storeu_ps(&data[i], _mm512_mul_ps(_mm512_add_ps(a, b), half_fac)); @@ -244,7 +285,7 @@ void inv_kacs_walk(float *data, size_t len) { } #elif defined(__AVX2__) const __m256 half_fac = _mm256_set1_ps(0.5f); - for (size_t i = 0; i < half; i += 8) { + for (size_t i = 0; i < half_end; i += 8) { __m256 a = _mm256_loadu_ps(&data[i]); __m256 b = _mm256_loadu_ps(&data[i + half]); _mm256_storeu_ps(&data[i], _mm256_mul_ps(_mm256_add_ps(a, b), half_fac)); @@ -253,7 +294,7 @@ void inv_kacs_walk(float *data, size_t len) { } #elif defined(__ARM_NEON) && defined(__aarch64__) const float32x4_t half_fac = vdupq_n_f32(0.5f); - for (size_t i = 0; i < half; i += 4) { + for (size_t i = 0; i < half_end; i += 4) { float32x4_t a = vld1q_f32(&data[i]); float32x4_t b = vld1q_f32(&data[i + half]); vst1q_f32(&data[i], vmulq_f32(vaddq_f32(a, b), half_fac)); @@ -261,7 +302,7 @@ void inv_kacs_walk(float *data, size_t len) { } #elif defined(__SSE2__) const __m128 half_fac = _mm_set1_ps(0.5f); - for (size_t i = 0; i < half; i += 4) { + for (size_t i = 0; i < half_end; i += 4) { __m128 a = _mm_loadu_ps(&data[i]); __m128 b = _mm_loadu_ps(&data[i + half]); _mm_storeu_ps(&data[i], _mm_mul_ps(_mm_add_ps(a, b), half_fac)); @@ -275,6 +316,13 @@ void inv_kacs_walk(float *data, size_t len) { data[i + half] = (a - b) * 0.5f; } #endif + // Scalar tail: handle remaining pairs when half is not SIMD-aligned. + for (size_t i = half_end; i < half; ++i) { + float a = data[i]; + float b = data[i + half]; + data[i] = (a + b) * 0.5f; + data[i + half] = (a - b) * 0.5f; + } } //! Scale each element by a constant factor. @@ -310,9 +358,9 @@ void write_u32_le(char *p, uint32_t v) { // ============================================================================ // FhtKacRotatorImpl - O(d log d) FHT-based Kac random rotation // -// Requires dimension % 64 == 0 for SIMD flip-sign correctness. -// When dimension is also a power of 2, uses 4 rounds of (flip -> FHT -> -// rescale). When dimension is 64-aligned but NOT a power of 2 (e.g. 192, 320), +// Requires dimension % 4 == 0 (scalar tails handle SIMD remainder). +// When dimension is a power of 2, uses 4 rounds of (flip -> FHT -> +// rescale). When dimension is NOT a power of 2 (e.g. 96, 100, 192), // uses kacs_walk reduction to handle the non-power-of-2 case. // ============================================================================ @@ -601,11 +649,11 @@ void RecordRotator::init(size_t dimension, RecordRotatorType rotator_type) { impl_->dimension = dimension; // Auto-select implementation based on dimension alignment when FhtKac - // is requested. FhtKac requires the dimension to be a multiple of 64 - // for SIMD flip-sign and FHT correctness. When the dimension is not - // 64-aligned we transparently fall back to the O(d^2) Matrix rotator. + // is requested. FhtKac requires the dimension to be a multiple of 4; + // scalar tails handle the SIMD remainder. When the dimension is not + // 4-aligned we transparently fall back to the O(d^2) Matrix rotator. bool use_fht = - (rotator_type == RecordRotatorType::FhtKac) && (dimension % 64 == 0); + (rotator_type == RecordRotatorType::FhtKac) && (dimension % 4 == 0); if (use_fht) { impl_->type = RecordRotatorType::FhtKac; @@ -617,7 +665,7 @@ void RecordRotator::init(size_t dimension, RecordRotatorType rotator_type) { } else { if (rotator_type == RecordRotatorType::FhtKac) { LOG_DEBUG( - "RecordRotator::init: dimension %zu is not 64-aligned, " + "RecordRotator::init: dimension %zu is not 4-aligned, " "falling back from FhtKac to Matrix rotator", dimension); } diff --git a/src/core/quantizer/record_rotator.h b/src/core/quantizer/record_rotator.h index 98734e7dc..cd60118ed 100644 --- a/src/core/quantizer/record_rotator.h +++ b/src/core/quantizer/record_rotator.h @@ -38,8 +38,8 @@ enum class RecordRotatorType : uint8_t { * explicit random matrix), so no rabitqlib headers are required. * * Auto-selects the rotation algorithm based on dimension alignment: - * - dimension % 64 == 0 -> FhtKac (O(d log d), requires 64-alignment) - * - otherwise -> Matrix (O(d^2), no alignment requirement) + * - dimension % 4 == 0 -> FhtKac (O(d log d), with scalar tails) + * - otherwise -> Matrix (O(d^2), no alignment requirement) * * Rotation preserves dimension: output size == input size (no padding). * @@ -58,11 +58,11 @@ class RecordRotator { RecordRotator &operator=(const RecordRotator &) = delete; //! Initialize the rotator. - //! Auto-selects FhtKac when dimension is 64-aligned, else falls back to + //! Auto-selects FhtKac when dimension is 4-aligned, else falls back to //! Matrix. The @p rotator_type parameter can force Matrix explicitly. //! @param dimension vector dimension (input and output size) //! @param rotator_type rotation algorithm (default: FhtKac, auto-degrades - //! to Matrix when dimension is not 64-aligned) + //! to Matrix when dimension is not 4-aligned) void init(size_t dimension, RecordRotatorType rotator_type = RecordRotatorType::FhtKac); diff --git a/tests/core/quantizer/integer_quantizer_reformer_test.cc b/tests/core/quantizer/integer_quantizer_reformer_test.cc index de2368729..104c3c28d 100644 --- a/tests/core/quantizer/integer_quantizer_reformer_test.cc +++ b/tests/core/quantizer/integer_quantizer_reformer_test.cc @@ -825,7 +825,7 @@ TEST(IntegerReformer, Int4InitConverterWithTrainedParams) { } } -// Test FhtKac rotator (dim=64, 64-aligned) +// Test FhtKac rotator (dim=64, power of 2, hot path) TEST(RecordRotatorTest, RotateUnrotateFhtKac) { const size_t dim = 64; RecordRotator rotator; @@ -851,9 +851,9 @@ TEST(RecordRotatorTest, RotateUnrotateFhtKac) { EXPECT_LT(max_err, 1e-3f); } -// Test Matrix rotator (dim=16, not 64-aligned, auto-fallback) +// Test Matrix rotator (dim=15, odd, not 4-aligned, auto-fallback) TEST(RecordRotatorTest, RotateUnrotateMatrix) { - const size_t dim = 16; + const size_t dim = 15; RecordRotator rotator; rotator.init(dim); EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::Matrix); @@ -873,6 +873,84 @@ TEST(RecordRotatorTest, RotateUnrotateMatrix) { float max_err = 0.0f; for (size_t j = 0; j < dim; ++j) max_err = std::max(max_err, std::abs(recovered[j] - original[j])); - std::cout << "Matrix (dim=16) max error: " << max_err << std::endl; + std::cout << "Matrix (dim=15) max error: " << max_err << std::endl; + EXPECT_LT(max_err, 1e-3f); +} + +// Test FhtKac rotator (dim=100, 4-aligned but not 16/32/64-aligned) +TEST(RecordRotatorTest, RotateUnrotateFhtKac_Dim100) { + const size_t dim = 100; + RecordRotator rotator; + rotator.init(dim); + EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::FhtKac); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + std::vector original(dim); + for (size_t j = 0; j < dim; ++j) original[j] = dist(gen); + + std::vector rotated(dim); + rotator.rotate(original.data(), rotated.data()); + + std::vector recovered(dim); + rotator.unrotate(rotated.data(), recovered.data()); + + float max_err = 0.0f; + for (size_t j = 0; j < dim; ++j) + max_err = std::max(max_err, std::abs(recovered[j] - original[j])); + std::cout << "FhtKac (dim=100) max error: " << max_err << std::endl; + EXPECT_LT(max_err, 1e-3f); +} + +// Test FhtKac rotator (dim=200, 4-aligned, non-power-of-2 kacs_walk path) +TEST(RecordRotatorTest, RotateUnrotateFhtKac_Dim200) { + const size_t dim = 200; + RecordRotator rotator; + rotator.init(dim); + EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::FhtKac); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + std::vector original(dim); + for (size_t j = 0; j < dim; ++j) original[j] = dist(gen); + + std::vector rotated(dim); + rotator.rotate(original.data(), rotated.data()); + + std::vector recovered(dim); + rotator.unrotate(rotated.data(), recovered.data()); + + float max_err = 0.0f; + for (size_t j = 0; j < dim; ++j) + max_err = std::max(max_err, std::abs(recovered[j] - original[j])); + std::cout << "FhtKac (dim=200) max error: " << max_err << std::endl; + EXPECT_LT(max_err, 1e-3f); +} + +// Test FhtKac rotator (dim=96, 32-aligned but not 64-aligned, kacs_walk path) +TEST(RecordRotatorTest, RotateUnrotateFhtKac_Dim96) { + const size_t dim = 96; + RecordRotator rotator; + rotator.init(dim); + EXPECT_EQ(rotator.rotator_type(), RecordRotatorType::FhtKac); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + std::vector original(dim); + for (size_t j = 0; j < dim; ++j) original[j] = dist(gen); + + std::vector rotated(dim); + rotator.rotate(original.data(), rotated.data()); + + std::vector recovered(dim); + rotator.unrotate(rotated.data(), recovered.data()); + + float max_err = 0.0f; + for (size_t j = 0; j < dim; ++j) + max_err = std::max(max_err, std::abs(recovered[j] - original[j])); + std::cout << "FhtKac (dim=96) max error: " << max_err << std::endl; EXPECT_LT(max_err, 1e-3f); } From 45af56590054573a491446688ed2eebd3658c5bb Mon Sep 17 00:00:00 2001 From: zzl <1581199236@qq.com> Date: Fri, 19 Jun 2026 10:58:45 +0800 Subject: [PATCH 38/38] MSVC --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5fc4e34ea..6196aedfa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,6 +64,7 @@ function(zvec_add_all_in_one_shared TARGET_NAME OUTPUT_NAME) ${ZVEC_ALLIN_LIBS} Threads::Threads ) + target_link_options(${TARGET_NAME} PRIVATE /OPT:REF /OPT:ICF) elseif(APPLE) foreach(ZVEC_ALLIN_LIB ${ZVEC_ALLIN_LIBS}) list(APPEND ZVEC_ALLIN_WA_OPTIONS