From c0a62b69f503bb54df2e2b9fc3716dd88e549b58 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Thu, 4 Sep 2025 16:51:00 -0300 Subject: [PATCH 01/65] improved conan_settings --- scripts/configure_conan_profile.sh | 46 +++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/scripts/configure_conan_profile.sh b/scripts/configure_conan_profile.sh index d2ef139..3122d1b 100755 --- a/scripts/configure_conan_profile.sh +++ b/scripts/configure_conan_profile.sh @@ -1,15 +1,53 @@ #!/bin/bash set -e -mkdir -p /root/.conan2/profiles +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" -cat > /root/.conan2/profiles/default < "$PROFILE_DIR/default" [settings] arch=x86_64 build_type=Release compiler=gcc -compiler.cppstd=20 +compiler.cppstd=17 compiler.libcxx=libstdc++11 -compiler.version=13 +compiler.version=11 os=Linux EOF + +printf "$LINE_BRK$SEGMENT" +#----------------------------------------- + +printf "$TAG Profile created in: $PROFILE_DIR/default\n" +printf "\nHard-check with: cat < $PROFILE_DIR/default\n\n" +printf "$SEGMENT" +printf "$SEGMENT" +printf "$SEGMENT" From d3f0502dfde20d2ad92ff670f5488e3b3704b8bb Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Thu, 4 Sep 2025 17:02:48 -0300 Subject: [PATCH 02/65] fix gitignore --- .gitignore | 58 ++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index af30c75..d95975f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,40 +1,39 @@ ### C++ ### # Prerequisites -*.d +**/*.d # Compiled Object files -*.slo -*.lo -*.o -*.obj +**/*.slo +**/*.lo +**/*.o +**/*.obj # Precompiled Headers -*.gch -*.pch +**/*.gch +**/*.pch # Compiled Dynamic libraries -*.so -*.dylib -*.dll +**/*.so +**/*.dylib +**/*.dll # Fortran module files -*.mod -*.smod +**/*.mod +**/*.smod # Compiled Static libraries -*.lai -*.la -*.a -*.lib +**/*.lai +**/*.la +**/*.a +**/*.lib # Executables -*.exe -*.out -*.app +**/*.exe +**/*.out +**/*.app ### Python ### -pycache/ -__pycache__/ +*pycache*/ .pytest_cache/ test_bindings.py .pyd @@ -46,20 +45,18 @@ CMakeFiles/ bkp/ build/ models/ -libs/libtorch + !libs/tokenizers-cpp !libs/openai-cpp libtorch*.zip -libtorch-cxx11-abi-shared* -libtorch-cxx11-abi-shared-with-deps-2.5.1+cpu.zip +libs/libtorch -models conan.lock libtorch/ extern/* -openai_api_key* +**/*openai_api_key* package/build/ package/dist/ @@ -67,10 +64,11 @@ packer package/*.egg-info .env -.venv/ -.venv*/ -.vscode/ -.vs/ +.venv/* +.venv*/* +.vscode/* +.vs/* +venv/* tests/ compile_commands.json From 647b63abeafc252ec50353413e837e5f028331cb Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Thu, 4 Sep 2025 17:14:14 -0300 Subject: [PATCH 03/65] fix a detail --- scripts/configure_conan_profile.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/configure_conan_profile.sh b/scripts/configure_conan_profile.sh index 3122d1b..da3aab6 100755 --- a/scripts/configure_conan_profile.sh +++ b/scripts/configure_conan_profile.sh @@ -21,7 +21,7 @@ printf "$LINE_BRK$SEGMENT" printf "$LINE_BRK$SEGMENT" printf "$TAG Finding\n" -PROFILE_DIR=$(find ~ -type d -wholename "*/.conan2/profiles" | head -n 1 || true) +PROFILE_DIR=$(find . -type d -wholename "*/.conan2/profiles" | head -n 1 || true) [ -z "$PROFILE_DIR" ] && PROFILE_DIR="$HOME/.conan2/profiles" && mkdir -p "$PROFILE_DIR" printf "$TAG Found at $PROFILE_DIR\n" @@ -37,9 +37,9 @@ cat << EOF > "$PROFILE_DIR/default" arch=x86_64 build_type=Release compiler=gcc -compiler.cppstd=17 +compiler.cppstd=20 compiler.libcxx=libstdc++11 -compiler.version=11 +compiler.version=13 os=Linux EOF From ef433ffce22eb5a88f45b8492c205e8a70c92a1f Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Fri, 12 Sep 2025 16:26:43 -0300 Subject: [PATCH 04/65] Move models to proper directory and update .gitignore to exclude future .onnx files --- .gitignore | 7 ++++++- {scripts => models}/hf_extract_model.py | 2 +- {scripts => models}/hf_model_to_onnx.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) rename {scripts => models}/hf_extract_model.py (90%) rename {scripts => models}/hf_model_to_onnx.py (94%) diff --git a/.gitignore b/.gitignore index d95975f..b1a0d3e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,7 +44,12 @@ CMakeFiles/ bkp/ build/ -models/ + +# Ignore everything in the models folder +models/* +# But DO NOT ignore these two files +!models/hf_extract_model.py +!models/hf_model_to_onnx.py !libs/tokenizers-cpp !libs/openai-cpp diff --git a/scripts/hf_extract_model.py b/models/hf_extract_model.py similarity index 90% rename from scripts/hf_extract_model.py rename to models/hf_extract_model.py index 6af6afa..20178d3 100644 --- a/scripts/hf_extract_model.py +++ b/models/hf_extract_model.py @@ -13,7 +13,7 @@ model_name = args.model_name - dir_path= os.path.join(os.path.dirname(__file__), "..", "models", model_name) + dir_path= os.path.join(os.path.dirname(__file__), ".", "models", model_name) if not os.path.exists(dir_path): os.makedirs(dir_path) diff --git a/scripts/hf_model_to_onnx.py b/models/hf_model_to_onnx.py similarity index 94% rename from scripts/hf_model_to_onnx.py rename to models/hf_model_to_onnx.py index afacb4a..98bd98c 100644 --- a/scripts/hf_model_to_onnx.py +++ b/models/hf_model_to_onnx.py @@ -16,7 +16,7 @@ model_name = args.model_name config = AutoConfig.from_pretrained(model_name) label_map = config.id2label -dir_path= os.path.join(os.path.dirname(__file__), "..", "models", model_name) +dir_path= os.path.join(os.path.dirname(__file__), ".", "models", model_name) if not os.path.exists(dir_path): os.makedirs(dir_path) From 033a5a9b01139260f9b63f11e42509745bacbb11 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Fri, 12 Sep 2025 16:35:45 -0300 Subject: [PATCH 05/65] Refactored Docker image using manylinux for compatibility and simplified pip-based installations. Removed static git clone in favor of mirrored structure to make the environment more flexible and modular. --- Dockerfile | 64 ++++++++++---------------- scripts/install_python_dependencies.sh | 4 -- 2 files changed, 24 insertions(+), 44 deletions(-) delete mode 100755 scripts/install_python_dependencies.sh diff --git a/Dockerfile b/Dockerfile index 12abacb..88c701a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,48 +1,32 @@ +# Use the official manylinux image (compatible with Python packaging standards) +FROM quay.io/pypa/manylinux_2_28_x86_64 -FROM python:3.12-slim AS builder +# Add Python 3.12 binaries to PATH (provided by the manylinux image) +ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" + +# Set working directory WORKDIR /app -# Install GCC 13 and other dependencies -RUN apt-get update && \ - apt-get install -y \ - gcc-13 \ - g++-13 \ - libstdc++-13-dev \ - git \ - curl \ - wget \ - cmake \ - nano \ - unzip \ - ninja-build \ - pkg-config \ - libffi-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libgflags-dev \ - libssl-dev \ - sudo \ - build-essential \ - gnupg \ - && apt-get clean && rm -rf /var/lib/apt/lists/* && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 100 - -COPY .git .git -COPY .gitmodules .gitmodules -COPY scripts/ ./scripts/ -RUN chmod +x -R /app/scripts -RUN mkdir -p /app/libs/openai-cpp /app/libs/tokenizers-cpp - -# Install Rust +# Install GCC 13 and common development tools using YUM (not APT!) +RUN yum install -y \ + gcc gcc-c++ make cmake git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + unzip nano \ + openblas-devel + +# Install Rust (for building Rust-based Python extensions, if needed) RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + +# Add Rust binaries to PATH ENV PATH="/root/.cargo/bin:${PATH}" -# Run your scripts -RUN /app/scripts/install_python_dependencies.sh -RUN /app/scripts/install_torch.sh -RUN /app/scripts/install_libs.sh -RUN /app/scripts/configure_conan_profile.sh +# Upgrade pip and install required Python packages globally +RUN python3 -m pip install --upgrade pip setuptools wheel && \ + pip install build conan cmake requests twine pybind11 numpy + +# Install common ML and ONNX tooling +RUN pip install torch transformers onnx onnxruntime optimum -# COPY . . +# Default shell for container CMD ["/bin/bash"] diff --git a/scripts/install_python_dependencies.sh b/scripts/install_python_dependencies.sh deleted file mode 100755 index 8da6c8b..0000000 --- a/scripts/install_python_dependencies.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -e - -pip install --upgrade pip setuptools wheel build conan cmake requests twine pybind11 numpy From 960b6184266ce312c87fa343a4950c2815f8fcc6 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Fri, 12 Sep 2025 16:38:02 -0300 Subject: [PATCH 06/65] Remove submodule installer script; use git clone --recursive instead --- scripts/install_libs.sh | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100755 scripts/install_libs.sh diff --git a/scripts/install_libs.sh b/scripts/install_libs.sh deleted file mode 100755 index d9caedc..0000000 --- a/scripts/install_libs.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -git submodule update --init --recursive --remote - -pushd libs/tokenizers-cpp - git checkout 4bb7533 - git submodule update --init --recursive --remote - pushd msgpack - git checkout 8c602e8 - popd - pushd sentencepiece - git checkout f2219b5 - popd -popd \ No newline at end of file From 39f389761401d816f8133540dc30bef5ee2e0391 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sat, 13 Sep 2025 08:46:27 -0300 Subject: [PATCH 07/65] Add faiss script --- scripts/install_faiss.sh | 126 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 scripts/install_faiss.sh diff --git a/scripts/install_faiss.sh b/scripts/install_faiss.sh new file mode 100644 index 0000000..bf1434c --- /dev/null +++ b/scripts/install_faiss.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash + +# ============================================================================= +# FAISS CPU Installer Script (C++ only) +# ----------------------------------------------------------------------------- +# Works on Ubuntu/Debian (APT) and manylinux/CentOS-like (YUM) by auto-detecting +# the package manager. It installs build deps and builds FAISS (CPU-only) into +# ../libs/faiss relative to the current working directory. +# ----------------------------------------------------------------------------- +# Usage (optional): +# FAISS_TAG=v1.8.0 ./install_faiss_cpu.sh # pin to a tag/branch (default v1.8.0) +# ============================================================================= + +set -euo pipefail + +# ───────────────────────────────────────────────────────────────────────────── +# Elevation helper: use sudo only when needed and available +# ───────────────────────────────────────────────────────────────────────────── +SUDO="" +if [[ "$(id -u)" -ne 0 ]]; then + if command -v sudo >/dev/null 2>&1; then + SUDO="sudo" + else + echo "[!] Not running as root and 'sudo' is not available.\n Re-run as root or install sudo." >&2 + exit 1 + fi +fi + +# ───────────────────────────────────────────────────────────────────────────── +# Detect package manager +# ───────────────────────────────────────────────────────────────────────────── +PKG_MANAGER="" +if command -v apt-get >/dev/null 2>&1; then + PKG_MANAGER="apt" + echo "[pkg] Detected APT-based system (Ubuntu/Debian)" +elif command -v yum >/dev/null 2>&1; then + PKG_MANAGER="yum" + echo "[pkg] Detected YUM-based system (manylinux/CentOS-like)" +else + echo "[x] Unsupported system: neither apt-get nor yum found." >&2 + exit 1 +fi + +# ───────────────────────────────────────────────────────────────────────────── +# Install dependencies +# ───────────────────────────────────────────────────────────────────────────── +echo "[pkg] Installing build dependencies..." +if [[ "$PKG_MANAGER" == "apt" ]]; then + $SUDO apt-get update -y + $SUDO apt-get install -y \ + cmake \ + g++ \ + libopenblas-dev \ + libgflags-dev \ + build-essential \ + python3-dev \ + git \ + unzip \ + wget \ + pkg-config \ + ninja-build +else + # manylinux/CentOS-like + $SUDO yum install -y \ + gcc \ + gcc-c++ \ + make \ + cmake \ + git \ + curl \ + wget \ + ninja-build \ + libffi-devel \ + openssl-devel \ + protobuf-devel \ + gflags-devel \ + zlib-devel \ + unzip \ + openblas-devel \ + pkgconf-pkg-config +fi + +# ───────────────────────────────────────────────────────────────────────────── +# Prepare destination +# ───────────────────────────────────────────────────────────────────────────── +PROJ_DIR="$(pwd)" +FAISS_DIR="${PROJ_DIR}/../libs/faiss" +FAISS_TAG="${FAISS_TAG:-v1.8.0}" + +echo "[fs] Preparing ${FAISS_DIR} (fresh clone)" +rm -rf "$FAISS_DIR" +mkdir -p "$(dirname "$FAISS_DIR")" + +# ───────────────────────────────────────────────────────────────────────────── +# Clone & build (CPU-only) +# ───────────────────────────────────────────────────────────────────────────── +echo "[git] Cloning FAISS (${FAISS_TAG})..." +git clone --branch "$FAISS_TAG" --depth 1 https://github.com/facebookresearch/faiss.git "$FAISS_DIR" + +cd "$FAISS_DIR" + +# Prefer Ninja if available for faster builds +GEN_ARGS=() +if command -v ninja >/dev/null 2>&1; then + GEN_ARGS+=( -G Ninja ) +fi + +echo "[cmake] Configuring (CPU-only, Release)..." +cmake -B build "${GEN_ARGS[@]}" \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release + +echo "[cmake] Building..." +cmake --build build --config Release --parallel "$(nproc)" + +# ───────────────────────────────────────────────────────────────────────────── +# Locate artifacts +# ───────────────────────────────────────────────────────────────────────────── +FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name 'libfaiss.*' -print -quit 2>/dev/null || true)" + +if [[ -n "${FOUND_LIB}" && -e "${FOUND_LIB}" ]]; then + echo "[ok] FAISS built successfully." + echo "[out] Headers : $FAISS_DIR/faiss/" + echo "[out] Library : $FOUND \ No newline at end of file From 1eb70c0063968eb9f11af813acb04c57ec68157c Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 13:10:17 -0300 Subject: [PATCH 08/65] lighter docker --- Dockerfile | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 88c701a..81f755b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,32 +1,32 @@ # Use the official manylinux image (compatible with Python packaging standards) FROM quay.io/pypa/manylinux_2_28_x86_64 -# Add Python 3.12 binaries to PATH (provided by the manylinux image) +# Add Python 3.12 binaries to PATH ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" # Set working directory WORKDIR /app -# Install GCC 13 and common development tools using YUM (not APT!) +# Install development tools, Python deps, Rust, and cleanup to save space RUN yum install -y \ - gcc gcc-c++ make cmake git curl wget \ - ninja-build libffi-devel openssl-devel \ - protobuf-devel gflags-devel zlib-devel \ - unzip nano \ - openblas-devel - -# Install Rust (for building Rust-based Python extensions, if needed) -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - -# Add Rust binaries to PATH + gcc gcc-c++ make git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + openblas-devel unzip\ + && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && yum clean all \ + && rm -rf /var/cache/yum + +# Add Rust to PATH ENV PATH="/root/.cargo/bin:${PATH}" -# Upgrade pip and install required Python packages globally -RUN python3 -m pip install --upgrade pip setuptools wheel && \ - pip install build conan cmake requests twine pybind11 numpy - -# Install common ML and ONNX tooling -RUN pip install torch transformers onnx onnxruntime optimum +# Upgrade pip & install Python build tools and ML packages +RUN python3 -m pip install --upgrade pip setuptools wheel \ + && pip install --no-cache-dir \ + build conan cmake requests \ + pybind11 numpy \ + torch transformers \ + onnx onnxruntime optimum -# Default shell for container +# Set default shell CMD ["/bin/bash"] From ac337e9105006f1666111087cdb2e4303b61039d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 13:44:13 -0300 Subject: [PATCH 09/65] moved build related files to src folder to reduce noise --- CMakeLists.txt => src/CMakeLists.txt | 0 conanfile.py => src/conanfile.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename CMakeLists.txt => src/CMakeLists.txt (100%) rename conanfile.py => src/conanfile.py (100%) diff --git a/CMakeLists.txt b/src/CMakeLists.txt similarity index 100% rename from CMakeLists.txt rename to src/CMakeLists.txt diff --git a/conanfile.py b/src/conanfile.py similarity index 100% rename from conanfile.py rename to src/conanfile.py From c5a3c7a3e52991f5e32542c93c9e866e1cac20d8 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 13:54:52 -0300 Subject: [PATCH 10/65] Improved the conan configuration script --- ...ure_conan_profile.sh => setting_conan_profile.sh} | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) rename scripts/{configure_conan_profile.sh => setting_conan_profile.sh} (83%) diff --git a/scripts/configure_conan_profile.sh b/scripts/setting_conan_profile.sh similarity index 83% rename from scripts/configure_conan_profile.sh rename to scripts/setting_conan_profile.sh index da3aab6..2398db8 100755 --- a/scripts/configure_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -32,6 +32,8 @@ printf "$LINE_BRK$SEGMENT" printf "$LINE_BRK$SEGMENT" printf "$TAG Writing default profile\n" + +# New Setup (old had compiler.cppstd=17 and compiler.version=11) cat << EOF > "$PROFILE_DIR/default" [settings] arch=x86_64 @@ -47,7 +49,15 @@ printf "$LINE_BRK$SEGMENT" #----------------------------------------- printf "$TAG Profile created in: $PROFILE_DIR/default\n" -printf "\nHard-check with: cat < $PROFILE_DIR/default\n\n" +printf "$TAG Checking: cat < $PROFILE_DIR/default $LINE_BRK" + +cat < $PROFILE_DIR/default +printf "$LINE_BRK" + +printf "$SEGMENT\n" + +printf "\nHard-check with: cat < $PROFILE_DIR/default$LINE_BRK" + printf "$SEGMENT" printf "$SEGMENT" printf "$SEGMENT" From 033d99e57a9f83d7c5364771991de8199aab8a1a Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 14:16:37 -0300 Subject: [PATCH 11/65] leaving the .*ignore cleaner --- .dockerignore | 90 ++++++++++++++++----------------------------------- .gitignore | 19 ++++------- 2 files changed, 35 insertions(+), 74 deletions(-) diff --git a/.dockerignore b/.dockerignore index e89e21a..c368843 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,62 +1,28 @@ -build -CMakeUserPresets.json -bkp -CMakeLists.txt.user -conan.lock -libs*/libtorch -libs*/libtorch - Copy -libs*/libtorch new c11 -models/ -libtorch-cxx11-abi-shared* -openai_api_key* -libtorch-cxx11-abi-shared-with-deps-2.5.1+cpu.zip -libtorch-cxx11-abi-shared-with-deps-2.5.1+cu124.zip -*/libtorch*.zip -*/openai-cpp/* -*/tokenizers-cpp/* -*/faiss/* -venv - -*/purecpp_chuncks_clean/purecpp_chuncks_clean/*.so -*/purecpp_chuncks_clean2/purecpp_chuncks_clean/*.so -*/purecpp_chunks_clean/purecpp_chunks_clean/*.so -*/purecpp_chunks_clean2/purecpp_chunks_clean/*.so -*/purecpp_extract/purecpp_extract/*.so -*/purecpp_embed/purecpp_embed/*.so -*/purecpp_meta/purecpp_meta/*.so -*/purecpp_meta/purecpp_meta_t/*.so -*/purecpp_libs/purecpp_libs/*.so - -*/*/*.egg-info -*/*/build/* -*/*/dist/* - -"testes wsl"/*.so -"testes many linux"/*.so -testes/*.so -testes/"modulos old"/*.so -testes/"modulos pip"/*.so - -./*.so -./*/*.so -./*/*/*.so -./*/*/*/*.so -./*/*/*/*/*.so -*.so -*/*.so -*/*/*.so -*/*/*/*.so -*/*/*/*/*.so -*.whl -*/*.whl -*/*/*.whl -*/*/*/*.whl -*/*/*/*/*.whl -*/libtorch - -*/build/* -*/conan.lock -*/CMakeUserPresets.json -*/Tests/* - -./*.zip \ No newline at end of file +**/build +**/conan.lock +**/CMakeUserPresets.json +bkp/ + +**/*openai_api_key* + +**/venv + +**/*.egg-info +**/dist/* + +**/*.so +**/.whl + +**/Tests/* + +libs/libtorch +libtorch*.zip + +# Ignore everything in the models folder +models/* +# But DO NOT ignore these two files +!models/hf_extract_model.py +!models/hf_model_to_onnx.py + +**/*.pdf +**/*.docx \ No newline at end of file diff --git a/.gitignore b/.gitignore index b1a0d3e..853d1ca 100644 --- a/.gitignore +++ b/.gitignore @@ -43,7 +43,7 @@ CMakeUserPresets.json CMakeFiles/ bkp/ -build/ +**/build/ # Ignore everything in the models folder models/* @@ -51,9 +51,6 @@ models/* !models/hf_extract_model.py !models/hf_model_to_onnx.py -!libs/tokenizers-cpp -!libs/openai-cpp - libtorch*.zip libs/libtorch @@ -63,21 +60,19 @@ extern/* **/*openai_api_key* -package/build/ -package/dist/ +**/build/* +**/dist/* packer -package/*.egg-info +**/*.egg-info .env .venv/* .venv*/* .vscode/* .vs/* -venv/* +venv -tests/ compile_commands.json .cache/ -*.pdf -*.docx -test* \ No newline at end of file +**/*.pdf +**/*.docx \ No newline at end of file From 24e676909b4eb566ce365ce9090e4669e50f30a0 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 14:17:38 -0300 Subject: [PATCH 12/65] Changing the location in CMakeLists.txt --- src/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7dcd48c..88bffb0 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -76,6 +76,7 @@ set(CMAKE_BUILD_RPATH "\$ORIGIN/" "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" ) + set(CMAKE_INSTALL_RPATH "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" "\$ORIGIN/purecpp.libs" @@ -88,6 +89,7 @@ set(CMAKE_INSTALL_RPATH "\$ORIGIN/" "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" ) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) @@ -95,7 +97,7 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) file(GLOB_RECURSE VDB_SRCS ${CMAKE_SOURCE_DIR}/components/VectorDatabase/src/*.cpp) set(RagPUREAI_BINDING_SRCS - ${CMAKE_SOURCE_DIR}/src/binding.cpp + ${CMAKE_SOURCE_DIR}/binding.cpp ${CMAKE_SOURCE_DIR}/components/VectorDatabase/python/binding_vectordb.cpp ) From 879ccd818ac261fca4d8007e2fceb8a09e03f4de Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 14:54:56 -0300 Subject: [PATCH 13/65] Update: .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 853d1ca..7bb6f8c 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ models/* libtorch*.zip libs/libtorch +libs/faiss/ conan.lock libtorch/ From 6354072228a7ba2223a77e6d3d853b98867c456d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 15:22:45 -0300 Subject: [PATCH 14/65] Removing pip install from the dockerfile to reduce the final image by 7 GB --- Dockerfile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 81f755b..dacb5d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,12 +21,13 @@ RUN yum install -y \ ENV PATH="/root/.cargo/bin:${PATH}" # Upgrade pip & install Python build tools and ML packages -RUN python3 -m pip install --upgrade pip setuptools wheel \ - && pip install --no-cache-dir \ - build conan cmake requests \ - pybind11 numpy \ - torch transformers \ - onnx onnxruntime optimum +# RUN python3 -m pip install --upgrade pip setuptools wheel \ +# && pip install --no-cache-dir \ +# build conan cmake requests \ +# pybind11 numpy \ +# torch transformers \ +# onnx onnxruntime optimum # Set default shell CMD ["/bin/bash"] + From 3f7c3fa94e1d202d2319aa615597c9cb04c3115b Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 15:52:42 -0300 Subject: [PATCH 15/65] put back the Old Setup on (New was set to compiler.cppstd=20 and compiler.version=13. But was resulting in issues.) --- build.sh | 4 +++- scripts/setting_conan_profile.sh | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/build.sh b/build.sh index d69686f..dad9dba 100755 --- a/build.sh +++ b/build.sh @@ -2,7 +2,8 @@ set -e set -x -sed -i s/compiler.version=.*/compiler.version=11/g ~/.conan2/profiles/default +# sed -i s/compiler.version=.*/compiler.version=11/g ~/.conan2/profiles/default + conan install . --build=missing cmake \ @@ -11,5 +12,6 @@ cmake \ -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ -DSPM_USE_BUILTIN_PROTOBUF=OFF \ -G "Unix Makefiles" + cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -- diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh index 2398db8..140cc46 100755 --- a/scripts/setting_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -33,15 +33,15 @@ printf "$LINE_BRK$SEGMENT" printf "$TAG Writing default profile\n" -# New Setup (old had compiler.cppstd=17 and compiler.version=11) +# Old Setup (New was set to compiler.cppstd=20 and compiler.version=13. But was resulting in issues.) cat << EOF > "$PROFILE_DIR/default" [settings] arch=x86_64 build_type=Release compiler=gcc -compiler.cppstd=20 +compiler.cppstd=17 compiler.libcxx=libstdc++11 -compiler.version=13 +compiler.version=11 os=Linux EOF From c176193efefaaeb11b2d702c2413efc5f278fef2 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 16:10:51 -0300 Subject: [PATCH 16/65] quick fixes --- Dockerfile | 2 +- scripts/setting_conan_profile.sh | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index dacb5d3..040fc74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM quay.io/pypa/manylinux_2_28_x86_64 ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" # Set working directory -WORKDIR /app +WORKDIR /home # Install development tools, Python deps, Rust, and cleanup to save space RUN yum install -y \ diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh index 140cc46..ba63e12 100755 --- a/scripts/setting_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -14,7 +14,6 @@ printf " $TAG" printf "$LINE_BRK$SEGMENT" printf "$TAG conan profile detect --force\n" conan profile detect --force -printf "$LINE_BRK$SEGMENT" #----------------------------------------- #----------------------------------------- @@ -25,12 +24,12 @@ PROFILE_DIR=$(find . -type d -wholename "*/.conan2/profiles" | head -n 1 || true [ -z "$PROFILE_DIR" ] && PROFILE_DIR="$HOME/.conan2/profiles" && mkdir -p "$PROFILE_DIR" printf "$TAG Found at $PROFILE_DIR\n" -printf "$LINE_BRK$SEGMENT" + #----------------------------------------- #----------------------------------------- printf "$LINE_BRK$SEGMENT" -printf "$TAG Writing default profile\n" +printf "$TAG Writing default profile$LINE_BRK" # Old Setup (New was set to compiler.cppstd=20 and compiler.version=13. But was resulting in issues.) @@ -58,6 +57,5 @@ printf "$SEGMENT\n" printf "\nHard-check with: cat < $PROFILE_DIR/default$LINE_BRK" -printf "$SEGMENT" -printf "$SEGMENT" -printf "$SEGMENT" +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n\n\n\n\n" \ No newline at end of file From 87cd6e0c0dde04f0dbaad79037ae1e59a051a82a Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 16:23:46 -0300 Subject: [PATCH 17/65] Adding logging to scripts --- scripts/install_faiss.sh | 26 +++++++++++++++++++++++++- scripts/install_torch.sh | 27 ++++++++++++++++++++++++++- scripts/setting_conan_profile.sh | 18 ++++++++++++------ 3 files changed, 63 insertions(+), 8 deletions(-) diff --git a/scripts/install_faiss.sh b/scripts/install_faiss.sh index bf1434c..8f12674 100644 --- a/scripts/install_faiss.sh +++ b/scripts/install_faiss.sh @@ -11,8 +11,21 @@ # FAISS_TAG=v1.8.0 ./install_faiss_cpu.sh # pin to a tag/branch (default v1.8.0) # ============================================================================= +#----------------------------------------- +#================= LOGGIN ================ +#----------------------------------------- set -euo pipefail +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" + +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + # ───────────────────────────────────────────────────────────────────────────── # Elevation helper: use sudo only when needed and available # ───────────────────────────────────────────────────────────────────────────── @@ -123,4 +136,15 @@ FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name 'libfaiss.*' -print if [[ -n "${FOUND_LIB}" && -e "${FOUND_LIB}" ]]; then echo "[ok] FAISS built successfully." echo "[out] Headers : $FAISS_DIR/faiss/" - echo "[out] Library : $FOUND \ No newline at end of file + echo "[out] Library : $FOUND" + + +#----------------------------------------- + +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n\n\n\n\n". +#----------------------------------------- + diff --git a/scripts/install_torch.sh b/scripts/install_torch.sh index 2295712..7ae34fe 100755 --- a/scripts/install_torch.sh +++ b/scripts/install_torch.sh @@ -1,5 +1,22 @@ #!/bin/bash +set -e +#----------------------------------------- +#================= LOGGIN ================ +#----------------------------------------- +set -euo pipefail + +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" + +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +#----------------------------------------- ZIP=libtorch-cxx11-abi-shared-with-deps-2.5.0+cpu.zip URL=https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcpu.zip @@ -7,4 +24,12 @@ rm -rf ${ZIP} ./libs/libtorch/cpu wget ${URL} -O ${ZIP} mkdir -p ./libs/libtorch unzip ${ZIP} -d ./libs/libtorch -mv ./libs/libtorch/libtorch ./libs/libtorch/cpu \ No newline at end of file +mv ./libs/libtorch/libtorch ./libs/libtorch/cpu +#----------------------------------------- + +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n\n\n\n\n". +#----------------------------------------- \ No newline at end of file diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh index ba63e12..b18dfad 100755 --- a/scripts/setting_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -1,17 +1,19 @@ #!/bin/bash set -e +#----------------------------------------- +#================= LOGGIN ================ +#----------------------------------------- TAG="[$(basename "${BASH_SOURCE[0]}")]" LINE_BRK="\n\n" SEGMENT="===========================================================\n" -printf "$SEGMENT" -printf "$SEGMENT" -printf "$SEGMENT" -printf " $TAG" +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +#----------------------------------------- #----------------------------------------- -printf "$LINE_BRK$SEGMENT" +printf "$SEGMENT" printf "$TAG conan profile detect --force\n" conan profile detect --force #----------------------------------------- @@ -57,5 +59,9 @@ printf "$SEGMENT\n" printf "\nHard-check with: cat < $PROFILE_DIR/default$LINE_BRK" +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n\n\n\n\n" \ No newline at end of file +printf "\n\n\n\n\n". +#----------------------------------------- \ No newline at end of file From bac3af5e4e4c65f68a0a7d87a30bdfd347efa8ce Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 16:35:45 -0300 Subject: [PATCH 18/65] Improved naming for better clarity and faster tab-completion --- scripts/{install_faiss.sh => faiss_installer.sh} | 0 scripts/{install_torch.sh => torch_installer.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/{install_faiss.sh => faiss_installer.sh} (100%) mode change 100644 => 100755 rename scripts/{install_torch.sh => torch_installer.sh} (100%) diff --git a/scripts/install_faiss.sh b/scripts/faiss_installer.sh old mode 100644 new mode 100755 similarity index 100% rename from scripts/install_faiss.sh rename to scripts/faiss_installer.sh diff --git a/scripts/install_torch.sh b/scripts/torch_installer.sh similarity index 100% rename from scripts/install_torch.sh rename to scripts/torch_installer.sh From 5b9e4d4286f46a86fc6b904880d0d7375871a9b1 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:00:19 -0300 Subject: [PATCH 19/65] Faiss_installer fix and del unecessary stuff from dockerfile --- Dockerfile | 8 --- scripts/faiss_installer.sh | 99 +++++++++++--------------------------- 2 files changed, 29 insertions(+), 78 deletions(-) diff --git a/Dockerfile b/Dockerfile index 040fc74..11e1ca7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,14 +20,6 @@ RUN yum install -y \ # Add Rust to PATH ENV PATH="/root/.cargo/bin:${PATH}" -# Upgrade pip & install Python build tools and ML packages -# RUN python3 -m pip install --upgrade pip setuptools wheel \ -# && pip install --no-cache-dir \ -# build conan cmake requests \ -# pybind11 numpy \ -# torch transformers \ -# onnx onnxruntime optimum - # Set default shell CMD ["/bin/bash"] diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index 8f12674..89b1897 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,16 +1,5 @@ #!/usr/bin/env bash -# ============================================================================= -# FAISS CPU Installer Script (C++ only) -# ----------------------------------------------------------------------------- -# Works on Ubuntu/Debian (APT) and manylinux/CentOS-like (YUM) by auto-detecting -# the package manager. It installs build deps and builds FAISS (CPU-only) into -# ../libs/faiss relative to the current working directory. -# ----------------------------------------------------------------------------- -# Usage (optional): -# FAISS_TAG=v1.8.0 ./install_faiss_cpu.sh # pin to a tag/branch (default v1.8.0) -# ============================================================================= - #----------------------------------------- #================= LOGGIN ================ #----------------------------------------- @@ -26,22 +15,20 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- -# ───────────────────────────────────────────────────────────────────────────── -# Elevation helper: use sudo only when needed and available -# ───────────────────────────────────────────────────────────────────────────── + +# sudo se necessário SUDO="" if [[ "$(id -u)" -ne 0 ]]; then if command -v sudo >/dev/null 2>&1; then SUDO="sudo" else - echo "[!] Not running as root and 'sudo' is not available.\n Re-run as root or install sudo." >&2 + echo "[!] Not running as root and 'sudo' is not available. + Re-run as root or install sudo." >&2 exit 1 fi fi -# ───────────────────────────────────────────────────────────────────────────── -# Detect package manager -# ───────────────────────────────────────────────────────────────────────────── +# Detecta gerenciador de pacotes PKG_MANAGER="" if command -v apt-get >/dev/null 2>&1; then PKG_MANAGER="apt" @@ -54,48 +41,21 @@ else exit 1 fi -# ───────────────────────────────────────────────────────────────────────────── -# Install dependencies -# ───────────────────────────────────────────────────────────────────────────── +# Dependências echo "[pkg] Installing build dependencies..." if [[ "$PKG_MANAGER" == "apt" ]]; then $SUDO apt-get update -y $SUDO apt-get install -y \ - cmake \ - g++ \ - libopenblas-dev \ - libgflags-dev \ - build-essential \ - python3-dev \ - git \ - unzip \ - wget \ - pkg-config \ - ninja-build + cmake g++ libopenblas-dev libgflags-dev build-essential \ + python3-dev git unzip wget pkg-config ninja-build else - # manylinux/CentOS-like $SUDO yum install -y \ - gcc \ - gcc-c++ \ - make \ - cmake \ - git \ - curl \ - wget \ - ninja-build \ - libffi-devel \ - openssl-devel \ - protobuf-devel \ - gflags-devel \ - zlib-devel \ - unzip \ - openblas-devel \ - pkgconf-pkg-config + gcc gcc-c++ make cmake git curl wget ninja-build \ + libffi-devel openssl-devel protobuf-devel gflags-devel \ + zlib-devel unzip openblas-devel pkgconf-pkg-config fi -# ───────────────────────────────────────────────────────────────────────────── -# Prepare destination -# ───────────────────────────────────────────────────────────────────────────── +# Pastas / TAG PROJ_DIR="$(pwd)" FAISS_DIR="${PROJ_DIR}/../libs/faiss" FAISS_TAG="${FAISS_TAG:-v1.8.0}" @@ -104,15 +64,14 @@ echo "[fs] Preparing ${FAISS_DIR} (fresh clone)" rm -rf "$FAISS_DIR" mkdir -p "$(dirname "$FAISS_DIR")" -# ───────────────────────────────────────────────────────────────────────────── -# Clone & build (CPU-only) -# ───────────────────────────────────────────────────────────────────────────── +# Clone echo "[git] Cloning FAISS (${FAISS_TAG})..." -git clone --branch "$FAISS_TAG" --depth 1 https://github.com/facebookresearch/faiss.git "$FAISS_DIR" +git clone --branch "$FAISS_TAG" --single-branch --depth 1 \ + https://github.com/facebookresearch/faiss.git "$FAISS_DIR" cd "$FAISS_DIR" -# Prefer Ninja if available for faster builds +# Ninja se disponível GEN_ARGS=() if command -v ninja >/dev/null 2>&1; then GEN_ARGS+=( -G Ninja ) @@ -122,29 +81,29 @@ echo "[cmake] Configuring (CPU-only, Release)..." cmake -B build "${GEN_ARGS[@]}" \ -DFAISS_ENABLE_GPU=OFF \ -DFAISS_ENABLE_PYTHON=OFF \ - -DFAISS_ENABLE_TESTS=OFF \ - -DCMAKE_BUILD_TYPE=Release + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_POLICY_DEFAULT_CMP0135=NEW -echo "[cmake] Building..." -cmake --build build --config Release --parallel "$(nproc)" +echo "[cmake] Building (target: faiss)..." +cmake --build build --target faiss --config Release --parallel "$(nproc)" -# ───────────────────────────────────────────────────────────────────────────── -# Locate artifacts -# ───────────────────────────────────────────────────────────────────────────── +# Localiza artefatos FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name 'libfaiss.*' -print -quit 2>/dev/null || true)" if [[ -n "${FOUND_LIB}" && -e "${FOUND_LIB}" ]]; then echo "[ok] FAISS built successfully." echo "[out] Headers : $FAISS_DIR/faiss/" - echo "[out] Library : $FOUND" - - -#----------------------------------------- + echo "[out] Library : $FOUND_LIB" +else + echo "[x] Build finished but libfaiss was not found under build/faiss/" >&2 + exit 2 +fi #----------------------------------------- #================= ENDING ================ #----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n\n\n\n\n". +printf "\n\n\n\n\n" #----------------------------------------- - From 120f8d6fdaa8d8c7f35f975c0a6ae5528616c73d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:11:19 -0300 Subject: [PATCH 20/65] Faiss installer --- scripts/faiss_installer.sh | 61 +++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index 89b1897..99576cf 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash #----------------------------------------- -#================= LOGGIN ================ +#================= LOGGING =============== #----------------------------------------- set -euo pipefail @@ -15,7 +15,6 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- - # sudo se necessário SUDO="" if [[ "$(id -u)" -ne 0 ]]; then @@ -47,12 +46,12 @@ if [[ "$PKG_MANAGER" == "apt" ]]; then $SUDO apt-get update -y $SUDO apt-get install -y \ cmake g++ libopenblas-dev libgflags-dev build-essential \ - python3-dev git unzip wget pkg-config ninja-build + python3-dev git unzip wget pkg-config ninja-build binutils else $SUDO yum install -y \ gcc gcc-c++ make cmake git curl wget ninja-build \ libffi-devel openssl-devel protobuf-devel gflags-devel \ - zlib-devel unzip openblas-devel pkgconf-pkg-config + zlib-devel unzip openblas-devel pkgconf-pkg-config binutils fi # Pastas / TAG @@ -77,33 +76,75 @@ if command -v ninja >/dev/null 2>&1; then GEN_ARGS+=( -G Ninja ) fi +# Toggles de build/instalação +BUILD_SHARED="${BUILD_SHARED:-OFF}" # export BUILD_SHARED=ON para .so +DO_INSTALL="${DO_INSTALL:-ON}" # export DO_INSTALL=OFF para pular install +INSTALL_PREFIX="${INSTALL_PREFIX:-${FAISS_DIR}/_install}" + +# Paralelismo com fallback +JOBS="$( (command -v nproc >/dev/null && nproc) || getconf _NPROCESSORS_ONLN || echo 2 )" + echo "[cmake] Configuring (CPU-only, Release)..." cmake -B build "${GEN_ARGS[@]}" \ + -DBUILD_SHARED_LIBS="${BUILD_SHARED}" \ -DFAISS_ENABLE_GPU=OFF \ -DFAISS_ENABLE_PYTHON=OFF \ -DBUILD_TESTING=OFF \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_POLICY_DEFAULT_CMP0135=NEW + -DCMAKE_POLICY_DEFAULT_CMP0135=NEW \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -echo "[cmake] Building (target: faiss)..." -cmake --build build --target faiss --config Release --parallel "$(nproc)" +echo "[cmake] Building (target: faiss) with ${JOBS} jobs..." +cmake --build build --target faiss --config Release --parallel "${JOBS}" + +# Instalação opcional (gera include/ e lib/ e config do package) +if [[ "${DO_INSTALL}" == "ON" ]]; then + echo "[cmake] Installing to ${INSTALL_PREFIX}..." + cmake --install build --component faiss 2>/dev/null || cmake --install build +fi # Localiza artefatos -FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name 'libfaiss.*' -print -quit 2>/dev/null || true)" +if [[ "${BUILD_SHARED}" == "ON" ]]; then + PREFERRED="libfaiss.so" + FALLBACK="libfaiss.a" +else + PREFERRED="libfaiss.a" + FALLBACK="libfaiss.so" +fi + +FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name "${PREFERRED}" -print -quit 2>/dev/null || true)" +if [[ -z "${FOUND_LIB}" ]]; then + FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name "${FALLBACK}" -print -quit 2>/dev/null || true)" +fi if [[ -n "${FOUND_LIB}" && -e "${FOUND_LIB}" ]]; then echo "[ok] FAISS built successfully." echo "[out] Headers : $FAISS_DIR/faiss/" echo "[out] Library : $FOUND_LIB" + if [[ "${DO_INSTALL}" == "ON" ]]; then + echo "[out] Install : ${INSTALL_PREFIX}" + echo " include : ${INSTALL_PREFIX}/include" + echo " lib : ${INSTALL_PREFIX}/lib" + fi else echo "[x] Build finished but libfaiss was not found under build/faiss/" >&2 exit 2 fi +# Checagens pós-build úteis +if command -v nm >/dev/null 2>&1; then + echo "[check] nm symbols (grep faiss::IndexFlat...):" + nm -C "${FOUND_LIB}" | grep -E 'faiss::IndexFlat' | head || true +fi +if [[ "${FOUND_LIB##*.}" == "so" ]] && command -v ldd >/dev/null 2>&1; then + echo "[check] ldd on shared library:" + ldd "${FOUND_LIB}" || true +fi + #----------------------------------------- #================= ENDING ================ #----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n\n\n\n\n" -#----------------------------------------- +printf "\n" From 73715fe53895a8162229a3dea5f7962e03be9250 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:15:30 -0300 Subject: [PATCH 21/65] translation of messages into English --- scripts/faiss_installer.sh | 47 +++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index 99576cf..ce35028 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash + +# ============================================================================= +# FAISS CPU Installer Script (C++ only) +# ----------------------------------------------------------------------------- +# Works on Ubuntu/Debian (APT) and manylinux/CentOS-like (YUM) by auto-detecting +# the package manager. It installs build deps and builds FAISS (CPU-only) into +# ../libs/faiss relative to the current working directory. +# ----------------------------------------------------------------------------- +# Usage (optional): +# FAISS_TAG=v1.8.0 ./install_faiss_cpu.sh # pin to a tag/branch (default v1.8.0) +# ============================================================================= + #----------------------------------------- #================= LOGGING =============== #----------------------------------------- @@ -15,7 +27,10 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- -# sudo se necessário + +# ───────────────────────────────────────────────────────────────────────────── +# Elevation helper: use sudo only when needed and available +# ───────────────────────────────────────────────────────────────────────────── SUDO="" if [[ "$(id -u)" -ne 0 ]]; then if command -v sudo >/dev/null 2>&1; then @@ -27,7 +42,9 @@ if [[ "$(id -u)" -ne 0 ]]; then fi fi -# Detecta gerenciador de pacotes +# ───────────────────────────────────────────────────────────────────────────── +# Detect package manager +# ───────────────────────────────────────────────────────────────────────────── PKG_MANAGER="" if command -v apt-get >/dev/null 2>&1; then PKG_MANAGER="apt" @@ -40,7 +57,9 @@ else exit 1 fi -# Dependências +# ───────────────────────────────────────────────────────────────────────────── +# Install dependencies +# ───────────────────────────────────────────────────────────────────────────── echo "[pkg] Installing build dependencies..." if [[ "$PKG_MANAGER" == "apt" ]]; then $SUDO apt-get update -y @@ -54,7 +73,9 @@ else zlib-devel unzip openblas-devel pkgconf-pkg-config binutils fi -# Pastas / TAG +# ───────────────────────────────────────────────────────────────────────────── +# Prepare destination +# ───────────────────────────────────────────────────────────────────────────── PROJ_DIR="$(pwd)" FAISS_DIR="${PROJ_DIR}/../libs/faiss" FAISS_TAG="${FAISS_TAG:-v1.8.0}" @@ -63,25 +84,27 @@ echo "[fs] Preparing ${FAISS_DIR} (fresh clone)" rm -rf "$FAISS_DIR" mkdir -p "$(dirname "$FAISS_DIR")" -# Clone +# ───────────────────────────────────────────────────────────────────────────── +# Clone & build (CPU-only) +# ───────────────────────────────────────────────────────────────────────────── echo "[git] Cloning FAISS (${FAISS_TAG})..." git clone --branch "$FAISS_TAG" --single-branch --depth 1 \ https://github.com/facebookresearch/faiss.git "$FAISS_DIR" cd "$FAISS_DIR" -# Ninja se disponível +# Prefer Ninja if available for faster builds GEN_ARGS=() if command -v ninja >/dev/null 2>&1; then GEN_ARGS+=( -G Ninja ) fi -# Toggles de build/instalação +# Build/Install Toggles BUILD_SHARED="${BUILD_SHARED:-OFF}" # export BUILD_SHARED=ON para .so DO_INSTALL="${DO_INSTALL:-ON}" # export DO_INSTALL=OFF para pular install INSTALL_PREFIX="${INSTALL_PREFIX:-${FAISS_DIR}/_install}" -# Paralelismo com fallback +# Parallelism with fallback JOBS="$( (command -v nproc >/dev/null && nproc) || getconf _NPROCESSORS_ONLN || echo 2 )" echo "[cmake] Configuring (CPU-only, Release)..." @@ -99,13 +122,15 @@ cmake -B build "${GEN_ARGS[@]}" \ echo "[cmake] Building (target: faiss) with ${JOBS} jobs..." cmake --build build --target faiss --config Release --parallel "${JOBS}" -# Instalação opcional (gera include/ e lib/ e config do package) +# Optional installation (generates include/ and lib/ and package config) if [[ "${DO_INSTALL}" == "ON" ]]; then echo "[cmake] Installing to ${INSTALL_PREFIX}..." cmake --install build --component faiss 2>/dev/null || cmake --install build fi -# Localiza artefatos +# ───────────────────────────────────────────────────────────────────────────── +# Locate artifacts +# ───────────────────────────────────────────────────────────────────────────── if [[ "${BUILD_SHARED}" == "ON" ]]; then PREFERRED="libfaiss.so" FALLBACK="libfaiss.a" @@ -133,7 +158,7 @@ else exit 2 fi -# Checagens pós-build úteis +# Useful post-build checks if command -v nm >/dev/null 2>&1; then echo "[check] nm symbols (grep faiss::IndexFlat...):" nm -C "${FOUND_LIB}" | grep -E 'faiss::IndexFlat' | head || true From 0d670dff657532946e2c99d1b4e1e9df8fdce19f Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:19:31 -0300 Subject: [PATCH 22/65] Fixing Build Script --- build.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/build.sh b/build.sh index dad9dba..5ee4df1 100755 --- a/build.sh +++ b/build.sh @@ -4,7 +4,11 @@ set -x # sed -i s/compiler.version=.*/compiler.version=11/g ~/.conan2/profiles/default -conan install . --build=missing +rm -fr build conan.lock + +conan lock create ./src --build=missing +conan install ./src --build=missing + cmake \ --preset conan-release \ From e4e8f0e6107dc210867c2717b37c01159c50308d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:41:43 -0300 Subject: [PATCH 23/65] Removing remaining residue from torch installation --- scripts/torch_installer.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/torch_installer.sh b/scripts/torch_installer.sh index 7ae34fe..30e65ec 100755 --- a/scripts/torch_installer.sh +++ b/scripts/torch_installer.sh @@ -25,6 +25,7 @@ wget ${URL} -O ${ZIP} mkdir -p ./libs/libtorch unzip ${ZIP} -d ./libs/libtorch mv ./libs/libtorch/libtorch ./libs/libtorch/cpu +rm -f *.zip #----------------------------------------- #----------------------------------------- From 6d34bc89139d13ec4cc6eb3a6e0a38f076201fde Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 17:52:12 -0300 Subject: [PATCH 24/65] Creating a wrapper script to orchestrate and streamline environment setup through modular installers. --- scripts/env_config.sh | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 scripts/env_config.sh diff --git a/scripts/env_config.sh b/scripts/env_config.sh new file mode 100755 index 0000000..66520d1 --- /dev/null +++ b/scripts/env_config.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +#----------------------------------------- +#================= LOGGING =============== +#----------------------------------------- +set -euo pipefail + +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" + +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + + +pip install build conan cmake requests pybind11 + +"$SCRIPT_DIR/setting_conan_profile.sh" + + +"$SCRIPT_DIR/torch_installer.sh" + + +"$SCRIPT_DIR/faiss_installer.sh" + + + +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n\n\n\n\n". +#----------------------------------------- \ No newline at end of file From 518f1169b9e63922ce86a2c2330528618a8efd30 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 18:00:48 -0300 Subject: [PATCH 25/65] Adding Sandbox environment --- Sandbox/README.md | 11 +++++++++++ scripts/env_config.sh | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 Sandbox/README.md diff --git a/Sandbox/README.md b/Sandbox/README.md new file mode 100644 index 0000000..77b0cf0 --- /dev/null +++ b/Sandbox/README.md @@ -0,0 +1,11 @@ + +# Sandbox Repository + +This repository is a **sandbox environment** for testing and prototyping. + +**Every time it is compiled (e.g., using the scripts `./build`), the outputs are **redirected** to the `Sandbox/` directory.** + +* Here, the `.so` files (shared libraries) will be available + for experimentation, prototyping, and testing purposes. + +* The `Resources/` folder contains a collection of publicly accessible files in various formats (.doc, .pdf, .txt), used to test the functionalities of the different loaders. These materials are assumed to be free of patent restrictions or in the public domain. \ No newline at end of file diff --git a/scripts/env_config.sh b/scripts/env_config.sh index 66520d1..76015c8 100755 --- a/scripts/env_config.sh +++ b/scripts/env_config.sh @@ -33,6 +33,6 @@ pip install build conan cmake requests pybind11 #----------------------------------------- #================= ENDING ================ #----------------------------------------- -printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n\n\n\n\n". +printf " END\n" +printf "$SEGMENT$SEGMENT$SEGMENT\n". #----------------------------------------- \ No newline at end of file From 14ecea4492256c1a4d5451f9cbdb74afbbc3a17f Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 18:15:46 -0300 Subject: [PATCH 26/65] Adding handle to send .so to Sandbox environment --- build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 5ee4df1..3478c2f 100755 --- a/build.sh +++ b/build.sh @@ -2,8 +2,6 @@ set -e set -x -# sed -i s/compiler.version=.*/compiler.version=11/g ~/.conan2/profiles/default - rm -fr build conan.lock conan lock create ./src --build=missing @@ -19,3 +17,6 @@ cmake \ cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -- +rm -f ./Sandbox/*.so +# cp ./src/build/Release/$MOD*.so ./Sandbox/ + From 978f0625ca25149ab5268f1b44d8bb1c974081d3 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 18:15:57 -0300 Subject: [PATCH 27/65] Adding FAISSVectorSearch --- .../FAISSVectorSearch/FAISSVectorSearch.cpp | 137 ++++++++++++++++++ .../FAISSVectorSearch/FAISSVectorSearch.h | 73 ++++++++++ 2 files changed, 210 insertions(+) create mode 100644 components/FAISSVectorSearch/FAISSVectorSearch.cpp create mode 100644 components/FAISSVectorSearch/FAISSVectorSearch.h diff --git a/components/FAISSVectorSearch/FAISSVectorSearch.cpp b/components/FAISSVectorSearch/FAISSVectorSearch.cpp new file mode 100644 index 0000000..8664b6d --- /dev/null +++ b/components/FAISSVectorSearch/FAISSVectorSearch.cpp @@ -0,0 +1,137 @@ +#include + +std::optional +FAISSVectorSearch::PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + faiss::IndexFlatL2 index(d); + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + const float* xb = vdb->getVDpointer(); + if (!xb) { + throw std::runtime_error("Empty vector database. Cannot proceed."); + } + + index.add(ndb, xb); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector I(k); + std::vector D(k); + index.search(nq, emb_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + std::cout << "Nearest index: " << I[0] << std::endl; + std::cout << "Distance: " << D[0] << std::endl; + return FAISSVectorSearch::PureResult{I, D}; + } + return {}; +} + +std::optional +FAISSVectorSearch::PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + faiss::IndexFlatIP index(d); + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + const float* xb = vdb->getVDpointer(); + if (!xb) { + throw std::runtime_error("Empty vector database. Cannot proceed."); + } + + index.add(ndb, xb); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector I(k); + std::vector D(k); + index.search(nq, emb_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + std::cout << "Most similar index: " << I[0] << std::endl; + std::cout << "Similarity score: " << D[0] << std::endl; + return FAISSVectorSearch::PureResult{I, D}; + } + return {}; +} + +std::optional +FAISSVectorSearch::PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + std::vector base = vdb->flatVD; + for (size_t i = 0; i < ndb; ++i) { + normalize_vector(&base[i * d], d); + } + + faiss::IndexFlatIP index(d); + index.add(ndb, base.data()); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector normalized_query = emb_query; + normalize_vector(normalized_query.data(), d); + + std::vector I(k); + std::vector D(k); + index.search(nq, normalized_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + return FAISSVectorSearch::PureResult{I, D}; + } + return {}; +} + +void FAISSVectorSearch::normalize_vector(float* vec, size_t d) { + float norm = 0.0f; + for (size_t i = 0; i < d; ++i) { + norm += vec[i] * vec[i]; + } + norm = std::sqrt(norm); + if (norm > 0.0f) { + for (size_t i = 0; i < d; ++i) { + vec[i] /= norm; + } + } +} \ No newline at end of file diff --git a/components/FAISSVectorSearch/FAISSVectorSearch.h b/components/FAISSVectorSearch/FAISSVectorSearch.h new file mode 100644 index 0000000..9298c8e --- /dev/null +++ b/components/FAISSVectorSearch/FAISSVectorSearch.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "RagException.h" +#include "StringUtils.h" +#include "CommonStructs.h" +#include "EmbeddingOpenAI.h" +#include "Chunk/ChunkCommons/ChunkCommons.h" +#include "Chunk/ChunkDefault/ChunkDefault.h" +#include "Chunk/ChunkQuery/ChunkQuery.h" + +namespace FAISSVectorSearch { + + // struct vdb_data { + // std::vector flatVD; + // std::string vendor; + // std::string model; + // size_t dim = 0; + // size_t n = 0; + + // inline const std::tuple getPar(void) const { + // return {n, dim}; + // } + + // inline std::pair getEmbPar(void) const { + // return {vendor, model}; + // } + + // inline const float* getVDpointer(void) const { + // if (flatVD.empty()) { + // std::cout << "[Info] Empty Vector Data Base\n"; + // return {}; + // } + // return flatVD.data(); + // } + // }; + + struct PureResult { + std::vector indices; + std::vector distances; + }; + + // L2 distance (Euclidean) + std::optional + PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Inner Product (dot product) + std::optional + PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Cosine similarity (requires normalization) + std::optional + PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Utility: in-place L2 normalization + void normalize_vector(float* vec, size_t d); + +} \ No newline at end of file From bc919926eafd7c20841c348c72ce2bf662428a67 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 19:08:54 -0300 Subject: [PATCH 28/65] Fixing build.sh and adding logging --- build.sh | 68 +++++++++++++++++++++++++++++++++++++++++-- scripts/env_config.sh | 2 +- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/build.sh b/build.sh index 3478c2f..a73c819 100755 --- a/build.sh +++ b/build.sh @@ -1,12 +1,54 @@ #!/bin/bash set -e set -x +set -euo pipefail -rm -fr build conan.lock +#----------------------------------------- +#================= LOGGING =============== +#----------------------------------------- +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" +#----------------------------------------- + +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " Begin $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +# ───────────────────────────────────────────────────────────────────────────── +# Conan +# ───────────────────────────────────────────────────────────────────────────── +#----------------------------------------- +printf " Begin [CONAN]$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +rm -fr ./src/build ./src/conan.lock conan lock create ./src --build=missing conan install ./src --build=missing +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT" +printf " Finish [CONAN]\n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#----------------------------------------- + +# ───────────────────────────────────────────────────────────────────────────── +# Build +# ───────────────────────────────────────────────────────────────────────────── +#----------------------------------------- +printf " Begin [Build]$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- +cd src/ cmake \ --preset conan-release \ @@ -16,7 +58,27 @@ cmake \ -G "Unix Makefiles" cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -- +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT" +printf " Finish [Build]\n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#----------------------------------------- + +# ───────────────────────────────────────────────────────────────────────────── +# Sending to Sandbox +# ───────────────────────────────────────────────────────────────────────────── +printf "[Last Step] Sending to Sandbox \n" + +rm -f ../Sandbox/*.so -rm -f ./Sandbox/*.so -# cp ./src/build/Release/$MOD*.so ./Sandbox/ +# cp ./src/build/Release/.so ../Sandbox/ +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT" +printf " Finish $TAG\n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#----------------------------------------- diff --git a/scripts/env_config.sh b/scripts/env_config.sh index 76015c8..6e77037 100755 --- a/scripts/env_config.sh +++ b/scripts/env_config.sh @@ -34,5 +34,5 @@ pip install build conan cmake requests pybind11 #================= ENDING ================ #----------------------------------------- printf " END\n" -printf "$SEGMENT$SEGMENT$SEGMENT\n". +printf "$SEGMENT$SEGMENT$SEGMENT\n" #----------------------------------------- \ No newline at end of file From a44c26a5051850fcee4c3a7badc69ac62038b59b Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 14 Sep 2025 19:19:22 -0300 Subject: [PATCH 29/65] Update build script to improve error handling --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index a73c819..2063677 100755 --- a/build.sh +++ b/build.sh @@ -1,7 +1,7 @@ #!/bin/bash -set -e -set -x set -euo pipefail +set -x + #----------------------------------------- #================= LOGGING =============== From 90ef76a0930b19c4cbb6762db6fe317c2db95887 Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 14 Sep 2025 19:31:05 -0300 Subject: [PATCH 30/65] Update env_config.sh --- scripts/env_config.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/env_config.sh b/scripts/env_config.sh index 6e77037..4789352 100755 --- a/scripts/env_config.sh +++ b/scripts/env_config.sh @@ -1,12 +1,9 @@ #!/usr/bin/env bash -set -e -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +set -euo pipefail #----------------------------------------- #================= LOGGING =============== #----------------------------------------- -set -euo pipefail - TAG="[$(basename "${BASH_SOURCE[0]}")]" LINE_BRK="\n\n" SEGMENT="===========================================================\n" @@ -18,6 +15,9 @@ printf "$LINE_BRK" #----------------------------------------- +#----------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + pip install build conan cmake requests pybind11 "$SCRIPT_DIR/setting_conan_profile.sh" @@ -27,7 +27,7 @@ pip install build conan cmake requests pybind11 "$SCRIPT_DIR/faiss_installer.sh" - +#----------------------------------------- #----------------------------------------- @@ -35,4 +35,4 @@ pip install build conan cmake requests pybind11 #----------------------------------------- printf " END\n" printf "$SEGMENT$SEGMENT$SEGMENT\n" -#----------------------------------------- \ No newline at end of file +#----------------------------------------- From 8346d1375e1cec04296b26520ec50f369d765d76 Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 14 Sep 2025 19:31:25 -0300 Subject: [PATCH 31/65] Add set -euo pipefail to faiss_installer.sh Enable strict error handling in the FAISS installer script. --- scripts/faiss_installer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index ce35028..9d9e4da 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +set -euo pipefail # ============================================================================= # FAISS CPU Installer Script (C++ only) @@ -15,7 +16,6 @@ #----------------------------------------- #================= LOGGING =============== #----------------------------------------- -set -euo pipefail TAG="[$(basename "${BASH_SOURCE[0]}")]" LINE_BRK="\n\n" From 8c2958794ce65d4d4208af96813c869dff6f8f2c Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 14 Sep 2025 19:31:46 -0300 Subject: [PATCH 32/65] Enhance script error handling with pipefail --- scripts/setting_conan_profile.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh index b18dfad..2242b4f 100755 --- a/scripts/setting_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -1,5 +1,6 @@ #!/bin/bash -set -e + +set -euo pipefail #----------------------------------------- #================= LOGGIN ================ @@ -64,4 +65,4 @@ printf "\nHard-check with: cat < $PROFILE_DIR/default$LINE_BRK" #----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" printf "\n\n\n\n\n". -#----------------------------------------- \ No newline at end of file +#----------------------------------------- From f0b60a866a7d958b5e57b912ed032774ba6190b2 Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 14 Sep 2025 19:32:06 -0300 Subject: [PATCH 33/65] Enable strict error handling in torch_installer.sh --- scripts/torch_installer.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/torch_installer.sh b/scripts/torch_installer.sh index 30e65ec..e0dc88b 100755 --- a/scripts/torch_installer.sh +++ b/scripts/torch_installer.sh @@ -1,10 +1,9 @@ #!/bin/bash -set -e +set -euo pipefail #----------------------------------------- #================= LOGGIN ================ #----------------------------------------- -set -euo pipefail TAG="[$(basename "${BASH_SOURCE[0]}")]" LINE_BRK="\n\n" @@ -33,4 +32,4 @@ rm -f *.zip #----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" printf "\n\n\n\n\n". -#----------------------------------------- \ No newline at end of file +#----------------------------------------- From 1e502879d554c69cbac4e2fd0f508cc43095b21d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 23:20:53 -0300 Subject: [PATCH 34/65] Fix build.sh logging --- build.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/build.sh b/build.sh index 2063677..5cf78ab 100755 --- a/build.sh +++ b/build.sh @@ -1,14 +1,13 @@ -#!/bin/bash -set -euo pipefail -set -x +#!/usr/bin/env bash +set -euo pipefail #----------------------------------------- #================= LOGGING =============== #----------------------------------------- TAG="[$(basename "${BASH_SOURCE[0]}")]" -LINE_BRK="\n\n" -SEGMENT="===========================================================\n" +LINE_BRK=$'\n\n' +SEGMENT=$'===========================================================\n' #----------------------------------------- #----------------------------------------- From 9b4558a271c6399ee287da68adcbaa05b727bf8c Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 14 Sep 2025 23:57:17 -0300 Subject: [PATCH 35/65] Starting to fix the integration with cmakelists in the new environment --- build.sh | 26 +++- src/CMakeLists.txt | 375 +++++++++++++++++++++++++++++---------------- 2 files changed, 262 insertions(+), 139 deletions(-) diff --git a/build.sh b/build.sh index 5cf78ab..f2143cd 100755 --- a/build.sh +++ b/build.sh @@ -17,6 +17,8 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- + + # ───────────────────────────────────────────────────────────────────────────── # Conan # ───────────────────────────────────────────────────────────────────────────── @@ -39,6 +41,8 @@ printf " Finish [CONAN]\n" printf "$SEGMENT$SEGMENT$SEGMENT\n" #----------------------------------------- + + # ───────────────────────────────────────────────────────────────────────────── # Build # ───────────────────────────────────────────────────────────────────────────── @@ -49,14 +53,20 @@ printf "$LINE_BRK" #----------------------------------------- cd src/ -cmake \ - --preset conan-release \ - -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ - -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ - -DSPM_USE_BUILTIN_PROTOBUF=OFF \ - -G "Unix Makefiles" +cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ + -DBUILD_SHARED_LIBS=OFF \ + -D_GLIBCXX_USE_CXX11_ABI=1 \ + -DSPM_USE_BUILTIN_PROTOBUF=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=generators/conan_toolchain.cmake \ + -S "$(pwd)" \ + -B "$(pwd)/build/Release" \ + -G "Unix Makefiles" + +cmake --build "$(pwd)/build/Release" --parallel $(nproc) +# cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -- #----------------------------------------- #================= ENDING ================ #----------------------------------------- @@ -65,6 +75,8 @@ printf " Finish [Build]\n" printf "$SEGMENT$SEGMENT$SEGMENT\n" #----------------------------------------- + + # ───────────────────────────────────────────────────────────────────────────── # Sending to Sandbox # ───────────────────────────────────────────────────────────────────────────── diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 88bffb0..42c4dd6 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,44 +1,143 @@ cmake_minimum_required(VERSION 3.22) project(RagPUREAI VERSION 1.0) -# General build settings -set(CMAKE_CXX_STANDARD 23) -set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) +# set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_BUILD_TYPE "Release") -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -# Toolchain -if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + +# ───────────────────────────────────────────────────────────────────────────── +#------- Caminhos para FAISS +# ───────────────────────────────────────────────────────────────────────────── +set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") +set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") +set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Deve conter libfaiss.a + +add_library(faiss STATIC IMPORTED) # Declara FAISS como biblioteca já compilada (IMPORTED) +set_target_properties(faiss PROPERTIES + IMPORTED_LOCATION "${FAISS_LIB_DIR}/libfaiss.a" + INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIR}" +) + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Compiler Specific flags +# ───────────────────────────────────────────────────────────────────────────── +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")# Flags específicas do MSVC set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/generators/conan_toolchain.cmake) - set(CMAKE_CXX_FLAGS_RELEASE "/Od") -else() + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Od /EHsc /MP /openmp /MD") +else() # Flags para G++/Clang ou G++/GNU em Linux + # Mantém as flags de otimização e adiciona -std=c++23 set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/Release/generators/conan_toolchain.cmake) - set(CMAKE_CXX_FLAGS_RELEASE "-O0") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fopenmp -pthread") + set(CMAKE_CXX_FLAGS_RELEASE "-O0 -std=c++23") endif() +# ───────────────────────────────────────────────────────────────────────────── +#------- Options +# ───────────────────────────────────────────────────────────────────────────── option(CURL_STATIC_LINKING "Set to ON to build libcurl with static linking." OFF) -option(BUILD_APPS "Build apps" OFF) -# Python & Pybind11 -find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +if(CURL_STATIC_LINKING) + message("-DCURL_STATICLIB [added]") + add_definitions(-DCURL_STATICLIB) +endif() + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find Python +# ───────────────────────────────────────────────────────────────────────────── +set(Python3_FIND_SHARED OFF)#Precisa sera? +find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) + +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(PYTHON_LIB_PATH "C:\\Program Files\\Python312\\libs") + link_directories(${Python3_LIBRARY_DIRS}) +endif() + include_directories(${Python3_INCLUDE_DIRS}) + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find Pybind11 +# ───────────────────────────────────────────────────────────────────────────── +message(STATUS "---------------------------------------\n") +if(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 8) + message(STATUS "Python 3.8 detectado!") + set(pybind11_DIR "/opt/python/cp38-cp38/lib/python3.8/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x03080000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 9) + message(STATUS "Python 3.9 detectado!") + set(pybind11_DIR "/opt/python/cp39-cp39/lib/python3.9/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x03090000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 10) + message(STATUS "Python 3.10 detectado!") + set(pybind11_DIR "/opt/python/cp310-cp310/lib/python3.10/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030A0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 11) + message(STATUS "Python 3.11 detectado!") + set(pybind11_DIR "/opt/python/cp311-cp311/lib/python3.11/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030B0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 12) + message(STATUS "Python 3.12 detectado!") + set(pybind11_DIR "/opt/python/cp312-cp312/lib/python3.12/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030C0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 13) + message(STATUS "Python 3.13 detectado!") + set(pybind11_DIR "/opt/python/cp313-cp313/lib/python3.13/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030D0000) + +else() + message(STATUS "Versão do Python não especificada nos if's anteriores!") + # Configuração padrão ou outra ação + +endif() + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find other dependencies +# ───────────────────────────────────────────────────────────────────────────── find_package(pybind11 REQUIRED) +find_package(pdfium REQUIRED) # Biblioteca para manipulação de PDFs +find_package(OpenMP REQUIRED) # OpenMP primeiro, pois pode ser usado por outros pacotes +find_package(ICU REQUIRED) # Biblioteca de internacionalização +find_package(miniz REQUIRED) # Biblioteca de compressão +find_package(rapidxml REQUIRED) # Parser XML +find_package(beauty REQUIRED) # HTTP Server (geralmente independente) +find_package(lexbor REQUIRED) # Biblioteca de parsing HTML +find_package(re2 REQUIRED) # Biblioteca de regex eficiente +find_package(nlohmann_json REQUIRED) # Biblioteca de JSON (não tem dependências) +#find_package(fmt REQUIRED) # Biblioteca de formatação de strings em casos aonde n tem format em compiladores mais antigos +find_package(CURL REQUIRED) # Biblioteca para requisições HTTP +find_package(Threads REQUIRED) + +# ───────────────────────────────────────────────────────────────────────────── +#------- Protobuf ------- +# ───────────────────────────────────────────────────────────────────────────── +find_package(Protobuf REQUIRED) + +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + link_directories(${Protobuf_LIBRARY_DIRS}) + include_directories(C:/vcpkg/packages/protobuf_x64-windows/include) + if (NOT DEFINED PROTOBUF_PROTOC_EXECUTABLE) + # include_directories(${Protobuf_INCLUDE_DIRS})# A principio o target Protobuf_INCLUDE_DIRS n existe, mas n sei se é necessario + set(PROTOBUF_PROTOC_EXECUTABLE "C:/vcpkg/packages/protobuf_x64-windows/tools/protobuf/protoc.exe") + endif() +endif() -# External Dependencies -find_package(pdfium REQUIRED) -find_package(ICU REQUIRED) -find_package(miniz REQUIRED) -find_package(rapidxml REQUIRED) -find_package(beauty REQUIRED) -find_package(lexbor REQUIRED) -find_package(OpenMP REQUIRED) -find_package(re2 REQUIRED) -find_package(nlohmann_json REQUIRED) -find_package(CURL REQUIRED) -find_package(onnxruntime REQUIRED) -find_package(redis++ REQUIRED) +find_package(onnxruntime REQUIRED) # Pode depender de protobuf (verifique se precisa!) +# ───────────────────────────────────────────────────────────────────────────── +#------- Descobrir o diretório site-packages do Python atual +# ───────────────────────────────────────────────────────────────────────────── +# Usaremos sysconfig para ser mais robusto em várias versões de Python. execute_process( COMMAND "${Python3_EXECUTABLE}" -c "import sysconfig; import pathlib; site_packages = sysconfig.get_paths()['purelib']; print(str(pathlib.Path(site_packages).resolve()))" @@ -46,24 +145,23 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -# Protobuf -find_package(Protobuf REQUIRED) -include_directories(${Protobuf_INCLUDE_DIRS}) +# ───────────────────────────────────────────────────────────────────────────── +#------- Torch ------- +# ───────────────────────────────────────────────────────────────────────────── +# set(_TORCH_REL_PATH "dependencias_libs/d_libs/libtorch/cpu") +set(Torch_DIR "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/share/cmake/Torch") -# Torch -set(Torch_DIR "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/share/cmake/Torch") find_package(Torch REQUIRED) -include_directories("${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/include") -link_directories("${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib") + +# Ajustar includes e bibliotecas do Torch +include_directories("${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/include") +link_directories("${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib") -# Tokenizers -set(TOKENIZERS_PATH "${CMAKE_SOURCE_DIR}/libs/tokenizers-cpp") -add_subdirectory(${TOKENIZERS_PATH} tokenizers EXCLUDE_FROM_ALL) - -# OpenAI -set(OPEANAI_CPP_PATH "${CMAKE_SOURCE_DIR}/libs/openai-cpp") -# RPATH +# ───────────────────────────────────────────────────────────────────────────── +#------- RPATH +# ───────────────────────────────────────────────────────────────────────────── +#------- Configuração do RPATH/RUNPATH/LIBRARY_PATH -------> Ajustar rpath para encontrar as bibliotecas do Torch set(CMAKE_BUILD_RPATH "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" "\$ORIGIN/purecpp.libs" @@ -93,120 +191,133 @@ set(CMAKE_INSTALL_RPATH set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -# Sources -file(GLOB_RECURSE VDB_SRCS ${CMAKE_SOURCE_DIR}/components/VectorDatabase/src/*.cpp) -set(RagPUREAI_BINDING_SRCS - ${CMAKE_SOURCE_DIR}/binding.cpp - ${CMAKE_SOURCE_DIR}/components/VectorDatabase/python/binding_vectordb.cpp -) -set(RagPUREAI_IMPL_SRCS - ${VDB_SRCS} - ${CMAKE_SOURCE_DIR}/libs/StringUtils/StringUtils.cpp - ${CMAKE_SOURCE_DIR}/libs/CommonStructs/CommonStructs.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/BaseLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/PDFLoader/PDFLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/DOCXLoader/DOCXLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/WebLoader/WebLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/TXTLoader/TXTLoader.cpp - - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataExtractor.cpp - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataRegexExtractor/MetadataRegexExtractor.cpp - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataHFExtractor/MetadataHFExtractor.cpp - - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingOpenAI/EmbeddingOpenAI.cpp - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingModel/EmbeddingModel.cpp - - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkCommons/ChunkCommons.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkCount/ChunkCount.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkDefault/ChunkDefault.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkSimilarity/ChunkSimilarity.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkQuery/ChunkQuery.cpp - - ${CMAKE_SOURCE_DIR}/components/CleanData/ContentCleaner/ContentCleaner.cpp - - ${CMAKE_SOURCE_DIR}/components/Chat/Message/HumanMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/Message/AIMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/Message/SystemMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/ChatHistory/ChatHistory.cpp +# ───────────────────────────────────────────────────────────────────────────── +#------- Tokenizers ------- +# ───────────────────────────────────────────────────────────────────────────── +set(TOKENIZERS_PATH "${CMAKE_SOURCE_DIR}/../libs/tokenizers-cpp") +add_subdirectory(${TOKENIZERS_PATH} tokenizers EXCLUDE_FROM_ALL) + +# ───────────────────────────────────────────────────────────────────────────── +#------- OpenAI C++ bindings ------- +# ───────────────────────────────────────────────────────────────────────────── +set(OPENAI_CPP_PATH "${CMAKE_SOURCE_DIR}/../libs/openai-cpp") + +# ───────────────────────────────────────────────────────────────────────────── +# ----- Fontes do projeto ----- +# ───────────────────────────────────────────────────────────────────────────── +set(RagPUREAI_BINDING_SRCS "${CMAKE_SOURCE_DIR}/binding.cpp") + +set(RagPUREAI_IMPL_SRCS + ${CMAKE_SOURCE_DIR}/../libs/StringUtils/StringUtils.cpp + + ${CMAKE_SOURCE_DIR}/../components/Embedding/BaseEmbedding.cpp + ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI/EmbeddingOpenAI.cpp + + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCommons/ChunkCommons.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCount/ChunkCount.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkDefault/ChunkDefault.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkSimilarity/ChunkSimilarity.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkQuery/ChunkQuery.cpp + + ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner/ContentCleaner.cpp + ${CMAKE_SOURCE_DIR}/../components/FAISSVectorSearch/FAISSVectorSearch.cpp ) -# RagPUREAILib + +# ───────────────────────────────────────────────────────────────────────────── +#------- RagPUREAI Library ------- +# ───────────────────────────────────────────────────────────────────────────── + add_library(RagPUREAILib STATIC ${RagPUREAI_IMPL_SRCS}) -target_include_directories(RagPUREAILib PUBLIC - ${CMAKE_SOURCE_DIR}/components - ${CMAKE_SOURCE_DIR}/components/DataLoader - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor - ${CMAKE_SOURCE_DIR}/components/Chunk - ${CMAKE_SOURCE_DIR}/components/CleanData - ${CMAKE_SOURCE_DIR}/components/Embedding - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingOpenAI - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingModel - - ${CMAKE_SOURCE_DIR}/components/Chat - ${CMAKE_SOURCE_DIR}/components/Chat/ChatHistory - ${CMAKE_SOURCE_DIR}/components/Chat/Message - - ${CMAKE_SOURCE_DIR}/components/VectorDatabase/include - - ${CMAKE_SOURCE_DIR}/libs/RagException - ${CMAKE_SOURCE_DIR}/libs/ThreadSafeQueue - ${CMAKE_SOURCE_DIR}/libs/CommonStructs - ${CMAKE_SOURCE_DIR}/libs/StringUtils - ${CMAKE_SOURCE_DIR}/libs/FileUtils - ${CMAKE_SOURCE_DIR}/libs/MemoryUtils - ${TOKENIZERS_PATH}/include - ${OPEANAI_CPP_PATH}/include - ${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/include + +target_include_directories(RagPUREAILib PUBLIC #.h + ${CMAKE_SOURCE_DIR}/../libs/RagException + ${CMAKE_SOURCE_DIR}/../libs/ThreadSafeQueue + ${CMAKE_SOURCE_DIR}/../libs/CommonStructs + ${CMAKE_SOURCE_DIR}/../libs/StringUtils + ${CMAKE_SOURCE_DIR}/../libs/FileUtils + ${CMAKE_SOURCE_DIR}/../libs/MemoryUtils + #C:/vcpkg/packages/protobuf_x64-windows/include ${CURL_INCLUDE_DIRS} + ${OPENAI_CPP_PATH}/include ${TORCH_INCLUDE_DIRS} + ${TOKENIZERS_PATH}/include + ${FAISS_INCLUDE_DIR} + + ${CMAKE_SOURCE_DIR}/../components + ${CMAKE_SOURCE_DIR}/../components/DataLoader + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor + ${CMAKE_SOURCE_DIR}/../components/Chunk + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCommons + ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner + ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI + ${CMAKE_SOURCE_DIR}/../components/Embedding + ${CMAKE_SOURCE_DIR}/../components/FAISSVectorSearch + ${CMAKE_SOURCE_DIR}/../libs/faiss/ ) +link_directories(${FAISS_LIB_DIR}) + +# ───────────────────────────────────────────────────────────────────────────── +# Linkar bibliotecas com RagPUREAILib +# ───────────────────────────────────────────────────────────────────────────── target_link_libraries(RagPUREAILib PUBLIC pdfium::pdfium icu::icu miniz::miniz rapidxml::rapidxml beauty::beauty - lexbor::lexbor_static - OpenMP::OpenMP_CXX + #fmt::fmt + lexbor::lexbor_static re2::re2 - nlohmann_json::nlohmann_json - redis++::redis++_static - hiredis::hiredis - onnxruntime::onnxruntime - tokenizers_cpp - protobuf::libprotobuf + protobuf::libprotobuf + onnxruntime::onnxruntime + OpenMP::OpenMP_CXX + Threads::Threads CURL::libcurl + nlohmann_json::nlohmann_json ${Python3_LIBRARIES} - ${TORCH_LIBRARIES} + ${TORCH_LIBRARIES} + tokenizers_cpp + faiss ) +# ───────────────────────────────────────────────────────────────────────────── +#---------------- Pybind11 Module +# ───────────────────────────────────────────────────────────────────────────── -# Binding with Pybind11 pybind11_add_module(RagPUREAI ${RagPUREAI_BINDING_SRCS}) -target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) - -# vectordb -pybind11_add_module(vectordb components/VectorDatabase/python/_vectordb.cpp) -target_link_libraries(vectordb PRIVATE - -Wl,--whole-archive - VectorDatabase - -Wl,--no-whole-archive -) -# Disables LTO/IPO in the module to avoid ODR/refcount problems. - -set_property(TARGET vectordb PROPERTY INTERPROCEDURAL_OPTIMIZATION FALSE) -target_compile_options(vectordb PRIVATE -fno-lto) -target_link_options(vectordb PRIVATE -fno-lto) +target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) -# .so output -set_target_properties(vectordb PROPERTIES - OUTPUT_NAME "vectordb" - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" - ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" - RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" -) +# ───────────────────────────────────────────────────────────────────────────── +#---------------- DEBUG +# ───────────────────────────────────────────────────────────────────────────── +# message(STATUS "---------------------------------------\n") +# message(STATUS "------------------------> Protobuf_FOUND: ${Protobuf_FOUND}") +# message(STATUS "------------------------> Protobuf_VERSION: ${Protobuf_VERSION}") +# message(STATUS "------------------------> Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS}") +# message(STATUS "------------------------> Protobuf_LIBRARIES: ${Protobuf_LIBRARIES}") +# message(STATUS "------------------------> Protobuf_PROTOC_EXECUTABLE: ${Protobuf_PROTOC_EXECUTABLE}") +# message(STATUS "------------------------> Protobuf_LITE_LIBRARIES: ${Protobuf_LITE_LIBRARIES}") +# message(STATUS "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n--------------------------------------- ") +# message(STATUS "---------------------------------------\n") +# message(STATUS "------------------------> Protobuf_FOUND: ${Protobuf_FOUND}") +# message(STATUS "------------------------> Protobuf_VERSION: ${Protobuf_VERSION}") +# message(STATUS "------------------------> Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS}") +# message(STATUS "------------------------> Protobuf_LIBRARIES: ${Protobuf_LIBRARIES}") +# message(STATUS "------------------------> Protobuf_PROTOC_EXECUTABLE: ${Protobuf_PROTOC_EXECUTABLE}") +# message(STATUS "------------------------> Protobuf_LITE_LIBRARIES: ${Protobuf_LITE_LIBRARIES}") +# message(STATUS "\n --------------------------------------- ") +# message(STATUS "------------------------> Encontrado site-packages do Python em: ${PYTHON_SITE_PACKAGES}") +# message(STATUS "------------------------> Caminho previsto para Torch_DIR: ${Torch_DIR}") +message(STATUS "\n\n\n\n------------------------> Caminho previsto para TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}\n\n\n\n") +message(STATUS "---------------------------------------\n") +# message(STATUS "------------------------> CURL_INCLUDE_DIRS: ${CURL_INCLUDE_DIRS}") +# message(STATUS "------------------------> OPENAI_CPP_PATH: ${OPENAI_CPP_PATH}/include") +# message(STATUS "------------------------> OPENAI_CPP_PATH: ${OPENAI_CPP_PATH}") +# message(STATUS "------------------------> TORCH_INCLUDE_DIRS: ${TORCH_INCLUDE_DIRS}/include") +message(STATUS "---------------------------------------> FAISS_LIB_DIR: ${FAISS_LIB_DIR}") \ No newline at end of file From a0c122cdba7539700438290fee4649ef58d9a480 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 00:02:06 -0300 Subject: [PATCH 36/65] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7bb6f8c..9607eab 100644 --- a/.gitignore +++ b/.gitignore @@ -53,7 +53,7 @@ models/* libtorch*.zip libs/libtorch -libs/faiss/ +libs/faiss/* conan.lock libtorch/ From 4ce20fd02fbbbe7438a9e4defcea526f4d68e023 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 07:08:53 -0300 Subject: [PATCH 37/65] feat(build): add standalone FAISS CPU-only install script for CentOS --- scripts/2faiss_installer.sh | 106 ++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 scripts/2faiss_installer.sh diff --git a/scripts/2faiss_installer.sh b/scripts/2faiss_installer.sh new file mode 100644 index 0000000..e4920f6 --- /dev/null +++ b/scripts/2faiss_installer.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Script to install and build FAISS (CPU-only) for C++ usage +# Compatible with CentOS/RHEL systems using yum +# It clones FAISS into libs/faiss/ and builds it with CMake + +set -e # Exit immediately if a command fails +# ───────────────────────────────────────────────────────────────────────────── +# SETUP: Define directories +# ───────────────────────────────────────────────────────────────────────────── + +# Assume the current directory is the project root +PROJ_DIR=$(pwd) + +# Destination directory for FAISS +FAISS_DIR="${PROJ_DIR}/libs/faiss" + +echo "Creating libs/faiss/ directory inside the project..." +mkdir -p "$FAISS_DIR" + + + +# ───────────────────────────────────────────────────────────────────────────── +# SYSTEM: Update packages and install dependencies +# ───────────────────────────────────────────────────────────────────────────── + +echo "Updating system packages..." +yum update -y + +echo "Checking if EPEL is installed..." +if ! rpm -q epel-release >/dev/null 2>&1; then + echo "Installing EPEL repository..." + yum install -y epel-release +fi + +echo "Installing required development packages..." +yum groupinstall -y "Development Tools" +yum install -y cmake3 gcc-c++ openblas-devel python3-devel git +yum install gflags-devel -y + +# Ensure `cmake` command exists, link it to `cmake3` if missing +if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then + echo "Linking cmake3 to cmake..." + ln -s /usr/bin/cmake3 /usr/bin/cmake +fi + + + +# ───────────────────────────────────────────────────────────────────────────── +# CLONE: Download FAISS repository +# ───────────────────────────────────────────────────────────────────────────── + +echo "Removing all files in repository $FAISS_DIR..." +rm -fr "$FAISS_DIR" + +echo "Cloning FAISS repository into $FAISS_DIR..." +git clone https://github.com/facebookresearch/faiss.git "$FAISS_DIR" + +cd "$FAISS_DIR" + + + +# ───────────────────────────────────────────────────────────────────────────── +# BUILD: Configure and compile FAISS (CPU-only) +# ───────────────────────────────────────────────────────────────────────────── + +echo "Configuring CMake for CPU-only FAISS build..." +cmake -B build -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=OFF -DFAISS_ENABLE_TESTS=OFF -DCMAKE_BUILD_TYPE=Release + +echo "Building FAISS..." +cmake --build build --parallel 3 + +echo "FAISS has been successfully built." + + + +# ───────────────────────────────────────────────────────────────────────────── +# VERIFY: Locate compiled library and headers +# ───────────────────────────────────────────────────────────────────────────── + +# Find the libfaiss library (static or shared) +FOUND_LIB=$(find "$FAISS_DIR/build/faiss" -name "libfaiss.*" | head -n 1) + +if [ -f "$FOUND_LIB" ]; then + echo "Header files located at: $FAISS_DIR/faiss/" + echo "Library file found at: $FOUND_LIB" +else + echo "Warning: libfaiss was not found in the expected directory." +fi + + + +# ───────────────────────────────────────────────────────────────────────────── +# INFO: How to link FAISS in your C++ CMake project +# ───────────────────────────────────────────────────────────────────────────── + +echo "" +echo "You can now link FAISS in your C++ project using:" +echo "" +echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss)' +echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' +echo ' target_link_libraries(your_target PRIVATE faiss)' + +# cd libs/faiss +# cmake -B build -DFAISS_ENABLE_PYTHON=OFF -DFAISS_ENABLE_GPU=OFF +# cmake --build build -j$(nproc) \ No newline at end of file From db6a91e2fbe66de7d931a353125d8bccb3617663 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 07:29:09 -0300 Subject: [PATCH 38/65] Add logging and comments to 2faiss* script. --- scripts/2faiss_installer.sh | 102 ++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/scripts/2faiss_installer.sh b/scripts/2faiss_installer.sh index e4920f6..50b49c4 100644 --- a/scripts/2faiss_installer.sh +++ b/scripts/2faiss_installer.sh @@ -1,10 +1,58 @@ -#!/bin/bash +#!/usr/bin/env bash # Script to install and build FAISS (CPU-only) for C++ usage # Compatible with CentOS/RHEL systems using yum # It clones FAISS into libs/faiss/ and builds it with CMake -set -e # Exit immediately if a command fails +set -euo pipefail + +#----------------------------------------- +#================= LOGGING =============== +#----------------------------------------- + +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" + +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +# ───────────────────────────────────────────────────────────────────────────── +# Elevation helper: use sudo only when needed and available +# ───────────────────────────────────────────────────────────────────────────── +SUDO="" +if [[ "$(id -u)" -ne 0 ]]; then + if command -v sudo >/dev/null 2>&1; then + SUDO="sudo" + else + echo "[!] Not running as root and 'sudo' is not available. + Re-run as root or install sudo." >&2 + exit 1 + fi +fi + + + +# ───────────────────────────────────────────────────────────────────────────── +# Detect package manager +# ───────────────────────────────────────────────────────────────────────────── +PKG_MANAGER="" +if command -v apt-get >/dev/null 2>&1; then + PKG_MANAGER="apt" + echo "[pkg] Detected APT-based system (Ubuntu/Debian)" +elif command -v yum >/dev/null 2>&1; then + PKG_MANAGER="yum" + echo "[pkg] Detected YUM-based system (manylinux/CentOS-like)" +else + echo "[x] Unsupported system: neither apt-get nor yum found." >&2 + exit 1 +fi + + + # ───────────────────────────────────────────────────────────────────────────── # SETUP: Define directories # ───────────────────────────────────────────────────────────────────────────── @@ -25,18 +73,34 @@ mkdir -p "$FAISS_DIR" # ───────────────────────────────────────────────────────────────────────────── echo "Updating system packages..." -yum update -y -echo "Checking if EPEL is installed..." -if ! rpm -q epel-release >/dev/null 2>&1; then - echo "Installing EPEL repository..." - yum install -y epel-release + +echo "[pkg] Installing required development packages..." +if [[ "$PKG_MANAGER" == "apt" ]]; then + $SUDO apt-get update -y + $SUDO apt install libgflags-dev -y + $SUDO apt install -y cmake g++ libopenblas-dev python3-dev build-essential git +# $SUDO apt-get install -y \ +# cmake g++ libopenblas-dev libgflags-dev build-essential \ +# python3-dev git unzip wget pkg-config ninja-build binutils + +else + echo "Checking if EPEL is installed..." + if ! rpm -q epel-release >/dev/null 2>&1; then + echo "Installing EPEL repository..." + yum install -y epel-release + fi + + $SUDO yum update -y + $SUDO yum groupinstall -y "Development Tools" + $SUDO yum install -y cmake3 gcc-c++ openblas-devel python3-devel git + $SUDO yum install gflags-devel -y +# $SUDO yum install -y \ +# gcc gcc-c++ cmake3 make cmake git curl wget ninja-build \ +# libffi-devel openssl-devel protobuf-devel gflags-devel \ +# zlib-devel unzip openblas-devel pkgconf-pkg-config binutils fi -echo "Installing required development packages..." -yum groupinstall -y "Development Tools" -yum install -y cmake3 gcc-c++ openblas-devel python3-devel git -yum install gflags-devel -y # Ensure `cmake` command exists, link it to `cmake3` if missing if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then @@ -65,10 +129,16 @@ cd "$FAISS_DIR" # ───────────────────────────────────────────────────────────────────────────── echo "Configuring CMake for CPU-only FAISS build..." -cmake -B build -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=OFF -DFAISS_ENABLE_TESTS=OFF -DCMAKE_BUILD_TYPE=Release +cmake -B build \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release echo "Building FAISS..." + cmake --build build --parallel 3 +# cmake --build build -j$(nproc) echo "FAISS has been successfully built." @@ -101,6 +171,8 @@ echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss)' echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' echo ' target_link_libraries(your_target PRIVATE faiss)' -# cd libs/faiss -# cmake -B build -DFAISS_ENABLE_PYTHON=OFF -DFAISS_ENABLE_GPU=OFF -# cmake --build build -j$(nproc) \ No newline at end of file +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n" From a2e49f47fa6e2924e143cb80f3d2cb21f5f264c7 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 07:33:24 -0300 Subject: [PATCH 39/65] new faiss_installer --- scripts/2faiss_installer.sh | 178 -------------------------------- scripts/faiss_installer.sh | 199 ++++++++++++++++++------------------ 2 files changed, 101 insertions(+), 276 deletions(-) delete mode 100644 scripts/2faiss_installer.sh mode change 100755 => 100644 scripts/faiss_installer.sh diff --git a/scripts/2faiss_installer.sh b/scripts/2faiss_installer.sh deleted file mode 100644 index 50b49c4..0000000 --- a/scripts/2faiss_installer.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env bash - -# Script to install and build FAISS (CPU-only) for C++ usage -# Compatible with CentOS/RHEL systems using yum -# It clones FAISS into libs/faiss/ and builds it with CMake - -set -euo pipefail - -#----------------------------------------- -#================= LOGGING =============== -#----------------------------------------- - -TAG="[$(basename "${BASH_SOURCE[0]}")]" -LINE_BRK="\n\n" -SEGMENT="===========================================================\n" - -printf "$SEGMENT$SEGMENT$SEGMENT" -printf " $TAG$LINE_BRK" -printf "$SEGMENT" -printf "$LINE_BRK" -#----------------------------------------- - -# ───────────────────────────────────────────────────────────────────────────── -# Elevation helper: use sudo only when needed and available -# ───────────────────────────────────────────────────────────────────────────── -SUDO="" -if [[ "$(id -u)" -ne 0 ]]; then - if command -v sudo >/dev/null 2>&1; then - SUDO="sudo" - else - echo "[!] Not running as root and 'sudo' is not available. - Re-run as root or install sudo." >&2 - exit 1 - fi -fi - - - -# ───────────────────────────────────────────────────────────────────────────── -# Detect package manager -# ───────────────────────────────────────────────────────────────────────────── -PKG_MANAGER="" -if command -v apt-get >/dev/null 2>&1; then - PKG_MANAGER="apt" - echo "[pkg] Detected APT-based system (Ubuntu/Debian)" -elif command -v yum >/dev/null 2>&1; then - PKG_MANAGER="yum" - echo "[pkg] Detected YUM-based system (manylinux/CentOS-like)" -else - echo "[x] Unsupported system: neither apt-get nor yum found." >&2 - exit 1 -fi - - - -# ───────────────────────────────────────────────────────────────────────────── -# SETUP: Define directories -# ───────────────────────────────────────────────────────────────────────────── - -# Assume the current directory is the project root -PROJ_DIR=$(pwd) - -# Destination directory for FAISS -FAISS_DIR="${PROJ_DIR}/libs/faiss" - -echo "Creating libs/faiss/ directory inside the project..." -mkdir -p "$FAISS_DIR" - - - -# ───────────────────────────────────────────────────────────────────────────── -# SYSTEM: Update packages and install dependencies -# ───────────────────────────────────────────────────────────────────────────── - -echo "Updating system packages..." - - -echo "[pkg] Installing required development packages..." -if [[ "$PKG_MANAGER" == "apt" ]]; then - $SUDO apt-get update -y - $SUDO apt install libgflags-dev -y - $SUDO apt install -y cmake g++ libopenblas-dev python3-dev build-essential git -# $SUDO apt-get install -y \ -# cmake g++ libopenblas-dev libgflags-dev build-essential \ -# python3-dev git unzip wget pkg-config ninja-build binutils - -else - echo "Checking if EPEL is installed..." - if ! rpm -q epel-release >/dev/null 2>&1; then - echo "Installing EPEL repository..." - yum install -y epel-release - fi - - $SUDO yum update -y - $SUDO yum groupinstall -y "Development Tools" - $SUDO yum install -y cmake3 gcc-c++ openblas-devel python3-devel git - $SUDO yum install gflags-devel -y -# $SUDO yum install -y \ -# gcc gcc-c++ cmake3 make cmake git curl wget ninja-build \ -# libffi-devel openssl-devel protobuf-devel gflags-devel \ -# zlib-devel unzip openblas-devel pkgconf-pkg-config binutils -fi - - -# Ensure `cmake` command exists, link it to `cmake3` if missing -if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then - echo "Linking cmake3 to cmake..." - ln -s /usr/bin/cmake3 /usr/bin/cmake -fi - - - -# ───────────────────────────────────────────────────────────────────────────── -# CLONE: Download FAISS repository -# ───────────────────────────────────────────────────────────────────────────── - -echo "Removing all files in repository $FAISS_DIR..." -rm -fr "$FAISS_DIR" - -echo "Cloning FAISS repository into $FAISS_DIR..." -git clone https://github.com/facebookresearch/faiss.git "$FAISS_DIR" - -cd "$FAISS_DIR" - - - -# ───────────────────────────────────────────────────────────────────────────── -# BUILD: Configure and compile FAISS (CPU-only) -# ───────────────────────────────────────────────────────────────────────────── - -echo "Configuring CMake for CPU-only FAISS build..." -cmake -B build \ - -DFAISS_ENABLE_GPU=OFF \ - -DFAISS_ENABLE_PYTHON=OFF \ - -DFAISS_ENABLE_TESTS=OFF \ - -DCMAKE_BUILD_TYPE=Release - -echo "Building FAISS..." - -cmake --build build --parallel 3 -# cmake --build build -j$(nproc) - -echo "FAISS has been successfully built." - - - -# ───────────────────────────────────────────────────────────────────────────── -# VERIFY: Locate compiled library and headers -# ───────────────────────────────────────────────────────────────────────────── - -# Find the libfaiss library (static or shared) -FOUND_LIB=$(find "$FAISS_DIR/build/faiss" -name "libfaiss.*" | head -n 1) - -if [ -f "$FOUND_LIB" ]; then - echo "Header files located at: $FAISS_DIR/faiss/" - echo "Library file found at: $FOUND_LIB" -else - echo "Warning: libfaiss was not found in the expected directory." -fi - - - -# ───────────────────────────────────────────────────────────────────────────── -# INFO: How to link FAISS in your C++ CMake project -# ───────────────────────────────────────────────────────────────────────────── - -echo "" -echo "You can now link FAISS in your C++ project using:" -echo "" -echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss)' -echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' -echo ' target_link_libraries(your_target PRIVATE faiss)' - -#----------------------------------------- -#================= ENDING ================ -#----------------------------------------- -printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n" diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh old mode 100755 new mode 100644 index 9d9e4da..50b49c4 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,17 +1,10 @@ #!/usr/bin/env bash -set -euo pipefail +# Script to install and build FAISS (CPU-only) for C++ usage +# Compatible with CentOS/RHEL systems using yum +# It clones FAISS into libs/faiss/ and builds it with CMake -# ============================================================================= -# FAISS CPU Installer Script (C++ only) -# ----------------------------------------------------------------------------- -# Works on Ubuntu/Debian (APT) and manylinux/CentOS-like (YUM) by auto-detecting -# the package manager. It installs build deps and builds FAISS (CPU-only) into -# ../libs/faiss relative to the current working directory. -# ----------------------------------------------------------------------------- -# Usage (optional): -# FAISS_TAG=v1.8.0 ./install_faiss_cpu.sh # pin to a tag/branch (default v1.8.0) -# ============================================================================= +set -euo pipefail #----------------------------------------- #================= LOGGING =============== @@ -27,7 +20,6 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- - # ───────────────────────────────────────────────────────────────────────────── # Elevation helper: use sudo only when needed and available # ───────────────────────────────────────────────────────────────────────────── @@ -42,6 +34,8 @@ if [[ "$(id -u)" -ne 0 ]]; then fi fi + + # ───────────────────────────────────────────────────────────────────────────── # Detect package manager # ───────────────────────────────────────────────────────────────────────────── @@ -57,116 +51,125 @@ else exit 1 fi + + # ───────────────────────────────────────────────────────────────────────────── -# Install dependencies +# SETUP: Define directories # ───────────────────────────────────────────────────────────────────────────── -echo "[pkg] Installing build dependencies..." + +# Assume the current directory is the project root +PROJ_DIR=$(pwd) + +# Destination directory for FAISS +FAISS_DIR="${PROJ_DIR}/libs/faiss" + +echo "Creating libs/faiss/ directory inside the project..." +mkdir -p "$FAISS_DIR" + + + +# ───────────────────────────────────────────────────────────────────────────── +# SYSTEM: Update packages and install dependencies +# ───────────────────────────────────────────────────────────────────────────── + +echo "Updating system packages..." + + +echo "[pkg] Installing required development packages..." if [[ "$PKG_MANAGER" == "apt" ]]; then $SUDO apt-get update -y - $SUDO apt-get install -y \ - cmake g++ libopenblas-dev libgflags-dev build-essential \ - python3-dev git unzip wget pkg-config ninja-build binutils + $SUDO apt install libgflags-dev -y + $SUDO apt install -y cmake g++ libopenblas-dev python3-dev build-essential git +# $SUDO apt-get install -y \ +# cmake g++ libopenblas-dev libgflags-dev build-essential \ +# python3-dev git unzip wget pkg-config ninja-build binutils + else - $SUDO yum install -y \ - gcc gcc-c++ make cmake git curl wget ninja-build \ - libffi-devel openssl-devel protobuf-devel gflags-devel \ - zlib-devel unzip openblas-devel pkgconf-pkg-config binutils + echo "Checking if EPEL is installed..." + if ! rpm -q epel-release >/dev/null 2>&1; then + echo "Installing EPEL repository..." + yum install -y epel-release + fi + + $SUDO yum update -y + $SUDO yum groupinstall -y "Development Tools" + $SUDO yum install -y cmake3 gcc-c++ openblas-devel python3-devel git + $SUDO yum install gflags-devel -y +# $SUDO yum install -y \ +# gcc gcc-c++ cmake3 make cmake git curl wget ninja-build \ +# libffi-devel openssl-devel protobuf-devel gflags-devel \ +# zlib-devel unzip openblas-devel pkgconf-pkg-config binutils fi + +# Ensure `cmake` command exists, link it to `cmake3` if missing +if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then + echo "Linking cmake3 to cmake..." + ln -s /usr/bin/cmake3 /usr/bin/cmake +fi + + + # ───────────────────────────────────────────────────────────────────────────── -# Prepare destination +# CLONE: Download FAISS repository # ───────────────────────────────────────────────────────────────────────────── -PROJ_DIR="$(pwd)" -FAISS_DIR="${PROJ_DIR}/../libs/faiss" -FAISS_TAG="${FAISS_TAG:-v1.8.0}" -echo "[fs] Preparing ${FAISS_DIR} (fresh clone)" -rm -rf "$FAISS_DIR" -mkdir -p "$(dirname "$FAISS_DIR")" +echo "Removing all files in repository $FAISS_DIR..." +rm -fr "$FAISS_DIR" + +echo "Cloning FAISS repository into $FAISS_DIR..." +git clone https://github.com/facebookresearch/faiss.git "$FAISS_DIR" + +cd "$FAISS_DIR" + + # ───────────────────────────────────────────────────────────────────────────── -# Clone & build (CPU-only) +# BUILD: Configure and compile FAISS (CPU-only) # ───────────────────────────────────────────────────────────────────────────── -echo "[git] Cloning FAISS (${FAISS_TAG})..." -git clone --branch "$FAISS_TAG" --single-branch --depth 1 \ - https://github.com/facebookresearch/faiss.git "$FAISS_DIR" -cd "$FAISS_DIR" +echo "Configuring CMake for CPU-only FAISS build..." +cmake -B build \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release + +echo "Building FAISS..." + +cmake --build build --parallel 3 +# cmake --build build -j$(nproc) + +echo "FAISS has been successfully built." -# Prefer Ninja if available for faster builds -GEN_ARGS=() -if command -v ninja >/dev/null 2>&1; then - GEN_ARGS+=( -G Ninja ) -fi -# Build/Install Toggles -BUILD_SHARED="${BUILD_SHARED:-OFF}" # export BUILD_SHARED=ON para .so -DO_INSTALL="${DO_INSTALL:-ON}" # export DO_INSTALL=OFF para pular install -INSTALL_PREFIX="${INSTALL_PREFIX:-${FAISS_DIR}/_install}" - -# Parallelism with fallback -JOBS="$( (command -v nproc >/dev/null && nproc) || getconf _NPROCESSORS_ONLN || echo 2 )" - -echo "[cmake] Configuring (CPU-only, Release)..." -cmake -B build "${GEN_ARGS[@]}" \ - -DBUILD_SHARED_LIBS="${BUILD_SHARED}" \ - -DFAISS_ENABLE_GPU=OFF \ - -DFAISS_ENABLE_PYTHON=OFF \ - -DBUILD_TESTING=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_POLICY_DEFAULT_CMP0135=NEW \ - -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" - -echo "[cmake] Building (target: faiss) with ${JOBS} jobs..." -cmake --build build --target faiss --config Release --parallel "${JOBS}" - -# Optional installation (generates include/ and lib/ and package config) -if [[ "${DO_INSTALL}" == "ON" ]]; then - echo "[cmake] Installing to ${INSTALL_PREFIX}..." - cmake --install build --component faiss 2>/dev/null || cmake --install build -fi # ───────────────────────────────────────────────────────────────────────────── -# Locate artifacts +# VERIFY: Locate compiled library and headers # ───────────────────────────────────────────────────────────────────────────── -if [[ "${BUILD_SHARED}" == "ON" ]]; then - PREFERRED="libfaiss.so" - FALLBACK="libfaiss.a" -else - PREFERRED="libfaiss.a" - FALLBACK="libfaiss.so" -fi -FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name "${PREFERRED}" -print -quit 2>/dev/null || true)" -if [[ -z "${FOUND_LIB}" ]]; then - FOUND_LIB="$(find "$FAISS_DIR/build/faiss" -maxdepth 1 -name "${FALLBACK}" -print -quit 2>/dev/null || true)" -fi +# Find the libfaiss library (static or shared) +FOUND_LIB=$(find "$FAISS_DIR/build/faiss" -name "libfaiss.*" | head -n 1) -if [[ -n "${FOUND_LIB}" && -e "${FOUND_LIB}" ]]; then - echo "[ok] FAISS built successfully." - echo "[out] Headers : $FAISS_DIR/faiss/" - echo "[out] Library : $FOUND_LIB" - if [[ "${DO_INSTALL}" == "ON" ]]; then - echo "[out] Install : ${INSTALL_PREFIX}" - echo " include : ${INSTALL_PREFIX}/include" - echo " lib : ${INSTALL_PREFIX}/lib" - fi +if [ -f "$FOUND_LIB" ]; then + echo "Header files located at: $FAISS_DIR/faiss/" + echo "Library file found at: $FOUND_LIB" else - echo "[x] Build finished but libfaiss was not found under build/faiss/" >&2 - exit 2 + echo "Warning: libfaiss was not found in the expected directory." fi -# Useful post-build checks -if command -v nm >/dev/null 2>&1; then - echo "[check] nm symbols (grep faiss::IndexFlat...):" - nm -C "${FOUND_LIB}" | grep -E 'faiss::IndexFlat' | head || true -fi -if [[ "${FOUND_LIB##*.}" == "so" ]] && command -v ldd >/dev/null 2>&1; then - echo "[check] ldd on shared library:" - ldd "${FOUND_LIB}" || true -fi + + +# ───────────────────────────────────────────────────────────────────────────── +# INFO: How to link FAISS in your C++ CMake project +# ───────────────────────────────────────────────────────────────────────────── + +echo "" +echo "You can now link FAISS in your C++ project using:" +echo "" +echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss)' +echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' +echo ' target_link_libraries(your_target PRIVATE faiss)' #----------------------------------------- #================= ENDING ================ From 4a8d2ef5e0d6ab20a97eda0898a719bf3b48d50d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 07:40:45 -0300 Subject: [PATCH 40/65] translating messages from Portuguese to English in CMakeLists --- libs/faiss | 1 + src/CMakeLists.txt | 88 +++++++++++++++++----------------------------- 2 files changed, 33 insertions(+), 56 deletions(-) create mode 160000 libs/faiss diff --git a/libs/faiss b/libs/faiss new file mode 160000 index 0000000..6470b8d --- /dev/null +++ b/libs/faiss @@ -0,0 +1 @@ +Subproject commit 6470b8d9d0f9c0adc71df6d5a1ce64199be85305 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 42c4dd6..c7dbf4f 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,13 +10,13 @@ set(CMAKE_BUILD_TYPE "Release") # ───────────────────────────────────────────────────────────────────────────── -#------- Caminhos para FAISS +#------- Caminhos para FAISS ------- # ───────────────────────────────────────────────────────────────────────────── set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") -set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Deve conter libfaiss.a +set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Must contain libfaiss.a -add_library(faiss STATIC IMPORTED) # Declara FAISS como biblioteca já compilada (IMPORTED) +add_library(faiss STATIC IMPORTED) # Declares FAISS as an already compiled library (IMPORTED) set_target_properties(faiss PROPERTIES IMPORTED_LOCATION "${FAISS_LIB_DIR}/libfaiss.a" INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIR}" @@ -24,20 +24,20 @@ set_target_properties(faiss PROPERTIES # ───────────────────────────────────────────────────────────────────────────── -#------- Compiler Specific flags +#------- Compiler Specific flags ------- # ───────────────────────────────────────────────────────────────────────────── -if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")# Flags específicas do MSVC +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")# MSVC-specific flags set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/generators/conan_toolchain.cmake) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Od /EHsc /MP /openmp /MD") -else() # Flags para G++/Clang ou G++/GNU em Linux - # Mantém as flags de otimização e adiciona -std=c++23 +else() # Flags for G++/Clang or G++/GNU on Linux + # Keeps the optimization flags and adds -std=c++23 set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/Release/generators/conan_toolchain.cmake) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fopenmp -pthread") set(CMAKE_CXX_FLAGS_RELEASE "-O0 -std=c++23") endif() # ───────────────────────────────────────────────────────────────────────────── -#------- Options +#------- Options ------- # ───────────────────────────────────────────────────────────────────────────── option(CURL_STATIC_LINKING "Set to ON to build libcurl with static linking." OFF) @@ -48,9 +48,9 @@ endif() # ───────────────────────────────────────────────────────────────────────────── -#------- Find Python +#------- Find Python ------- # ───────────────────────────────────────────────────────────────────────────── -set(Python3_FIND_SHARED OFF)#Precisa sera? +set(Python3_FIND_SHARED OFF)# Need it? find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") @@ -61,48 +61,48 @@ endif() include_directories(${Python3_INCLUDE_DIRS}) # ───────────────────────────────────────────────────────────────────────────── -#------- Find Pybind11 +#------- Find Pybind11 ------- # ───────────────────────────────────────────────────────────────────────────── message(STATUS "---------------------------------------\n") if(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 8) - message(STATUS "Python 3.8 detectado!") + message(STATUS "Python 3.8 detected!!") set(pybind11_DIR "/opt/python/cp38-cp38/lib/python3.8/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x03080000) elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 9) - message(STATUS "Python 3.9 detectado!") + message(STATUS "Python 3.9 detected!!") set(pybind11_DIR "/opt/python/cp39-cp39/lib/python3.9/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x03090000) elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 10) - message(STATUS "Python 3.10 detectado!") + message(STATUS "Python 3.10 detected!!") set(pybind11_DIR "/opt/python/cp310-cp310/lib/python3.10/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x030A0000) elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 11) - message(STATUS "Python 3.11 detectado!") + message(STATUS "Python 3.11 detected!!") set(pybind11_DIR "/opt/python/cp311-cp311/lib/python3.11/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x030B0000) elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 12) - message(STATUS "Python 3.12 detectado!") + message(STATUS "Python 3.12 detected!!") set(pybind11_DIR "/opt/python/cp312-cp312/lib/python3.12/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x030C0000) elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 13) - message(STATUS "Python 3.13 detectado!") + message(STATUS "Python 3.13 detected!") set(pybind11_DIR "/opt/python/cp313-cp313/lib/python3.13/site-packages/pybind11/share/cmake/pybind11") # add_definitions(-DPy_LIMITED_API=0x030D0000) else() - message(STATUS "Versão do Python não especificada nos if's anteriores!") - # Configuração padrão ou outra ação + message(STATUS "Python version not specified in previous if's!") + # Default setting or other action endif() # ───────────────────────────────────────────────────────────────────────────── -#------- Find other dependencies +#------- Find other dependencies ------- # ───────────────────────────────────────────────────────────────────────────── find_package(pybind11 REQUIRED) find_package(pdfium REQUIRED) # Biblioteca para manipulação de PDFs @@ -127,17 +127,17 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") link_directories(${Protobuf_LIBRARY_DIRS}) include_directories(C:/vcpkg/packages/protobuf_x64-windows/include) if (NOT DEFINED PROTOBUF_PROTOC_EXECUTABLE) - # include_directories(${Protobuf_INCLUDE_DIRS})# A principio o target Protobuf_INCLUDE_DIRS n existe, mas n sei se é necessario + # include_directories(${Protobuf_INCLUDE_DIRS})# At first the Protobuf_INCLUDE_DIR target does not exist, but I don't know if it is necessary set(PROTOBUF_PROTOC_EXECUTABLE "C:/vcpkg/packages/protobuf_x64-windows/tools/protobuf/protoc.exe") endif() endif() -find_package(onnxruntime REQUIRED) # Pode depender de protobuf (verifique se precisa!) +find_package(onnxruntime REQUIRED) # May depend on protobuf (check if you need it!) -# ───────────────────────────────────────────────────────────────────────────── -#------- Descobrir o diretório site-packages do Python atual -# ───────────────────────────────────────────────────────────────────────────── -# Usaremos sysconfig para ser mais robusto em várias versões de Python. +# ─────────────────────────────────────────────────────────────────────── +#------- Discover the directory Current Python site-packages ------- +# ────────────────────────────────────────────────────────────────────────────── +# We will use sysconfig to be more robust across multiple Python versions. execute_process( COMMAND "${Python3_EXECUTABLE}" -c "import sysconfig; import pathlib; site_packages = sysconfig.get_paths()['purelib']; print(str(pathlib.Path(site_packages).resolve()))" @@ -159,7 +159,7 @@ link_directories("${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib") # ───────────────────────────────────────────────────────────────────────────── -#------- RPATH +#------- RPATH ------- # ───────────────────────────────────────────────────────────────────────────── #------- Configuração do RPATH/RUNPATH/LIBRARY_PATH -------> Ajustar rpath para encontrar as bibliotecas do Torch set(CMAKE_BUILD_RPATH @@ -206,7 +206,7 @@ set(OPENAI_CPP_PATH "${CMAKE_SOURCE_DIR}/../libs/openai-cpp") # ───────────────────────────────────────────────────────────────────────────── -# ----- Fontes do projeto ----- +# ----- Project sources ----- # ───────────────────────────────────────────────────────────────────────────── set(RagPUREAI_BINDING_SRCS "${CMAKE_SOURCE_DIR}/binding.cpp") @@ -262,7 +262,7 @@ target_include_directories(RagPUREAILib PUBLIC #.h link_directories(${FAISS_LIB_DIR}) # ───────────────────────────────────────────────────────────────────────────── -# Linkar bibliotecas com RagPUREAILib +# Link libraries with RagPUREAILib # ───────────────────────────────────────────────────────────────────────────── target_link_libraries(RagPUREAILib PUBLIC pdfium::pdfium @@ -294,30 +294,6 @@ target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) # ───────────────────────────────────────────────────────────────────────────── -#---------------- DEBUG -# ───────────────────────────────────────────────────────────────────────────── -# message(STATUS "---------------------------------------\n") -# message(STATUS "------------------------> Protobuf_FOUND: ${Protobuf_FOUND}") -# message(STATUS "------------------------> Protobuf_VERSION: ${Protobuf_VERSION}") -# message(STATUS "------------------------> Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS}") -# message(STATUS "------------------------> Protobuf_LIBRARIES: ${Protobuf_LIBRARIES}") -# message(STATUS "------------------------> Protobuf_PROTOC_EXECUTABLE: ${Protobuf_PROTOC_EXECUTABLE}") -# message(STATUS "------------------------> Protobuf_LITE_LIBRARIES: ${Protobuf_LITE_LIBRARIES}") -# message(STATUS "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n--------------------------------------- ") -# message(STATUS "---------------------------------------\n") -# message(STATUS "------------------------> Protobuf_FOUND: ${Protobuf_FOUND}") -# message(STATUS "------------------------> Protobuf_VERSION: ${Protobuf_VERSION}") -# message(STATUS "------------------------> Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS}") -# message(STATUS "------------------------> Protobuf_LIBRARIES: ${Protobuf_LIBRARIES}") -# message(STATUS "------------------------> Protobuf_PROTOC_EXECUTABLE: ${Protobuf_PROTOC_EXECUTABLE}") -# message(STATUS "------------------------> Protobuf_LITE_LIBRARIES: ${Protobuf_LITE_LIBRARIES}") -# message(STATUS "\n --------------------------------------- ") -# message(STATUS "------------------------> Encontrado site-packages do Python em: ${PYTHON_SITE_PACKAGES}") -# message(STATUS "------------------------> Caminho previsto para Torch_DIR: ${Torch_DIR}") -message(STATUS "\n\n\n\n------------------------> Caminho previsto para TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}\n\n\n\n") -message(STATUS "---------------------------------------\n") -# message(STATUS "------------------------> CURL_INCLUDE_DIRS: ${CURL_INCLUDE_DIRS}") -# message(STATUS "------------------------> OPENAI_CPP_PATH: ${OPENAI_CPP_PATH}/include") -# message(STATUS "------------------------> OPENAI_CPP_PATH: ${OPENAI_CPP_PATH}") -# message(STATUS "------------------------> TORCH_INCLUDE_DIRS: ${TORCH_INCLUDE_DIRS}/include") -message(STATUS "---------------------------------------> FAISS_LIB_DIR: ${FAISS_LIB_DIR}") \ No newline at end of file +#---------------- DEBUG MESSAGES +# ───────────────────────────────────────────────────────────────────────────── +# ... \ No newline at end of file From ceb83fcf3726010be7610eb40fed6eaf3ffe9882 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 07:57:03 -0300 Subject: [PATCH 41/65] =?UTF-8?q?Remove=20todos=20os=20subm=C3=B3dulos=20e?= =?UTF-8?q?=20limpa=20cache?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitmodules | 13 ------------- Dockerfile | 7 ++----- libs/openai-cpp | 1 - libs/tokenizers-cpp | 1 - 4 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 .gitmodules delete mode 160000 libs/openai-cpp delete mode 160000 libs/tokenizers-cpp diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 49add60..0000000 --- a/.gitmodules +++ /dev/null @@ -1,13 +0,0 @@ -[submodule "libs/tokenizers-cpp"] - path = libs/tokenizers-cpp - url = https://github.com/mlc-ai/tokenizers-cpp.git -[submodule "libs/openai-cpp"] - path = libs/openai-cpp - url = https://github.com/olrea/openai-cpp.git -[submodule "extern/pybind11"] - path = extern/pybind11 - url = ../../pybind/pybind11 - branch = stable -[submodule "tokenizers-cpp"] - path = tokenizers-cpp - url = https://github.com/mlc-ai/tokenizers-cpp.git diff --git a/Dockerfile b/Dockerfile index 11e1ca7..cd0b212 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,6 @@ # Use the official manylinux image (compatible with Python packaging standards) FROM quay.io/pypa/manylinux_2_28_x86_64 -# Add Python 3.12 binaries to PATH -ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" - # Set working directory WORKDIR /home @@ -17,8 +14,8 @@ RUN yum install -y \ && yum clean all \ && rm -rf /var/cache/yum -# Add Rust to PATH -ENV PATH="/root/.cargo/bin:${PATH}" +# Add Rust to PATH and Python 3.12 binaries to PATH +ENV PATH="/root/.cargo/bin:/opt/python/cp312-cp312/bin:${PATH}" # Set default shell CMD ["/bin/bash"] diff --git a/libs/openai-cpp b/libs/openai-cpp deleted file mode 160000 index 9554a4d..0000000 --- a/libs/openai-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9554a4d86d7650bb3b46db772dbacd5f3e054b8c diff --git a/libs/tokenizers-cpp b/libs/tokenizers-cpp deleted file mode 160000 index 4bb7533..0000000 --- a/libs/tokenizers-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4bb753377680e249345b54c6b10e6d0674c8af03 From 1d402e99618583ca9c3779414400e6bd9077dbb5 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 08:01:57 -0300 Subject: [PATCH 42/65] Re-add cleaned submodules: tokenizers-cpp and openai-cpp --- .gitmodules | 7 +++++++ libs/openai-cpp | 1 + libs/tokenizers-cpp | 1 + scripts/faiss_installer.sh | 0 4 files changed, 9 insertions(+) create mode 100644 .gitmodules create mode 160000 libs/openai-cpp create mode 160000 libs/tokenizers-cpp mode change 100644 => 100755 scripts/faiss_installer.sh diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..bd35155 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,7 @@ +[submodule "libs/tokenizers-cpp"] + path = libs/tokenizers-cpp + url = https://github.com/mlc-ai/tokenizers-cpp.git + +[submodule "libs/openai-cpp"] + path = libs/openai-cpp + url = https://github.com/olrea/openai-cpp.git diff --git a/libs/openai-cpp b/libs/openai-cpp new file mode 160000 index 0000000..9554a4d --- /dev/null +++ b/libs/openai-cpp @@ -0,0 +1 @@ +Subproject commit 9554a4d86d7650bb3b46db772dbacd5f3e054b8c diff --git a/libs/tokenizers-cpp b/libs/tokenizers-cpp new file mode 160000 index 0000000..55d53aa --- /dev/null +++ b/libs/tokenizers-cpp @@ -0,0 +1 @@ +Subproject commit 55d53aa38dc8df7d9c8bd9ed50907e82ae83ce66 diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh old mode 100644 new mode 100755 From 1537de0665baed4020f53e0647d8d10a0528d4e6 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 08:09:34 -0300 Subject: [PATCH 43/65] Adding FAISSVectorSearch into binding.cpp --- src/binding.cpp | 161 +++++++++++++++++++++++++++++++----------------- 1 file changed, 103 insertions(+), 58 deletions(-) diff --git a/src/binding.cpp b/src/binding.cpp index 33f0807..5e4d40c 100644 --- a/src/binding.cpp +++ b/src/binding.cpp @@ -39,6 +39,8 @@ #include "ChunkCommons/ChunkCommons.h" #include "ChunkQuery/ChunkQuery.h" +#include "FAISSVectorSearch/FAISSVectorSearch.h" + #include "../components/MetadataExtractor/Document.h" #include "IMetadataExtractor.h" #include "MetadataExtractor.h" @@ -53,11 +55,11 @@ #include "EmbeddingOpenAI/IEmbeddingOpenAI.h" #include "EmbeddingOpenAI/EmbeddingOpenAI.h" -#include "../components/Chat/Message/BaseMessage.h" -#include "../components/Chat/Message/HumanMessage.h" -#include "../components/Chat/Message/AIMessage.h" -#include "../components/Chat/Message/SystemMessage.h" -#include "../components/Chat/ChatHistory/ChatHistory.h" +// #include "../components/Chat/Message/BaseMessage.h" +// #include "../components/Chat/Message/HumanMessage.h" +// #include "../components/Chat/Message/AIMessage.h" +// #include "../components/Chat/Message/SystemMessage.h" +// #include "../components/Chat/ChatHistory/ChatHistory.h" namespace py = pybind11; using namespace RAGLibrary; @@ -1326,56 +1328,100 @@ void bind_EmbeddingOpenAI(py::module &m) )doc"); } -// VectorDabase -void bind_VectorDB(pybind11::module_ &); - -// Trampoline class for BaseMessage -class PyBaseMessage : public purecpp::chat::BaseMessage { -public: - using purecpp::chat::BaseMessage::BaseMessage; // Inherit constructors - - std::string get_type() const override { - PYBIND11_OVERRIDE_PURE( - std::string, /* Return type */ - purecpp::chat::BaseMessage, /* Parent class */ - get_type /* Name of function */ - /* Arguments */ - ); - } - - std::string get_content() const override { - PYBIND11_OVERRIDE_PURE( - std::string, - purecpp::chat::BaseMessage, - get_content - ); - } -}; - -void bind_ChatClasses(py::module &m) { - py::class_>(m, "BaseMessage") - .def(py::init<>()) - .def_property_readonly("type", &purecpp::chat::BaseMessage::get_type) - .def_property_readonly("content", &purecpp::chat::BaseMessage::get_content); - - py::class_, purecpp::chat::BaseMessage>(m, "HumanMessage") - .def(py::init(), py::arg("content")); - - py::class_, purecpp::chat::BaseMessage>(m, "AIMessage") - .def(py::init(), py::arg("content")); - - py::class_, purecpp::chat::BaseMessage>(m, "SystemMessage") - .def(py::init(), py::arg("content")); - - py::class_(m, "ChatHistory") - .def(py::init<>()) - .def("add_message", static_cast&)>(&purecpp::chat::ChatHistory::add_message), py::arg("message")) - .def("add_messages", static_cast>&)>(&purecpp::chat::ChatHistory::add_message), py::arg("messages")) - .def_property_readonly("messages", &purecpp::chat::ChatHistory::get_messages) - .def("clear", &purecpp::chat::ChatHistory::clear) - .def("size", &purecpp::chat::ChatHistory::size) - .def("add_benchmark_messages_omp", &purecpp::chat::ChatHistory::add_benchmark_messages_omp, py::arg("num_messages")); +void bind_FAISSVectorSearch(py::module& m) { + py::class_(m, "PureResult") + .def_readonly("index", &FAISSVectorSearch::PureResult::indices) // user-friendly alias + .def_readonly("distances", &FAISSVectorSearch::PureResult::distances) + .def("__repr__", [](const FAISSVectorSearch::PureResult& self) { + std::ostringstream oss; + oss << "PureResult(index="; + oss << py::repr(py::cast(self.indices)); + oss << ", distances="; + oss << py::repr(py::cast(self.distances)); + oss << ")"; + return oss.str(); + }); + + m.def("PureL2", &FAISSVectorSearch::PureL2, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs an exact L2 (Euclidean) similarity search using FAISS. + Returns the top-k most similar vectors from the database. + )pbdoc"); + + m.def("PureIP", &FAISSVectorSearch::PureIP, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs a dot product similarity search using FAISS. + Suitable when the magnitude of vectors is meaningful. + )pbdoc"); + + m.def("PureCosine", &FAISSVectorSearch::PureCosine, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs a cosine similarity search using FAISS. + Internally normalizes all vectors and then uses inner product search. + )pbdoc"); } +// // VectorDabase +// void bind_VectorDB(pybind11::module_ &); + +// // Trampoline class for BaseMessage +// class PyBaseMessage : public purecpp::chat::BaseMessage { +// public: +// using purecpp::chat::BaseMessage::BaseMessage; // Inherit constructors + +// std::string get_type() const override { +// PYBIND11_OVERRIDE_PURE( +// std::string, /* Return type */ +// purecpp::chat::BaseMessage, /* Parent class */ +// get_type /* Name of function */ +// /* Arguments */ +// ); +// } + +// std::string get_content() const override { +// PYBIND11_OVERRIDE_PURE( +// std::string, +// purecpp::chat::BaseMessage, +// get_content +// ); +// } +// }; + +// void bind_ChatClasses(py::module &m) { +// py::class_>(m, "BaseMessage") +// .def(py::init<>()) +// .def_property_readonly("type", &purecpp::chat::BaseMessage::get_type) +// .def_property_readonly("content", &purecpp::chat::BaseMessage::get_content); + +// py::class_, purecpp::chat::BaseMessage>(m, "HumanMessage") +// .def(py::init(), py::arg("content")); + +// py::class_, purecpp::chat::BaseMessage>(m, "AIMessage") +// .def(py::init(), py::arg("content")); + +// py::class_, purecpp::chat::BaseMessage>(m, "SystemMessage") +// .def(py::init(), py::arg("content")); + +// py::class_(m, "ChatHistory") +// .def(py::init<>()) +// .def("add_message", static_cast&)>(&purecpp::chat::ChatHistory::add_message), py::arg("message")) +// .def("add_messages", static_cast>&)>(&purecpp::chat::ChatHistory::add_message), py::arg("messages")) +// .def_property_readonly("messages", &purecpp::chat::ChatHistory::get_messages) +// .def("clear", &purecpp::chat::ChatHistory::clear) +// .def("size", &purecpp::chat::ChatHistory::size) +// .def("add_benchmark_messages_omp", &purecpp::chat::ChatHistory::add_benchmark_messages_omp, py::arg("num_messages")); +// } //-------------------------------------------------------------------------- // Main module @@ -1415,8 +1461,7 @@ PYBIND11_MODULE(RagPUREAI, m) bind_IEmbeddingOpenAI(m); bind_EmbeddingOpenAI(m); - bind_ChatClasses(m); - - py::module_ vectorDB = m.def_submodule("vectorDB", "Bindings for vector database"); - bind_VectorDB(vectorDB); + // bind_ChatClasses(m); + // py::module_ vectorDB = m.def_submodule("vectorDB", "Bindings for vector database"); + // bind_VectorDB(vectorDB); } From 15eb9791d33d7567369e4dc7060fcf39d391fe70 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 08:18:17 -0300 Subject: [PATCH 44/65] Adding faiss as a submodule --- .gitmodules | 3 +++ scripts/faiss_installer.sh | 14 -------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/.gitmodules b/.gitmodules index bd35155..ce0490b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ [submodule "libs/openai-cpp"] path = libs/openai-cpp url = https://github.com/olrea/openai-cpp.git +[submodule "libs/faiss"] + path = libs/faiss + url = https://github.com/facebookresearch/faiss.git diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index 50b49c4..d5624f3 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -110,20 +110,6 @@ fi -# ───────────────────────────────────────────────────────────────────────────── -# CLONE: Download FAISS repository -# ───────────────────────────────────────────────────────────────────────────── - -echo "Removing all files in repository $FAISS_DIR..." -rm -fr "$FAISS_DIR" - -echo "Cloning FAISS repository into $FAISS_DIR..." -git clone https://github.com/facebookresearch/faiss.git "$FAISS_DIR" - -cd "$FAISS_DIR" - - - # ───────────────────────────────────────────────────────────────────────────── # BUILD: Configure and compile FAISS (CPU-only) # ───────────────────────────────────────────────────────────────────────────── From 7910b9acd4f87cfcf52371ee42769b3dcbc9d3b3 Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Mon, 15 Sep 2025 08:53:13 -0300 Subject: [PATCH 45/65] Update build.sh --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index f2143cd..e2b9bb1 100755 --- a/build.sh +++ b/build.sh @@ -84,7 +84,7 @@ printf "[Last Step] Sending to Sandbox \n" rm -f ../Sandbox/*.so -# cp ./src/build/Release/.so ../Sandbox/ +cp ./src/build/Release/RagPUREAI.cpython*.so ../Sandbox/ #----------------------------------------- #================= ENDING ================ From 2e37fb8b8ad2c26bf2aad624d1a98dad8bc1176c Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 09:24:02 -0300 Subject: [PATCH 46/65] =?UTF-8?q?fix(build):=20corrige=20caminho=20do=20FA?= =?UTF-8?q?ISS=20e=20inclus=C3=A3o=20do=20ContentCleaner.h?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/CMakeLists.txt | 22 ++++++++++++++++++++-- src/binding.cpp | 16 ++++++++-------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c7dbf4f..4263074 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_BUILD_TYPE "Release") # ───────────────────────────────────────────────────────────────────────────── #------- Caminhos para FAISS ------- # ───────────────────────────────────────────────────────────────────────────── -set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") +set(FAISS_ROOT "${CMAKE_BINARY_DIR}../libs/faiss") set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Must contain libfaiss.a @@ -23,6 +23,24 @@ set_target_properties(faiss PROPERTIES ) +# # ── FAISS (robusto a layout de build) +# set(FAISS_ROOT "${CMAKE_SOURCE_DIR}/../libs/faiss") +# set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") +# set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") + +# # Aceita tanto .a quanto .so +# find_library(FAISS_LIB NAMES faiss PATHS "${FAISS_LIB_DIR}" NO_DEFAULT_PATH) + +# if(NOT FAISS_LIB) +# message(FATAL_ERROR "FAISS não encontrado em ${FAISS_LIB_DIR}. Construa o FAISS ou ajuste o caminho.") +# endif() + +# add_library(faiss UNKNOWN IMPORTED) +# set_target_properties(faiss PROPERTIES +# IMPORTED_LOCATION "${FAISS_LIB}" +# INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIR}" +# ) + # ───────────────────────────────────────────────────────────────────────────── #------- Compiler Specific flags ------- # ───────────────────────────────────────────────────────────────────────────── @@ -296,4 +314,4 @@ target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) # ───────────────────────────────────────────────────────────────────────────── #---------------- DEBUG MESSAGES # ───────────────────────────────────────────────────────────────────────────── -# ... \ No newline at end of file +#${CMAKE_SOURCE_DIR}) \ No newline at end of file diff --git a/src/binding.cpp b/src/binding.cpp index 5e4d40c..d8ee775 100644 --- a/src/binding.cpp +++ b/src/binding.cpp @@ -31,7 +31,7 @@ #include "TXTLoader/TXTLoader.h" #include "WebLoader/WebLoader.h" -#include "ContentCleaner/ContentCleaner.h" +#include "ContentCleaner.h" #include "ChunkDefault/ChunkDefault.h" #include "ChunkCount/ChunkCount.h" @@ -41,7 +41,7 @@ #include "FAISSVectorSearch/FAISSVectorSearch.h" -#include "../components/MetadataExtractor/Document.h" +#include "MetadataExtractor/Document.h" #include "IMetadataExtractor.h" #include "MetadataExtractor.h" #include "MetadataRegexExtractor/IMetadataRegexExtractor.h" @@ -49,17 +49,17 @@ #include "MetadataRegexExtractor/MetadataRegexExtractor.h" #include "MetadataHFExtractor/MetadataHFExtractor.h" -#include "../components/Embedding/Document.h" +#include "Embedding/Document.h" #include "IBaseEmbedding.h" #include "EmbeddingOpenAI/IEmbeddingOpenAI.h" #include "EmbeddingOpenAI/EmbeddingOpenAI.h" -// #include "../components/Chat/Message/BaseMessage.h" -// #include "../components/Chat/Message/HumanMessage.h" -// #include "../components/Chat/Message/AIMessage.h" -// #include "../components/Chat/Message/SystemMessage.h" -// #include "../components/Chat/ChatHistory/ChatHistory.h" +// #include "Chat/Message/BaseMessage.h" +// #include "Chat/Message/HumanMessage.h" +// #include "Chat/Message/AIMessage.h" +// #include "Chat/Message/SystemMessage.h" +// #include "Chat/ChatHistory/ChatHistory.h" namespace py = pybind11; using namespace RAGLibrary; From e90104a9621af768b291ece0127ee3bff2c9502a Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 15 Sep 2025 09:32:14 -0300 Subject: [PATCH 47/65] Fix faiss build --- scripts/faiss_installer.sh | 3 +++ src/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index d5624f3..e99b496 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -113,6 +113,7 @@ fi # ───────────────────────────────────────────────────────────────────────────── # BUILD: Configure and compile FAISS (CPU-only) # ───────────────────────────────────────────────────────────────────────────── +cd "$FAISS_DIR" echo "Configuring CMake for CPU-only FAISS build..." cmake -B build \ @@ -126,6 +127,8 @@ echo "Building FAISS..." cmake --build build --parallel 3 # cmake --build build -j$(nproc) +cd "$PROJ_DIR" + echo "FAISS has been successfully built." diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4263074..bf05ccf 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_BUILD_TYPE "Release") # ───────────────────────────────────────────────────────────────────────────── #------- Caminhos para FAISS ------- # ───────────────────────────────────────────────────────────────────────────── -set(FAISS_ROOT "${CMAKE_BINARY_DIR}../libs/faiss") +set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Must contain libfaiss.a From 081382625865c3a9658f6c9eede98a3abc2c7552 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:05:32 -0300 Subject: [PATCH 48/65] Update .gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9607eab..f377f27 100644 --- a/.gitignore +++ b/.gitignore @@ -53,9 +53,7 @@ models/* libtorch*.zip libs/libtorch -libs/faiss/* -conan.lock libtorch/ extern/* From c5e317eef912fdc0541dbb10126bfde0a62c314d Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:06:03 -0300 Subject: [PATCH 49/65] Update README.md --- README.md | 273 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 205 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index ca01461..b0dcd1d 100644 --- a/README.md +++ b/README.md @@ -1,123 +1,271 @@ -# PureCPP +# PureCPP -**PureCPP** is the C++ backend powering the core logic of the RAG (Retrieval-Augmented Generation) system. It provides high-performance native modules that integrate seamlessly with Python via bindings. +[![Status](https://img.shields.io/badge/status-stable-brightgreen?style=flat-square)]() -## Contributing +**PureCPP is a powerful C++ backend architecture for RAG systems.**\ +Designed for maximum performance and scalability, it integrates vector search, ONNX models, and CPU/CUDA acceleration into a seamless, python integrated framework. -We welcome contributions to **PureCPP**! +*This repository provides detailed guidance on how to set up the environment, configure dependencies and building the project.* -Before submitting a pull request or issue, please read our [Contribution Guide](/community/CONTRIBUTING.md). +## 📚 Table of Contents +- [1. Docker Environment Setup](#docker-environment-setup) +- [2. Local Environment Setup](#local-environment-setup) +- [3. Using Pre-trained Models](#use-pre-trained-models) + +--- ## Project Structure ``` -. ├── scripts/ # Shell utilities and setup scripts ├── package/ # Python package │ └── purecpp/ # Contains the compiled .so -├── build/ # Generated build files -├── libs/ # Third-party dependencies -├── CMakeLists.txt # Main build config +├── libs/ # Dependencies +├── src/ # source files and CMake entry +│ ├── build/ # Generated build files +│ └── CMakeLists.txt # Main build config +├── models/ +│ ├── hf_extract_model.py +│ ├── hf_model_to_onnx.py +│ └── .onnx ├── Dockerfile # Build environment └── README.md -``` +```` + +### Documentation -## Documentation +For detailed explanation of features, please refer to our 🔗 [official documentation](https://docs.puredocs.org/setup). -For full installation and setup instructions, visit our official documentation: +### Contributing to PureCPP -🔗 [PureCPP Documentation](https://docs.puredocs.org/setup) +We welcome contributions to **PureCPP**! + +**If you would like to contribute, please read our 👉 [contribution guide](/community/CONTRIBUTING.md).** +### Requirements + +- ***GCC/G++** >= 13.1* +- ***CMake** >= 3.22* +- ***Python** >= 3.8* + ## Quick Start with PIP -To install the package via `pip` (for end-users): +To install the package via `pip` **(for end-users)**: ```bash pip install purecpp ``` -## Build Options +--- +--- +# Build Options +--- -You can either **build locally** or use our **Docker environment** to ensure consistency. +## Docker Environment Setup -### Building with Docker (Recommended) +* **1. Clone the repository along with all its submodules (recursively)** -To simplify setup and avoid installing system-wide dependencies, use the provided Dockerfile. +```bash +git clone --recursive https://github.com/pureai-ecosystem/purecpp +``` -#### Step 1: Build the Docker image +* **2. Navigate into the cloned repository folder** ```bash -docker build -t purecpp . +cd purecpp ``` -#### Step 2: Start a bash shell inside the container +* **3. Build a Docker image from the current directory and tag it as 'pure_faiss'** ```bash -docker run -it --rm purecpp bash +docker build -t pure_faiss . ``` -#### Step 3: Inside the container, build the project +* **4. Start a Docker container named 'env' from the 'pure_faiss' image, mounting current dir to /home** ```bash -./build +docker run -it --name env -v "$PWD":/home pure_faiss ``` +> ## Note +> Once you've created the container using `docker run`, ***you don't need to recreate it again.*** +> Instead, follow these two simple commands to ***reuse*** the container: -This will generate the shared object (`RagPUREAI.cpython-*.so`) in the `build/Release/` directory. +> ```bash +> docker start env +> ```` +> **This command **starts an existing container** that has already been created earlier using `docker run`.** -#### Step 4: Copy `.so` to your test folder +> ```bash +> docker exec -it env bash +> ``` +> **This command **attaches a terminal to the running container**, allowing you to interact with it just like you would with a regular Linux shell.** -To test the Python bindings, copy the `.so` file to your test script directory: + +* **5. Execute the `env_config.sh`** **(in order to install FAISS, torch, configure conan)** ```bash -cp build/Release/RagPUREAI*.so /some-test-folder +chmod +x -R scripts/*.sh +./scripts/env_config.sh ``` -### Building Locally (Alternative) +* **6. Make the build.sh script executable and build it** -You may also build the project manually without Docker, if your environment satisfies the requirements. +```bash +chmod +x build.sh +./build.sh +``` -#### Minimum Requirements to Build Locally +--- +--- -* **Python** ≥ 3.8 -* **CMake** ≥ 3.22 -* **Conan** ≥ 2.0 -* **Rust** -* **GCC/G++** = 13 -* **Protobuf Compiler** +## Local Environment Setup -#### Build Steps +### 1. Clone the Repository ```bash -chmod +x scripts/install_python_dependencies.sh -chmod +x scripts/install_torch.sh -chmod +x scripts/install_libs.sh -chmod +x scripts/configure_conan_profile.sh -chmod +x build +git clone --recursive https://github.com/pureai-ecosystem/purecpp +cd purecpp +```` + +> **Note:** +> If you forgot to use `--recursive` when cloning the repository, +> make sure to run: +> +> ```bash +> git submodule update --init --recursive +> ``` +> +> This will initialize and update all required Git submodules. + +### 2. Installing dependencies + +- **Ubuntu/Debian** +```bash +sudo apt update && \ +sudo apt upgrade -y && \ +sudo apt install -y \ + build-essential wget curl \ + ninja-build cmake libopenblas-dev \ + libgflags-dev python3-dev libprotobuf-dev \ + protobuf-compiler unzip libssl-dev zlib1g-dev +```` + +- **RedHat** +```bash +yum update && +yum install -y \ + gcc gcc-c++ make git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + openblas-devel unzip \ +```` -# Install dependencies -./scripts/install_python_dependencies.sh -./scripts/install_torch.sh -./scripts/install_libs.sh -./scripts/configure_conan_profile.sh +### 3. Install python essential packages + +*In case you do not have a Docker environment available*, we strongly recommend that you use a Python `venv` (virtual environment) to ensure proper isolation of dependencies and reproducibility of results. This practice minimizes conflicts between global packages and project-specific requirements, fostering a cleaner and more maintainable development setup. + +Steps below to create and activate the virtual environment: + + - Create the virtual environment (replace 'venv' with your preferred name) + ```bash + python3 -m venv venv + ```` + - Activate the virtual environment on Linux or macOS + ```bash + source venv/bin/activate + ```` + +```bash +pip install build conan cmake requests pybind11 +```` + + +### 4. Install Rust via rustup + +*Run rustup installer non-interactively (-y). This places cargo and rustc in /root/.cargo & activate Rust Environment:* + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +source ~/.cargo/env +```` + +### 5. Execute the `env_config.sh` **(in order to install FAISS, torch, configure conan)** + +```bash +chmod +x -R ./installers/*.sh +./installers/env_config.sh +```` -# Build the project -./build -``` -The output `.so` file will be located in `build/Release/`. +### 6. Make the `build.sh` script executable & Run it + +```bash +chmod +x build.sh +./build.sh +``` --- ## Testing Locally -To test the Python bindings: +> This is a development version with an automatic pipline build system. Optimizing the process, making it easy to compile and test all modules automatically in this development version. \ +> To compile and build, just use the provided scripts — no manual setup needed.\ +> The resulting libraries will be placed inside [`Sandbox/`](/Sandbox) -```python -from RagPUREAI import SomeExposedFunction +```SourceTree +Sandbox/ +├── RagPUREAI.cpython-312-x86_64-linux-gnu.so +└── ... ``` -Ensure `RagPUREAI*.so` is placed in the same folder as your Python project. +--- +--- + +# Use pre-trained models + +### 🛠️ Hugging Face to **ONNX** Converter + +This Python scripts convert Hugging Face models into the ONNX format for optimized inference. + +This scripts handles two main use cases: +1. **Feature extraction models** (e.g., `sentence-transformers`). +2. **Token classification models** (e.g., Named Entity Recognition - NER). + +It automatically creates a `models` directory (in the parent folder of the script) to store the exported ONNX models and related assets. + +### Requirements + + *Before running the script, make sure you have the following Python packages installed:* + ```bash + pip install torch transformers onnx onnxruntime optimum + ``` + +### Examples + +```bash +python3 models/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" +```` +```bash +python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" +``` + +### Output + +``` +./models/ + ├── hf_extract_model.py + ├── hf_model_to_onnx.py + ├── sentence-transformers/all-MiniLM-L6-v2/ + │ ├── model.onnx (via optimum) + │ └── tokenizer/ + └── dslim/bert-base-NER/ + ├── model.onnx + ├── label_map.json + └── tokenizer/ +``` + +--- --- ## Publishing to PyPI @@ -136,19 +284,8 @@ This script will: --- -## Downloading Pre-trained Models - -You can convert HuggingFace models to ONNX using: - -```bash -python3 scripts/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" -python3 scripts/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" -``` - ---- - ## Next Steps ![Next Steps](community/release.jpg) -Stay tuned for updates and new model integrations! 🚀 \ No newline at end of file +Stay tuned for updates and new model integrations! 🚀 From e1533b66f5685206af8a523063354ed6f0530c19 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:06:31 -0300 Subject: [PATCH 50/65] Renamed to faiss_backend --- .../faiss_backend.cpp} | 22 +++++++++---------- .../faiss_backend.h} | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) rename components/{FAISSVectorSearch/FAISSVectorSearch.cpp => FAISSBackend/faiss_backend.cpp} (82%) rename components/{FAISSVectorSearch/FAISSVectorSearch.h => FAISSBackend/faiss_backend.h} (98%) diff --git a/components/FAISSVectorSearch/FAISSVectorSearch.cpp b/components/FAISSBackend/faiss_backend.cpp similarity index 82% rename from components/FAISSVectorSearch/FAISSVectorSearch.cpp rename to components/FAISSBackend/faiss_backend.cpp index 8664b6d..16892f6 100644 --- a/components/FAISSVectorSearch/FAISSVectorSearch.cpp +++ b/components/FAISSBackend/faiss_backend.cpp @@ -1,7 +1,7 @@ -#include +#include -std::optional -FAISSVectorSearch::PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { +std::optional +faiss_backend::PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { Chunk::ChunkQuery cq(query, {}, &chunks, pos); size_t nq, d, ndb; std::tie(nq, d, ndb) = cq.getPar(); @@ -36,13 +36,13 @@ FAISSVectorSearch::PureL2(std::string query, const Chunk::ChunkDefault& chunks, if (D.size() > 0) { std::cout << "Nearest index: " << I[0] << std::endl; std::cout << "Distance: " << D[0] << std::endl; - return FAISSVectorSearch::PureResult{I, D}; + return faiss_backend::PureResult{I, D}; } return {}; } -std::optional -FAISSVectorSearch::PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { +std::optional +faiss_backend::PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { Chunk::ChunkQuery cq(query, {}, &chunks, pos); size_t nq, d, ndb; std::tie(nq, d, ndb) = cq.getPar(); @@ -77,13 +77,13 @@ FAISSVectorSearch::PureIP(std::string query, const Chunk::ChunkDefault& chunks, if (D.size() > 0) { std::cout << "Most similar index: " << I[0] << std::endl; std::cout << "Similarity score: " << D[0] << std::endl; - return FAISSVectorSearch::PureResult{I, D}; + return faiss_backend::PureResult{I, D}; } return {}; } -std::optional -FAISSVectorSearch::PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { +std::optional +faiss_backend::PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { Chunk::ChunkQuery cq(query, {}, &chunks, pos); size_t nq, d, ndb; std::tie(nq, d, ndb) = cq.getPar(); @@ -118,12 +118,12 @@ FAISSVectorSearch::PureCosine(std::string query, const Chunk::ChunkDefault& chun index.search(nq, normalized_query.data(), k, D.data(), I.data()); if (D.size() > 0) { - return FAISSVectorSearch::PureResult{I, D}; + return faiss_backend::PureResult{I, D}; } return {}; } -void FAISSVectorSearch::normalize_vector(float* vec, size_t d) { +void faiss_backend::normalize_vector(float* vec, size_t d) { float norm = 0.0f; for (size_t i = 0; i < d; ++i) { norm += vec[i] * vec[i]; diff --git a/components/FAISSVectorSearch/FAISSVectorSearch.h b/components/FAISSBackend/faiss_backend.h similarity index 98% rename from components/FAISSVectorSearch/FAISSVectorSearch.h rename to components/FAISSBackend/faiss_backend.h index 9298c8e..a2d5203 100644 --- a/components/FAISSVectorSearch/FAISSVectorSearch.h +++ b/components/FAISSBackend/faiss_backend.h @@ -24,7 +24,7 @@ #include "Chunk/ChunkDefault/ChunkDefault.h" #include "Chunk/ChunkQuery/ChunkQuery.h" -namespace FAISSVectorSearch { +namespace faiss_backend { // struct vdb_data { // std::vector flatVD; From e1756ff94cc313839e55bd219128ce0d93efe185 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:06:42 -0300 Subject: [PATCH 51/65] Update build.sh --- build.sh | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/build.sh b/build.sh index e2b9bb1..f5117b8 100755 --- a/build.sh +++ b/build.sh @@ -2,6 +2,8 @@ set -euo pipefail +cd src/ + #----------------------------------------- #================= LOGGING =============== #----------------------------------------- @@ -19,6 +21,24 @@ printf "$LINE_BRK" +# ───────────────────────────────────────────────────────────────────────────── +# Smart core splitter for parallel builds +# ───────────────────────────────────────────────────────────────────────────── + +cores=$(nproc) + +if [ "$cores" -gt 1 ]; then + half=$((cores / 2)) +else + half=1 +fi +printf "$LINE_BRK" +echo "[INFO] Detected $cores cores, using $half for parallel build." +printf "$LINE_BRK" +printf "$SEGMENT" +printf "$SEGMENT" +#----------------------------------------- + # ───────────────────────────────────────────────────────────────────────────── # Conan # ───────────────────────────────────────────────────────────────────────────── @@ -28,10 +48,11 @@ printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- -rm -fr ./src/build ./src/conan.lock +rm -fr ./build +conan install . --build=missing -c tools.build:jobs=$half -conan lock create ./src --build=missing -conan install ./src --build=missing +# rm -fr ./conan.lock +# conan lock create . --build=missing -c tools.build:jobs=$half #----------------------------------------- #================= ENDING ================ @@ -51,7 +72,6 @@ printf " Begin [Build]$LINE_BRK" printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- -cd src/ cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ @@ -64,7 +84,7 @@ cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ -B "$(pwd)/build/Release" \ -G "Unix Makefiles" -cmake --build "$(pwd)/build/Release" --parallel $(nproc) +cmake --build "$(pwd)/build/Release" --parallel $half # cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI #----------------------------------------- @@ -80,11 +100,12 @@ printf "$SEGMENT$SEGMENT$SEGMENT\n" # ───────────────────────────────────────────────────────────────────────────── # Sending to Sandbox # ───────────────────────────────────────────────────────────────────────────── + printf "[Last Step] Sending to Sandbox \n" rm -f ../Sandbox/*.so -cp ./src/build/Release/RagPUREAI.cpython*.so ../Sandbox/ +cp ./build/Release/RagPUREAI.cpython*.so ../Sandbox/ #----------------------------------------- #================= ENDING ================ From bf9eab923119af3c3b4fe152bb84362fce47fd9e Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:07:00 -0300 Subject: [PATCH 52/65] Update CMAKELISTS --- src/CMakeLists.txt | 73 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bf05ccf..5770b9b 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,7 +10,7 @@ set(CMAKE_BUILD_TYPE "Release") # ───────────────────────────────────────────────────────────────────────────── -#------- Caminhos para FAISS ------- +#------- Path to FAISS ------- # ───────────────────────────────────────────────────────────────────────────── set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") @@ -23,24 +23,6 @@ set_target_properties(faiss PROPERTIES ) -# # ── FAISS (robusto a layout de build) -# set(FAISS_ROOT "${CMAKE_SOURCE_DIR}/../libs/faiss") -# set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") -# set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") - -# # Aceita tanto .a quanto .so -# find_library(FAISS_LIB NAMES faiss PATHS "${FAISS_LIB_DIR}" NO_DEFAULT_PATH) - -# if(NOT FAISS_LIB) -# message(FATAL_ERROR "FAISS não encontrado em ${FAISS_LIB_DIR}. Construa o FAISS ou ajuste o caminho.") -# endif() - -# add_library(faiss UNKNOWN IMPORTED) -# set_target_properties(faiss PROPERTIES -# IMPORTED_LOCATION "${FAISS_LIB}" -# INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIR}" -# ) - # ───────────────────────────────────────────────────────────────────────────── #------- Compiler Specific flags ------- # ───────────────────────────────────────────────────────────────────────────── @@ -123,17 +105,17 @@ endif() #------- Find other dependencies ------- # ───────────────────────────────────────────────────────────────────────────── find_package(pybind11 REQUIRED) -find_package(pdfium REQUIRED) # Biblioteca para manipulação de PDFs -find_package(OpenMP REQUIRED) # OpenMP primeiro, pois pode ser usado por outros pacotes -find_package(ICU REQUIRED) # Biblioteca de internacionalização -find_package(miniz REQUIRED) # Biblioteca de compressão -find_package(rapidxml REQUIRED) # Parser XML -find_package(beauty REQUIRED) # HTTP Server (geralmente independente) -find_package(lexbor REQUIRED) # Biblioteca de parsing HTML -find_package(re2 REQUIRED) # Biblioteca de regex eficiente -find_package(nlohmann_json REQUIRED) # Biblioteca de JSON (não tem dependências) -#find_package(fmt REQUIRED) # Biblioteca de formatação de strings em casos aonde n tem format em compiladores mais antigos -find_package(CURL REQUIRED) # Biblioteca para requisições HTTP +find_package(pdfium REQUIRED) +find_package(OpenMP REQUIRED) +find_package(ICU REQUIRED) +find_package(miniz REQUIRED) +find_package(rapidxml REQUIRED) +find_package(beauty REQUIRED) +find_package(lexbor REQUIRED) +find_package(re2 REQUIRED) +find_package(nlohmann_json REQUIRED) +#find_package(fmt REQUIRED) +find_package(CURL REQUIRED) find_package(Threads REQUIRED) # ───────────────────────────────────────────────────────────────────────────── @@ -191,6 +173,9 @@ set(CMAKE_BUILD_RPATH "/usr/lib64" "\$ORIGIN/" "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" + + "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib" + "\$ORIGIN/../libs/libtorch/cpu/lib" ) set(CMAKE_INSTALL_RPATH @@ -204,13 +189,15 @@ set(CMAKE_INSTALL_RPATH "/usr/lib64" "\$ORIGIN/" "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" + + "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib" + "\$ORIGIN/../libs/libtorch/cpu/lib" ) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - # ───────────────────────────────────────────────────────────────────────────── #------- Tokenizers ------- # ───────────────────────────────────────────────────────────────────────────── @@ -222,14 +209,19 @@ add_subdirectory(${TOKENIZERS_PATH} tokenizers EXCLUDE_FROM_ALL) # ───────────────────────────────────────────────────────────────────────────── set(OPENAI_CPP_PATH "${CMAKE_SOURCE_DIR}/../libs/openai-cpp") - # ───────────────────────────────────────────────────────────────────────────── # ----- Project sources ----- # ───────────────────────────────────────────────────────────────────────────── -set(RagPUREAI_BINDING_SRCS "${CMAKE_SOURCE_DIR}/binding.cpp") + +set(RagPUREAI_BINDING_SRCS binding.cpp) set(RagPUREAI_IMPL_SRCS ${CMAKE_SOURCE_DIR}/../libs/StringUtils/StringUtils.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/BaseLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/PDFLoader/PDFLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/DOCXLoader/DOCXLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/WebLoader/WebLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/TXTLoader/TXTLoader.cpp ${CMAKE_SOURCE_DIR}/../components/Embedding/BaseEmbedding.cpp ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI/EmbeddingOpenAI.cpp @@ -240,8 +232,13 @@ set(RagPUREAI_IMPL_SRCS ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkSimilarity/ChunkSimilarity.cpp ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkQuery/ChunkQuery.cpp + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataExtractor.cpp + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataRegexExtractor/MetadataRegexExtractor.cpp + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataHFExtractor/MetadataHFExtractor.cpp + ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner/ContentCleaner.cpp - ${CMAKE_SOURCE_DIR}/../components/FAISSVectorSearch/FAISSVectorSearch.cpp + + ${CMAKE_SOURCE_DIR}/../components/FAISSBackend/faiss_backend.cpp ) @@ -258,6 +255,7 @@ target_include_directories(RagPUREAILib PUBLIC #.h ${CMAKE_SOURCE_DIR}/../libs/StringUtils ${CMAKE_SOURCE_DIR}/../libs/FileUtils ${CMAKE_SOURCE_DIR}/../libs/MemoryUtils + #C:/vcpkg/packages/protobuf_x64-windows/include ${CURL_INCLUDE_DIRS} ${OPENAI_CPP_PATH}/include @@ -273,15 +271,16 @@ target_include_directories(RagPUREAILib PUBLIC #.h ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI ${CMAKE_SOURCE_DIR}/../components/Embedding - ${CMAKE_SOURCE_DIR}/../components/FAISSVectorSearch + ${CMAKE_SOURCE_DIR}/../components/FAISSBackend ${CMAKE_SOURCE_DIR}/../libs/faiss/ ) -link_directories(${FAISS_LIB_DIR}) # ───────────────────────────────────────────────────────────────────────────── # Link libraries with RagPUREAILib # ───────────────────────────────────────────────────────────────────────────── +link_directories(${FAISS_LIB_DIR}) + target_link_libraries(RagPUREAILib PUBLIC pdfium::pdfium icu::icu @@ -301,7 +300,9 @@ target_link_libraries(RagPUREAILib PUBLIC ${TORCH_LIBRARIES} tokenizers_cpp faiss + # ${FAISS_LIB_DIR}/libfaiss.a ) + # ───────────────────────────────────────────────────────────────────────────── #---------------- Pybind11 Module # ───────────────────────────────────────────────────────────────────────────── From fb0788c487a487e1b207306cd8c96720b8bfd0f6 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 09:08:08 -0300 Subject: [PATCH 53/65] Adding new features binding --- src/binding.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/binding.cpp b/src/binding.cpp index d8ee775..ee98299 100644 --- a/src/binding.cpp +++ b/src/binding.cpp @@ -39,7 +39,7 @@ #include "ChunkCommons/ChunkCommons.h" #include "ChunkQuery/ChunkQuery.h" -#include "FAISSVectorSearch/FAISSVectorSearch.h" +#include "FAISSBackend/faiss_backend.h" #include "MetadataExtractor/Document.h" #include "IMetadataExtractor.h" @@ -1328,11 +1328,11 @@ void bind_EmbeddingOpenAI(py::module &m) )doc"); } -void bind_FAISSVectorSearch(py::module& m) { - py::class_(m, "PureResult") - .def_readonly("index", &FAISSVectorSearch::PureResult::indices) // user-friendly alias - .def_readonly("distances", &FAISSVectorSearch::PureResult::distances) - .def("__repr__", [](const FAISSVectorSearch::PureResult& self) { +void bind_faiss_backend(py::module& m) { + py::class_(m, "PureResult") + .def_readonly("index", &faiss_backend::PureResult::indices) // user-friendly alias + .def_readonly("distances", &faiss_backend::PureResult::distances) + .def("__repr__", [](const faiss_backend::PureResult& self) { std::ostringstream oss; oss << "PureResult(index="; oss << py::repr(py::cast(self.indices)); @@ -1342,7 +1342,7 @@ void bind_FAISSVectorSearch(py::module& m) { return oss.str(); }); - m.def("PureL2", &FAISSVectorSearch::PureL2, + m.def("PureL2", &faiss_backend::PureL2, py::arg("query"), py::arg("chunks"), py::arg("pos"), @@ -1352,7 +1352,7 @@ void bind_FAISSVectorSearch(py::module& m) { Returns the top-k most similar vectors from the database. )pbdoc"); - m.def("PureIP", &FAISSVectorSearch::PureIP, + m.def("PureIP", &faiss_backend::PureIP, py::arg("query"), py::arg("chunks"), py::arg("pos"), @@ -1362,7 +1362,7 @@ void bind_FAISSVectorSearch(py::module& m) { Suitable when the magnitude of vectors is meaningful. )pbdoc"); - m.def("PureCosine", &FAISSVectorSearch::PureCosine, + m.def("PureCosine", &faiss_backend::PureCosine, py::arg("query"), py::arg("chunks"), py::arg("pos"), From 34a7a12ca21c0d5ff2772614efe66ef46a8be3b3 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sun, 21 Sep 2025 10:03:34 -0300 Subject: [PATCH 54/65] Final Version SUPPORT.md is identical to CONTRIBUTE --- SUPPORT.md | 147 ------------------- components/Chunk/ChunkCommons/ChunkCommons.h | 4 +- 2 files changed, 2 insertions(+), 149 deletions(-) delete mode 100644 SUPPORT.md diff --git a/SUPPORT.md b/SUPPORT.md deleted file mode 100644 index 70d88c9..0000000 --- a/SUPPORT.md +++ /dev/null @@ -1,147 +0,0 @@ -# 🧙🏼‍♂️Contributing to PureCPP - -Welcome to **PureCPP**, where efficiency and optimization are the foundation of everything we build. Here, we value clean, fast, and powerful code. - -Want to contribute? Whether you're creating new integrations, improving performance, or expanding features, every line of code matters. Together, we’ll take high-performance computing to the next level. - -💡 Ready to code without limits? Let’s get to work! 💡 - ---- - -## 💪🏽 Quick Start Guide - -Ready to jump in? Follow this quick setup guide to get started smoothly: - -1. **Fork** the repo and clone your fork. -2. Navigate to the project folder: - ```bash - cd purecpp - ``` - -3. Make sure you have the following packages installed: -- GCC/G++ 13.1 -- CMake 3.22+ -- Conan 2 -- Rust -- Python 3.8+ - -4. Install the required dependencies:`: -Depending on your system, you may need: - - ```bash - sudo apt update && sudo apt install -y gcc-13 g++-13 cmake conan rustc cargo - ``` -5. Install development conan dependencies:: - - ```bash - !pip install conan==2.* - ``` - -6. Run the tests to ensure everything is working:: - ```bash - ./tests/run_tests - ``` - - - All set! Now it's time to build something powerful. If you need more details, check out the [Development Guidelines](#-Development-Guidelines). - ---- -## Community Discord -Join our community [Discord](https://discord.gg/8eF9v78Ndv) to ask questions, get support, and collaborate with fellow contributors and users. - - -## ⚡ What Can You Contribute To? - -There are many ways to contribute to **PureCPP**—whether you're a **C++ expert** or just starting out with **high-performance computing**. Here, we focus on **performance, efficiency, and scalability**. Your contributions are always welcome! - -## 1. 🚀 Expand Core Modules - -Help us improve **PureCPP** by contributing to our core modules and making the framework even more optimized. - -- **New Integrations** (e.g., support for new compilers, optimized bindings, high-performance libraries) -- **Memory Management**, **Parallelism (Threads and CUDA)**, **Matrix and Tensor Operations** -- **Advanced Chunking Techniques** to optimize processing -- **Efficient Metadata Extraction and Management** - -## 2. ⚙️ Dataloaders and Smart Storage - -- **Optimized Dataloaders** for different file types and databases -- **Efficient indexing and retrieval** -- **Smart loading strategies to optimize search performance** - -## 3. 🏎️ Vector Database and LLMs - -- Implementation and optimization of **high-performance vector databases** -- **Integration of LLMs** and embedding models for semantic search -- Support for **quantization, fine-tuning, and CUDA optimizations** - -## 4. 🛠️ Bug Fixes and Code Improvements - -Found something that could be optimized? Code improvements are always welcome! Check out the [GitHub Labels](https://github.com/pureai-ecosystem/purecpp/labels) - -## 5. 📚 Share Usage Examples - -If you’ve used **PureCPP** in an innovative way, share your examples and contribute to the community. - -## 6. 🔬 Experiments and New Approaches - -Got a different idea? We’re open to tests and new approaches—experiment and submit a PR! - - - ---- - -## 🚀 **Next Steps: What Are We Planning?** - -We are always evolving! Here are the next steps to make our pipelines even more efficient and powerful: - -### 🔹 **New Features** -✅ **Add local Vector Databases** to enhance semantic search performance -✅ **Integrate local LLMs** and create connectors for inference frameworks - -### 🔧 **Fixes & Improvements** -🛠️ **Optimize data extraction** for greater efficiency -📌 **Add Schema** to better structure data -📌 **Expand the variety of models** in our components -🔄 **Enhance chunking techniques** for smarter processing -📈 **Improve embeddings** for more precise vector representations -🗂️ **Refine metadata extraction** for better contextualization - -💡 **Got an idea?** Your contribution is more than welcome! Join us and help take this project even further. 🚀 - ---- - - -## ✨ Steps to Contribute - -1. **Fork** the repository on GitHub. -2. **Clone** your fork to your local machine. - ```bash - git clone https://github.com/pureai-ecosystem/purecpp.git - ``` -3. **Create a branch** for your work. - ```bash - git checkout -b your-feature-branch - ``` -4. **Set up your environment** -5. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. -6. **Commit** your changes, then push them to your fork. - ```bash - git push origin your-feature-branch - ``` -7. **Open a pull request** on GitHub. - -Obrigado! - ---- - - -## ⚡ **Acknowledgements: Built with Pure Performance** - -Big thanks for being part of **PureCPP**—where every bit counts, and every byte makes a difference! 🚀 - -Whether you're optimizing loops, fine-tuning embeddings, or pushing parallel processing to the limit, your contributions fuel the engine of **high-performance computing**. - -We’re not just writing code—we’re compiling the future. 🔥 - -Keep coding at full speed! 🏎️💻 diff --git a/components/Chunk/ChunkCommons/ChunkCommons.h b/components/Chunk/ChunkCommons/ChunkCommons.h index b9777a7..5a77f53 100644 --- a/components/Chunk/ChunkCommons/ChunkCommons.h +++ b/components/Chunk/ChunkCommons/ChunkCommons.h @@ -32,13 +32,13 @@ namespace Chunk }; }; - extern inline const std::unordered_map> EmbeddingModel = { + extern inline const std::unordered_map> EmbeddingModel = { {"openai", {"text-embedding-ada-002", "text-embedding-3-small", "..."}}, {"huggingface", {"bge-small", "bge-large"}}, {"cohere", {"embed-english-light-v3.0"}} }; - inline void PrintEmbeddingModels() { + inline void PrintEmbeddingModels() { std::cout << "╔══════════════════════════════════════════════════════════╗\n"; std::cout << "║ 📦 Available Embedding Models \n"; std::cout << "╠══════════════════════════════════════════════════════════╣\n"; From 0bfe06e691ac739fa1a5b997c673d3de6870d25b Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Mon, 22 Sep 2025 10:21:00 -0300 Subject: [PATCH 55/65] Update: readme sandbox --- Sandbox/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Sandbox/README.md b/Sandbox/README.md index 77b0cf0..cef8d16 100644 --- a/Sandbox/README.md +++ b/Sandbox/README.md @@ -1,11 +1,11 @@ # Sandbox Repository -This repository is a **sandbox environment** for testing and prototyping. +This folder is a **isolated testing and experimentation environment** -**Every time it is compiled (e.g., using the scripts `./build`), the outputs are **redirected** to the `Sandbox/` directory.** +**Every time it is compiled (using the script `./build.sh`), the output is **redirected** to the `Sandbox/` directory.** * Here, the `.so` files (shared libraries) will be available for experimentation, prototyping, and testing purposes. -* The `Resources/` folder contains a collection of publicly accessible files in various formats (.doc, .pdf, .txt), used to test the functionalities of the different loaders. These materials are assumed to be free of patent restrictions or in the public domain. \ No newline at end of file +* The `Resources/` folder contains a collection of publicly accessible files in various formats (.doc, .pdf, .txt), used to test the functionalities of the different loaders. These materials are assumed to be free of patent restrictions or in the public domain. From d49ef765a892022854efd1ad25255bf091672c05 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Wed, 24 Sep 2025 07:06:25 -0300 Subject: [PATCH 56/65] chore: rename community to docs for better semantics. - Renamed `community/` folder to `docs/` for clearer semantics and aligning with common repository conventions. - Removed `.dockerignore` as it was no longer necessary for the current Docker setup. --- .dockerignore | 28 ---------------------------- {community => docs}/CONTRIBUTING.md | 0 {community => docs}/release.jpg | Bin 3 files changed, 28 deletions(-) delete mode 100644 .dockerignore rename {community => docs}/CONTRIBUTING.md (100%) rename {community => docs}/release.jpg (100%) diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index c368843..0000000 --- a/.dockerignore +++ /dev/null @@ -1,28 +0,0 @@ -**/build -**/conan.lock -**/CMakeUserPresets.json -bkp/ - -**/*openai_api_key* - -**/venv - -**/*.egg-info -**/dist/* - -**/*.so -**/.whl - -**/Tests/* - -libs/libtorch -libtorch*.zip - -# Ignore everything in the models folder -models/* -# But DO NOT ignore these two files -!models/hf_extract_model.py -!models/hf_model_to_onnx.py - -**/*.pdf -**/*.docx \ No newline at end of file diff --git a/community/CONTRIBUTING.md b/docs/CONTRIBUTING.md similarity index 100% rename from community/CONTRIBUTING.md rename to docs/CONTRIBUTING.md diff --git a/community/release.jpg b/docs/release.jpg similarity index 100% rename from community/release.jpg rename to docs/release.jpg From e5982ccb15df4137b672df9202a1977eeb907d1a Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Wed, 24 Sep 2025 07:06:46 -0300 Subject: [PATCH 57/65] Update readme --- README.md | 250 +++++++++++++++++++++++++++--------------------------- 1 file changed, 126 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index b0dcd1d..3a5269f 100644 --- a/README.md +++ b/README.md @@ -1,52 +1,49 @@ -# PureCPP +# PureCPP [![Status](https://img.shields.io/badge/status-stable-brightgreen?style=flat-square)]() **PureCPP is a powerful C++ backend architecture for RAG systems.**\ -Designed for maximum performance and scalability, it integrates vector search, ONNX models, and CPU/CUDA acceleration into a seamless, python integrated framework. +Designed for maximum performance and scalability, it integrates vector search, ONNX models, and CPU/CUDA acceleration into a seamless, Python-integrated framework. -*This repository provides detailed guidance on how to set up the environment, configure dependencies and building the project.* +*This repository provides detailed guidance on how to set up the environment, configure dependencies, and build the project.* ## 📚 Table of Contents -- [1. Docker Environment Setup](#docker-environment-setup) -- [2. Local Environment Setup](#local-environment-setup) -- [3. Using Pre-trained Models](#use-pre-trained-models) +* **1.** [Environment Setup](#environment-setup) + - [Docker](#docker) + - [Local](#local) +* **2.** [Build](#build) +* **3.** [Using Pre-trained Models](#use-pre-trained-models) --- ## Project Structure -``` +```html ├── scripts/ # Shell utilities and setup scripts ├── package/ # Python package │ └── purecpp/ # Contains the compiled .so ├── libs/ # Dependencies ├── src/ # source files and CMake entry │ ├── build/ # Generated build files +│ ├── Conanfile.py # Package manager for C and C++ │ └── CMakeLists.txt # Main build config -├── models/ +├── models/ │ ├── hf_extract_model.py │ ├── hf_model_to_onnx.py -│ └── .onnx +│ └── / ├── Dockerfile # Build environment └── README.md ```` ### Documentation -For detailed explanation of features, please refer to our 🔗 [official documentation](https://docs.puredocs.org/setup). +For a detailed explanation of the features, please refer to our 🔗 [official documentation](https://docs.puredocs.org/setup). ### Contributing to PureCPP We welcome contributions to **PureCPP**! **If you would like to contribute, please read our 👉 [contribution guide](/community/CONTRIBUTING.md).** - -### Requirements - -- ***GCC/G++** >= 13.1* -- ***CMake** >= 3.22* -- ***Python** >= 3.8* ## Quick Start with PIP @@ -56,167 +53,172 @@ To install the package via `pip` **(for end-users)**: pip install purecpp ``` ---- ---- -# Build Options --- -## Docker Environment Setup +## Environment Setup -* **1. Clone the repository along with all its submodules (recursively)** +--- + +### First of all clone the repository ```bash git clone --recursive https://github.com/pureai-ecosystem/purecpp -``` +cd purecpp +```` -* **2. Navigate into the cloned repository folder** +> [!WARNING] +> +> If you forgot to use `--recursive` when cloning the repository, +> make sure to run: +> +> ```bash +> git submodule update --init --recursive +> ``` -```bash -cd purecpp -``` +### **Docker** -* **3. Build a Docker image from the current directory and tag it as 'pure_faiss'** +--- -```bash -docker build -t pure_faiss . -``` +* **1. Build a Docker image from the current directory and tag it as 'purecpp_env'** -* **4. Start a Docker container named 'env' from the 'pure_faiss' image, mounting current dir to /home** + ```bash + docker build -t purecpp_env . + ``` -```bash -docker run -it --name env -v "$PWD":/home pure_faiss -``` -> ## Note +* **2. Start a Docker container named 'env' from the 'purecpp_env' image, mounting current dir to /home** + + ```bash + docker run -it --name env -v "$PWD":/home purecpp_env + ``` + +* **3. Execute the `env_config.sh`** + + ```bash + chmod +x scripts/*.sh + ./scripts/env_config.sh + ``` + + *This install python essential package, libtorch, FAISS, and configure Conan* + +> [!CAUTION] +> > Once you've created the container using `docker run`, ***you don't need to recreate it again.*** > Instead, follow these two simple commands to ***reuse*** the container: - > ```bash > docker start env -> ```` -> **This command **starts an existing container** that has already been created earlier using `docker run`.** - +> ``` +> **This command *starts an existing container* that has already been created earlier using `docker run`.** > ```bash > docker exec -it env bash > ``` -> **This command **attaches a terminal to the running container**, allowing you to interact with it just like you would with a regular Linux shell.** - +> **This command *attaches a terminal to the running container*, allowing you to interact with it just like you would with a regular Linux shell.** -* **5. Execute the `env_config.sh`** **(in order to install FAISS, torch, configure conan)** - -```bash -chmod +x -R scripts/*.sh -./scripts/env_config.sh -``` +--- -* **6. Make the build.sh script executable and build it** +### **Local** -```bash -chmod +x build.sh -./build.sh -``` - ---- --- -## Local Environment Setup +> Requirements +> +> * **Python** ≥ 3.8 +> * **CMake** ≥ 3.22 +> * **Rust** +> * **GCC/G++** ≥ 13 -### 1. Clone the Repository +#### 1. Installing dependencies -```bash -git clone --recursive https://github.com/pureai-ecosystem/purecpp -cd purecpp -```` +- **Ubuntu/Debian** + ```bash + sudo apt update && \ + sudo apt upgrade -y && \ + sudo apt install -y \ + build-essential wget curl \ + ninja-build cmake libopenblas-dev \ + libgflags-dev python3-dev libprotobuf-dev \ + protobuf-compiler unzip libssl-dev zlib1g-dev + ``` -> **Note:** -> If you forgot to use `--recursive` when cloning the repository, -> make sure to run: -> -> ```bash -> git submodule update --init --recursive -> ``` -> -> This will initialize and update all required Git submodules. +- **Red Hat** + ```bash + yum update && + yum install -y \ + gcc gcc-c++ make git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + openblas-devel unzip \ + ``` -### 2. Installing dependencies +#### 2. Install Rust via rustup -- **Ubuntu/Debian** ```bash -sudo apt update && \ -sudo apt upgrade -y && \ -sudo apt install -y \ - build-essential wget curl \ - ninja-build cmake libopenblas-dev \ - libgflags-dev python3-dev libprotobuf-dev \ - protobuf-compiler unzip libssl-dev zlib1g-dev -```` +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +``` +*Run rustup installer non-interactively (-y).* -- **RedHat** ```bash -yum update && -yum install -y \ - gcc gcc-c++ make git curl wget \ - ninja-build libffi-devel openssl-devel \ - protobuf-devel gflags-devel zlib-devel \ - openblas-devel unzip \ -```` - -### 3. Install python essential packages +source ~/.cargo/env +``` +*This places cargo and rustc in /root/.cargo & activate Rust Environment* -*In case you do not have a Docker environment available*, we strongly recommend that you use a Python `venv` (virtual environment) to ensure proper isolation of dependencies and reproducibility of results. This practice minimizes conflicts between global packages and project-specific requirements, fostering a cleaner and more maintainable development setup. +#### 3. Execute the `env_config.sh` -Steps below to create and activate the virtual environment: + ***In case you do not have a Docker environment available, we strongly recommend that you use a Python `venv` (virtual environment) to ensure proper isolation of dependencies and reproducibility of results.*** - Create the virtual environment (replace 'venv' with your preferred name) + ```bash python3 -m venv venv - ```` + ``` + - Activate the virtual environment on Linux or macOS + ```bash source venv/bin/activate - ```` + ``` -```bash -pip install build conan cmake requests pybind11 -```` - - -### 4. Install Rust via rustup - -*Run rustup installer non-interactively (-y). This places cargo and rustc in /root/.cargo & activate Rust Environment:* + This practice minimizes conflicts between global packages and project-specific requirements. Use the steps below to create and activate the virtual environment. ```bash -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -source ~/.cargo/env -```` +chmod +x scripts/*.sh +./scripts/env_config.sh +``` -### 5. Execute the `env_config.sh` **(in order to install FAISS, torch, configure conan)** +*This install python essential package, libtorch, FAISS, and configure Conan* -```bash -chmod +x -R ./installers/*.sh -./installers/env_config.sh -```` +--- +## **Build** -### 6. Make the `build.sh` script executable & Run it +***The `build.sh` is a development pipeline, that makes easier to compile and test*** ```bash chmod +x build.sh ./build.sh ``` +- Cleans the `build/` folder +- Installs Conan dependencies if missing +- Compiles the code +- Sends the `RagPUREAI.*.so` output to[`Sandbox/`](/Sandbox) + --- ## Testing Locally -> This is a development version with an automatic pipline build system. Optimizing the process, making it easy to compile and test all modules automatically in this development version. \ -> To compile and build, just use the provided scripts — no manual setup needed.\ -> The resulting libraries will be placed inside [`Sandbox/`](/Sandbox) +The `build.sh` script will place the resulting libraries inside [`Sandbox/`](/Sandbox) -```SourceTree -Sandbox/ -├── RagPUREAI.cpython-312-x86_64-linux-gnu.so -└── ... -``` + ```html + Sandbox/ + ├── Resources/ + ├── RagPUREAI.cpython-312-x86_64-linux-gnu.so + └── YOUR-TEST.py + ``` + +To test the Python bindings: + ```python + from RagPUREAI import SomeExposedFunction + ``` --- --- @@ -225,9 +227,9 @@ Sandbox/ ### 🛠️ Hugging Face to **ONNX** Converter -This Python scripts convert Hugging Face models into the ONNX format for optimized inference. +These Python scripts convert Hugging Face models into the ONNX format for optimized inference. -This scripts handles two main use cases: +These scripts handle two primary use cases: 1. **Feature extraction models** (e.g., `sentence-transformers`). 2. **Token classification models** (e.g., Named Entity Recognition - NER). @@ -244,7 +246,7 @@ It automatically creates a `models` directory (in the parent folder of the scrip ```bash python3 models/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" -```` +``` ```bash python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" @@ -270,10 +272,10 @@ python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" - ## Publishing to PyPI -To build and upload the Python package: +To build and upload the Python package to PyPI: ```bash -./scripts/create_pip_package +./scripts/create_pip_package PYPI-API-KEY ``` This script will: @@ -286,6 +288,6 @@ This script will: ## Next Steps -![Next Steps](community/release.jpg) +![Next Steps](docs/release.jpg) Stay tuned for updates and new model integrations! 🚀 From 4012b2052f82c1514a6fdad9e917cd06e31f31ad Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Sat, 27 Sep 2025 08:30:18 -0300 Subject: [PATCH 58/65] Move .md to models --- .gitignore | 3 +-- models/README.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 models/README.md diff --git a/.gitignore b/.gitignore index f377f27..c681f2f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -### C++ ### # Prerequisites **/*.d @@ -47,9 +46,9 @@ bkp/ # Ignore everything in the models folder models/* -# But DO NOT ignore these two files !models/hf_extract_model.py !models/hf_model_to_onnx.py +!models/README.md libtorch*.zip libs/libtorch diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000..59e2da5 --- /dev/null +++ b/models/README.md @@ -0,0 +1,48 @@ +--- +--- + +# Download Pre-trained Models + +## 🛠️ Hugging Face to **ONNX** Converter: + +These Python scripts convert Hugging Face models into the ONNX format for optimized inference. + +These scripts handle two primary use cases: +1. **Feature extraction models** (e.g., `sentence-transformers`). +2. **Token classification models** (e.g., Named Entity Recognition - NER). + +It automatically downloads the model and organizes the exported files in a structured subdirectory. + +## Requirements + + *Before running the script, make sure you have the following Python packages installed:* + ```bash + pip install torch transformers onnx onnxruntime optimum + ``` + +## Examples + +```bash +python3 models/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" +``` + +```bash +python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" +``` + +## Output + +``` +./models/ + ├── hf_extract_model.py + ├── hf_model_to_onnx.py + ├── sentence-transformers/all-MiniLM-L6-v2/ + │ ├── model.onnx (via optimum) + │ └── tokenizer/ + └── dslim/bert-base-NER/ + ├── model.onnx + ├── label_map.json + └── tokenizer/ +``` + +--- From 02c65d7a80d5574297ee8a05ac174b553caf01fb Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sat, 27 Sep 2025 08:46:53 -0300 Subject: [PATCH 59/65] Revise contributing steps and community information Updated the contributing guide to streamline the contribution process and improve clarity. --- docs/CONTRIBUTING.md | 99 +++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 62 deletions(-) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 353a716..46b1e78 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -8,49 +8,39 @@ Want to contribute? Whether you're creating new integrations, improving performa --- -## 💪🏽 Quick Start Guide - -Ready to jump in? Follow this quick setup guide to get started smoothly: +## ✨ Steps to Contribute + +**Ready to jump in? Follow this quick setup guide to get started smoothly** 1. **Fork** the repo and clone your fork. -2. Navigate to the project folder: - ```bash - cd purecpp - ``` - -3. Make sure you have the following packages installed: -- GCC/G++ 13.1 -- CMake 3.22+ -- Conan 2 -- Rust -- Python 3.8+ -4. Install the required dependencies:`: -Depending on your system, you may need: +2. [Environment setup](/README.md#environment-setup-ubuntu--debian-for-c-and-python-development) - ```bash - sudo apt update && sudo apt install -y gcc-13 g++-13 cmake conan rustc cargo - ``` -5. Install development conan dependencies:: +3. [How to build](/README.md#how-to-build) - ```bash - !pip install conan==2.* - ``` +**All set! Now... it is time to build something powerful**. -6. Run the tests to ensure everything is working:: +1. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. +2. **Commit** your changes, then push them to your fork. ```bash - ./tests/run_tests - ``` + git push origin your-feature-branch + ```` +3. **Open a pull request** on GitHub. - All set! Now it's time to build something powerful. If you need more details, check out the [Development Guidelines](#-Development-Guidelines). --- -## Community Discord -Join our community [Discord](https://discord.gg/8eF9v78Ndv) to ask questions, get support, and collaborate with fellow contributors and users. +--- +## Community Discord [![Join us on Discord ](https://img.shields.io/badge/Join_Us_On_Discord-5865F2?logo=discord&logoColor=white&style=for-the-badge)](https://discord.gg/8eF9v78Ndv) +- Ask questions and get support +- Share feedback and suggestions +- Connect with the team and other users + +--- +--- -## ⚡ What Can You Contribute To? +## ⚡ What to Contribute? There are many ways to contribute to **PureCPP**—whether you're a **C++ expert** or just starting out with **high-performance computing**. Here, we focus on **performance, efficiency, and scalability**. Your contributions are always welcome! @@ -59,19 +49,22 @@ There are many ways to contribute to **PureCPP**—whether you're a **C++ expert Help us improve **PureCPP** by contributing to our core modules and making the framework even more optimized. - **New Integrations** (e.g., support for new compilers, optimized bindings, high-performance libraries) -- **Memory Management**, **Parallelism (Threads and CUDA)**, **Matrix and Tensor Operations** +- **Memory Management** ✔️ +- **Parallelism (Threads/OpenMP)** ✔️ +- **Parallelism (CUDA)** +- **Matrix and Tensor Operations** - **Advanced Chunking Techniques** to optimize processing - **Efficient Metadata Extraction and Management** ## 2. ⚙️ Dataloaders and Smart Storage - **Optimized Dataloaders** for different file types and databases -- **Efficient indexing and retrieval** -- **Smart loading strategies to optimize search performance** +- **Efficient indexing and retrieval** ✔️ +- **Smart loading strategies to optimize search performance** ✔️+- ## 3. 🏎️ Vector Database and LLMs -- Implementation and optimization of **high-performance vector databases** +- Implementation and optimization of **high-performance vector databases** ✔️ - **Integration of LLMs** and embedding models for semantic search - Support for **quantization, fine-tuning, and CUDA optimizations** @@ -87,8 +80,6 @@ If you’ve used **PureCPP** in an innovative way, share your examples and contr Got a different idea? We’re open to tests and new approaches—experiment and submit a PR! - - --- ## 🚀 **Next Steps: What Are We Planning?** @@ -110,38 +101,22 @@ We are always evolving! Here are the next steps to make our pipelines even more 💡 **Got an idea?** Your contribution is more than welcome! Join us and help take this project even further. 🚀 --- - - -## ✨ Steps to Contribute - -1. **Fork** the repository on GitHub. -2. **Clone** your fork to your local machine. - ```bash - git clone https://github.com/pureai-ecosystem/purecpp.git - ``` -3. **Create a branch** for your work. - ```bash - git checkout -b your-feature-branch - ``` -4. **Set up your environment** -5. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. -6. **Commit** your changes, then push them to your fork. - ```bash - git push origin your-feature-branch - ``` -7. **Open a pull request** on GitHub. - -Obrigado! - --- - ## ⚡ **Acknowledgements: Built with Pure Performance** -Big thanks for being part of **PureCPP**—where every bit counts, and every byte makes a difference! 🚀 +Big thanks for being part of **PureCPP**— where every bit counts, and every byte makes a difference! 🚀 Whether you're optimizing loops, fine-tuning embeddings, or pushing parallel processing to the limit, your contributions fuel the engine of **high-performance computing**. We’re not just writing code—we’re compiling the future. 🔥 -Keep coding at full speed! 🏎️💻 \ No newline at end of file +Keep coding at full speed! 🏎️💻 + +--- + +Thank You! Gracias! 謝謝! 감사해요! ありがとう! Спасибо! Obrigado! + + +--- +--- From c99f7e4fdac97d479b3574bbb50d7f4cff6cd5ef Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sun, 5 Oct 2025 09:55:46 -0300 Subject: [PATCH 60/65] Revise README structure and content Updated sections in README.md for clarity and added new topics. --- README.md | 84 ++++++++----------------------------------------------- 1 file changed, 11 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 3a5269f..46c91f6 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ Designed for maximum performance and scalability, it integrates vector search, O * **1.** [Environment Setup](#environment-setup) - [Docker](#docker) - [Local](#local) -* **2.** [Build](#build) -* **3.** [Using Pre-trained Models](#use-pre-trained-models) +* **2.** [Build & Testing](#build) +* **3.** [Publishing to PyPI](#publishing-to-pypi) --- @@ -35,23 +35,9 @@ Designed for maximum performance and scalability, it integrates vector search, O └── README.md ```` -### Documentation - -For a detailed explanation of the features, please refer to our 🔗 [official documentation](https://docs.puredocs.org/setup). - -### Contributing to PureCPP - -We welcome contributions to **PureCPP**! - -**If you would like to contribute, please read our 👉 [contribution guide](/community/CONTRIBUTING.md).** - -## Quick Start with PIP - -To install the package via `pip` **(for end-users)**: - -```bash -pip install purecpp -``` +- **[Quick Start ↗](https://docs.puredocs.org/setup)** +- **[Contributing to PureCPP ↗](docs/CONTRIBUTING.md)** +- **[Download Pre-trained Models ↗](./models/README.md)** --- @@ -98,7 +84,7 @@ cd purecpp ./scripts/env_config.sh ``` - *This install python essential package, libtorch, FAISS, and configure Conan* + *This script automates the setup. Installing Python essentials, LibTorch, FAISS, and configuring Conan profile* > [!CAUTION] > @@ -123,7 +109,6 @@ cd purecpp > > * **Python** ≥ 3.8 > * **CMake** ≥ 3.22 -> * **Rust** > * **GCC/G++** ≥ 13 #### 1. Installing dependencies @@ -159,6 +144,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ```bash source ~/.cargo/env ``` + *This places cargo and rustc in /root/.cargo & activate Rust Environment* #### 3. Execute the `env_config.sh` @@ -176,15 +162,16 @@ source ~/.cargo/env ```bash source venv/bin/activate ``` + *This practice minimizes conflicts between global packages and project-specific requirements.* - This practice minimizes conflicts between global packages and project-specific requirements. Use the steps below to create and activate the virtual environment. +**Then run `env_config.sh` script** ```bash chmod +x scripts/*.sh ./scripts/env_config.sh ``` -*This install python essential package, libtorch, FAISS, and configure Conan* +*This script automates the setup. Installing Python essentials, LibTorch, FAISS, and configuring Conan profile* --- @@ -202,9 +189,7 @@ chmod +x build.sh - Compiles the code - Sends the `RagPUREAI.*.so` output to[`Sandbox/`](/Sandbox) ---- - -## Testing Locally +## **Testing** The `build.sh` script will place the resulting libraries inside [`Sandbox/`](/Sandbox) @@ -223,53 +208,6 @@ To test the Python bindings: --- --- -# Use pre-trained models - -### 🛠️ Hugging Face to **ONNX** Converter - -These Python scripts convert Hugging Face models into the ONNX format for optimized inference. - -These scripts handle two primary use cases: -1. **Feature extraction models** (e.g., `sentence-transformers`). -2. **Token classification models** (e.g., Named Entity Recognition - NER). - -It automatically creates a `models` directory (in the parent folder of the script) to store the exported ONNX models and related assets. - -### Requirements - - *Before running the script, make sure you have the following Python packages installed:* - ```bash - pip install torch transformers onnx onnxruntime optimum - ``` - -### Examples - -```bash -python3 models/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" -``` - -```bash -python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" -``` - -### Output - -``` -./models/ - ├── hf_extract_model.py - ├── hf_model_to_onnx.py - ├── sentence-transformers/all-MiniLM-L6-v2/ - │ ├── model.onnx (via optimum) - │ └── tokenizer/ - └── dslim/bert-base-NER/ - ├── model.onnx - ├── label_map.json - └── tokenizer/ -``` - ---- ---- - ## Publishing to PyPI To build and upload the Python package to PyPI: From 2a9e09a6d9d785d7154865a8336602ab6b1c0f8a Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Tue, 28 Oct 2025 10:51:31 -0300 Subject: [PATCH 61/65] Refactor env_config.sh for improved logging and colors --- scripts/env_config.sh | 52 ++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/scripts/env_config.sh b/scripts/env_config.sh index 4789352..f8f2091 100755 --- a/scripts/env_config.sh +++ b/scripts/env_config.sh @@ -1,38 +1,54 @@ #!/usr/bin/env bash set -euo pipefail -#----------------------------------------- -#================= LOGGING =============== -#----------------------------------------- +#================= COLORS ================= +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +RESET='\033[0m' + +#================= FORMATTING ============= TAG="[$(basename "${BASH_SOURCE[0]}")]" -LINE_BRK="\n\n" +LINE_BRK=$'\n\n' SEGMENT="===========================================================\n" -printf "$SEGMENT$SEGMENT$SEGMENT" -printf " $TAG$LINE_BRK" -printf "$SEGMENT" -printf "$LINE_BRK" -#----------------------------------------- +#================= LOGGER FUNCS =========== +log_start() { + local section="$1" + printf "${CYAN}${SEGMENT}${SEGMENT}${SEGMENT}" + printf " Begin [$section] ${TAG}${LINE_BRK}" + printf "${SEGMENT}${RESET}" +} +log_end() { + local section="$1" + printf "${YELLOW}${SEGMENT}" + printf " Finish [$section]${LINE_BRK}" + printf "${SEGMENT}${SEGMENT}${SEGMENT}${RESET}" +} +#========================================== + + + +#────────────────────────────────────────── +log_start "ENV SETUP SCRIPT" -#----------------------------------------- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +echo -e "$GREEN[INFO] Resolved SCRIPT_DIR: $SCRIPT_DIR$RESET" +log_start "PIP" pip install build conan cmake requests pybind11 -"$SCRIPT_DIR/setting_conan_profile.sh" +"$SCRIPT_DIR/setting_conan_profile.sh" "$SCRIPT_DIR/torch_installer.sh" - "$SCRIPT_DIR/faiss_installer.sh" #----------------------------------------- -#----------------------------------------- -#================= ENDING ================ -#----------------------------------------- -printf " END\n" -printf "$SEGMENT$SEGMENT$SEGMENT\n" -#----------------------------------------- +#────────────────────────────────────────── +log_end "ENV SETUP SCRIPT" +printf "\n$CYAN\n$SEGMENT$SEGMENT$SEGMENT\n $RESET" +#----------------------------------------- \ No newline at end of file From a58cce53fff63785aefbac79fa8231012d4208e3 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Tue, 28 Oct 2025 10:54:12 -0300 Subject: [PATCH 62/65] Enhance logging and package installation in installer script Updated logging functions and improved output messages for better clarity. Adjusted package installation commands for consistency across package managers. --- scripts/faiss_installer.sh | 195 ++++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 91 deletions(-) diff --git a/scripts/faiss_installer.sh b/scripts/faiss_installer.sh index e99b496..3708d98 100755 --- a/scripts/faiss_installer.sh +++ b/scripts/faiss_installer.sh @@ -1,44 +1,80 @@ #!/usr/bin/env bash -# Script to install and build FAISS (CPU-only) for C++ usage -# Compatible with CentOS/RHEL systems using yum -# It clones FAISS into libs/faiss/ and builds it with CMake - set -euo pipefail -#----------------------------------------- -#================= LOGGING =============== -#----------------------------------------- +#============= COLORS ============= +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +RESET='\033[0m' +#============= FORMATTING ============= TAG="[$(basename "${BASH_SOURCE[0]}")]" -LINE_BRK="\n\n" +LINE_BRK=$'\n\n' SEGMENT="===========================================================\n" -printf "$SEGMENT$SEGMENT$SEGMENT" -printf " $TAG$LINE_BRK" -printf "$SEGMENT" -printf "$LINE_BRK" -#----------------------------------------- +#============= LOGGER FUNCS ============= +log_start() { + local section="$1" + printf "${CYAN}${SEGMENT}${SEGMENT}${SEGMENT}" + printf " Begin: [$section] ${TAG}${LINE_BRK}" + printf "${SEGMENT}${RESET}" +} + +log_end() { + local section="$1" + printf "${YELLOW}${SEGMENT}" + printf " Finish [$section]${LINE_BRK}" + printf "${SEGMENT}${SEGMENT}${SEGMENT}${RESET}" +} + +normal_log() { + local message="$1" + local color="$RESET" + + if [ $# -ge 2 ]; then + color="$2" + fi + + if [ $# -ge 3 ]; then + repeat="$3" + fi + + # Repeat SEGMENT lines to frame the message visually + + for ((i = 0; i < repeat; i++)); do + echo -e "${color}${SEGMENT}${RESET}" + done + + echo -e "${color} $message${LINE_BRK}${RESET}" + + for ((i = 0; i < repeat; i++)); do + echo -e "${color}${SEGMENT}${RESET}" + done +} +#========================================== + + +#────────────────────────────────────────── +log_start "DETECT ROOT PRIVILEGES" -# ───────────────────────────────────────────────────────────────────────────── -# Elevation helper: use sudo only when needed and available -# ───────────────────────────────────────────────────────────────────────────── SUDO="" if [[ "$(id -u)" -ne 0 ]]; then if command -v sudo >/dev/null 2>&1; then SUDO="sudo" else - echo "[!] Not running as root and 'sudo' is not available. - Re-run as root or install sudo." >&2 + echo -e "${RED}[!] Not running as root and 'sudo' is not available. Re-run as root or install sudo.${RESET}" exit 1 fi fi +log_end "DETECT ROOT PRIVILEGES" -# ───────────────────────────────────────────────────────────────────────────── -# Detect package manager -# ───────────────────────────────────────────────────────────────────────────── +#────────────────────────────────────────── +log_start "DETECT PACKAGE MANAGER" + PKG_MANAGER="" if command -v apt-get >/dev/null 2>&1; then PKG_MANAGER="apt" @@ -47,121 +83,98 @@ elif command -v yum >/dev/null 2>&1; then PKG_MANAGER="yum" echo "[pkg] Detected YUM-based system (manylinux/CentOS-like)" else - echo "[x] Unsupported system: neither apt-get nor yum found." >&2 + echo -e "${RED}[x] Unsupported system: neither apt-get nor yum found.${RESET}" >&2 exit 1 fi +log_end "DETECT PACKAGE MANAGER" -# ───────────────────────────────────────────────────────────────────────────── -# SETUP: Define directories -# ───────────────────────────────────────────────────────────────────────────── +#────────────────────────────────────────── +log_start "SETUP DIRECTORIES" -# Assume the current directory is the project root PROJ_DIR=$(pwd) - -# Destination directory for FAISS FAISS_DIR="${PROJ_DIR}/libs/faiss" - -echo "Creating libs/faiss/ directory inside the project..." mkdir -p "$FAISS_DIR" +echo "[INFO] Created: $FAISS_DIR" +log_end "SETUP DIRECTORIES" -# ───────────────────────────────────────────────────────────────────────────── -# SYSTEM: Update packages and install dependencies -# ───────────────────────────────────────────────────────────────────────────── +#────────────────────────────────────────── +log_start "INSTALL DEPENDENCIES" -echo "Updating system packages..." - - -echo "[pkg] Installing required development packages..." if [[ "$PKG_MANAGER" == "apt" ]]; then $SUDO apt-get update -y - $SUDO apt install libgflags-dev -y - $SUDO apt install -y cmake g++ libopenblas-dev python3-dev build-essential git -# $SUDO apt-get install -y \ -# cmake g++ libopenblas-dev libgflags-dev build-essential \ -# python3-dev git unzip wget pkg-config ninja-build binutils - + $SUDO apt-get install -y \ + cmake g++ libopenblas-dev libgflags-dev \ + python3-dev build-essential git + else - echo "Checking if EPEL is installed..." - if ! rpm -q epel-release >/dev/null 2>&1; then - echo "Installing EPEL repository..." - yum install -y epel-release - fi + echo "[INFO] Checking if EPEL is installed..." + if ! rpm -q epel-release >/dev/null 2>&1; then + echo "[INFO] Installing EPEL repository..." + $SUDO yum install -y epel-release + fi - $SUDO yum update -y - $SUDO yum groupinstall -y "Development Tools" - $SUDO yum install -y cmake3 gcc-c++ openblas-devel python3-devel git - $SUDO yum install gflags-devel -y -# $SUDO yum install -y \ -# gcc gcc-c++ cmake3 make cmake git curl wget ninja-build \ -# libffi-devel openssl-devel protobuf-devel gflags-devel \ -# zlib-devel unzip openblas-devel pkgconf-pkg-config binutils + $SUDO yum update -y + $SUDO yum groupinstall -y "Development Tools" + $SUDO yum install -y \ + cmake3 gcc-c++ openblas-devel python3-devel git gflags-devel fi - -# Ensure `cmake` command exists, link it to `cmake3` if missing if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then - echo "Linking cmake3 to cmake..." - ln -s /usr/bin/cmake3 /usr/bin/cmake + echo "[INFO] Linking cmake3 to cmake..." + $SUDO ln -sf /usr/bin/cmake3 /usr/bin/cmake fi +log_end "INSTALL DEPENDENCIES" + +#────────────────────────────────────────── +log_start "BUILD FAISS (CPU ONLY)" -# ───────────────────────────────────────────────────────────────────────────── -# BUILD: Configure and compile FAISS (CPU-only) -# ───────────────────────────────────────────────────────────────────────────── cd "$FAISS_DIR" -echo "Configuring CMake for CPU-only FAISS build..." cmake -B build \ - -DFAISS_ENABLE_GPU=OFF \ - -DFAISS_ENABLE_PYTHON=OFF \ - -DFAISS_ENABLE_TESTS=OFF \ - -DCMAKE_BUILD_TYPE=Release - -echo "Building FAISS..." + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release cmake --build build --parallel 3 -# cmake --build build -j$(nproc) - cd "$PROJ_DIR" -echo "FAISS has been successfully built." - +log_end "BUILD FAISS (CPU ONLY)" -# ───────────────────────────────────────────────────────────────────────────── -# VERIFY: Locate compiled library and headers -# ───────────────────────────────────────────────────────────────────────────── +#────────────────────────────────────────── +log_start "VERIFY BUILD" -# Find the libfaiss library (static or shared) FOUND_LIB=$(find "$FAISS_DIR/build/faiss" -name "libfaiss.*" | head -n 1) if [ -f "$FOUND_LIB" ]; then - echo "Header files located at: $FAISS_DIR/faiss/" - echo "Library file found at: $FOUND_LIB" + echo "[OK] Header files at: ${FAISS_DIR}/faiss/" + echo "[OK] Library file at: ${FOUND_LIB}" else - echo "Warning: libfaiss was not found in the expected directory." + echo -e "${RED}[WARN] libfaiss not found in expected build directory.${RESET}" fi +log_end "VERIFY BUILD" -# ───────────────────────────────────────────────────────────────────────────── -# INFO: How to link FAISS in your C++ CMake project -# ───────────────────────────────────────────────────────────────────────────── +#────────────────────────────────────────── +log_end "FAISS INSTALLATION" +#────────────────────────────────────────── + + +echo -e "$CYAN" +echo "LINKING INSTRUCTIONS" echo "" echo "You can now link FAISS in your C++ project using:" echo "" -echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss)' +echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss) ' echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' echo ' target_link_libraries(your_target PRIVATE faiss)' - -#----------------------------------------- -#================= ENDING ================ -#----------------------------------------- -printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n" +echo -e "$RESET" \ No newline at end of file From 1f13953f33a77204cadc385fcf333c65fc38bc87 Mon Sep 17 00:00:00 2001 From: bbzaffari Date: Tue, 28 Oct 2025 10:56:44 -0300 Subject: [PATCH 63/65] Improve logging and formatting in setting_conan_profile.sh Refactored the script to enhance readability and debugging. Logging output now uses consistent formatting with color codes and aligned indentation. Changes include: - Added default color fallback for log messages - Improved segment repetition logic for better visual structure - General cleanup and comment clarifications This improves maintainability and user feedback during script execution. --- scripts/setting_conan_profile.sh | 93 +++++++++++++++++++------------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh index 2242b4f..0f07364 100755 --- a/scripts/setting_conan_profile.sh +++ b/scripts/setting_conan_profile.sh @@ -1,42 +1,64 @@ -#!/bin/bash - +#!/usr/bin/env bash set -euo pipefail -#----------------------------------------- -#================= LOGGIN ================ -#----------------------------------------- +#================= COLORS ================= +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +RESET='\033[0m' + +#================= FORMATTING ============= TAG="[$(basename "${BASH_SOURCE[0]}")]" -LINE_BRK="\n\n" +LINE_BRK=$'\n\n' SEGMENT="===========================================================\n" -printf "$SEGMENT$SEGMENT$SEGMENT" -printf " $TAG$LINE_BRK" -#----------------------------------------- +#================= LOGGER FUNCS =========== +log_start() { + local section="$1" + printf "${CYAN}${SEGMENT}${SEGMENT}${SEGMENT}" + printf " Begin: [$section] ${TAG}${LINE_BRK}" + printf "${SEGMENT}${RESET}" +} + +log_end() { + local section="$1" + printf "${YELLOW}${SEGMENT}" + printf " Finish [$section]${LINE_BRK}" + printf "${SEGMENT}${SEGMENT}${SEGMENT}${RESET}" +} +#========================================== + -#----------------------------------------- -printf "$SEGMENT" -printf "$TAG conan profile detect --force\n" +#────────────────────────────────────────── +log_start "CONAN DETECT" + +echo -e "$GREEN[INFO] Running: conan profile detect --force$RESET" conan profile detect --force -#----------------------------------------- -#----------------------------------------- -printf "$LINE_BRK$SEGMENT" -printf "$TAG Finding\n" +log_end "CONAN DETECT" + + +#────────────────────────────────────────── +log_start "LOCATE PROFILE DIR" PROFILE_DIR=$(find . -type d -wholename "*/.conan2/profiles" | head -n 1 || true) -[ -z "$PROFILE_DIR" ] && PROFILE_DIR="$HOME/.conan2/profiles" && mkdir -p "$PROFILE_DIR" +if [ -z "$PROFILE_DIR" ]; then + PROFILE_DIR="$HOME/.conan2/profiles" + echo -e "$GREEN[INFO] Defaulting to: $PROFILE_DIR$RESET" + mkdir -p "$PROFILE_DIR" +else + echo -e "$GREEN[INFO] Found profile dir at: $PROFILE_DIR$RESET" +fi -printf "$TAG Found at $PROFILE_DIR\n" +log_end "LOCATE PROFILE DIR" -#----------------------------------------- -#----------------------------------------- -printf "$LINE_BRK$SEGMENT" -printf "$TAG Writing default profile$LINE_BRK" +#────────────────────────────────────────── +log_start "WRITE PROFILE" +DEFAULT_PROFILE="$PROFILE_DIR/default" -# Old Setup (New was set to compiler.cppstd=20 and compiler.version=13. But was resulting in issues.) -cat << EOF > "$PROFILE_DIR/default" +cat << EOF > "$DEFAULT_PROFILE" [settings] arch=x86_64 build_type=Release @@ -47,22 +69,17 @@ compiler.version=11 os=Linux EOF -printf "$LINE_BRK$SEGMENT" -#----------------------------------------- +echo -e "$GREEN[INFO] Profile written to: $DEFAULT_PROFILE$RESET" -printf "$TAG Profile created in: $PROFILE_DIR/default\n" -printf "$TAG Checking: cat < $PROFILE_DIR/default $LINE_BRK" +log_end "WRITE PROFILE" -cat < $PROFILE_DIR/default -printf "$LINE_BRK" -printf "$SEGMENT\n" +#────────────────────────────────────────── +log_start "VERIFY PROFILE" -printf "\nHard-check with: cat < $PROFILE_DIR/default$LINE_BRK" +echo -e "$GREEN[INFO] Displaying contents of: $DEFAULT_PROFILE" +echo -e "$CYAN" +cat "$DEFAULT_PROFILE" +echo -e "$RESET" -#----------------------------------------- -#================= ENDING ================ -#----------------------------------------- -printf "$SEGMENT$SEGMENT$SEGMENT" -printf "\n\n\n\n\n". -#----------------------------------------- +log_end "VERIFY PROFILE" From f17e3923898725d14fb38fab67f555b4ec9ba15e Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sat, 1 Nov 2025 12:33:04 -0300 Subject: [PATCH 64/65] Refactor build script for improved logging and formatting --- build.sh | 75 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/build.sh b/build.sh index f5117b8..002f495 100755 --- a/build.sh +++ b/build.sh @@ -4,6 +4,10 @@ set -euo pipefail cd src/ +#================= COLORS ================= +GREEN='\033[0;32m' +RESET='\033[0m' + #----------------------------------------- #================= LOGGING =============== #----------------------------------------- @@ -14,16 +18,15 @@ SEGMENT=$'===========================================================\n' #----------------------------------------- printf "$SEGMENT$SEGMENT$SEGMENT" -printf " Begin $TAG$LINE_BRK" -printf "$SEGMENT" -printf "$LINE_BRK" +printf " Begin: $TAG$LINE_BRK" +printf "$SEGMENT$LINE_BRK" #----------------------------------------- -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── # Smart core splitter for parallel builds -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── cores=$(nproc) @@ -33,17 +36,16 @@ else half=1 fi printf "$LINE_BRK" -echo "[INFO] Detected $cores cores, using $half for parallel build." -printf "$LINE_BRK" -printf "$SEGMENT" -printf "$SEGMENT" +echo -e "$GREEN[Core splitter] Detected $cores cores, using $half for parallel build. " +printf "$LINE_BRK$SEGMENT$SEGMENT" + #----------------------------------------- -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── # Conan -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── #----------------------------------------- -printf " Begin [CONAN]$LINE_BRK" +printf " Begin: [CONAN]$LINE_BRK" printf "$SEGMENT" printf "$LINE_BRK" #----------------------------------------- @@ -54,24 +56,20 @@ conan install . --build=missing -c tools.build:jobs=$half # rm -fr ./conan.lock # conan lock create . --build=missing -c tools.build:jobs=$half -#----------------------------------------- -#================= ENDING ================ #----------------------------------------- printf "$SEGMENT" -printf " Finish [CONAN]\n" +printf " [CONAN] Finished \n" printf "$SEGMENT$SEGMENT$SEGMENT\n" -#----------------------------------------- - - +#================= ENDING ================ -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── # Build -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── #----------------------------------------- -printf " Begin [Build]$LINE_BRK" -printf "$SEGMENT" -printf "$LINE_BRK" +printf " Begin: [Build]$LINE_BRK" +printf "$SEGMENT$LINE_BRK" #----------------------------------------- +START_TIME=$(date +%s) cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ @@ -84,33 +82,36 @@ cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ -B "$(pwd)/build/Release" \ -G "Unix Makefiles" -cmake --build "$(pwd)/build/Release" --parallel $half -# cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI +cmake --build "$(pwd)/build/Release" --parallel "$half" #--target RagPUREAI + +END_TIME=$(date +%s) +ELAPSED_TIME=$((END_TIME - START_TIME)) +#---------------- LOG -------------------- +echo -e "$GREEN" +echo "============================================================" +echo " Total build time: ${ELAPSED_TIME} s" +echo -e "============================================================$RESET" -#----------------------------------------- -#================= ENDING ================ #----------------------------------------- printf "$SEGMENT" -printf " Finish [Build]\n" +printf " [Build] Finished \n" printf "$SEGMENT$SEGMENT$SEGMENT\n" -#----------------------------------------- - +#================= ENDING ================ -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── # Sending to Sandbox -# ───────────────────────────────────────────────────────────────────────────── +# ──────────────────────────────────────── -printf "[Last Step] Sending to Sandbox \n" +printf "$GREEN[Last Step] Sending to Sandbox \n" +echo -e "$RESET" rm -f ../Sandbox/*.so cp ./build/Release/RagPUREAI.cpython*.so ../Sandbox/ -#----------------------------------------- -#================= ENDING ================ #----------------------------------------- printf "$SEGMENT" -printf " Finish $TAG\n" +printf " $TAG Finished \n" printf "$SEGMENT$SEGMENT$SEGMENT\n" -#----------------------------------------- +#================= ENDING ================ From 81f1ee398e81197e8ee9710608bf1f2b208ab645 Mon Sep 17 00:00:00 2001 From: BRUNO B ZAFFARI Date: Sat, 1 Nov 2025 12:38:13 -0300 Subject: [PATCH 65/65] Fix log formatting in build.sh --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 002f495..7efc266 100755 --- a/build.sh +++ b/build.sh @@ -88,9 +88,9 @@ END_TIME=$(date +%s) ELAPSED_TIME=$((END_TIME - START_TIME)) #---------------- LOG -------------------- echo -e "$GREEN" -echo "============================================================" +echo "===========================================================" echo " Total build time: ${ELAPSED_TIME} s" -echo -e "============================================================$RESET" +echo -e "===========================================================$RESET" #----------------------------------------- printf "$SEGMENT"