diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index e89e21a..0000000 --- a/.dockerignore +++ /dev/null @@ -1,62 +0,0 @@ -build -CMakeUserPresets.json -bkp -CMakeLists.txt.user -conan.lock -libs*/libtorch -libs*/libtorch - Copy -libs*/libtorch new c11 -models/ -libtorch-cxx11-abi-shared* -openai_api_key* -libtorch-cxx11-abi-shared-with-deps-2.5.1+cpu.zip -libtorch-cxx11-abi-shared-with-deps-2.5.1+cu124.zip -*/libtorch*.zip -*/openai-cpp/* -*/tokenizers-cpp/* -*/faiss/* -venv - -*/purecpp_chuncks_clean/purecpp_chuncks_clean/*.so -*/purecpp_chuncks_clean2/purecpp_chuncks_clean/*.so -*/purecpp_chunks_clean/purecpp_chunks_clean/*.so -*/purecpp_chunks_clean2/purecpp_chunks_clean/*.so -*/purecpp_extract/purecpp_extract/*.so -*/purecpp_embed/purecpp_embed/*.so -*/purecpp_meta/purecpp_meta/*.so -*/purecpp_meta/purecpp_meta_t/*.so -*/purecpp_libs/purecpp_libs/*.so - -*/*/*.egg-info -*/*/build/* -*/*/dist/* - -"testes wsl"/*.so -"testes many linux"/*.so -testes/*.so -testes/"modulos old"/*.so -testes/"modulos pip"/*.so - -./*.so -./*/*.so -./*/*/*.so -./*/*/*/*.so -./*/*/*/*/*.so -*.so -*/*.so -*/*/*.so -*/*/*/*.so -*/*/*/*/*.so -*.whl -*/*.whl -*/*/*.whl -*/*/*/*.whl -*/*/*/*/*.whl -*/libtorch - -*/build/* -*/conan.lock -*/CMakeUserPresets.json -*/Tests/* - -./*.zip \ No newline at end of file diff --git a/.gitignore b/.gitignore index af30c75..c681f2f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,40 +1,38 @@ -### C++ ### # Prerequisites -*.d +**/*.d # Compiled Object files -*.slo -*.lo -*.o -*.obj +**/*.slo +**/*.lo +**/*.o +**/*.obj # Precompiled Headers -*.gch -*.pch +**/*.gch +**/*.pch # Compiled Dynamic libraries -*.so -*.dylib -*.dll +**/*.so +**/*.dylib +**/*.dll # Fortran module files -*.mod -*.smod +**/*.mod +**/*.smod # Compiled Static libraries -*.lai -*.la -*.a -*.lib +**/*.lai +**/*.la +**/*.a +**/*.lib # Executables -*.exe -*.out -*.app +**/*.exe +**/*.out +**/*.app ### Python ### -pycache/ -__pycache__/ +*pycache*/ .pytest_cache/ test_bindings.py .pyd @@ -44,37 +42,35 @@ CMakeUserPresets.json CMakeFiles/ bkp/ -build/ -models/ -libs/libtorch -!libs/tokenizers-cpp -!libs/openai-cpp +**/build/ + +# Ignore everything in the models folder +models/* +!models/hf_extract_model.py +!models/hf_model_to_onnx.py +!models/README.md libtorch*.zip -libtorch-cxx11-abi-shared* -libtorch-cxx11-abi-shared-with-deps-2.5.1+cpu.zip +libs/libtorch -models -conan.lock libtorch/ extern/* -openai_api_key* +**/*openai_api_key* -package/build/ -package/dist/ +**/build/* +**/dist/* packer -package/*.egg-info +**/*.egg-info .env -.venv/ -.venv*/ -.vscode/ -.vs/ +.venv/* +.venv*/* +.vscode/* +.vs/* +venv -tests/ compile_commands.json .cache/ -*.pdf -*.docx -test* \ No newline at end of file +**/*.pdf +**/*.docx \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 49add60..ce0490b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,13 +1,10 @@ [submodule "libs/tokenizers-cpp"] path = libs/tokenizers-cpp url = https://github.com/mlc-ai/tokenizers-cpp.git + [submodule "libs/openai-cpp"] path = libs/openai-cpp url = https://github.com/olrea/openai-cpp.git -[submodule "extern/pybind11"] - path = extern/pybind11 - url = ../../pybind/pybind11 - branch = stable -[submodule "tokenizers-cpp"] - path = tokenizers-cpp - url = https://github.com/mlc-ai/tokenizers-cpp.git +[submodule "libs/faiss"] + path = libs/faiss + url = https://github.com/facebookresearch/faiss.git diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100755 index 7dcd48c..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,210 +0,0 @@ -cmake_minimum_required(VERSION 3.22) -project(RagPUREAI VERSION 1.0) - -# General build settings -set(CMAKE_CXX_STANDARD 23) -set(CMAKE_CXX_STANDARD_REQUIRED True) -set(CMAKE_BUILD_TYPE "Release") -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -# Toolchain -if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/generators/conan_toolchain.cmake) - set(CMAKE_CXX_FLAGS_RELEASE "/Od") -else() - set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/Release/generators/conan_toolchain.cmake) - set(CMAKE_CXX_FLAGS_RELEASE "-O0") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") -endif() - -option(CURL_STATIC_LINKING "Set to ON to build libcurl with static linking." OFF) -option(BUILD_APPS "Build apps" OFF) - -# Python & Pybind11 -find_package(Python3 REQUIRED COMPONENTS Interpreter Development) -include_directories(${Python3_INCLUDE_DIRS}) -find_package(pybind11 REQUIRED) - -# External Dependencies -find_package(pdfium REQUIRED) -find_package(ICU REQUIRED) -find_package(miniz REQUIRED) -find_package(rapidxml REQUIRED) -find_package(beauty REQUIRED) -find_package(lexbor REQUIRED) -find_package(OpenMP REQUIRED) -find_package(re2 REQUIRED) -find_package(nlohmann_json REQUIRED) -find_package(CURL REQUIRED) -find_package(onnxruntime REQUIRED) -find_package(redis++ REQUIRED) - -execute_process( - COMMAND "${Python3_EXECUTABLE}" -c - "import sysconfig; import pathlib; site_packages = sysconfig.get_paths()['purelib']; print(str(pathlib.Path(site_packages).resolve()))" - OUTPUT_VARIABLE PYTHON_SITE_PACKAGES - OUTPUT_STRIP_TRAILING_WHITESPACE -) - -# Protobuf -find_package(Protobuf REQUIRED) -include_directories(${Protobuf_INCLUDE_DIRS}) - -# Torch -set(Torch_DIR "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/share/cmake/Torch") -find_package(Torch REQUIRED) -include_directories("${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/include") -link_directories("${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib") - -# Tokenizers -set(TOKENIZERS_PATH "${CMAKE_SOURCE_DIR}/libs/tokenizers-cpp") -add_subdirectory(${TOKENIZERS_PATH} tokenizers EXCLUDE_FROM_ALL) - -# OpenAI -set(OPEANAI_CPP_PATH "${CMAKE_SOURCE_DIR}/libs/openai-cpp") - -# RPATH -set(CMAKE_BUILD_RPATH - "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" - "\$ORIGIN/purecpp.libs" - "\$ORIGIN/d_libs/libtorch/cpu/lib" - "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib" - "\$ORIGIN/libs/libtorch/cpu/lib" - "\$ORIGIN:/usr/bin/protoc" - "/usr/lib/x86_64-linux-gnu" - "/usr/lib64" - "\$ORIGIN/" - "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" -) -set(CMAKE_INSTALL_RPATH - "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" - "\$ORIGIN/purecpp.libs" - "\$ORIGIN/d_libs/libtorch/cpu/lib" - "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib" - "\$ORIGIN/libs/libtorch/cpu/lib" - "\$ORIGIN:/usr/bin/protoc" - "/usr/lib/x86_64-linux-gnu" - "/usr/lib64" - "\$ORIGIN/" - "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" -) -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") -set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - -# Sources -file(GLOB_RECURSE VDB_SRCS ${CMAKE_SOURCE_DIR}/components/VectorDatabase/src/*.cpp) - -set(RagPUREAI_BINDING_SRCS - ${CMAKE_SOURCE_DIR}/src/binding.cpp - ${CMAKE_SOURCE_DIR}/components/VectorDatabase/python/binding_vectordb.cpp -) - -set(RagPUREAI_IMPL_SRCS - ${VDB_SRCS} - ${CMAKE_SOURCE_DIR}/libs/StringUtils/StringUtils.cpp - ${CMAKE_SOURCE_DIR}/libs/CommonStructs/CommonStructs.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/BaseLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/PDFLoader/PDFLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/DOCXLoader/DOCXLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/WebLoader/WebLoader.cpp - ${CMAKE_SOURCE_DIR}/components/DataLoader/TXTLoader/TXTLoader.cpp - - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataExtractor.cpp - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataRegexExtractor/MetadataRegexExtractor.cpp - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor/MetadataHFExtractor/MetadataHFExtractor.cpp - - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingOpenAI/EmbeddingOpenAI.cpp - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingModel/EmbeddingModel.cpp - - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkCommons/ChunkCommons.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkCount/ChunkCount.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkDefault/ChunkDefault.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkSimilarity/ChunkSimilarity.cpp - ${CMAKE_SOURCE_DIR}/components/Chunk/ChunkQuery/ChunkQuery.cpp - - ${CMAKE_SOURCE_DIR}/components/CleanData/ContentCleaner/ContentCleaner.cpp - - ${CMAKE_SOURCE_DIR}/components/Chat/Message/HumanMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/Message/AIMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/Message/SystemMessage.cpp - ${CMAKE_SOURCE_DIR}/components/Chat/ChatHistory/ChatHistory.cpp - -) - -# RagPUREAILib -add_library(RagPUREAILib STATIC ${RagPUREAI_IMPL_SRCS}) -target_include_directories(RagPUREAILib PUBLIC - ${CMAKE_SOURCE_DIR}/components - ${CMAKE_SOURCE_DIR}/components/DataLoader - ${CMAKE_SOURCE_DIR}/components/MetadataExtractor - ${CMAKE_SOURCE_DIR}/components/Chunk - ${CMAKE_SOURCE_DIR}/components/CleanData - ${CMAKE_SOURCE_DIR}/components/Embedding - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingOpenAI - ${CMAKE_SOURCE_DIR}/components/Embedding/EmbeddingModel - - ${CMAKE_SOURCE_DIR}/components/Chat - ${CMAKE_SOURCE_DIR}/components/Chat/ChatHistory - ${CMAKE_SOURCE_DIR}/components/Chat/Message - - ${CMAKE_SOURCE_DIR}/components/VectorDatabase/include - - ${CMAKE_SOURCE_DIR}/libs/RagException - ${CMAKE_SOURCE_DIR}/libs/ThreadSafeQueue - ${CMAKE_SOURCE_DIR}/libs/CommonStructs - ${CMAKE_SOURCE_DIR}/libs/StringUtils - ${CMAKE_SOURCE_DIR}/libs/FileUtils - ${CMAKE_SOURCE_DIR}/libs/MemoryUtils - ${TOKENIZERS_PATH}/include - ${OPEANAI_CPP_PATH}/include - ${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/include - ${CURL_INCLUDE_DIRS} - ${TORCH_INCLUDE_DIRS} -) - -target_link_libraries(RagPUREAILib PUBLIC - pdfium::pdfium - icu::icu - miniz::miniz - rapidxml::rapidxml - beauty::beauty - lexbor::lexbor_static - OpenMP::OpenMP_CXX - re2::re2 - nlohmann_json::nlohmann_json - redis++::redis++_static - hiredis::hiredis - onnxruntime::onnxruntime - tokenizers_cpp - protobuf::libprotobuf - CURL::libcurl - ${Python3_LIBRARIES} - ${TORCH_LIBRARIES} -) - -# Binding with Pybind11 -pybind11_add_module(RagPUREAI ${RagPUREAI_BINDING_SRCS}) -target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) - -# vectordb -pybind11_add_module(vectordb components/VectorDatabase/python/_vectordb.cpp) -target_link_libraries(vectordb PRIVATE - -Wl,--whole-archive - VectorDatabase - -Wl,--no-whole-archive -) - -# Disables LTO/IPO in the module to avoid ODR/refcount problems. - -set_property(TARGET vectordb PROPERTY INTERPROCEDURAL_OPTIMIZATION FALSE) -target_compile_options(vectordb PRIVATE -fno-lto) -target_link_options(vectordb PRIVATE -fno-lto) - -# .so output -set_target_properties(vectordb PROPERTIES - OUTPUT_NAME "vectordb" - LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" - ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" - RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/python" -) - diff --git a/Dockerfile b/Dockerfile index 12abacb..cd0b212 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,48 +1,22 @@ +# Use the official manylinux image (compatible with Python packaging standards) +FROM quay.io/pypa/manylinux_2_28_x86_64 -FROM python:3.12-slim AS builder -WORKDIR /app +# Set working directory +WORKDIR /home -# Install GCC 13 and other dependencies -RUN apt-get update && \ - apt-get install -y \ - gcc-13 \ - g++-13 \ - libstdc++-13-dev \ - git \ - curl \ - wget \ - cmake \ - nano \ - unzip \ - ninja-build \ - pkg-config \ - libffi-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libgflags-dev \ - libssl-dev \ - sudo \ - build-essential \ - gnupg \ - && apt-get clean && rm -rf /var/lib/apt/lists/* && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 100 +# Install development tools, Python deps, Rust, and cleanup to save space +RUN yum install -y \ + gcc gcc-c++ make git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + openblas-devel unzip\ + && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && yum clean all \ + && rm -rf /var/cache/yum -COPY .git .git -COPY .gitmodules .gitmodules -COPY scripts/ ./scripts/ -RUN chmod +x -R /app/scripts -RUN mkdir -p /app/libs/openai-cpp /app/libs/tokenizers-cpp +# Add Rust to PATH and Python 3.12 binaries to PATH +ENV PATH="/root/.cargo/bin:/opt/python/cp312-cp312/bin:${PATH}" -# Install Rust -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - -# Run your scripts -RUN /app/scripts/install_python_dependencies.sh -RUN /app/scripts/install_torch.sh -RUN /app/scripts/install_libs.sh -RUN /app/scripts/configure_conan_profile.sh - -# COPY . . +# Set default shell CMD ["/bin/bash"] + diff --git a/README.md b/README.md index ca01461..46c91f6 100644 --- a/README.md +++ b/README.md @@ -1,131 +1,219 @@ # PureCPP -**PureCPP** is the C++ backend powering the core logic of the RAG (Retrieval-Augmented Generation) system. It provides high-performance native modules that integrate seamlessly with Python via bindings. +[![Status](https://img.shields.io/badge/status-stable-brightgreen?style=flat-square)]() -## Contributing +**PureCPP is a powerful C++ backend architecture for RAG systems.**\ +Designed for maximum performance and scalability, it integrates vector search, ONNX models, and CPU/CUDA acceleration into a seamless, Python-integrated framework. -We welcome contributions to **PureCPP**! +*This repository provides detailed guidance on how to set up the environment, configure dependencies, and build the project.* -Before submitting a pull request or issue, please read our [Contribution Guide](/community/CONTRIBUTING.md). +## πŸ“š Table of Contents +* **1.** [Environment Setup](#environment-setup) + - [Docker](#docker) + - [Local](#local) +* **2.** [Build & Testing](#build) +* **3.** [Publishing to PyPI](#publishing-to-pypi) + +--- ## Project Structure -``` -. +```html β”œβ”€β”€ scripts/ # Shell utilities and setup scripts β”œβ”€β”€ package/ # Python package β”‚ └── purecpp/ # Contains the compiled .so -β”œβ”€β”€ build/ # Generated build files -β”œβ”€β”€ libs/ # Third-party dependencies -β”œβ”€β”€ CMakeLists.txt # Main build config +β”œβ”€β”€ libs/ # Dependencies +β”œβ”€β”€ src/ # source files and CMake entry +β”‚ β”œβ”€β”€ build/ # Generated build files +β”‚ β”œβ”€β”€ Conanfile.py # Package manager for C and C++ +β”‚ └── CMakeLists.txt # Main build config +β”œβ”€β”€ models/ +β”‚ β”œβ”€β”€ hf_extract_model.py +β”‚ β”œβ”€β”€ hf_model_to_onnx.py +β”‚ └── / β”œβ”€β”€ Dockerfile # Build environment └── README.md -``` +```` -## Documentation +- **[Quick Start β†—](https://docs.puredocs.org/setup)** +- **[Contributing to PureCPP β†—](docs/CONTRIBUTING.md)** +- **[Download Pre-trained Models β†—](./models/README.md)** -For full installation and setup instructions, visit our official documentation: +--- -πŸ”— [PureCPP Documentation](https://docs.puredocs.org/setup) +## Environment Setup -## Quick Start with PIP +--- -To install the package via `pip` (for end-users): +### First of all clone the repository ```bash -pip install purecpp -``` +git clone --recursive https://github.com/pureai-ecosystem/purecpp +cd purecpp +```` -## Build Options +> [!WARNING] +> +> If you forgot to use `--recursive` when cloning the repository, +> make sure to run: +> +> ```bash +> git submodule update --init --recursive +> ``` -You can either **build locally** or use our **Docker environment** to ensure consistency. +### **Docker** -### Building with Docker (Recommended) +--- -To simplify setup and avoid installing system-wide dependencies, use the provided Dockerfile. +* **1. Build a Docker image from the current directory and tag it as 'purecpp_env'** -#### Step 1: Build the Docker image + ```bash + docker build -t purecpp_env . + ``` -```bash -docker build -t purecpp . -``` +* **2. Start a Docker container named 'env' from the 'purecpp_env' image, mounting current dir to /home** -#### Step 2: Start a bash shell inside the container + ```bash + docker run -it --name env -v "$PWD":/home purecpp_env + ``` -```bash -docker run -it --rm purecpp bash -``` +* **3. Execute the `env_config.sh`** -#### Step 3: Inside the container, build the project + ```bash + chmod +x scripts/*.sh + ./scripts/env_config.sh + ``` -```bash -./build -``` + *This script automates the setup. Installing Python essentials, LibTorch, FAISS, and configuring Conan profile* -This will generate the shared object (`RagPUREAI.cpython-*.so`) in the `build/Release/` directory. +> [!CAUTION] +> +> Once you've created the container using `docker run`, ***you don't need to recreate it again.*** +> Instead, follow these two simple commands to ***reuse*** the container: +> ```bash +> docker start env +> ``` +> **This command *starts an existing container* that has already been created earlier using `docker run`.** +> ```bash +> docker exec -it env bash +> ``` +> **This command *attaches a terminal to the running container*, allowing you to interact with it just like you would with a regular Linux shell.** -#### Step 4: Copy `.so` to your test folder +--- -To test the Python bindings, copy the `.so` file to your test script directory: +### **Local** + +--- + +> Requirements +> +> * **Python** β‰₯ 3.8 +> * **CMake** β‰₯ 3.22 +> * **GCC/G++** β‰₯ 13 + +#### 1. Installing dependencies + +- **Ubuntu/Debian** + ```bash + sudo apt update && \ + sudo apt upgrade -y && \ + sudo apt install -y \ + build-essential wget curl \ + ninja-build cmake libopenblas-dev \ + libgflags-dev python3-dev libprotobuf-dev \ + protobuf-compiler unzip libssl-dev zlib1g-dev + ``` + +- **Red Hat** + ```bash + yum update && + yum install -y \ + gcc gcc-c++ make git curl wget \ + ninja-build libffi-devel openssl-devel \ + protobuf-devel gflags-devel zlib-devel \ + openblas-devel unzip \ + ``` + +#### 2. Install Rust via rustup ```bash -cp build/Release/RagPUREAI*.so /some-test-folder +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ``` +*Run rustup installer non-interactively (-y).* -### Building Locally (Alternative) +```bash +source ~/.cargo/env +``` -You may also build the project manually without Docker, if your environment satisfies the requirements. +*This places cargo and rustc in /root/.cargo & activate Rust Environment* -#### Minimum Requirements to Build Locally +#### 3. Execute the `env_config.sh` -* **Python** β‰₯ 3.8 -* **CMake** β‰₯ 3.22 -* **Conan** β‰₯ 2.0 -* **Rust** -* **GCC/G++** = 13 -* **Protobuf Compiler** + ***In case you do not have a Docker environment available, we strongly recommend that you use a Python `venv` (virtual environment) to ensure proper isolation of dependencies and reproducibility of results.*** -#### Build Steps + - Create the virtual environment (replace 'venv' with your preferred name) + + ```bash + python3 -m venv venv + ``` + + - Activate the virtual environment on Linux or macOS + + ```bash + source venv/bin/activate + ``` + *This practice minimizes conflicts between global packages and project-specific requirements.* + +**Then run `env_config.sh` script** ```bash -chmod +x scripts/install_python_dependencies.sh -chmod +x scripts/install_torch.sh -chmod +x scripts/install_libs.sh -chmod +x scripts/configure_conan_profile.sh -chmod +x build - -# Install dependencies -./scripts/install_python_dependencies.sh -./scripts/install_torch.sh -./scripts/install_libs.sh -./scripts/configure_conan_profile.sh - -# Build the project -./build +chmod +x scripts/*.sh +./scripts/env_config.sh ``` -The output `.so` file will be located in `build/Release/`. +*This script automates the setup. Installing Python essentials, LibTorch, FAISS, and configuring Conan profile* --- -## Testing Locally +## **Build** -To test the Python bindings: +***The `build.sh` is a development pipeline, that makes easier to compile and test*** -```python -from RagPUREAI import SomeExposedFunction +```bash +chmod +x build.sh +./build.sh ``` -Ensure `RagPUREAI*.so` is placed in the same folder as your Python project. +- Cleans the `build/` folder +- Installs Conan dependencies if missing +- Compiles the code +- Sends the `RagPUREAI.*.so` output to[`Sandbox/`](/Sandbox) + +## **Testing** + +The `build.sh` script will place the resulting libraries inside [`Sandbox/`](/Sandbox) + + ```html + Sandbox/ + β”œβ”€β”€ Resources/ + β”œβ”€β”€ RagPUREAI.cpython-312-x86_64-linux-gnu.so + └── YOUR-TEST.py + ``` + +To test the Python bindings: + ```python + from RagPUREAI import SomeExposedFunction + ``` --- +--- ## Publishing to PyPI -To build and upload the Python package: +To build and upload the Python package to PyPI: ```bash -./scripts/create_pip_package +./scripts/create_pip_package PYPI-API-KEY ``` This script will: @@ -136,19 +224,8 @@ This script will: --- -## Downloading Pre-trained Models - -You can convert HuggingFace models to ONNX using: - -```bash -python3 scripts/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" -python3 scripts/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" -``` - ---- - ## Next Steps -![Next Steps](community/release.jpg) +![Next Steps](docs/release.jpg) -Stay tuned for updates and new model integrations! πŸš€ \ No newline at end of file +Stay tuned for updates and new model integrations! πŸš€ diff --git a/Sandbox/README.md b/Sandbox/README.md new file mode 100644 index 0000000..cef8d16 --- /dev/null +++ b/Sandbox/README.md @@ -0,0 +1,11 @@ + +# Sandbox Repository + +This folder is a **isolated testing and experimentation environment** + +**Every time it is compiled (using the script `./build.sh`), the output is **redirected** to the `Sandbox/` directory.** + +* Here, the `.so` files (shared libraries) will be available + for experimentation, prototyping, and testing purposes. + +* The `Resources/` folder contains a collection of publicly accessible files in various formats (.doc, .pdf, .txt), used to test the functionalities of the different loaders. These materials are assumed to be free of patent restrictions or in the public domain. diff --git a/build.sh b/build.sh index d69686f..7efc266 100755 --- a/build.sh +++ b/build.sh @@ -1,15 +1,117 @@ -#!/bin/bash -set -e -set -x +#!/usr/bin/env bash -sed -i s/compiler.version=.*/compiler.version=11/g ~/.conan2/profiles/default -conan install . --build=missing +set -euo pipefail -cmake \ - --preset conan-release \ - -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ - -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ - -DSPM_USE_BUILTIN_PROTOBUF=OFF \ - -G "Unix Makefiles" -cmake --build --preset conan-release --parallel $(nproc) --target RagPUREAI -- +cd src/ +#================= COLORS ================= +GREEN='\033[0;32m' +RESET='\033[0m' + +#----------------------------------------- +#================= LOGGING =============== +#----------------------------------------- +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK=$'\n\n' +SEGMENT=$'===========================================================\n' +#----------------------------------------- + +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " Begin: $TAG$LINE_BRK" +printf "$SEGMENT$LINE_BRK" +#----------------------------------------- + + + +# ──────────────────────────────────────── +# Smart core splitter for parallel builds +# ──────────────────────────────────────── + +cores=$(nproc) + +if [ "$cores" -gt 1 ]; then + half=$((cores / 2)) +else + half=1 +fi +printf "$LINE_BRK" +echo -e "$GREEN[Core splitter] Detected $cores cores, using $half for parallel build. " +printf "$LINE_BRK$SEGMENT$SEGMENT" + +#----------------------------------------- + +# ──────────────────────────────────────── +# Conan +# ──────────────────────────────────────── +#----------------------------------------- +printf " Begin: [CONAN]$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +rm -fr ./build +conan install . --build=missing -c tools.build:jobs=$half + +# rm -fr ./conan.lock +# conan lock create . --build=missing -c tools.build:jobs=$half + +#----------------------------------------- +printf "$SEGMENT" +printf " [CONAN] Finished \n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#================= ENDING ================ + +# ──────────────────────────────────────── +# Build +# ──────────────────────────────────────── +#----------------------------------------- +printf " Begin: [Build]$LINE_BRK" +printf "$SEGMENT$LINE_BRK" +#----------------------------------------- +START_TIME=$(date +%s) + +cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW \ + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ + -DBUILD_SHARED_LIBS=OFF \ + -D_GLIBCXX_USE_CXX11_ABI=1 \ + -DSPM_USE_BUILTIN_PROTOBUF=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=generators/conan_toolchain.cmake \ + -S "$(pwd)" \ + -B "$(pwd)/build/Release" \ + -G "Unix Makefiles" + +cmake --build "$(pwd)/build/Release" --parallel "$half" #--target RagPUREAI + +END_TIME=$(date +%s) +ELAPSED_TIME=$((END_TIME - START_TIME)) +#---------------- LOG -------------------- +echo -e "$GREEN" +echo "===========================================================" +echo " Total build time: ${ELAPSED_TIME} s" +echo -e "===========================================================$RESET" + +#----------------------------------------- +printf "$SEGMENT" +printf " [Build] Finished \n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#================= ENDING ================ + + +# ──────────────────────────────────────── +# Sending to Sandbox +# ──────────────────────────────────────── + +printf "$GREEN[Last Step] Sending to Sandbox \n" +echo -e "$RESET" + +rm -f ../Sandbox/*.so + +cp ./build/Release/RagPUREAI.cpython*.so ../Sandbox/ + +#----------------------------------------- +printf "$SEGMENT" +printf " $TAG Finished \n" +printf "$SEGMENT$SEGMENT$SEGMENT\n" +#================= ENDING ================ diff --git a/community/CONTRIBUTING.md b/community/CONTRIBUTING.md deleted file mode 100644 index 353a716..0000000 --- a/community/CONTRIBUTING.md +++ /dev/null @@ -1,147 +0,0 @@ -# πŸ§™πŸΌβ€β™‚οΈContributing to PureCPP - -Welcome to **PureCPP**, where efficiency and optimization are the foundation of everything we build. Here, we value clean, fast, and powerful code. - -Want to contribute? Whether you're creating new integrations, improving performance, or expanding features, every line of code matters. Together, we’ll take high-performance computing to the next level. - -πŸ’‘ Ready to code without limits? Let’s get to work! πŸ’‘ - ---- - -## πŸ’ͺ🏽 Quick Start Guide - -Ready to jump in? Follow this quick setup guide to get started smoothly: - -1. **Fork** the repo and clone your fork. -2. Navigate to the project folder: - ```bash - cd purecpp - ``` - -3. Make sure you have the following packages installed: -- GCC/G++ 13.1 -- CMake 3.22+ -- Conan 2 -- Rust -- Python 3.8+ - -4. Install the required dependencies:`: -Depending on your system, you may need: - - ```bash - sudo apt update && sudo apt install -y gcc-13 g++-13 cmake conan rustc cargo - ``` -5. Install development conan dependencies:: - - ```bash - !pip install conan==2.* - ``` - -6. Run the tests to ensure everything is working:: - ```bash - ./tests/run_tests - ``` - - - All set! Now it's time to build something powerful. If you need more details, check out the [Development Guidelines](#-Development-Guidelines). - ---- -## Community Discord -Join our community [Discord](https://discord.gg/8eF9v78Ndv) to ask questions, get support, and collaborate with fellow contributors and users. - - -## ⚑ What Can You Contribute To? - -There are many ways to contribute to **PureCPP**β€”whether you're a **C++ expert** or just starting out with **high-performance computing**. Here, we focus on **performance, efficiency, and scalability**. Your contributions are always welcome! - -## 1. πŸš€ Expand Core Modules - -Help us improve **PureCPP** by contributing to our core modules and making the framework even more optimized. - -- **New Integrations** (e.g., support for new compilers, optimized bindings, high-performance libraries) -- **Memory Management**, **Parallelism (Threads and CUDA)**, **Matrix and Tensor Operations** -- **Advanced Chunking Techniques** to optimize processing -- **Efficient Metadata Extraction and Management** - -## 2. βš™οΈ Dataloaders and Smart Storage - -- **Optimized Dataloaders** for different file types and databases -- **Efficient indexing and retrieval** -- **Smart loading strategies to optimize search performance** - -## 3. 🏎️ Vector Database and LLMs - -- Implementation and optimization of **high-performance vector databases** -- **Integration of LLMs** and embedding models for semantic search -- Support for **quantization, fine-tuning, and CUDA optimizations** - -## 4. πŸ› οΈ Bug Fixes and Code Improvements - -Found something that could be optimized? Code improvements are always welcome! Check out the [GitHub Labels](https://github.com/pureai-ecosystem/purecpp/labels) - -## 5. πŸ“š Share Usage Examples - -If you’ve used **PureCPP** in an innovative way, share your examples and contribute to the community. - -## 6. πŸ”¬ Experiments and New Approaches - -Got a different idea? We’re open to tests and new approachesβ€”experiment and submit a PR! - - - ---- - -## πŸš€ **Next Steps: What Are We Planning?** - -We are always evolving! Here are the next steps to make our pipelines even more efficient and powerful: - -### πŸ”Ή **New Features** -βœ… **Add local Vector Databases** to enhance semantic search performance -βœ… **Integrate local LLMs** and create connectors for inference frameworks - -### πŸ”§ **Fixes & Improvements** -πŸ› οΈ **Optimize data extraction** for greater efficiency -πŸ“Œ **Add Schema** to better structure data -πŸ“Œ **Expand the variety of models** in our components -πŸ”„ **Enhance chunking techniques** for smarter processing -πŸ“ˆ **Improve embeddings** for more precise vector representations -πŸ—‚οΈ **Refine metadata extraction** for better contextualization - -πŸ’‘ **Got an idea?** Your contribution is more than welcome! Join us and help take this project even further. πŸš€ - ---- - - -## ✨ Steps to Contribute - -1. **Fork** the repository on GitHub. -2. **Clone** your fork to your local machine. - ```bash - git clone https://github.com/pureai-ecosystem/purecpp.git - ``` -3. **Create a branch** for your work. - ```bash - git checkout -b your-feature-branch - ``` -4. **Set up your environment** -5. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. -6. **Commit** your changes, then push them to your fork. - ```bash - git push origin your-feature-branch - ``` -7. **Open a pull request** on GitHub. - -Obrigado! - ---- - - -## ⚑ **Acknowledgements: Built with Pure Performance** - -Big thanks for being part of **PureCPP**β€”where every bit counts, and every byte makes a difference! πŸš€ - -Whether you're optimizing loops, fine-tuning embeddings, or pushing parallel processing to the limit, your contributions fuel the engine of **high-performance computing**. - -We’re not just writing codeβ€”we’re compiling the future. πŸ”₯ - -Keep coding at full speed! πŸŽοΈπŸ’» \ No newline at end of file diff --git a/components/Chunk/ChunkCommons/ChunkCommons.h b/components/Chunk/ChunkCommons/ChunkCommons.h index b9777a7..5a77f53 100644 --- a/components/Chunk/ChunkCommons/ChunkCommons.h +++ b/components/Chunk/ChunkCommons/ChunkCommons.h @@ -32,13 +32,13 @@ namespace Chunk }; }; - extern inline const std::unordered_map> EmbeddingModel = { + extern inline const std::unordered_map> EmbeddingModel = { {"openai", {"text-embedding-ada-002", "text-embedding-3-small", "..."}}, {"huggingface", {"bge-small", "bge-large"}}, {"cohere", {"embed-english-light-v3.0"}} }; - inline void PrintEmbeddingModels() { + inline void PrintEmbeddingModels() { std::cout << "╔══════════════════════════════════════════════════════════╗\n"; std::cout << "β•‘ πŸ“¦ Available Embedding Models \n"; std::cout << "╠══════════════════════════════════════════════════════════╣\n"; diff --git a/components/FAISSBackend/faiss_backend.cpp b/components/FAISSBackend/faiss_backend.cpp new file mode 100644 index 0000000..16892f6 --- /dev/null +++ b/components/FAISSBackend/faiss_backend.cpp @@ -0,0 +1,137 @@ +#include + +std::optional +faiss_backend::PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + faiss::IndexFlatL2 index(d); + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + const float* xb = vdb->getVDpointer(); + if (!xb) { + throw std::runtime_error("Empty vector database. Cannot proceed."); + } + + index.add(ndb, xb); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector I(k); + std::vector D(k); + index.search(nq, emb_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + std::cout << "Nearest index: " << I[0] << std::endl; + std::cout << "Distance: " << D[0] << std::endl; + return faiss_backend::PureResult{I, D}; + } + return {}; +} + +std::optional +faiss_backend::PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + faiss::IndexFlatIP index(d); + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + const float* xb = vdb->getVDpointer(); + if (!xb) { + throw std::runtime_error("Empty vector database. Cannot proceed."); + } + + index.add(ndb, xb); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector I(k); + std::vector D(k); + index.search(nq, emb_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + std::cout << "Most similar index: " << I[0] << std::endl; + std::cout << "Similarity score: " << D[0] << std::endl; + return faiss_backend::PureResult{I, D}; + } + return {}; +} + +std::optional +faiss_backend::PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k) { + Chunk::ChunkQuery cq(query, {}, &chunks, pos); + size_t nq, d, ndb; + std::tie(nq, d, ndb) = cq.getPar(); + + if (k > ndb) { + throw std::invalid_argument("k > base vector"); + } + + const Chunk::vdb_data* vdb = cq.getVDB(); + if (!vdb) { + throw std::runtime_error("vdb_data is null. Cannot proceed."); + } + + std::vector base = vdb->flatVD; + for (size_t i = 0; i < ndb; ++i) { + normalize_vector(&base[i * d], d); + } + + faiss::IndexFlatIP index(d); + index.add(ndb, base.data()); + + auto emb_query = cq.getEmbedQuery(); + if (emb_query.size() != d) { + throw std::runtime_error("Embedding dimension mismatch."); + } + + std::vector normalized_query = emb_query; + normalize_vector(normalized_query.data(), d); + + std::vector I(k); + std::vector D(k); + index.search(nq, normalized_query.data(), k, D.data(), I.data()); + + if (D.size() > 0) { + return faiss_backend::PureResult{I, D}; + } + return {}; +} + +void faiss_backend::normalize_vector(float* vec, size_t d) { + float norm = 0.0f; + for (size_t i = 0; i < d; ++i) { + norm += vec[i] * vec[i]; + } + norm = std::sqrt(norm); + if (norm > 0.0f) { + for (size_t i = 0; i < d; ++i) { + vec[i] /= norm; + } + } +} \ No newline at end of file diff --git a/components/FAISSBackend/faiss_backend.h b/components/FAISSBackend/faiss_backend.h new file mode 100644 index 0000000..a2d5203 --- /dev/null +++ b/components/FAISSBackend/faiss_backend.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "RagException.h" +#include "StringUtils.h" +#include "CommonStructs.h" +#include "EmbeddingOpenAI.h" +#include "Chunk/ChunkCommons/ChunkCommons.h" +#include "Chunk/ChunkDefault/ChunkDefault.h" +#include "Chunk/ChunkQuery/ChunkQuery.h" + +namespace faiss_backend { + + // struct vdb_data { + // std::vector flatVD; + // std::string vendor; + // std::string model; + // size_t dim = 0; + // size_t n = 0; + + // inline const std::tuple getPar(void) const { + // return {n, dim}; + // } + + // inline std::pair getEmbPar(void) const { + // return {vendor, model}; + // } + + // inline const float* getVDpointer(void) const { + // if (flatVD.empty()) { + // std::cout << "[Info] Empty Vector Data Base\n"; + // return {}; + // } + // return flatVD.data(); + // } + // }; + + struct PureResult { + std::vector indices; + std::vector distances; + }; + + // L2 distance (Euclidean) + std::optional + PureL2(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Inner Product (dot product) + std::optional + PureIP(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Cosine similarity (requires normalization) + std::optional + PureCosine(std::string query, const Chunk::ChunkDefault& chunks, size_t pos, int k); + + // Utility: in-place L2 normalization + void normalize_vector(float* vec, size_t d); + +} \ No newline at end of file diff --git a/SUPPORT.md b/docs/CONTRIBUTING.md similarity index 64% rename from SUPPORT.md rename to docs/CONTRIBUTING.md index 70d88c9..46b1e78 100644 --- a/SUPPORT.md +++ b/docs/CONTRIBUTING.md @@ -8,49 +8,39 @@ Want to contribute? Whether you're creating new integrations, improving performa --- -## πŸ’ͺ🏽 Quick Start Guide - -Ready to jump in? Follow this quick setup guide to get started smoothly: +## ✨ Steps to Contribute + +**Ready to jump in? Follow this quick setup guide to get started smoothly** 1. **Fork** the repo and clone your fork. -2. Navigate to the project folder: - ```bash - cd purecpp - ``` - -3. Make sure you have the following packages installed: -- GCC/G++ 13.1 -- CMake 3.22+ -- Conan 2 -- Rust -- Python 3.8+ -4. Install the required dependencies:`: -Depending on your system, you may need: +2. [Environment setup](/README.md#environment-setup-ubuntu--debian-for-c-and-python-development) - ```bash - sudo apt update && sudo apt install -y gcc-13 g++-13 cmake conan rustc cargo - ``` -5. Install development conan dependencies:: +3. [How to build](/README.md#how-to-build) - ```bash - !pip install conan==2.* - ``` +**All set! Now... it is time to build something powerful**. -6. Run the tests to ensure everything is working:: +1. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. +2. **Commit** your changes, then push them to your fork. ```bash - ./tests/run_tests - ``` + git push origin your-feature-branch + ```` +3. **Open a pull request** on GitHub. - All set! Now it's time to build something powerful. If you need more details, check out the [Development Guidelines](#-Development-Guidelines). --- -## Community Discord -Join our community [Discord](https://discord.gg/8eF9v78Ndv) to ask questions, get support, and collaborate with fellow contributors and users. +--- +## Community Discord [![Join us on Discord ](https://img.shields.io/badge/Join_Us_On_Discord-5865F2?logo=discord&logoColor=white&style=for-the-badge)](https://discord.gg/8eF9v78Ndv) +- Ask questions and get support +- Share feedback and suggestions +- Connect with the team and other users + +--- +--- -## ⚑ What Can You Contribute To? +## ⚑ What to Contribute? There are many ways to contribute to **PureCPP**β€”whether you're a **C++ expert** or just starting out with **high-performance computing**. Here, we focus on **performance, efficiency, and scalability**. Your contributions are always welcome! @@ -59,19 +49,22 @@ There are many ways to contribute to **PureCPP**β€”whether you're a **C++ expert Help us improve **PureCPP** by contributing to our core modules and making the framework even more optimized. - **New Integrations** (e.g., support for new compilers, optimized bindings, high-performance libraries) -- **Memory Management**, **Parallelism (Threads and CUDA)**, **Matrix and Tensor Operations** +- **Memory Management** βœ”οΈ +- **Parallelism (Threads/OpenMP)** βœ”οΈ +- **Parallelism (CUDA)** +- **Matrix and Tensor Operations** - **Advanced Chunking Techniques** to optimize processing - **Efficient Metadata Extraction and Management** ## 2. βš™οΈ Dataloaders and Smart Storage - **Optimized Dataloaders** for different file types and databases -- **Efficient indexing and retrieval** -- **Smart loading strategies to optimize search performance** +- **Efficient indexing and retrieval** βœ”οΈ +- **Smart loading strategies to optimize search performance** βœ”οΈ+- ## 3. 🏎️ Vector Database and LLMs -- Implementation and optimization of **high-performance vector databases** +- Implementation and optimization of **high-performance vector databases** βœ”οΈ - **Integration of LLMs** and embedding models for semantic search - Support for **quantization, fine-tuning, and CUDA optimizations** @@ -87,8 +80,6 @@ If you’ve used **PureCPP** in an innovative way, share your examples and contr Got a different idea? We’re open to tests and new approachesβ€”experiment and submit a PR! - - --- ## πŸš€ **Next Steps: What Are We Planning?** @@ -110,38 +101,22 @@ We are always evolving! Here are the next steps to make our pipelines even more πŸ’‘ **Got an idea?** Your contribution is more than welcome! Join us and help take this project even further. πŸš€ --- - - -## ✨ Steps to Contribute - -1. **Fork** the repository on GitHub. -2. **Clone** your fork to your local machine. - ```bash - git clone https://github.com/pureai-ecosystem/purecpp.git - ``` -3. **Create a branch** for your work. - ```bash - git checkout -b your-feature-branch - ``` -4. **Set up your environment** -5. **Work on your feature or bugfix**, ensuring you have unit tests covering your code. -6. **Commit** your changes, then push them to your fork. - ```bash - git push origin your-feature-branch - ``` -7. **Open a pull request** on GitHub. - -Obrigado! - --- - ## ⚑ **Acknowledgements: Built with Pure Performance** -Big thanks for being part of **PureCPP**β€”where every bit counts, and every byte makes a difference! πŸš€ +Big thanks for being part of **PureCPP**β€” where every bit counts, and every byte makes a difference! πŸš€ Whether you're optimizing loops, fine-tuning embeddings, or pushing parallel processing to the limit, your contributions fuel the engine of **high-performance computing**. We’re not just writing codeβ€”we’re compiling the future. πŸ”₯ Keep coding at full speed! πŸŽοΈπŸ’» + +--- + +Thank You! Gracias! 謝謝! κ°μ‚¬ν•΄μš”! γ‚γ‚ŠγŒγ¨γ†! Бпасибо! Obrigado! + + +--- +--- diff --git a/community/release.jpg b/docs/release.jpg similarity index 100% rename from community/release.jpg rename to docs/release.jpg diff --git a/libs/faiss b/libs/faiss new file mode 160000 index 0000000..6470b8d --- /dev/null +++ b/libs/faiss @@ -0,0 +1 @@ +Subproject commit 6470b8d9d0f9c0adc71df6d5a1ce64199be85305 diff --git a/libs/tokenizers-cpp b/libs/tokenizers-cpp index 4bb7533..55d53aa 160000 --- a/libs/tokenizers-cpp +++ b/libs/tokenizers-cpp @@ -1 +1 @@ -Subproject commit 4bb753377680e249345b54c6b10e6d0674c8af03 +Subproject commit 55d53aa38dc8df7d9c8bd9ed50907e82ae83ce66 diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000..59e2da5 --- /dev/null +++ b/models/README.md @@ -0,0 +1,48 @@ +--- +--- + +# Download Pre-trained Models + +## πŸ› οΈ Hugging Face to **ONNX** Converter: + +These Python scripts convert Hugging Face models into the ONNX format for optimized inference. + +These scripts handle two primary use cases: +1. **Feature extraction models** (e.g., `sentence-transformers`). +2. **Token classification models** (e.g., Named Entity Recognition - NER). + +It automatically downloads the model and organizes the exported files in a structured subdirectory. + +## Requirements + + *Before running the script, make sure you have the following Python packages installed:* + ```bash + pip install torch transformers onnx onnxruntime optimum + ``` + +## Examples + +```bash +python3 models/hf_model_to_onnx.py -m="dbmdz/bert-large-cased-finetuned-conll03-english" -o="bert-large-cased-finetuned-conll03-english" +``` + +```bash +python3 models/hf_model_to_onnx.py -m="sentence-transformers/all-MiniLM-L6-v2" -o="sentence-transformers/all-MiniLM-L6-v2" +``` + +## Output + +``` +./models/ + β”œβ”€β”€ hf_extract_model.py + β”œβ”€β”€ hf_model_to_onnx.py + β”œβ”€β”€ sentence-transformers/all-MiniLM-L6-v2/ + β”‚ β”œβ”€β”€ model.onnx (via optimum) + β”‚ └── tokenizer/ + └── dslim/bert-base-NER/ + β”œβ”€β”€ model.onnx + β”œβ”€β”€ label_map.json + └── tokenizer/ +``` + +--- diff --git a/scripts/hf_extract_model.py b/models/hf_extract_model.py similarity index 90% rename from scripts/hf_extract_model.py rename to models/hf_extract_model.py index 6af6afa..20178d3 100644 --- a/scripts/hf_extract_model.py +++ b/models/hf_extract_model.py @@ -13,7 +13,7 @@ model_name = args.model_name - dir_path= os.path.join(os.path.dirname(__file__), "..", "models", model_name) + dir_path= os.path.join(os.path.dirname(__file__), ".", "models", model_name) if not os.path.exists(dir_path): os.makedirs(dir_path) diff --git a/scripts/hf_model_to_onnx.py b/models/hf_model_to_onnx.py similarity index 94% rename from scripts/hf_model_to_onnx.py rename to models/hf_model_to_onnx.py index afacb4a..98bd98c 100644 --- a/scripts/hf_model_to_onnx.py +++ b/models/hf_model_to_onnx.py @@ -16,7 +16,7 @@ model_name = args.model_name config = AutoConfig.from_pretrained(model_name) label_map = config.id2label -dir_path= os.path.join(os.path.dirname(__file__), "..", "models", model_name) +dir_path= os.path.join(os.path.dirname(__file__), ".", "models", model_name) if not os.path.exists(dir_path): os.makedirs(dir_path) diff --git a/scripts/configure_conan_profile.sh b/scripts/configure_conan_profile.sh deleted file mode 100755 index d2ef139..0000000 --- a/scripts/configure_conan_profile.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -e - -mkdir -p /root/.conan2/profiles - -cat > /root/.conan2/profiles/default </dev/null 2>&1; then + SUDO="sudo" + else + echo -e "${RED}[!] Not running as root and 'sudo' is not available. Re-run as root or install sudo.${RESET}" + exit 1 + fi +fi + +log_end "DETECT ROOT PRIVILEGES" + + +#────────────────────────────────────────── +log_start "DETECT PACKAGE MANAGER" + +PKG_MANAGER="" +if command -v apt-get >/dev/null 2>&1; then + PKG_MANAGER="apt" + echo "[pkg] Detected APT-based system (Ubuntu/Debian)" +elif command -v yum >/dev/null 2>&1; then + PKG_MANAGER="yum" + echo "[pkg] Detected YUM-based system (manylinux/CentOS-like)" +else + echo -e "${RED}[x] Unsupported system: neither apt-get nor yum found.${RESET}" >&2 + exit 1 +fi + +log_end "DETECT PACKAGE MANAGER" + + +#────────────────────────────────────────── +log_start "SETUP DIRECTORIES" + +PROJ_DIR=$(pwd) +FAISS_DIR="${PROJ_DIR}/libs/faiss" +mkdir -p "$FAISS_DIR" +echo "[INFO] Created: $FAISS_DIR" + +log_end "SETUP DIRECTORIES" + + +#────────────────────────────────────────── +log_start "INSTALL DEPENDENCIES" + +if [[ "$PKG_MANAGER" == "apt" ]]; then + $SUDO apt-get update -y + $SUDO apt-get install -y \ + cmake g++ libopenblas-dev libgflags-dev \ + python3-dev build-essential git + +else + echo "[INFO] Checking if EPEL is installed..." + if ! rpm -q epel-release >/dev/null 2>&1; then + echo "[INFO] Installing EPEL repository..." + $SUDO yum install -y epel-release + fi + + $SUDO yum update -y + $SUDO yum groupinstall -y "Development Tools" + $SUDO yum install -y \ + cmake3 gcc-c++ openblas-devel python3-devel git gflags-devel +fi + +if ! command -v cmake >/dev/null && command -v cmake3 >/dev/null; then + echo "[INFO] Linking cmake3 to cmake..." + $SUDO ln -sf /usr/bin/cmake3 /usr/bin/cmake +fi + +log_end "INSTALL DEPENDENCIES" + + +#────────────────────────────────────────── +log_start "BUILD FAISS (CPU ONLY)" + +cd "$FAISS_DIR" + +cmake -B build \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DFAISS_ENABLE_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build build --parallel 3 +cd "$PROJ_DIR" + +log_end "BUILD FAISS (CPU ONLY)" + + +#────────────────────────────────────────── +log_start "VERIFY BUILD" + +FOUND_LIB=$(find "$FAISS_DIR/build/faiss" -name "libfaiss.*" | head -n 1) + +if [ -f "$FOUND_LIB" ]; then + echo "[OK] Header files at: ${FAISS_DIR}/faiss/" + echo "[OK] Library file at: ${FOUND_LIB}" +else + echo -e "${RED}[WARN] libfaiss not found in expected build directory.${RESET}" +fi + +log_end "VERIFY BUILD" + + +#────────────────────────────────────────── + +log_end "FAISS INSTALLATION" +#────────────────────────────────────────── + + +echo -e "$CYAN" +echo "LINKING INSTRUCTIONS" +echo "" +echo "You can now link FAISS in your C++ project using:" +echo "" +echo ' include_directories(${CMAKE_SOURCE_DIR}/libs/faiss/faiss) ' +echo ' link_directories(${CMAKE_SOURCE_DIR}/libs/faiss/build/faiss)' +echo ' target_link_libraries(your_target PRIVATE faiss)' +echo -e "$RESET" \ No newline at end of file diff --git a/scripts/install_libs.sh b/scripts/install_libs.sh deleted file mode 100755 index d9caedc..0000000 --- a/scripts/install_libs.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -git submodule update --init --recursive --remote - -pushd libs/tokenizers-cpp - git checkout 4bb7533 - git submodule update --init --recursive --remote - pushd msgpack - git checkout 8c602e8 - popd - pushd sentencepiece - git checkout f2219b5 - popd -popd \ No newline at end of file diff --git a/scripts/install_python_dependencies.sh b/scripts/install_python_dependencies.sh deleted file mode 100755 index 8da6c8b..0000000 --- a/scripts/install_python_dependencies.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -e - -pip install --upgrade pip setuptools wheel build conan cmake requests twine pybind11 numpy diff --git a/scripts/install_torch.sh b/scripts/install_torch.sh deleted file mode 100755 index 2295712..0000000 --- a/scripts/install_torch.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -ZIP=libtorch-cxx11-abi-shared-with-deps-2.5.0+cpu.zip -URL=https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcpu.zip - -rm -rf ${ZIP} ./libs/libtorch/cpu -wget ${URL} -O ${ZIP} -mkdir -p ./libs/libtorch -unzip ${ZIP} -d ./libs/libtorch -mv ./libs/libtorch/libtorch ./libs/libtorch/cpu \ No newline at end of file diff --git a/scripts/setting_conan_profile.sh b/scripts/setting_conan_profile.sh new file mode 100755 index 0000000..0f07364 --- /dev/null +++ b/scripts/setting_conan_profile.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +#================= COLORS ================= +GREEN='\033[0;32m' +CYAN='\033[0;36m' +YELLOW='\033[1;33m' +RESET='\033[0m' + +#================= FORMATTING ============= +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK=$'\n\n' +SEGMENT="===========================================================\n" + +#================= LOGGER FUNCS =========== +log_start() { + local section="$1" + printf "${CYAN}${SEGMENT}${SEGMENT}${SEGMENT}" + printf " Begin: [$section] ${TAG}${LINE_BRK}" + printf "${SEGMENT}${RESET}" +} + +log_end() { + local section="$1" + printf "${YELLOW}${SEGMENT}" + printf " Finish [$section]${LINE_BRK}" + printf "${SEGMENT}${SEGMENT}${SEGMENT}${RESET}" +} +#========================================== + + +#────────────────────────────────────────── +log_start "CONAN DETECT" + +echo -e "$GREEN[INFO] Running: conan profile detect --force$RESET" +conan profile detect --force + +log_end "CONAN DETECT" + + +#────────────────────────────────────────── +log_start "LOCATE PROFILE DIR" + +PROFILE_DIR=$(find . -type d -wholename "*/.conan2/profiles" | head -n 1 || true) +if [ -z "$PROFILE_DIR" ]; then + PROFILE_DIR="$HOME/.conan2/profiles" + echo -e "$GREEN[INFO] Defaulting to: $PROFILE_DIR$RESET" + mkdir -p "$PROFILE_DIR" +else + echo -e "$GREEN[INFO] Found profile dir at: $PROFILE_DIR$RESET" +fi + +log_end "LOCATE PROFILE DIR" + + +#────────────────────────────────────────── +log_start "WRITE PROFILE" + +DEFAULT_PROFILE="$PROFILE_DIR/default" + +cat << EOF > "$DEFAULT_PROFILE" +[settings] +arch=x86_64 +build_type=Release +compiler=gcc +compiler.cppstd=17 +compiler.libcxx=libstdc++11 +compiler.version=11 +os=Linux +EOF + +echo -e "$GREEN[INFO] Profile written to: $DEFAULT_PROFILE$RESET" + +log_end "WRITE PROFILE" + + +#────────────────────────────────────────── +log_start "VERIFY PROFILE" + +echo -e "$GREEN[INFO] Displaying contents of: $DEFAULT_PROFILE" +echo -e "$CYAN" +cat "$DEFAULT_PROFILE" +echo -e "$RESET" + +log_end "VERIFY PROFILE" diff --git a/scripts/torch_installer.sh b/scripts/torch_installer.sh new file mode 100755 index 0000000..e0dc88b --- /dev/null +++ b/scripts/torch_installer.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -euo pipefail + +#----------------------------------------- +#================= LOGGIN ================ +#----------------------------------------- + +TAG="[$(basename "${BASH_SOURCE[0]}")]" +LINE_BRK="\n\n" +SEGMENT="===========================================================\n" + +printf "$SEGMENT$SEGMENT$SEGMENT" +printf " $TAG$LINE_BRK" +printf "$SEGMENT" +printf "$LINE_BRK" +#----------------------------------------- + +#----------------------------------------- +ZIP=libtorch-cxx11-abi-shared-with-deps-2.5.0+cpu.zip +URL=https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcpu.zip + +rm -rf ${ZIP} ./libs/libtorch/cpu +wget ${URL} -O ${ZIP} +mkdir -p ./libs/libtorch +unzip ${ZIP} -d ./libs/libtorch +mv ./libs/libtorch/libtorch ./libs/libtorch/cpu +rm -f *.zip +#----------------------------------------- + +#----------------------------------------- +#================= ENDING ================ +#----------------------------------------- +printf "$SEGMENT$SEGMENT$SEGMENT" +printf "\n\n\n\n\n". +#----------------------------------------- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100755 index 0000000..5770b9b --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,318 @@ +cmake_minimum_required(VERSION 3.22) +project(RagPUREAI VERSION 1.0) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) +# set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_BUILD_TYPE "Release") + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Path to FAISS ------- +# ───────────────────────────────────────────────────────────────────────────── +set(FAISS_ROOT "${CMAKE_BINARY_DIR}/../../../libs/faiss") +set(FAISS_INCLUDE_DIR "${FAISS_ROOT}/faiss") +set(FAISS_LIB_DIR "${FAISS_ROOT}/build/faiss") # Must contain libfaiss.a + +add_library(faiss STATIC IMPORTED) # Declares FAISS as an already compiled library (IMPORTED) +set_target_properties(faiss PROPERTIES + IMPORTED_LOCATION "${FAISS_LIB_DIR}/libfaiss.a" + INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIR}" +) + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Compiler Specific flags ------- +# ───────────────────────────────────────────────────────────────────────────── +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")# MSVC-specific flags + set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/generators/conan_toolchain.cmake) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Od /EHsc /MP /openmp /MD") +else() # Flags for G++/Clang or G++/GNU on Linux + # Keeps the optimization flags and adds -std=c++23 + set(CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/Release/generators/conan_toolchain.cmake) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fopenmp -pthread") + set(CMAKE_CXX_FLAGS_RELEASE "-O0 -std=c++23") +endif() + +# ───────────────────────────────────────────────────────────────────────────── +#------- Options ------- +# ───────────────────────────────────────────────────────────────────────────── +option(CURL_STATIC_LINKING "Set to ON to build libcurl with static linking." OFF) + +if(CURL_STATIC_LINKING) + message("-DCURL_STATICLIB [added]") + add_definitions(-DCURL_STATICLIB) +endif() + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find Python ------- +# ───────────────────────────────────────────────────────────────────────────── +set(Python3_FIND_SHARED OFF)# Need it? +find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) + +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(PYTHON_LIB_PATH "C:\\Program Files\\Python312\\libs") + link_directories(${Python3_LIBRARY_DIRS}) +endif() + +include_directories(${Python3_INCLUDE_DIRS}) + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find Pybind11 ------- +# ───────────────────────────────────────────────────────────────────────────── +message(STATUS "---------------------------------------\n") +if(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 8) + message(STATUS "Python 3.8 detected!!") + set(pybind11_DIR "/opt/python/cp38-cp38/lib/python3.8/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x03080000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 9) + message(STATUS "Python 3.9 detected!!") + set(pybind11_DIR "/opt/python/cp39-cp39/lib/python3.9/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x03090000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 10) + message(STATUS "Python 3.10 detected!!") + set(pybind11_DIR "/opt/python/cp310-cp310/lib/python3.10/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030A0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 11) + message(STATUS "Python 3.11 detected!!") + set(pybind11_DIR "/opt/python/cp311-cp311/lib/python3.11/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030B0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 12) + message(STATUS "Python 3.12 detected!!") + set(pybind11_DIR "/opt/python/cp312-cp312/lib/python3.12/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030C0000) + +elseif(Python3_VERSION_MAJOR EQUAL 3 AND Python3_VERSION_MINOR EQUAL 13) + message(STATUS "Python 3.13 detected!") + set(pybind11_DIR "/opt/python/cp313-cp313/lib/python3.13/site-packages/pybind11/share/cmake/pybind11") + # add_definitions(-DPy_LIMITED_API=0x030D0000) + +else() + message(STATUS "Python version not specified in previous if's!") + # Default setting or other action + +endif() + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Find other dependencies ------- +# ───────────────────────────────────────────────────────────────────────────── +find_package(pybind11 REQUIRED) +find_package(pdfium REQUIRED) +find_package(OpenMP REQUIRED) +find_package(ICU REQUIRED) +find_package(miniz REQUIRED) +find_package(rapidxml REQUIRED) +find_package(beauty REQUIRED) +find_package(lexbor REQUIRED) +find_package(re2 REQUIRED) +find_package(nlohmann_json REQUIRED) +#find_package(fmt REQUIRED) +find_package(CURL REQUIRED) +find_package(Threads REQUIRED) + +# ───────────────────────────────────────────────────────────────────────────── +#------- Protobuf ------- +# ───────────────────────────────────────────────────────────────────────────── +find_package(Protobuf REQUIRED) + +if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + link_directories(${Protobuf_LIBRARY_DIRS}) + include_directories(C:/vcpkg/packages/protobuf_x64-windows/include) + if (NOT DEFINED PROTOBUF_PROTOC_EXECUTABLE) + # include_directories(${Protobuf_INCLUDE_DIRS})# At first the Protobuf_INCLUDE_DIR target does not exist, but I don't know if it is necessary + set(PROTOBUF_PROTOC_EXECUTABLE "C:/vcpkg/packages/protobuf_x64-windows/tools/protobuf/protoc.exe") + endif() +endif() + +find_package(onnxruntime REQUIRED) # May depend on protobuf (check if you need it!) + +# ─────────────────────────────────────────────────────────────────────── +#------- Discover the directory Current Python site-packages ------- +# ────────────────────────────────────────────────────────────────────────────── +# We will use sysconfig to be more robust across multiple Python versions. +execute_process( + COMMAND "${Python3_EXECUTABLE}" -c + "import sysconfig; import pathlib; site_packages = sysconfig.get_paths()['purelib']; print(str(pathlib.Path(site_packages).resolve()))" + OUTPUT_VARIABLE PYTHON_SITE_PACKAGES + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# ───────────────────────────────────────────────────────────────────────────── +#------- Torch ------- +# ───────────────────────────────────────────────────────────────────────────── +# set(_TORCH_REL_PATH "dependencias_libs/d_libs/libtorch/cpu") +set(Torch_DIR "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/share/cmake/Torch") + +find_package(Torch REQUIRED) + +# Ajustar includes e bibliotecas do Torch +include_directories("${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/include") +link_directories("${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib") + + +# ───────────────────────────────────────────────────────────────────────────── +#------- RPATH ------- +# ───────────────────────────────────────────────────────────────────────────── +#------- ConfiguraΓ§Γ£o do RPATH/RUNPATH/LIBRARY_PATH -------> Ajustar rpath para encontrar as bibliotecas do Torch +set(CMAKE_BUILD_RPATH + "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" + "\$ORIGIN/purecpp.libs" + "\$ORIGIN/d_libs/libtorch/cpu/lib" + "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib" + "\$ORIGIN/libs/libtorch/cpu/lib" + "\$ORIGIN:/usr/bin/protoc" + "/usr/lib/x86_64-linux-gnu" + "/usr/lib64" + "\$ORIGIN/" + "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" + + "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib" + "\$ORIGIN/../libs/libtorch/cpu/lib" +) + +set(CMAKE_INSTALL_RPATH + "${PYTHON_SITE_PACKAGES}/*/d_libs/libtorch/cpu/lib" + "\$ORIGIN/purecpp.libs" + "\$ORIGIN/d_libs/libtorch/cpu/lib" + "${CMAKE_SOURCE_DIR}/libs/libtorch/cpu/lib" + "\$ORIGIN/libs/libtorch/cpu/lib" + "\$ORIGIN:/usr/bin/protoc" + "/usr/lib/x86_64-linux-gnu" + "/usr/lib64" + "\$ORIGIN/" + "\$ORIGIN/purecpp/d_libs/libtorch/cpu/lib" + + "${CMAKE_SOURCE_DIR}/../libs/libtorch/cpu/lib" + "\$ORIGIN/../libs/libtorch/cpu/lib" +) + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + + +# ───────────────────────────────────────────────────────────────────────────── +#------- Tokenizers ------- +# ───────────────────────────────────────────────────────────────────────────── +set(TOKENIZERS_PATH "${CMAKE_SOURCE_DIR}/../libs/tokenizers-cpp") +add_subdirectory(${TOKENIZERS_PATH} tokenizers EXCLUDE_FROM_ALL) + +# ───────────────────────────────────────────────────────────────────────────── +#------- OpenAI C++ bindings ------- +# ───────────────────────────────────────────────────────────────────────────── +set(OPENAI_CPP_PATH "${CMAKE_SOURCE_DIR}/../libs/openai-cpp") + +# ───────────────────────────────────────────────────────────────────────────── +# ----- Project sources ----- +# ───────────────────────────────────────────────────────────────────────────── + +set(RagPUREAI_BINDING_SRCS binding.cpp) + +set(RagPUREAI_IMPL_SRCS + ${CMAKE_SOURCE_DIR}/../libs/StringUtils/StringUtils.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/BaseLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/PDFLoader/PDFLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/DOCXLoader/DOCXLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/WebLoader/WebLoader.cpp + ${CMAKE_SOURCE_DIR}/../components/DataLoader/TXTLoader/TXTLoader.cpp + + ${CMAKE_SOURCE_DIR}/../components/Embedding/BaseEmbedding.cpp + ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI/EmbeddingOpenAI.cpp + + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCommons/ChunkCommons.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCount/ChunkCount.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkDefault/ChunkDefault.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkSimilarity/ChunkSimilarity.cpp + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkQuery/ChunkQuery.cpp + + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataExtractor.cpp + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataRegexExtractor/MetadataRegexExtractor.cpp + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor/MetadataHFExtractor/MetadataHFExtractor.cpp + + ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner/ContentCleaner.cpp + + ${CMAKE_SOURCE_DIR}/../components/FAISSBackend/faiss_backend.cpp +) + + +# ───────────────────────────────────────────────────────────────────────────── +#------- RagPUREAI Library ------- +# ───────────────────────────────────────────────────────────────────────────── + +add_library(RagPUREAILib STATIC ${RagPUREAI_IMPL_SRCS}) + +target_include_directories(RagPUREAILib PUBLIC #.h + ${CMAKE_SOURCE_DIR}/../libs/RagException + ${CMAKE_SOURCE_DIR}/../libs/ThreadSafeQueue + ${CMAKE_SOURCE_DIR}/../libs/CommonStructs + ${CMAKE_SOURCE_DIR}/../libs/StringUtils + ${CMAKE_SOURCE_DIR}/../libs/FileUtils + ${CMAKE_SOURCE_DIR}/../libs/MemoryUtils + + #C:/vcpkg/packages/protobuf_x64-windows/include + ${CURL_INCLUDE_DIRS} + ${OPENAI_CPP_PATH}/include + ${TORCH_INCLUDE_DIRS} + ${TOKENIZERS_PATH}/include + ${FAISS_INCLUDE_DIR} + + ${CMAKE_SOURCE_DIR}/../components + ${CMAKE_SOURCE_DIR}/../components/DataLoader + ${CMAKE_SOURCE_DIR}/../components/MetadataExtractor + ${CMAKE_SOURCE_DIR}/../components/Chunk + ${CMAKE_SOURCE_DIR}/../components/Chunk/ChunkCommons + ${CMAKE_SOURCE_DIR}/../components/CleanData/ContentCleaner + ${CMAKE_SOURCE_DIR}/../components/Embedding/EmbeddingOpenAI + ${CMAKE_SOURCE_DIR}/../components/Embedding + ${CMAKE_SOURCE_DIR}/../components/FAISSBackend + ${CMAKE_SOURCE_DIR}/../libs/faiss/ +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Link libraries with RagPUREAILib +# ───────────────────────────────────────────────────────────────────────────── +link_directories(${FAISS_LIB_DIR}) + +target_link_libraries(RagPUREAILib PUBLIC + pdfium::pdfium + icu::icu + miniz::miniz + rapidxml::rapidxml + beauty::beauty + #fmt::fmt + lexbor::lexbor_static + re2::re2 + protobuf::libprotobuf + onnxruntime::onnxruntime + OpenMP::OpenMP_CXX + Threads::Threads + CURL::libcurl + nlohmann_json::nlohmann_json + ${Python3_LIBRARIES} + ${TORCH_LIBRARIES} + tokenizers_cpp + faiss + # ${FAISS_LIB_DIR}/libfaiss.a +) + +# ───────────────────────────────────────────────────────────────────────────── +#---------------- Pybind11 Module +# ───────────────────────────────────────────────────────────────────────────── + +pybind11_add_module(RagPUREAI ${RagPUREAI_BINDING_SRCS}) + +target_link_libraries(RagPUREAI PRIVATE RagPUREAILib) + + +# ───────────────────────────────────────────────────────────────────────────── +#---------------- DEBUG MESSAGES +# ───────────────────────────────────────────────────────────────────────────── +#${CMAKE_SOURCE_DIR}) \ No newline at end of file diff --git a/src/binding.cpp b/src/binding.cpp index 33f0807..ee98299 100644 --- a/src/binding.cpp +++ b/src/binding.cpp @@ -31,7 +31,7 @@ #include "TXTLoader/TXTLoader.h" #include "WebLoader/WebLoader.h" -#include "ContentCleaner/ContentCleaner.h" +#include "ContentCleaner.h" #include "ChunkDefault/ChunkDefault.h" #include "ChunkCount/ChunkCount.h" @@ -39,7 +39,9 @@ #include "ChunkCommons/ChunkCommons.h" #include "ChunkQuery/ChunkQuery.h" -#include "../components/MetadataExtractor/Document.h" +#include "FAISSBackend/faiss_backend.h" + +#include "MetadataExtractor/Document.h" #include "IMetadataExtractor.h" #include "MetadataExtractor.h" #include "MetadataRegexExtractor/IMetadataRegexExtractor.h" @@ -47,17 +49,17 @@ #include "MetadataRegexExtractor/MetadataRegexExtractor.h" #include "MetadataHFExtractor/MetadataHFExtractor.h" -#include "../components/Embedding/Document.h" +#include "Embedding/Document.h" #include "IBaseEmbedding.h" #include "EmbeddingOpenAI/IEmbeddingOpenAI.h" #include "EmbeddingOpenAI/EmbeddingOpenAI.h" -#include "../components/Chat/Message/BaseMessage.h" -#include "../components/Chat/Message/HumanMessage.h" -#include "../components/Chat/Message/AIMessage.h" -#include "../components/Chat/Message/SystemMessage.h" -#include "../components/Chat/ChatHistory/ChatHistory.h" +// #include "Chat/Message/BaseMessage.h" +// #include "Chat/Message/HumanMessage.h" +// #include "Chat/Message/AIMessage.h" +// #include "Chat/Message/SystemMessage.h" +// #include "Chat/ChatHistory/ChatHistory.h" namespace py = pybind11; using namespace RAGLibrary; @@ -1326,56 +1328,100 @@ void bind_EmbeddingOpenAI(py::module &m) )doc"); } -// VectorDabase -void bind_VectorDB(pybind11::module_ &); - -// Trampoline class for BaseMessage -class PyBaseMessage : public purecpp::chat::BaseMessage { -public: - using purecpp::chat::BaseMessage::BaseMessage; // Inherit constructors - - std::string get_type() const override { - PYBIND11_OVERRIDE_PURE( - std::string, /* Return type */ - purecpp::chat::BaseMessage, /* Parent class */ - get_type /* Name of function */ - /* Arguments */ - ); - } - - std::string get_content() const override { - PYBIND11_OVERRIDE_PURE( - std::string, - purecpp::chat::BaseMessage, - get_content - ); - } -}; - -void bind_ChatClasses(py::module &m) { - py::class_>(m, "BaseMessage") - .def(py::init<>()) - .def_property_readonly("type", &purecpp::chat::BaseMessage::get_type) - .def_property_readonly("content", &purecpp::chat::BaseMessage::get_content); - - py::class_, purecpp::chat::BaseMessage>(m, "HumanMessage") - .def(py::init(), py::arg("content")); - - py::class_, purecpp::chat::BaseMessage>(m, "AIMessage") - .def(py::init(), py::arg("content")); - - py::class_, purecpp::chat::BaseMessage>(m, "SystemMessage") - .def(py::init(), py::arg("content")); - - py::class_(m, "ChatHistory") - .def(py::init<>()) - .def("add_message", static_cast&)>(&purecpp::chat::ChatHistory::add_message), py::arg("message")) - .def("add_messages", static_cast>&)>(&purecpp::chat::ChatHistory::add_message), py::arg("messages")) - .def_property_readonly("messages", &purecpp::chat::ChatHistory::get_messages) - .def("clear", &purecpp::chat::ChatHistory::clear) - .def("size", &purecpp::chat::ChatHistory::size) - .def("add_benchmark_messages_omp", &purecpp::chat::ChatHistory::add_benchmark_messages_omp, py::arg("num_messages")); +void bind_faiss_backend(py::module& m) { + py::class_(m, "PureResult") + .def_readonly("index", &faiss_backend::PureResult::indices) // user-friendly alias + .def_readonly("distances", &faiss_backend::PureResult::distances) + .def("__repr__", [](const faiss_backend::PureResult& self) { + std::ostringstream oss; + oss << "PureResult(index="; + oss << py::repr(py::cast(self.indices)); + oss << ", distances="; + oss << py::repr(py::cast(self.distances)); + oss << ")"; + return oss.str(); + }); + + m.def("PureL2", &faiss_backend::PureL2, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs an exact L2 (Euclidean) similarity search using FAISS. + Returns the top-k most similar vectors from the database. + )pbdoc"); + + m.def("PureIP", &faiss_backend::PureIP, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs a dot product similarity search using FAISS. + Suitable when the magnitude of vectors is meaningful. + )pbdoc"); + + m.def("PureCosine", &faiss_backend::PureCosine, + py::arg("query"), + py::arg("chunks"), + py::arg("pos"), + py::arg("k") = 1, + R"pbdoc( + Performs a cosine similarity search using FAISS. + Internally normalizes all vectors and then uses inner product search. + )pbdoc"); } +// // VectorDabase +// void bind_VectorDB(pybind11::module_ &); + +// // Trampoline class for BaseMessage +// class PyBaseMessage : public purecpp::chat::BaseMessage { +// public: +// using purecpp::chat::BaseMessage::BaseMessage; // Inherit constructors + +// std::string get_type() const override { +// PYBIND11_OVERRIDE_PURE( +// std::string, /* Return type */ +// purecpp::chat::BaseMessage, /* Parent class */ +// get_type /* Name of function */ +// /* Arguments */ +// ); +// } + +// std::string get_content() const override { +// PYBIND11_OVERRIDE_PURE( +// std::string, +// purecpp::chat::BaseMessage, +// get_content +// ); +// } +// }; + +// void bind_ChatClasses(py::module &m) { +// py::class_>(m, "BaseMessage") +// .def(py::init<>()) +// .def_property_readonly("type", &purecpp::chat::BaseMessage::get_type) +// .def_property_readonly("content", &purecpp::chat::BaseMessage::get_content); + +// py::class_, purecpp::chat::BaseMessage>(m, "HumanMessage") +// .def(py::init(), py::arg("content")); + +// py::class_, purecpp::chat::BaseMessage>(m, "AIMessage") +// .def(py::init(), py::arg("content")); + +// py::class_, purecpp::chat::BaseMessage>(m, "SystemMessage") +// .def(py::init(), py::arg("content")); + +// py::class_(m, "ChatHistory") +// .def(py::init<>()) +// .def("add_message", static_cast&)>(&purecpp::chat::ChatHistory::add_message), py::arg("message")) +// .def("add_messages", static_cast>&)>(&purecpp::chat::ChatHistory::add_message), py::arg("messages")) +// .def_property_readonly("messages", &purecpp::chat::ChatHistory::get_messages) +// .def("clear", &purecpp::chat::ChatHistory::clear) +// .def("size", &purecpp::chat::ChatHistory::size) +// .def("add_benchmark_messages_omp", &purecpp::chat::ChatHistory::add_benchmark_messages_omp, py::arg("num_messages")); +// } //-------------------------------------------------------------------------- // Main module @@ -1415,8 +1461,7 @@ PYBIND11_MODULE(RagPUREAI, m) bind_IEmbeddingOpenAI(m); bind_EmbeddingOpenAI(m); - bind_ChatClasses(m); - - py::module_ vectorDB = m.def_submodule("vectorDB", "Bindings for vector database"); - bind_VectorDB(vectorDB); + // bind_ChatClasses(m); + // py::module_ vectorDB = m.def_submodule("vectorDB", "Bindings for vector database"); + // bind_VectorDB(vectorDB); } diff --git a/conanfile.py b/src/conanfile.py similarity index 100% rename from conanfile.py rename to src/conanfile.py