diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml new file mode 100644 index 00000000..028b2934 --- /dev/null +++ b/.github/workflows/build_wheels.yml @@ -0,0 +1,193 @@ +name: Build Wheels + +on: + push: + tags: ['v*'] # Build wheels on version tags + branches: [main, master] # Also build on direct pushes to main/master + paths: + - '**.py' + - '**.pyx' + - 'pyproject.toml' + - 'meson.build' + - '.github/workflows/build_wheels.yml' + pull_request: + branches: [main, master] + paths: + - '**.py' + - '**.pyx' + - 'pyproject.toml' + - 'meson.build' + - '.github/workflows/build_wheels.yml' + workflow_dispatch: + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, ubuntu-24.04-arm, macos-15-intel, macos-15] + # ubuntu-latest: Linux x86_64 (native) + # ubuntu-24.04-arm: Linux aarch64 (native, no QEMU!) + # macos-15-intel: macOS Intel x86_64 (native) + # macos-15: macOS Apple Silicon arm64 (native) + + steps: + - uses: actions/checkout@v4 + + # Set up ccache for faster C++ compilation + - name: Set up ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-ccache + + - name: Build wheels + uses: pypa/cibuildwheel@v2.22 + env: + CIBW_BUILD_VERBOSITY: 1 + # Build only native architecture (no cross-compilation or emulation) + CIBW_ARCHS: native + + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl + retention-days: 7 + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build sdist + run: pipx run build --sdist + + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: dist/*.tar.gz + retention-days: 7 + + test_wheels: + name: Test wheels + needs: [build_wheels] + # Run on: version tags, manual triggers, and re-runs (but NOT on regular branch pushes/PRs) + if: github.event_name == 'workflow_dispatch' || github.run_attempt > 1 || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, ubuntu-24.04-arm, macos-15-intel, macos-15] + python: ['3.9', '3.12'] # Test oldest and newest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + + - name: Download wheels + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: wheelhouse + + - name: Install wheel and dependencies + shell: bash + run: | + pip install --find-links wheelhouse libact + pip install numpy scipy scikit-learn matplotlib + + - name: Test import and functionality + shell: bash + run: | + python -c "import libact; print(f'libact version: {libact.__version__}')" + python -c " + import libact.query_strategies as qs + + # Test basic imports + assert hasattr(qs, 'UncertaintySampling'), 'UncertaintySampling not found' + assert hasattr(qs, 'RandomSampling'), 'RandomSampling not found' + print('✓ Basic query strategies imported successfully') + + # Test optional C-extension modules (may not be available if BLAS/LAPACK not found during build) + has_hintsvm = hasattr(qs, 'HintSVM') + has_variance_reduction = hasattr(qs, 'VarianceReduction') + + if has_hintsvm and has_variance_reduction: + # Try to actually import them to verify they work + from libact.query_strategies import HintSVM, VarianceReduction + print('✓ All C-extension modules (HintSVM, VarianceReduction) available and working') + else: + missing = [] + if not has_hintsvm: + missing.append('HintSVM') + if not has_variance_reduction: + missing.append('VarianceReduction') + print(f'⚠ Warning: Optional C-extensions not available: {missing}') + print(' (Wheels were built without BLAS/LAPACK support)') + " + + upload_testpypi: + name: Upload to TestPyPI + needs: [build_wheels, build_sdist, test_wheels] + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || github.event_name == 'push' + environment: + name: testpypi + url: https://test.pypi.org/p/libact + permissions: + id-token: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + pattern: wheels-* + merge-multiple: true + path: dist + + - name: Download sdist + uses: actions/download-artifact@v4 + with: + name: sdist + path: dist + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + + # upload_pypi: + # name: Upload to PyPI + # needs: [build_wheels, build_sdist, test_wheels] + # runs-on: ubuntu-latest + # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + # environment: + # name: pypi + # url: https://pypi.org/p/libact + # permissions: + # id-token: write + + # steps: + # - name: Download all artifacts + # uses: actions/download-artifact@v4 + # with: + # pattern: wheels-* + # merge-multiple: true + # path: dist + + # - name: Download sdist + # uses: actions/download-artifact@v4 + # with: + # name: sdist + # path: dist + + # - name: Publish to PyPI + # uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fa477d1..002eb4d9 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,24 +3,26 @@ name: Libact linting on: [push, pull_request] jobs: - build: - + lint: runs-on: ${{ matrix.os }} - continue-on-error: True strategy: fail-fast: false matrix: os: [ubuntu-latest] - python-version: [3.9] + python-version: ['3.9'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip pip install pylint - - run: pylint libact + + - name: Run pylint (errors only) + run: pylint --errors-only libact diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d66ed96e..42319cca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,24 +3,22 @@ name: Libact tests on: [push, pull_request] jobs: - build: - + test-with-blas: + name: Test with BLAS/LAPACK (full features) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - # os: [ubuntu-latest, macos-latest] - # python-version: [2.7, 3.6, 3.7, 3.8, 3.9] os: [ubuntu-22.04] - python-version: [3.9, 3.10, 3.11] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install system dependencies run: | if [ "$RUNNER_OS" = "macOS" ]; then brew update @@ -29,18 +27,117 @@ jobs: echo "backend: TkAgg" >> ~/.matplotlib/matplotlibrc else sudo apt-get update -qq - sudo apt-get install -y build-essential gfortran libatlas-base-dev liblapacke-dev + sudo apt-get install -y build-essential gfortran libopenblas-dev liblapacke-dev pkg-config sudo apt-get install -y python3-dev fi + - name: Install Python dependencies + run: | python -m pip install --upgrade pip pip install pylint coverage codecov - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Install Libact + - name: Install build tools + run: | + pip install meson-python meson ninja cython numpy + - name: Install libact in editable mode run: | - ./setup.py build_ext --inplace - - name: Unittests + pip install --no-build-isolation -e . 2>&1 | tee build.log + - name: Verify optional features were built + run: | + # Check build log for feature messages + if grep -q "Building VarianceReduction" build.log; then + echo "✓ VarianceReduction feature was built" + else + echo "✗ ERROR: VarianceReduction feature was NOT built (expected with BLAS/LAPACK)" + exit 1 + fi + if grep -q "Building HintSVM" build.log; then + echo "✓ HintSVM feature was built" + else + echo "✗ ERROR: HintSVM feature was NOT built (expected with BLAS/LAPACK)" + exit 1 + fi + # Verify the compiled modules are importable + python -c "from libact.query_strategies._variance_reduction import *; print('✓ _variance_reduction module imports successfully')" + python -c "from libact.query_strategies._hintsvm import *; print('✓ _hintsvm module imports successfully')" + - name: Run unittests run: | python -m unittest -v - - run: coverage run --source libact --omit */tests/* setup.py test - - run: coverage report - - run: codecov + - name: Run coverage + run: | + coverage run --source libact --omit */tests/* -m unittest + - name: Report coverage + run: | + coverage report + - name: Upload coverage to Codecov + run: | + codecov + + test-without-blas: + name: Test without BLAS/LAPACK (minimal install) + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ['3.11'] # Test with one Python version to verify minimal install + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install minimal system dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -y build-essential python3-dev + # Intentionally NOT installing BLAS/LAPACK to test fallback behavior + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + - name: Install build tools + run: | + pip install meson-python meson ninja cython numpy + - name: Install libact (should skip variance_reduction and hintsvm) + run: | + pip install --no-build-isolation -e . 2>&1 | tee install.log + - name: Verify optional features were skipped + run: | + # Verify warning message appears + if grep -q "BLAS/LAPACK libraries not found" install.log; then + echo "✓ BLAS/LAPACK warning message found (as expected)" + else + echo "✗ WARNING: Expected BLAS/LAPACK warning message not found" + fi + # Verify features were skipped + if grep -q "Skipping VarianceReduction" install.log; then + echo "✓ VarianceReduction was correctly skipped" + else + echo "✗ ERROR: VarianceReduction skip message not found" + exit 1 + fi + if grep -q "Skipping HintSVM" install.log; then + echo "✓ HintSVM was correctly skipped" + else + echo "✗ ERROR: HintSVM skip message not found" + exit 1 + fi + # Verify the modules don't exist (shouldn't be importable) + python -c "try: + from libact.query_strategies._variance_reduction import * + print('✗ ERROR: _variance_reduction should not be importable without BLAS') + exit(1) +except ImportError: + print('✓ _variance_reduction correctly not available (as expected)')" || exit 1 + python -c "try: + from libact.query_strategies._hintsvm import * + print('✗ ERROR: _hintsvm should not be importable without BLAS') + exit(1) +except ImportError: + print('✓ _hintsvm correctly not available (as expected)')" || exit 1 + - name: Test basic import + run: | + python -c "from libact.base.dataset import Dataset; print('SUCCESS: Basic import works without BLAS/LAPACK')" + - name: Run basic unittests + run: | + # Run tests but don't fail on tests that require variance_reduction or hintsvm + python -m unittest discover -s libact/base/tests -v || true + python -m unittest discover -s libact/labelers/tests -v || true diff --git a/.gitignore b/.gitignore index 3c8ea412..11cecead 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,15 @@ examples/australian.txt examples/diabetes.txt examples/heart.txt +# this file is automatically generated during build libact/query_strategies/_hintsvm.c + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml +.pdm-python +.pdm-build/ diff --git a/.pylintrc b/.pylintrc index cb53b843..f79d9190 100644 --- a/.pylintrc +++ b/.pylintrc @@ -28,9 +28,6 @@ callbacks=cb_,_cb [BASIC] -# List of builtins function names that should not be used, separated by a comma -bad-functions=map,filter - # Good variable names which should always be accepted, separated by a comma # x, y, X, Y - vector maxtrix of features and labels. # P, p - probability distribution @@ -43,16 +40,10 @@ good-names=i,j,k,_,X,Y,x,y,P,p,qs,w,W,N,T,K # Regular expression matching correct attribute names attr-rgx=[a-z_][a-z0-9_]{2,30}$ -# Naming hint for attribute names -attr-name-hint=[a-z_][a-z0-9_]{2,30}$ - # Regular expression matching correct variable names # start with X imply its a matrix for features variable-rgx=[a-z_X][a-z0-9_]{2,30}$ -# Naming hint for variable names -variable-name-hint=[a-z_X][a-z0-9_]{2,30}$ - # Regular expression which should only match function or class names that do # not require a docstring. no-docstring-rgx=^_ @@ -86,12 +77,6 @@ generated-members= # Maximum number of characters on a single line. max-line-length=120 -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma,dict-separator - # Maximum number of lines in a module max-module-lines=1000 diff --git a/README.md b/README.md index 379ddb98..4d45839e 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,189 @@ # libact: Pool-based Active Learning in Python -authors: [Yao-Yuan Yang](http://yyyang.me), Shao-Chuan Lee, Yu-An Chung, Tung-En Wu, Si-An Chen, [Hsuan-Tien Lin](http://www.csie.ntu.edu.tw/~htlin) +Authors: [Yao-Yuan Yang](http://yyyang.me), Shao-Chuan Lee, Yu-An Chung, Tung-En Wu, Si-An Chen, [Hsuan-Tien Lin](http://www.csie.ntu.edu.tw/~htlin) -[![Build Status](https://travis-ci.org/ntucllab/libact.svg)](https://travis-ci.org/ntucllab/libact) +Contributors: Zheng-Yu (Josh) Huang, Po-Yi Lu + +[![Build Status](https://github.com/ntucllab/libact/actions/workflows/tests.yml/badge.svg)](https://github.com/ntucllab/libact/actions/workflows/tests.yml) [![Documentation Status](https://readthedocs.org/projects/libact/badge/?version=latest)](http://libact.readthedocs.org/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/libact.svg)](https://badge.fury.io/py/libact) [![codecov.io](https://codecov.io/github/ntucllab/libact/coverage.svg?branch=master)](https://codecov.io/github/ntucllab/libact?branch=master) -# Introduction +## Introduction `libact` is a Python package designed to make active learning easier for real-world users. The package not only implements several popular active learning strategies, but also features the [active-learning-by-learning](http://www.csie.ntu.edu.tw/~htlin/paper/doc/aaai15albl.pdf) meta-algorithm that assists the users to automatically select the best strategy on the fly. Furthermore, the package provides a unified interface for implementing more strategies, models and application-specific labelers. The package is open-source along with issue trackers on github, and can be easily installed from Python Package Index repository. -# Documentation +## Documentation The technical report associated with the package is on [arXiv](https://arxiv.org/abs/1710.00379), and the documentation for the latest release is available on [readthedocs](http://libact.readthedocs.org/en/latest/). Comments and questions on the package is welcomed at `libact-users@googlegroups.com`. All contributions to the documentation are greatly appreciated! -# Basic Dependencies +## Basic Dependencies + +- Python 3.9, 3.10, 3.11, 3.12 + +- Python dependencies (automatically installed with pip): + - numpy >= 2 + - scipy >= 1.13 + - scikit-learn >= 1.6 + - matplotlib >= 3.8 + - joblib -* Python 3.9, 3.10, 3.11 - * _Note._ We will soon release Python 2.7, 3.3, 3.4, 3.5, and 3.6 installations in the new branch. +### BLAS/LAPACKE Dependencies + +- Debian (>= 7) / Ubuntu (>= 14.04) -* Debian (>= 7) / Ubuntu (>= 14.04) ``` sudo apt-get install build-essential gfortran libatlas-base-dev liblapacke-dev python3-dev ``` -* Python dependencies -``` -pip install -r requirements.txt -``` +- Arch Linux -* Arch ``` sudo pacman -S lapacke ``` -* macOS +- macOS ``` brew install openblas ``` -# Installation +- Others: refer to the BLAS/LAPACKE installation guides. -After resolving the dependencies, you may install the package via pip (for all users): -``` -sudo pip install libact -``` +## Installation -or pip install in home directory: -``` -pip install --user libact -``` +- Install the official release (from PyPI): -or pip install from github repository for latest source: +```shell +pip install libact ``` + +> **Note:** For Windows users, it is recommended to use **Windows Subsystem for Linux (WSL)** as the primary environment for installing and running `libact`. + +- Install the latest development version + +```shell pip install git+https://github.com/ntucllab/libact.git ``` -To build and install from souce in your home directory: -``` -python setup.py install --user +## Build Options + +This package supports the following build options: + +- `blas`: BLAS library to use (default='auto'). Options: `auto`, `openblas`, `Accelerate`, `mkl`, `lapack`, `blis`. +- `lapack`: LAPACK library to use (default=`auto`). Options: `auto`, `openblas`, `Accelerate`, `mkl`, `lapack`, `blis`. +- `variance_reduction`: Build variance reduction module (default: true) +- `hintsvm`: Build hintsvm module (default: true) + +### Examples + +To install `libact` with the default configuration, run: + +```shell +pip install libact ``` -To build and install from souce for all users on Unix/Linux: +Install without optional modules: -**(This is the recommended method for Python 3.10 users)** +```shell +pip install libact --config-settings=setup-args="-Dvariance_reduction=false" \ + --config-settings=setup-args="-Dhintsvm=false" ``` -pip install -e . + +## Build from Source + +### Overview + +This project utilizes `meson` and `meson-python` as the build backend. To build from source, ensure you have the aforementioned dependencies installed on your system. The building procedure additionally requires the following dependencies: + +- `meson-python` +- `ninja` +- `cython` +- `numpy` + +### The Recommended Approach (Using Bootstrapped Environment Config) + +To simplify the environment setup, we provide a pre-configured `environment.yml` located at the root directory of the project. Install with `conda/mamba` to get a head start. + +```shell +# Clone the repository +git clone https://github.com/ntucllab/libact.git +cd libact + +# Create and activate conda environment +conda env create -f environment.yml +conda activate libact + +# Install in development mode +pip install --no-build-isolation -e . + +# Or build and install +pip install --no-build-isolation . ``` -## Installation Options +### Regular Install (Recommended for Users) + +For regular usage (not development), simply install from PyPI or from a local clone: -- `LIBACT_BUILD_HINTSVM`: set this variable to 1 if you would like to build - hintsvm c-extension. If set to 0, you will not be able to use the HintSVM - query strategy. Default=1. -- `LIBACT_BUILD_VARIANCE_REDUCTION`: set this variable to 1 if you would like to - build variance reduction c-extension. If set to 0, you will not be able to use - the VarianceReduction query strategy. Default=1. +```shell +# From PyPI +pip install libact -Example: +# From local clone +pip install . ``` -LIBACT_BUILD_HINTSVM=1 pip install git+https://github.com/ntucllab/libact.git + +Regular installs do **not** require build tools at runtime and will work without any additional dependencies. + +### Editable/Development Install (Recommended Method) + +Editable installs with meson-python automatically rebuild compiled components when you import the package. To ensure build tools are available, use `--no-build-isolation`: + +```shell +# First install build dependencies in your environment +pip install meson-python meson ninja cython numpy + +# Then install in editable mode without build isolation +pip install --no-build-isolation -e . ``` -# Usage +This ensures that `ninja`, `meson`, and other build tools remain available in your environment for rebuilds. + +**Troubleshooting:** If you get errors about missing `ninja` or build tools when importing libact: +- You may have installed in editable mode with build isolation (which is not recommended) +- Solution: Reinstall using the method above, OR use a regular install: `pip install .` + +## Available Query Strategies + +| Strategy | Type | Description | +|----------|------|-------------| +| `UncertaintySampling` | Exploitation | Selects samples where the model is least confident | +| `EpsilonUncertaintySampling` | Exploration + Exploitation | ε-greedy: random with prob ε, uncertainty sampling otherwise | +| `CoreSet` | Diversity | k-Center Greedy, selects the point farthest from labeled set | +| `BALD` | Epistemic Uncertainty | Bayesian Active Learning by Disagreement via ensemble (mutual information) | +| `InformationDensity` | Representativeness | Density-weighted uncertainty — avoids querying outliers | +| `QueryByCommittee` | Disagreement | Committee of models votes on most informative samples | +| `QUIRE` | Informativeness + Representativeness | Combines uncertainty and density | +| `RandomSampling` | Baseline | Uniform random selection | +| `ActiveLearningByLearning` | Meta-algorithm | Multi-armed bandit that selects the best strategy on the fly | +| `VarianceReduction` | Variance | Minimizes output variance (requires C extension) | +| `HintSVM` | SVM-based | SVM-guided active learning (requires C extension) | +| `DensityWeightedMeta` | Density | Weights informativeness by density | +| `DWUS` | Density + Uncertainty | Density-weighted uncertainty sampling | + +## Available Models + +| Model | Description | +|-------|-------------| +| `LogisticRegression` | Logistic regression with probability output | +| `SVM` | Support Vector Machine classifier | +| `Perceptron` | Perceptron classifier | +| `SklearnAdapter` / `SklearnProbaAdapter` | Wraps any scikit-learn estimator for use with libact | + +## Usage The main usage of `libact` is as follows: @@ -100,6 +196,50 @@ lb = lbr.label(X[ask_id]) # query the label of unlabeled data from labeler insta trn_ds.update(ask_id, lb) # update the dataset with newly queried data ``` +### Using CoreSet, BALD, and InformationDensity Strategies + +```python +from libact.query_strategies import ( + CoreSet, + BALD, + InformationDensity, + ActiveLearningByLearning, +) +from libact.models import LogisticRegression + +# Core-Set (diversity-based, farthest-from-labeled) +qs = CoreSet(dataset) + +# BALD (epistemic uncertainty via ensemble disagreement) +qs = BALD(dataset, models=[ + LogisticRegression(C=0.1), + LogisticRegression(C=1.0), + LogisticRegression(C=10.0), +]) + +# Information Density (uncertainty weighted by representativeness) +qs = InformationDensity(dataset, model=LogisticRegression(), method='entropy') + +# With stronger density preference (beta=2) and cosine similarity +qs = InformationDensity(dataset, model=LogisticRegression(), + method='entropy', metric='cosine', beta=2.0) + +# ALBL with all three strategies combined +qs = ActiveLearningByLearning( + dataset, + query_strategies=[ + CoreSet(dataset), + BALD(dataset, models=[...]), + InformationDensity(dataset, model=LogisticRegression()), + ], + T=quota, + uniform_sampler=True, + model=model +) +``` + +## Examples + Some examples are available under the `examples` directory. Before running, use `examples/get_dataset.py` to retrieve the dataset used by the examples. @@ -112,12 +252,14 @@ Available examples: that you want a human to label the selected sample for your algorithm. - [albl_plot](examples/albl_plot.py): This example compares the performance of ALBL with other active learning algorithms. + - [albl_new_strategies_benchmark](examples/albl_new_strategies_benchmark.py): Benchmarks + CoreSet, BALD, and InformationDensity query strategies individually and combined via ALBL. - [multilabel_plot](examples/multilabel_plot.py): This example compares the performance of algorithms under multilabel setting. - [alce_plot](examples/alce_plot.py): This example compares the performance of algorithms under cost-sensitive multi-class setting. -# Running tests +## Running tests To run the test suite: @@ -138,7 +280,8 @@ python -m coverage run --source libact --omit */tests/* -m unittest python -m coverage report ``` -# Citing +## Citing + If you find this package useful, please cite the original works (see Reference of each strategy) as well as the following ``` @@ -153,7 +296,8 @@ If you find this package useful, please cite the original works (see Reference o } ``` - -# Acknowledgments +## Acknowledgments The authors thank Chih-Wei Chang and other members of the [Computational Learning Lab](https://learner.csie.ntu.edu.tw/) at National Taiwan University for valuable discussions and various contributions to making this package better. + + diff --git a/docs/active_learning_by_learning.rst b/docs/active_learning_by_learning.rst index bc5187b4..a5fdaf84 100644 --- a/docs/active_learning_by_learning.rst +++ b/docs/active_learning_by_learning.rst @@ -48,3 +48,49 @@ ALBL combines the result of these query strategies and generate its own suggestion of which sample to query. ALBL will adaptively *learn* from each of the decision it made, using the given supervised learning model in :code:`model` parameter. + +Using Diverse Strategy Signals +------------------------------ + +ALBL works best when its component strategies provide orthogonal signals. +The following example combines exploitation (uncertainty), exploration +(diversity), and representativeness into a single ALBL instance: + +.. code-block:: python + :linenos: + + from libact.query_strategies import ActiveLearningByLearning + from libact.query_strategies import UncertaintySampling + from libact.query_strategies import CoreSet + from libact.query_strategies import BALD + from libact.query_strategies import InformationDensity + from libact.models import LogisticRegression + + model = LogisticRegression() + + qs = ActiveLearningByLearning( + dataset, + T=quota, + query_strategies=[ + UncertaintySampling(dataset, model=LogisticRegression(C=1.)), + CoreSet(dataset), + BALD(dataset, models=[ + LogisticRegression(C=0.1), + LogisticRegression(C=1.0), + LogisticRegression(C=10.0), + ]), + InformationDensity(dataset, model=LogisticRegression()), + ], + model=model, + uniform_sampler=True + ) + +The strategies provide complementary signals: + +- **UncertaintySampling**: exploits model confidence boundaries +- **CoreSet**: explores by maximizing geometric coverage (farthest-from-labeled) +- **BALD**: measures epistemic uncertainty via ensemble disagreement +- **InformationDensity**: balances uncertainty with representativeness to avoid outliers + +ALBL's multi-armed bandit mechanism will learn which signals are most useful for +the given problem and adapt its selection accordingly. diff --git a/docs/conf.py b/docs/conf.py index 6029b7bc..e02ecc60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,16 +86,16 @@ # built documents. # # The short X.Y version. -version = '0.1.4' +version = '0.2.0' # The full version, including alpha/beta/rc tags. -release = '0.1.4' +release = '0.2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -140,9 +140,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. #html_theme = 'alabaster' -import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -171,7 +169,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -319,7 +317,7 @@ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} # Skip private members in this project, which start with an underscore (_). diff --git a/docs/libact.models.rst b/docs/libact.models.rst index e69a088a..f0cb1095 100644 --- a/docs/libact.models.rst +++ b/docs/libact.models.rst @@ -40,7 +40,6 @@ libact.models.svm module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/libact.query_strategies.rst b/docs/libact.query_strategies.rst index 3d8621f2..7d01f0e6 100644 --- a/docs/libact.query_strategies.rst +++ b/docs/libact.query_strategies.rst @@ -67,6 +67,38 @@ libact.query_strategies.uncertainty_sampling module :undoc-members: :show-inheritance: +libact.query_strategies.bald module +------------------------------------ + +.. automodule:: libact.query_strategies.bald + :members: + :undoc-members: + :show-inheritance: + +libact.query_strategies.coreset module +-------------------------------------- + +.. automodule:: libact.query_strategies.coreset + :members: + :undoc-members: + :show-inheritance: + +libact.query_strategies.epsilon_uncertainty_sampling module +----------------------------------------------------------- + +.. automodule:: libact.query_strategies.epsilon_uncertainty_sampling + :members: + :undoc-members: + :show-inheritance: + +libact.query_strategies.information_density module +-------------------------------------------------- + +.. automodule:: libact.query_strategies.information_density + :members: + :undoc-members: + :show-inheritance: + libact.query_strategies.variance_reduction module ------------------------------------------------- diff --git a/docs/overview.rst b/docs/overview.rst index 34fd9c0f..6575d019 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -38,8 +38,11 @@ Currently, the following active learning algorithms are supported: - Binary Classification + - BALD - Bayesian Active Learning by Disagreement (bald.py) + - Core-Set - k-Center Greedy (coreset.py) - Density Weighted Uncertainty Sampling (density_weighted_uncertainty_sampling.py) - Hinted Sampling with SVM (hintsvm.py) + - Information Density (information_density.py) - Query By Committee (query_by_committee.py) - Querying Informative and Representative Examples (quire.py) - Random Sampling (random_sampling.py) diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..489e057c --- /dev/null +++ b/environment.yml @@ -0,0 +1,15 @@ +name: libact +channels: + - conda-forge +dependencies: + - python=3.12 + - compilers + - openblas + - meson-python + - ninja + - pkg-config + - pip + - ca-certificates + - openssl + - cython + - numpy diff --git a/examples/albl_new_strategies_benchmark.py b/examples/albl_new_strategies_benchmark.py new file mode 100644 index 00000000..ed12748e --- /dev/null +++ b/examples/albl_new_strategies_benchmark.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +""" +Benchmark script demonstrating ALBL with CoreSet, BALD, and InformationDensity. + +This script compares the performance of ALBL using the query strategies: +- CoreSet: Diversity via k-Center Greedy (farthest from labeled set) +- BALD: Epistemic uncertainty via ensemble disagreement +- InformationDensity: Density-weighted uncertainty (avoids querying outliers) + +Usage: + python albl_new_strategies_benchmark.py + +Requirements: + - scikit-learn + - numpy + - matplotlib (for plotting) +""" + +import os + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +# libact imports +from libact.base.dataset import Dataset +from libact.models import LogisticRegression +from libact.query_strategies import ( + ActiveLearningByLearning, + UncertaintySampling, + RandomSampling, + CoreSet, + BALD, + InformationDensity, +) +from libact.labelers import IdealLabeler + + +def create_synthetic_dataset(n_samples=500, n_features=20, n_informative=10, + n_redundant=5, n_classes=2, random_state=42): + """Create a synthetic classification dataset.""" + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_redundant=n_redundant, + n_classes=n_classes, + random_state=random_state, + flip_y=0.1 # Add some label noise + ) + return X, y + + +def run_experiment(trn_ds, tst_ds, labeler, model, qs, quota): + """Run active learning experiment and return error curves.""" + E_in, E_out = [], [] + + for _ in range(quota): + try: + ask_id = qs.make_query() + except ValueError: + # No more unlabeled samples or budget exhausted + break + + label = labeler.label(trn_ds.data[ask_id][0]) + trn_ds.update(ask_id, label) + + model.train(trn_ds) + E_in.append(1 - model.score(trn_ds)) + E_out.append(1 - model.score(tst_ds)) + + return np.array(E_in), np.array(E_out) + + +def main(): + print("=" * 60) + print("ALBL Benchmark with CoreSet, BALD & InformationDensity") + print("=" * 60) + + # Parameters + n_labeled = 10 # Initial labeled samples + quota = 100 # Number of queries to make + n_repeats = 5 # Number of experiment repetitions + random_state = 42 + + # Create dataset + print("\nCreating synthetic dataset...") + X, y = create_synthetic_dataset(n_samples=500, random_state=random_state) + + results = { + 'Random': [], + 'UncertaintySampling': [], + 'CoreSet': [], + 'BALD': [], + 'InformationDensity': [], + 'ALBL (All)': [], + } + + for rep in range(n_repeats): + print(f"\nRepetition {rep + 1}/{n_repeats}") + + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=random_state + rep + ) + + # Ensure initial labels have both classes + while len(np.unique(y_train[:n_labeled])) < 2: + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=random_state + rep + 100 + ) + + # Create datasets + def make_trn_ds(): + labels = list(y_train[:n_labeled]) + [None] * (len(y_train) - n_labeled) + return Dataset(X_train, labels) + + tst_ds = Dataset(X_test, y_test) + fully_labeled = Dataset(X_train, y_train) + labeler = IdealLabeler(fully_labeled) + + # Model factory + def make_model(): + return LogisticRegression(solver='liblinear') + + # 1. Random Sampling + print(" Running Random Sampling...") + trn_ds = make_trn_ds() + qs = RandomSampling(trn_ds, random_state=random_state) + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['Random'].append(E_out) + + # 2. Uncertainty Sampling + print(" Running Uncertainty Sampling...") + trn_ds = make_trn_ds() + qs = UncertaintySampling(trn_ds, model=make_model(), method='lc') + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['UncertaintySampling'].append(E_out) + + # 3. CoreSet + print(" Running CoreSet...") + trn_ds = make_trn_ds() + qs = CoreSet(trn_ds, random_state=random_state) + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['CoreSet'].append(E_out) + + # 4. BALD + print(" Running BALD...") + trn_ds = make_trn_ds() + qs = BALD( + trn_ds, + models=[ + LogisticRegression(solver='liblinear', C=0.1), + LogisticRegression(solver='liblinear', C=1.0), + LogisticRegression(solver='liblinear', C=10.0), + ], + random_state=random_state + ) + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['BALD'].append(E_out) + + # 5. InformationDensity + print(" Running InformationDensity...") + trn_ds = make_trn_ds() + qs = InformationDensity( + trn_ds, + model=make_model(), + method='entropy', + random_state=random_state + ) + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['InformationDensity'].append(E_out) + + # 6. ALBL with all strategies combined + print(" Running ALBL with all strategies...") + trn_ds = make_trn_ds() + qs = ActiveLearningByLearning( + trn_ds, + query_strategies=[ + CoreSet(trn_ds, random_state=random_state), + BALD( + trn_ds, + models=[ + LogisticRegression(solver='liblinear', C=c) + for c in [0.1, 1.0, 10.0] + ], + random_state=random_state + ), + InformationDensity( + trn_ds, + model=make_model(), + method='entropy', + random_state=random_state + ), + ], + T=quota, + uniform_sampler=True, + model=make_model(), + random_state=random_state + ) + _, E_out = run_experiment(trn_ds, tst_ds, labeler, make_model(), qs, quota) + results['ALBL (All)'].append(E_out) + + # Compute mean results + print("\n" + "=" * 60) + print("Computing mean results...") + + mean_results = {} + for name, runs in results.items(): + # Pad shorter runs to same length + max_len = max(len(r) for r in runs) + padded = [np.pad(r, (0, max_len - len(r)), mode='edge') for r in runs] + mean_results[name] = np.mean(padded, axis=0) + + # Print final errors + print("\nFinal Test Error (mean over {} runs):".format(n_repeats)) + for name, errors in mean_results.items(): + print(f" {name}: {errors[-1]:.4f}") + + # Plot results + print("\nGenerating plot...") + plt.figure(figsize=(10, 6)) + + colors = { + 'Random': 'gray', + 'UncertaintySampling': 'blue', + 'CoreSet': 'orange', + 'BALD': 'red', + 'InformationDensity': 'green', + 'ALBL (All)': 'purple', + } + + for name, errors in mean_results.items(): + plt.plot(range(1, len(errors) + 1), errors, + label=name, color=colors[name], linewidth=2) + + plt.xlabel('Number of Queries', fontsize=12) + plt.ylabel('Test Error', fontsize=12) + plt.title('Active Learning Benchmark: CoreSet, BALD & InformationDensity', + fontsize=14) + plt.legend(loc='upper right') + plt.grid(True, alpha=0.3) + plt.tight_layout() + + output_path = os.path.join(os.path.dirname(__file__), 'albl_new_strategies_results.png') + plt.savefig(output_path, dpi=150) + print(f"Plot saved to: {output_path}") + plt.show() + + +if __name__ == '__main__': + main() diff --git a/libact/__init__.py b/libact/__init__.py index f359abf5..39c4d90c 100644 --- a/libact/__init__.py +++ b/libact/__init__.py @@ -15,4 +15,5 @@ """ +__version__ = "0.2.0" __all__ = ["base", "labelers", "models", "query_strategies"] diff --git a/libact/base/meson.build b/libact/base/meson.build new file mode 100644 index 00000000..0e7ed20b --- /dev/null +++ b/libact/base/meson.build @@ -0,0 +1,12 @@ +py_src = [ + '__init__.py', + 'dataset.py', + 'interfaces.py', +] + +py.install_sources( + py_src, + subdir: 'libact/base', +) + +subdir('tests') diff --git a/libact/base/tests/meson.build b/libact/base/tests/meson.build new file mode 100644 index 00000000..8777764a --- /dev/null +++ b/libact/base/tests/meson.build @@ -0,0 +1,9 @@ +py_src = [ + '__init__.py', + 'test_dataset.py', +] + +py.install_sources( + py_src, + subdir: 'libact/base/tests', +) diff --git a/libact/labelers/meson.build b/libact/labelers/meson.build new file mode 100644 index 00000000..83b2f965 --- /dev/null +++ b/libact/labelers/meson.build @@ -0,0 +1,12 @@ +py_src = [ + '__init__.py', + 'ideal_labeler.py', + 'interactive_labeler.py', +] + +py.install_sources( + py_src, + subdir: 'libact/labelers/', +) + +subdir('tests') diff --git a/libact/labelers/tests/__init__.py b/libact/labelers/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libact/labelers/tests/meson.build b/libact/labelers/tests/meson.build new file mode 100644 index 00000000..2442d74f --- /dev/null +++ b/libact/labelers/tests/meson.build @@ -0,0 +1,9 @@ +py_src = [ + '__init__.py', + 'test_labelers.py', +] + +py.install_sources( + py_src, + subdir: 'libact/labelers/tests', +) diff --git a/libact/meson.build b/libact/meson.build new file mode 100644 index 00000000..3ef63a94 --- /dev/null +++ b/libact/meson.build @@ -0,0 +1,8 @@ +py_src = ['__init__.py'] +py.install_sources(py_src, subdir: 'libact') + +subdir('base') +subdir('labelers') +subdir('models') +subdir('utils') +subdir('query_strategies') diff --git a/libact/models/meson.build b/libact/models/meson.build new file mode 100644 index 00000000..79b683aa --- /dev/null +++ b/libact/models/meson.build @@ -0,0 +1,15 @@ +py_src = [ + '__init__.py', + 'logistic_regression.py', + 'perceptron.py', + 'sklearn_adapter.py', + 'svm.py', +] + +py.install_sources( + py_src, + subdir: 'libact/models', +) + +subdir('multilabel') +subdir('tests') diff --git a/libact/models/multilabel/meson.build b/libact/models/multilabel/meson.build new file mode 100644 index 00000000..bd9842d4 --- /dev/null +++ b/libact/models/multilabel/meson.build @@ -0,0 +1,12 @@ +py_src = [ + '__init__.py', + 'binary_relevance.py', + 'dummy_clf.py', +] + +py.install_sources( + py_src, + subdir: 'libact/models/multilabel', +) + +subdir('tests') diff --git a/libact/models/multilabel/tests/meson.build b/libact/models/multilabel/tests/meson.build new file mode 100644 index 00000000..c0f4843d --- /dev/null +++ b/libact/models/multilabel/tests/meson.build @@ -0,0 +1,9 @@ +py_src = [ + '__init__.py', + 'test_binary_relevance.py', +] + +py.install_sources( + py_src, + subdir: 'libact/models/multilabel/tests', +) diff --git a/libact/models/tests/meson.build b/libact/models/tests/meson.build new file mode 100644 index 00000000..ee5dc427 --- /dev/null +++ b/libact/models/tests/meson.build @@ -0,0 +1,12 @@ +py_src = [ + '__init__.py', + 'test_logistic_regression.py', + 'test_perceptron.py', + 'test_sklearn_adapter.py', + 'test_svm.py', +] + +py.install_sources( + py_src, + subdir: 'libact/models/tests', +) diff --git a/libact/query_strategies/__init__.py b/libact/query_strategies/__init__.py index 04ec42de..7004cf77 100644 --- a/libact/query_strategies/__init__.py +++ b/libact/query_strategies/__init__.py @@ -14,6 +14,10 @@ from .quire import QUIRE from .random_sampling import RandomSampling from .density_weighted_uncertainty_sampling import DWUS +from .bald import BALD +from .coreset import CoreSet +from .epsilon_uncertainty_sampling import EpsilonUncertaintySampling +from .information_density import InformationDensity # don't import c extentions when on readthedocs server from .density_weighted_meta import DensityWeightedMeta if not ON_RTD: @@ -26,7 +30,7 @@ "LIBACT_BUILD_VARIANCE_REDUCTION=1 if intend to run " "VarianceReduction") try: - from libact.query_strategies._hintsvm import hintsvm_query + from ._hintsvm import hintsvm_query from .hintsvm import HintSVM except ModuleNotFoundError: LOGGER.warning("HintSVM C-extension not compiled. " @@ -38,8 +42,12 @@ __all__ = [ 'ActiveLearningByLearning', + 'BALD', + 'CoreSet', 'DWUS', + 'EpsilonUncertaintySampling', 'HintSVM', + 'InformationDensity', 'QUIRE', 'QueryByCommittee', 'RandomSampling', diff --git a/libact/query_strategies/bald.py b/libact/query_strategies/bald.py new file mode 100644 index 00000000..61702043 --- /dev/null +++ b/libact/query_strategies/bald.py @@ -0,0 +1,248 @@ +"""BALD (Bayesian Active Learning by Disagreement) + +This module implements BALD using an ensemble of models to approximate +Bayesian uncertainty estimation via mutual information. +""" +import logging + +import numpy as np + +from libact.base.dataset import Dataset +from libact.base.interfaces import QueryStrategy, ProbabilisticModel +from libact.utils import inherit_docstring_from, seed_random_state + +LOGGER = logging.getLogger(__name__) + + +class BALD(QueryStrategy): + """BALD (Bayesian Active Learning by Disagreement) Query Strategy + + This strategy implements Bayesian Active Learning by Disagreement (BALD) + using an ensemble of models. BALD selects samples that maximize mutual + information between predictions and model parameters, approximated here + using ensemble disagreement. + + BALD score: I[y; w | x, D] = H[y | x, D] - E_w[H[y | x, w]] + = H[mean(proba)] - mean(H[proba]) + + Where H is entropy, computed as -sum(p * log(p)). + + Parameters + ---------- + dataset : Dataset object + The dataset to query from. + + models : list of ProbabilisticModel instances, optional + Pre-initialized models to use as the ensemble. Each model must + implement predict_proba(). If provided, these models are used directly + (e.g., models with different hyperparameters). + + base_model : ProbabilisticModel instance, optional + A base model to clone for creating the ensemble via bootstrap bagging. + Required if `models` is not provided. Must have a `clone()` method. + + n_models : int, optional (default=10) + Number of models to create when using `base_model` with bagging. + Ignored if `models` is provided. + + random_state : {int, np.random.RandomState instance, None}, optional (default=None) + Random state for reproducibility. + + Attributes + ---------- + models : list of ProbabilisticModel + The ensemble of models. + + random_state_ : np.random.RandomState instance + The random number generator. + + Examples + -------- + Using pre-initialized models with different hyperparameters: + + .. code-block:: python + + from libact.query_strategies import BALD + from libact.models import LogisticRegression + + qs = BALD( + dataset, + models=[ + LogisticRegression(C=0.1), + LogisticRegression(C=1.0), + LogisticRegression(C=10.0), + ] + ) + + Using bootstrap bagging with a base model: + + .. code-block:: python + + from libact.query_strategies import BALD + from libact.models import SklearnProbaAdapter + from sklearn.ensemble import RandomForestClassifier + + base = SklearnProbaAdapter(RandomForestClassifier(n_estimators=10)) + qs = BALD(dataset, base_model=base, n_models=10) + + References + ---------- + .. [1] Houlsby, Neil, et al. "Bayesian active learning for classification + and preference learning." arXiv preprint arXiv:1112.5745 (2011). + + .. [2] Gal, Yarin, Riashat Islam, and Zoubin Ghahramani. "Deep bayesian + active learning with image data." ICML 2017. + """ + + def __init__(self, dataset, **kwargs): + super(BALD, self).__init__(dataset, **kwargs) + + models = kwargs.pop('models', None) + base_model = kwargs.pop('base_model', None) + self.n_models = kwargs.pop('n_models', 10) + + random_state = kwargs.pop('random_state', None) + self.random_state_ = seed_random_state(random_state) + + # Initialize ensemble + if models is not None: + # Use provided models directly + if not models: + raise ValueError("models list is empty") + for model in models: + if not isinstance(model, ProbabilisticModel): + raise TypeError( + "All models must be ProbabilisticModel instances" + ) + self.models = list(models) + self._base_model = None + elif base_model is not None: + # Create ensemble via cloning + if not isinstance(base_model, ProbabilisticModel): + raise TypeError("base_model must be a ProbabilisticModel") + if not hasattr(base_model, 'clone'): + raise TypeError("base_model must have a 'clone()' method") + self._base_model = base_model + self.models = [base_model.clone() for _ in range(self.n_models)] + else: + raise TypeError( + "__init__() requires either 'models' or 'base_model' argument" + ) + + # Train the ensemble + self._train_ensemble() + + def _entropy(self, proba): + """Calculate entropy of probability distributions. + + Parameters + ---------- + proba : array-like, shape (n_samples, n_classes) + Probability distributions. + + Returns + ------- + entropy : ndarray, shape (n_samples,) + Entropy for each sample. + """ + # Clip to avoid log(0) + proba = np.clip(proba, 1e-10, 1.0) + return -np.sum(proba * np.log(proba), axis=1) + + def _labeled_uniform_sample(self, sample_size): + """Sample labeled entries uniformly for bootstrap bagging.""" + X, y = self.dataset.get_labeled_entries() + indices = self.random_state_.randint(0, len(X), size=sample_size) + return Dataset(X[indices], np.array(y)[indices]) + + def _train_ensemble(self): + """Train the ensemble using bootstrap bagging.""" + dataset = self.dataset + n_labeled = dataset.len_labeled() + + if n_labeled == 0: + LOGGER.warning("No labeled samples available for training") + return + + for model in self.models: + # Create bootstrap sample + bag = self._labeled_uniform_sample(int(n_labeled)) + # Ensure all classes are represented + max_attempts = 10 + attempts = 0 + while bag.get_num_of_labels() != dataset.get_num_of_labels(): + bag = self._labeled_uniform_sample(int(n_labeled)) + attempts += 1 + if attempts >= max_attempts: + LOGGER.warning( + "Could not create balanced bootstrap sample after " + f"{max_attempts} attempts, using current bag" + ) + break + model.train(bag) + + @inherit_docstring_from(QueryStrategy) + def update(self, entry_id, label): + # Retrain ensemble with the new labeled data + self._train_ensemble() + + @inherit_docstring_from(QueryStrategy) + def make_query(self): + dataset = self.dataset + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + raise ValueError("No unlabeled samples available") + + # Get predictions from all models + all_proba = [] + for model in self.models: + proba = model.predict_proba(X_pool) + all_proba.append(np.asarray(proba)) + + all_proba = np.array(all_proba) # shape: (n_models, n_samples, n_classes) + + # Calculate BALD score: H[mean(P)] - mean(H[P]) + # Mean probability across ensemble + mean_proba = np.mean(all_proba, axis=0) # shape: (n_samples, n_classes) + + # Entropy of mean predictions (total uncertainty) + entropy_mean = self._entropy(mean_proba) # shape: (n_samples,) + + # Mean entropy across models (expected data uncertainty) + entropies = np.array([self._entropy(p) for p in all_proba]) # shape: (n_models, n_samples) + mean_entropy = np.mean(entropies, axis=0) # shape: (n_samples,) + + # BALD score = mutual information + bald_scores = entropy_mean - mean_entropy # shape: (n_samples,) + + # Select sample with highest BALD score (break ties randomly) + max_score = np.max(bald_scores) + candidates = np.where(np.isclose(bald_scores, max_score))[0] + selected_idx = self.random_state_.choice(candidates) + + return unlabeled_entry_ids[selected_idx] + + def _get_scores(self): + """Return BALD scores for all unlabeled samples.""" + dataset = self.dataset + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + return [] + + # Get predictions from all models + all_proba = np.array([ + np.asarray(model.predict_proba(X_pool)) + for model in self.models + ]) + + mean_proba = np.mean(all_proba, axis=0) + entropy_mean = self._entropy(mean_proba) + entropies = np.array([self._entropy(p) for p in all_proba]) + mean_entropy = np.mean(entropies, axis=0) + bald_scores = entropy_mean - mean_entropy + + return list(zip(unlabeled_entry_ids, bald_scores)) diff --git a/libact/query_strategies/coreset.py b/libact/query_strategies/coreset.py new file mode 100644 index 00000000..eca967b4 --- /dev/null +++ b/libact/query_strategies/coreset.py @@ -0,0 +1,154 @@ +"""Core-Set (k-Center Greedy) Query Strategy + +This module implements the Core-Set approach for active learning, which selects +the unlabeled point farthest from all labeled points (greedy k-Center). +""" +import numpy as np +from scipy.spatial.distance import cdist + +from libact.base.interfaces import QueryStrategy +from libact.utils import inherit_docstring_from, seed_random_state + + +class CoreSet(QueryStrategy): + """Core-Set (k-Center Greedy) Query Strategy + + This strategy selects samples that maximize the minimum distance to any + already-labeled point. It greedily builds a coreset by always picking the + unlabeled point farthest from the current labeled set, ensuring geometric + coverage of the feature space. + + Parameters + ---------- + dataset : Dataset object + The dataset to query from. + + metric : str, optional (default='euclidean') + Distance metric passed to ``scipy.spatial.distance.cdist``. + Common options: 'euclidean', 'cosine', 'cityblock', 'minkowski'. + + transformer : object with transform method, optional (default=None) + Optional feature transformer (e.g., encoder, embedding model). + If provided, distances are computed in the transformed space. + Must have a ``transform(X)`` method. + + random_state : {int, np.random.RandomState instance, None}, optional (default=None) + Random state for tie-breaking reproducibility. + + Attributes + ---------- + metric : str + The distance metric used. + + transformer : object or None + The feature transformer if provided. + + random_state_ : np.random.RandomState instance + The random number generator. + + Examples + -------- + .. code-block:: python + + from libact.query_strategies import CoreSet + + # Basic usage with Euclidean distance + qs = CoreSet(dataset) + + # With cosine distance + qs = CoreSet(dataset, metric='cosine') + + # With a feature transformer + qs = CoreSet(dataset, transformer=my_encoder) + + References + ---------- + .. [1] Sener, Ozan, and Silvio Savarese. "Active learning for convolutional + neural networks: A core-set approach." ICLR 2018. + """ + + def __init__(self, dataset, **kwargs): + super(CoreSet, self).__init__(dataset, **kwargs) + + self.metric = kwargs.pop('metric', 'euclidean') + + self.transformer = kwargs.pop('transformer', None) + if self.transformer is not None and not hasattr(self.transformer, 'transform'): + raise TypeError("transformer must have a 'transform' method") + + random_state = kwargs.pop('random_state', None) + self.random_state_ = seed_random_state(random_state) + + @inherit_docstring_from(QueryStrategy) + def make_query(self): + dataset = self.dataset + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + raise ValueError("No unlabeled samples available") + + # Get labeled data + labeled_entries = dataset.get_labeled_entries() + X_labeled = np.asarray(labeled_entries[0]) + + # Fallback to random if no labeled data + if len(X_labeled) == 0: + idx = self.random_state_.randint(0, len(unlabeled_entry_ids)) + return unlabeled_entry_ids[idx] + + # Transform features if transformer is provided + if self.transformer is not None: + X_pool_t = np.asarray(self.transformer.transform(X_pool)) + X_labeled_t = np.asarray(self.transformer.transform(X_labeled)) + else: + X_pool_t = X_pool + X_labeled_t = X_labeled + + # Compute pairwise distances: (n_unlabeled, n_labeled) + dist_matrix = cdist(X_pool_t, X_labeled_t, metric=self.metric) + + # For each unlabeled point, find minimum distance to any labeled point + min_distances = np.min(dist_matrix, axis=1) + + # Select the unlabeled point with maximum min-distance (farthest) + max_dist = np.max(min_distances) + candidates = np.where(np.isclose(min_distances, max_dist))[0] + selected_idx = self.random_state_.choice(candidates) + + return unlabeled_entry_ids[selected_idx] + + def _get_scores(self): + """Return min-distances to labeled set for all unlabeled samples. + + Returns + ------- + scores : list of (entry_id, score) tuples + Each score is the minimum distance from that unlabeled point + to any labeled point. Higher score means more informative. + """ + dataset = self.dataset + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + return [] + + labeled_entries = dataset.get_labeled_entries() + X_labeled = np.asarray(labeled_entries[0]) + + if len(X_labeled) == 0: + return list(zip(unlabeled_entry_ids, + [float('inf')] * len(unlabeled_entry_ids))) + + if self.transformer is not None: + X_pool_t = np.asarray(self.transformer.transform(X_pool)) + X_labeled_t = np.asarray(self.transformer.transform(X_labeled)) + else: + X_pool_t = X_pool + X_labeled_t = X_labeled + + dist_matrix = cdist(X_pool_t, X_labeled_t, metric=self.metric) + min_distances = np.min(dist_matrix, axis=1) + + return list(zip(unlabeled_entry_ids, min_distances)) diff --git a/libact/query_strategies/epsilon_uncertainty_sampling.py b/libact/query_strategies/epsilon_uncertainty_sampling.py new file mode 100644 index 00000000..5527bcb2 --- /dev/null +++ b/libact/query_strategies/epsilon_uncertainty_sampling.py @@ -0,0 +1,216 @@ +"""Epsilon Uncertainty Sampling (ε-US) + +This module implements epsilon-greedy uncertainty sampling, which balances +exploration (random sampling) with exploitation (uncertainty sampling). +""" +import numpy as np + +from libact.base.interfaces import QueryStrategy, ContinuousModel, \ + ProbabilisticModel +from libact.utils import inherit_docstring_from, seed_random_state, zip + + +class EpsilonUncertaintySampling(QueryStrategy): + r"""Epsilon Uncertainty Sampling (ε-US) + + This strategy implements epsilon-greedy active learning, balancing + exploration and exploitation: + + - With probability ε: select a random unlabeled sample (exploration) + - With probability 1-ε: select by uncertainty sampling (exploitation) + + This simple approach provides a tunable exploration rate, which can be + useful when the model's uncertainty estimates are unreliable early in + training. + + Parameters + ---------- + dataset : Dataset object + The dataset to query from. + + model : :py:class:`libact.base.interfaces.ProbabilisticModel` or \ + :py:class:`libact.base.interfaces.ContinuousModel` + The base model for uncertainty estimation. + + epsilon : float, optional (default=0.1) + Probability of random exploration. Must be in [0, 1]. + - ``epsilon=0``: pure uncertainty sampling (no exploration) + - ``epsilon=1``: pure random sampling (no exploitation) + + method : str, optional (default='lc') + Uncertainty measure to use when exploiting: + + - ``'lc'``: least confident (1 - max probability) + - ``'sm'``: smallest margin (difference between top two probabilities) + - ``'entropy'``: predictive entropy + + ``'entropy'`` requires a ``ProbabilisticModel``. + + random_state : {int, np.random.RandomState instance, None}, optional + Random state for reproducibility. + + Attributes + ---------- + model : ProbabilisticModel or ContinuousModel + The uncertainty model. + + epsilon : float + The exploration probability. + + method : str + The uncertainty method. + + Examples + -------- + .. code-block:: python + + from libact.query_strategies import EpsilonUncertaintySampling + from libact.models import LogisticRegression + + # 10% exploration, 90% uncertainty sampling + qs = EpsilonUncertaintySampling( + dataset, + model=LogisticRegression(), + epsilon=0.1 + ) + + # Higher exploration early in training + qs = EpsilonUncertaintySampling( + dataset, + model=LogisticRegression(), + epsilon=0.3, + method='entropy' + ) + + Notes + ----- + When using with ALBL (Active Learning by Learning), note that ALBL's + ``uniform_sampler=True`` parameter already adds random sampling as one + of the bandit arms. In that context, ε-US may be redundant since ALBL + learns the optimal exploration/exploitation balance adaptively. + + ε-US is most useful as: + - A standalone strategy outside ALBL + - A simple baseline for comparison + - When you want a fixed (non-adaptive) exploration rate + + See Also + -------- + UncertaintySampling : Pure uncertainty sampling without exploration. + RandomSampling : Pure random sampling. + """ + + def __init__(self, *args, **kwargs): + super(EpsilonUncertaintySampling, self).__init__(*args, **kwargs) + + self.model = kwargs.pop('model', None) + if self.model is None: + raise TypeError( + "__init__() missing required keyword-only argument: 'model'" + ) + if not isinstance(self.model, ContinuousModel) and \ + not isinstance(self.model, ProbabilisticModel): + raise TypeError( + "model has to be a ContinuousModel or ProbabilisticModel" + ) + + self.epsilon = kwargs.pop('epsilon', 0.1) + if not 0 <= self.epsilon <= 1: + raise ValueError("epsilon must be in [0, 1]") + + self.method = kwargs.pop('method', 'lc') + if self.method not in ['lc', 'sm', 'entropy']: + raise TypeError( + "supported methods are ['lc', 'sm', 'entropy'], " + "the given one is: " + self.method + ) + if self.method == 'entropy' and \ + not isinstance(self.model, ProbabilisticModel): + raise TypeError( + "method 'entropy' requires model to be a ProbabilisticModel" + ) + + random_state = kwargs.pop('random_state', None) + self.random_state_ = seed_random_state(random_state) + + self.model.train(self.dataset) + + def _get_uncertainty_scores(self, X_pool): + """Compute uncertainty scores for unlabeled samples. + + Parameters + ---------- + X_pool : array-like, shape (n_samples, n_features) + + Returns + ------- + scores : ndarray, shape (n_samples,) + Uncertainty scores (higher = more uncertain). + """ + if isinstance(self.model, ProbabilisticModel): + dvalue = np.asarray(self.model.predict_proba(X_pool)) + else: + # ContinuousModel + dvalue = np.asarray(self.model.predict_real(X_pool)) + + if self.method == 'lc': + return 1.0 - np.max(dvalue, axis=1) + elif self.method == 'sm': + if dvalue.shape[1] == 2: + top2 = np.sort(dvalue, axis=1)[:, ::-1][:, :2] + else: + top2 = -(np.partition(-dvalue, 1, axis=1)[:, :2]) + return 1.0 - np.abs(top2[:, 0] - top2[:, 1]) + else: # entropy + dvalue = np.clip(dvalue, 1e-10, 1.0) + return np.sum(-dvalue * np.log(dvalue), axis=1) + + def _get_scores(self): + """Return uncertainty scores for all unlabeled samples. + + Returns + ------- + scores : list of (entry_id, score) tuples + """ + dataset = self.dataset + self.model.train(dataset) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + return [] + + scores = self._get_uncertainty_scores(X_pool) + return list(zip(unlabeled_entry_ids, scores)) + + @inherit_docstring_from(QueryStrategy) + def make_query(self, return_score=False): + dataset = self.dataset + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + + if len(unlabeled_entry_ids) == 0: + raise ValueError("No unlabeled samples available") + + # Epsilon-greedy: explore with probability epsilon + if self.random_state_.random() < self.epsilon: + # Exploration: random selection + ask_id = self.random_state_.choice(unlabeled_entry_ids) + else: + # Exploitation: uncertainty sampling + self.model.train(dataset) + X_pool = np.asarray(X_pool) + scores = self._get_uncertainty_scores(X_pool) + + max_score = np.max(scores) + candidates = np.where(np.isclose(scores, max_score))[0] + selected_idx = self.random_state_.choice(candidates) + ask_id = unlabeled_entry_ids[selected_idx] + + if return_score: + return ask_id, self._get_scores() + else: + return ask_id + + @inherit_docstring_from(QueryStrategy) + def update(self, entry_id, label): + self.model.train(self.dataset) diff --git a/libact/query_strategies/information_density.py b/libact/query_strategies/information_density.py new file mode 100644 index 00000000..ba8c8e02 --- /dev/null +++ b/libact/query_strategies/information_density.py @@ -0,0 +1,237 @@ +"""Information Density Query Strategy + +This module implements the Information Density approach for active learning, +which weights uncertainty scores by the average similarity to other unlabeled +instances, preferring samples that are both uncertain and representative. +""" +import numpy as np +from scipy.spatial.distance import cdist + +from libact.base.interfaces import QueryStrategy, ProbabilisticModel, \ + ContinuousModel +from libact.utils import inherit_docstring_from, seed_random_state, zip + + +class InformationDensity(QueryStrategy): + r"""Information Density Query Strategy + + This strategy combines model uncertainty with instance density to avoid + querying outliers. Each unlabeled sample is scored by its uncertainty + weighted by its average similarity to all other unlabeled samples: + + .. math:: + + \text{ID}(x) = \text{uncertainty}(x) \times + \left( \frac{1}{|U|} \sum_{u \in U} \text{sim}(x, u) \right)^\beta + + Parameters + ---------- + dataset : Dataset object + The dataset to query from. + + model : :py:class:`libact.base.interfaces.ProbabilisticModel` or \ + :py:class:`libact.base.interfaces.ContinuousModel` + The base model for uncertainty estimation. + + method : str, optional (default='entropy') + Uncertainty measure to use: + + - ``'lc'``: least confident (1 - max probability) + - ``'sm'``: smallest margin (difference between top two probabilities) + - ``'entropy'``: predictive entropy + + ``'entropy'`` requires a ``ProbabilisticModel``. + + metric : str, optional (default='euclidean') + Distance metric for density calculation, passed to + ``scipy.spatial.distance.cdist``. + + beta : float, optional (default=1.0) + Exponent controlling density influence. + ``beta=0`` gives pure uncertainty; larger values increase + the density weight. + + random_state : {int, np.random.RandomState instance, None}, optional + Random state for tie-breaking reproducibility. + + Attributes + ---------- + model : ProbabilisticModel or ContinuousModel + The uncertainty model. + + method : str + The uncertainty method. + + metric : str + The distance metric for density. + + beta : float + The density exponent. + + Examples + -------- + .. code-block:: python + + from libact.query_strategies import InformationDensity + + # Basic usage with entropy uncertainty + qs = InformationDensity(dataset, model=my_model) + + # With least-confident uncertainty and cosine similarity + qs = InformationDensity(dataset, model=my_model, method='lc', + metric='cosine') + + # Strong density preference + qs = InformationDensity(dataset, model=my_model, beta=2.0) + + References + ---------- + .. [1] Settles, Burr, and Mark Craven. "An analysis of active learning + strategies for sequence labeling tasks." EMNLP 2008. + https://aclanthology.org/D08-1112.pdf + """ + + def __init__(self, *args, **kwargs): + super(InformationDensity, self).__init__(*args, **kwargs) + + self.model = kwargs.pop('model', None) + if self.model is None: + raise TypeError( + "__init__() missing required keyword-only argument: 'model'" + ) + if not isinstance(self.model, ContinuousModel) and \ + not isinstance(self.model, ProbabilisticModel): + raise TypeError( + "model has to be a ContinuousModel or ProbabilisticModel" + ) + + self.method = kwargs.pop('method', 'entropy') + if self.method not in ['lc', 'sm', 'entropy']: + raise TypeError( + "supported methods are ['lc', 'sm', 'entropy'], " + "the given one is: " + self.method + ) + if self.method == 'entropy' and \ + not isinstance(self.model, ProbabilisticModel): + raise TypeError( + "method 'entropy' requires model to be a ProbabilisticModel" + ) + + self.metric = kwargs.pop('metric', 'euclidean') + self.beta = kwargs.pop('beta', 1.0) + + random_state = kwargs.pop('random_state', None) + self.random_state_ = seed_random_state(random_state) + + self.model.train(self.dataset) + + def _uncertainty_scores(self, X_pool): + """Compute uncertainty scores for unlabeled samples. + + Parameters + ---------- + X_pool : array-like, shape (n_samples, n_features) + + Returns + ------- + scores : ndarray, shape (n_samples,) + Uncertainty scores (higher = more uncertain). + """ + if isinstance(self.model, ProbabilisticModel): + dvalue = np.asarray(self.model.predict_proba(X_pool)) + elif isinstance(self.model, ContinuousModel): + dvalue = np.asarray(self.model.predict_real(X_pool)) + + if self.method == 'lc': + return 1.0 - np.max(dvalue, axis=1) + elif self.method == 'sm': + # Get top 2 values via partition + if dvalue.shape[1] == 2: + top2 = np.sort(dvalue, axis=1)[:, ::-1][:, :2] + else: + top2 = -(np.partition(-dvalue, 1, axis=1)[:, :2]) + return 1.0 - np.abs(top2[:, 0] - top2[:, 1]) + elif self.method == 'entropy': + dvalue = np.clip(dvalue, 1e-10, 1.0) + return np.sum(-dvalue * np.log(dvalue), axis=1) + + def _density_scores(self, X_pool): + """Compute density scores for unlabeled samples. + + Density of each sample is the average similarity to all other + unlabeled samples, where similarity = 1 / (1 + distance). + + Parameters + ---------- + X_pool : array-like, shape (n_samples, n_features) + + Returns + ------- + scores : ndarray, shape (n_samples,) + Density scores (higher = more representative). + """ + n = len(X_pool) + if n <= 1: + return np.ones(n) + + dist_matrix = cdist(X_pool, X_pool, metric=self.metric) + sim_matrix = 1.0 / (1.0 + dist_matrix) + + # Exclude self-similarity (diagonal) from average + np.fill_diagonal(sim_matrix, 0.0) + density = np.sum(sim_matrix, axis=1) / (n - 1) + + return density + + def _get_scores(self): + """Return information density scores for all unlabeled samples. + + Returns + ------- + scores : list of (entry_id, score) tuples + Each score is uncertainty × density^beta. Higher = more informative. + """ + dataset = self.dataset + self.model.train(dataset) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() + X_pool = np.asarray(X_pool) + + if len(unlabeled_entry_ids) == 0: + return [] + + uncertainty = self._uncertainty_scores(X_pool) + # Ensure non-negative uncertainty (ContinuousModel predict_real can + # produce values > 1.0, causing 1-max or 1-margin to go negative). + # The Settles formulation requires non-negative uncertainty for the + # multiplicative combination with density to work correctly. + uncertainty = np.maximum(uncertainty, 0.0) + density = self._density_scores(X_pool) + + scores = uncertainty * (density ** self.beta) + + return list(zip(unlabeled_entry_ids, scores)) + + @inherit_docstring_from(QueryStrategy) + def make_query(self, return_score=False): + dataset = self.dataset + unlabeled_entry_ids, _ = dataset.get_unlabeled_entries() + + if len(unlabeled_entry_ids) == 0: + raise ValueError("No unlabeled samples available") + + scores = self._get_scores() + entry_ids, score_values = zip(*scores) + score_values = np.asarray(list(score_values)) + + max_score = np.max(score_values) + candidates = np.where(np.isclose(score_values, max_score))[0] + selected_idx = self.random_state_.choice(candidates) + + if return_score: + return entry_ids[selected_idx], scores + else: + return entry_ids[selected_idx] + + @inherit_docstring_from(QueryStrategy) + def update(self, entry_id, label): + self.model.train(self.dataset) diff --git a/libact/query_strategies/meson.build b/libact/query_strategies/meson.build new file mode 100644 index 00000000..7d72fe33 --- /dev/null +++ b/libact/query_strategies/meson.build @@ -0,0 +1,130 @@ +py_src = [ + '__init__.py', + 'active_learning_by_learning.py', + 'bald.py', + 'coreset.py', + 'density_weighted_meta.py', + 'density_weighted_uncertainty_sampling.py', + 'epsilon_uncertainty_sampling.py', + 'hintsvm.py', + 'information_density.py', + 'query_by_committee.py', + 'quire.py', + 'random_sampling.py', + 'uncertainty_sampling.py', + 'variance_reduction.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies', +) + +incdir_numpy = run_command( + py, + [ + '-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())', + ], + check: true, +).stdout().strip() + +# BLAS/LAPACK are only needed if variance_reduction or hintsvm features are enabled +# By default, both features are enabled (see meson.options) +want_variance_reduction = get_option('variance_reduction') +want_hintsvm = get_option('hintsvm') +need_blas = want_variance_reduction or want_hintsvm + +# Show what we're attempting to build +message('Optional features requested:') +message(' - variance_reduction: ' + want_variance_reduction.to_string()) +message(' - hintsvm: ' + want_hintsvm.to_string()) + +blas_dep = dependency('', required: false) +lapack_dep = dependency('', required: false) + +if need_blas + blas_candidates = ['mkl', 'Accelerate', 'openblas', 'blis', 'lapack'] + blas_opt = get_option('blas') + lapack_opt = get_option('lapack') + + blas_dep = (blas_opt != 'auto') ? dependency(blas_opt, required: false) : dependency('', required: false) + lapack_dep = (lapack_opt != 'auto') ? dependency(lapack_opt, required: false) : dependency('', required: false) + + # fall-back loop + foreach cand : blas_candidates + if not blas_dep.found() + blas_dep = dependency(cand, required: false) + endif + if not lapack_dep.found() + lapack_dep = dependency(cand, required: false) + endif + endforeach + + if not blas_dep.found() or not lapack_dep.found() + # If BLAS/LAPACK not found, automatically disable optional features and continue + warning( + 'BLAS/LAPACK libraries not found. Disabling variance_reduction and hintsvm features. ' + + 'To enable these features, install BLAS/LAPACK libraries (e.g., libopenblas-dev and liblapacke-dev on Debian/Ubuntu, openblas on macOS via Homebrew).' + ) + want_variance_reduction = false + want_hintsvm = false + endif +endif + +if want_variance_reduction + message('Building VarianceReduction ...') + py.extension_module( + '_variance_reduction', + ['src/variance_reduction/variance_reduction.c'], + c_args: ['-std=c11'], + dependencies: [blas_dep, lapack_dep], + include_directories: [ + include_directories(incdir_numpy), + ], + install: true, + subdir: 'libact/query_strategies', + ) +else + message('Skipping VarianceReduction (BLAS/LAPACK not available or feature disabled)') +endif + +srcs = [ + '_hintsvm.pyx', + 'src/hintsvm/libsvm_helper.c', + 'src/hintsvm/svm.cpp', +] + +if want_hintsvm + message('Building HintSVM ...') + py.extension_module( + '_hintsvm', + srcs, + dependencies: [blas_dep, lapack_dep], + include_directories: [ + include_directories('src/hintsvm'), + include_directories(incdir_numpy), + ], + install: true, + subdir: 'libact/query_strategies', + ) +else + message('Skipping HintSVM (BLAS/LAPACK not available or feature disabled)') +endif + +subdir('multilabel') +subdir('multiclass') +subdir('tests') + +# Summary of what was built +message('=== Build Summary ===') +if want_variance_reduction + message(' ✓ variance_reduction: BUILT') +else + message(' ✗ variance_reduction: SKIPPED') +endif +if want_hintsvm + message(' ✓ hintsvm: BUILT') +else + message(' ✗ hintsvm: SKIPPED') +endif +message('====================') diff --git a/libact/query_strategies/multiclass/meson.build b/libact/query_strategies/multiclass/meson.build new file mode 100644 index 00000000..447db98c --- /dev/null +++ b/libact/query_strategies/multiclass/meson.build @@ -0,0 +1,14 @@ +py_src = [ + '__init__.py', + 'active_learning_with_cost_embedding.py', + 'expected_error_reduction.py', + 'hierarchical_sampling.py', + 'mdsp.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies/multiclass', +) + +subdir('tests') diff --git a/libact/query_strategies/multiclass/tests/meson.build b/libact/query_strategies/multiclass/tests/meson.build new file mode 100644 index 00000000..b08f8615 --- /dev/null +++ b/libact/query_strategies/multiclass/tests/meson.build @@ -0,0 +1,10 @@ +py_src = [ + '__init__.py', + 'test_hierarchical_sampling.py', + 'test_iris.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies/multiclass/tests', +) diff --git a/libact/query_strategies/multilabel/meson.build b/libact/query_strategies/multilabel/meson.build new file mode 100644 index 00000000..a9ac26f0 --- /dev/null +++ b/libact/query_strategies/multilabel/meson.build @@ -0,0 +1,15 @@ +py_src = [ + '__init__.py', + 'adaptive_active_learning.py', + 'binary_minimization.py', + 'cost_sensitive_reference_pair_encoding.py', + 'maximum_margin_reduction.py', + 'multilabel_with_auxiliary_learner.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies/multilabel', +) + +subdir('tests') diff --git a/libact/query_strategies/multilabel/tests/meson.build b/libact/query_strategies/multilabel/tests/meson.build new file mode 100644 index 00000000..cb25f846 --- /dev/null +++ b/libact/query_strategies/multilabel/tests/meson.build @@ -0,0 +1,9 @@ +py_src = [ + '__init__.py', + 'test_multilabel_realdata.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies/multilabel/tests', +) diff --git a/libact/query_strategies/query_by_committee.py b/libact/query_strategies/query_by_committee.py index 73cc6cdf..ea8dadab 100644 --- a/libact/query_strategies/query_by_committee.py +++ b/libact/query_strategies/query_by_committee.py @@ -205,5 +205,7 @@ def make_query(self): avg_kl = self._kl_divergence_disagreement(proba) ask_idx = self.random_state_.choice( np.where(np.isclose(avg_kl, np.max(avg_kl)))[0]) + else: + raise ValueError("disagreement must be 'vote' or 'kl_divergence'") return unlabeled_entry_ids[ask_idx] diff --git a/libact/query_strategies/tests/meson.build b/libact/query_strategies/tests/meson.build new file mode 100644 index 00000000..5c4378bd --- /dev/null +++ b/libact/query_strategies/tests/meson.build @@ -0,0 +1,19 @@ +py_src = [ + '__init__.py', + 'test_bald.py', + 'test_coreset.py', + 'test_density_weighted_meta.py', + 'test_epsilon_uncertainty_sampling.py', + 'test_hintsvm.py', + 'test_information_density.py', + 'test_quire.py', + 'test_realdata.py', + 'test_uncertainty_sampling.py', + 'test_variance_reduction.py', + 'utils.py', +] + +py.install_sources( + py_src, + subdir: 'libact/query_strategies/tests', +) diff --git a/libact/query_strategies/tests/test_bald.py b/libact/query_strategies/tests/test_bald.py new file mode 100644 index 00000000..28b539c5 --- /dev/null +++ b/libact/query_strategies/tests/test_bald.py @@ -0,0 +1,243 @@ +"""Test BALD Query Strategy""" +import unittest +from unittest.mock import Mock, MagicMock + +import numpy as np + +from libact.base.dataset import Dataset +from libact.models import LogisticRegression, SklearnProbaAdapter +from libact.query_strategies import BALD +from libact.labelers import IdealLabeler + + +def init_dataset(X, y, n_labeled=6): + """Initialize dataset with some labeled and some unlabeled samples.""" + labels = list(y[:n_labeled]) + [None] * (len(y) - n_labeled) + return Dataset(X, labels) + + +class BALDTestCase(unittest.TestCase): + + def setUp(self): + np.random.seed(1126) + self.X = np.array([ + [-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], + [0, 1], [0, -2], [1.5, 1.5], [-2, -2] + ]) + self.y = np.array([-1, -1, -1, 1, 1, 1, -1, -1, 1, 1]) + self.fully_labeled_ds = Dataset(self.X, self.y) + self.labeler = IdealLabeler(self.fully_labeled_ds) + + def test_with_provided_models(self): + """Should work with pre-provided list of models.""" + trn_ds = init_dataset(self.X, self.y) + + models = [ + LogisticRegression(solver='liblinear', C=0.1), + LogisticRegression(solver='liblinear', C=1.0), + LogisticRegression(solver='liblinear', C=10.0), + ] + + qs = BALD(trn_ds, models=models, random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_with_base_model_bagging(self): + """Should create ensemble via bagging from base model.""" + from sklearn.linear_model import LogisticRegression as SklearnLR + + trn_ds = init_dataset(self.X, self.y) + + base_model = SklearnProbaAdapter( + SklearnLR(solver='liblinear', random_state=42) + ) + + qs = BALD( + trn_ds, + base_model=base_model, + n_models=5, + random_state=42 + ) + + self.assertEqual(len(qs.models), 5) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_bald_score_computation(self): + """Verify BALD score is computed correctly: H[mean(P)] - mean(H[P]).""" + trn_ds = init_dataset(self.X, self.y) + + models = [ + LogisticRegression(solver='liblinear', C=1.0), + LogisticRegression(solver='liblinear', C=1.0), + ] + + qs = BALD(trn_ds, models=models, random_state=42) + + # Get scores + scores = qs._get_scores() + self.assertGreater(len(scores), 0) + + # All BALD scores should be non-negative (MI is non-negative) + for entry_id, score in scores: + self.assertGreaterEqual(score, -1e-10) # Allow small numerical errors + + def test_update_retrains_ensemble(self): + """Update should retrain the ensemble with new data.""" + from sklearn.linear_model import LogisticRegression as SklearnLR + + trn_ds = init_dataset(self.X, self.y) + + base_model = SklearnProbaAdapter( + SklearnLR(solver='liblinear', random_state=42) + ) + + qs = BALD( + trn_ds, + base_model=base_model, + n_models=3, + random_state=42 + ) + + # Make a query and update + ask_id = qs.make_query() + lbl = self.labeler.label(self.X[ask_id]) + trn_ds.update(ask_id, lbl) + + # After update, models should have been retrained + # Make another query - should still work + ask_id2 = qs.make_query() + remaining_unlabeled = set(trn_ds.get_unlabeled_entries()[0]) + self.assertIn(ask_id2, remaining_unlabeled) + + def test_empty_models_list(self): + """Should raise ValueError for empty models list.""" + trn_ds = init_dataset(self.X, self.y) + + with self.assertRaises(ValueError): + BALD(trn_ds, models=[]) + + def test_non_probabilistic_model(self): + """Should raise TypeError for non-ProbabilisticModel.""" + from libact.models import SVM + + trn_ds = init_dataset(self.X, self.y) + + with self.assertRaises(TypeError): + BALD(trn_ds, models=[SVM()]) + + def test_missing_required_args(self): + """Should raise TypeError if neither models nor base_model provided.""" + trn_ds = init_dataset(self.X, self.y) + + with self.assertRaises(TypeError): + BALD(trn_ds) + + def test_base_model_without_clone(self): + """Should raise TypeError if base_model lacks clone method.""" + trn_ds = init_dataset(self.X, self.y) + + mock_model = Mock() + mock_model.predict_proba = Mock(return_value=np.random.rand(4, 2)) + # Remove clone method + del mock_model.clone + + with self.assertRaises(TypeError): + BALD(trn_ds, base_model=mock_model) + + def test_reproducibility(self): + """Same random_state should produce same queries.""" + from sklearn.linear_model import LogisticRegression as SklearnLR + + base_model = SklearnProbaAdapter( + SklearnLR(solver='liblinear', random_state=42) + ) + + trn_ds1 = init_dataset(self.X, self.y) + qs1 = BALD( + trn_ds1, + base_model=base_model, + n_models=3, + random_state=42 + ) + + trn_ds2 = init_dataset(self.X, self.y) + qs2 = BALD( + trn_ds2, + base_model=base_model, + n_models=3, + random_state=42 + ) + + # Run several queries and compare + queries1, queries2 = [], [] + for _ in range(3): + ask_id1 = qs1.make_query() + ask_id2 = qs2.make_query() + queries1.append(ask_id1) + queries2.append(ask_id2) + + lbl1 = self.labeler.label(self.X[ask_id1]) + lbl2 = self.labeler.label(self.X[ask_id2]) + trn_ds1.update(ask_id1, lbl1) + trn_ds2.update(ask_id2, lbl2) + + self.assertEqual(queries1, queries2) + + def test_multiple_queries(self): + """Should handle multiple queries correctly.""" + trn_ds = init_dataset(self.X, self.y) + + models = [ + LogisticRegression(solver='liblinear', C=c) + for c in [0.1, 1.0, 10.0] + ] + + qs = BALD(trn_ds, models=models, random_state=42) + + # Make multiple queries + queries = [] + for _ in range(4): + ask_id = qs.make_query() + queries.append(ask_id) + lbl = self.labeler.label(self.X[ask_id]) + trn_ds.update(ask_id, lbl) + + # All queries should be unique (no repetition) + self.assertEqual(len(queries), len(set(queries))) + + def test_entropy_calculation(self): + """Test the entropy helper function.""" + trn_ds = init_dataset(self.X, self.y) + models = [LogisticRegression(solver='liblinear')] + qs = BALD(trn_ds, models=models, random_state=42) + + # Uniform distribution should have maximum entropy + uniform = np.array([[0.5, 0.5]]) + entropy_uniform = qs._entropy(uniform) + + # Certain distribution should have zero entropy + certain = np.array([[1.0, 0.0]]) + entropy_certain = qs._entropy(certain) + + self.assertGreater(entropy_uniform[0], entropy_certain[0]) + self.assertAlmostEqual(entropy_certain[0], 0.0, places=5) + + def test_empty_unlabeled_pool(self): + """Should raise error when no unlabeled samples available.""" + trn_ds = Dataset(self.X, self.y) # All labeled + models = [LogisticRegression(solver='liblinear')] + qs = BALD(trn_ds, models=models, random_state=42) + + with self.assertRaises(ValueError): + qs.make_query() + + +if __name__ == '__main__': + unittest.main() diff --git a/libact/query_strategies/tests/test_coreset.py b/libact/query_strategies/tests/test_coreset.py new file mode 100644 index 00000000..6dc1ad22 --- /dev/null +++ b/libact/query_strategies/tests/test_coreset.py @@ -0,0 +1,174 @@ +"""Test Core-Set (k-Center Greedy) Query Strategy""" +import unittest +from unittest.mock import Mock + +import numpy as np + +from libact.base.dataset import Dataset +from libact.query_strategies import CoreSet +from libact.labelers import IdealLabeler + + +def init_dataset(X, y, n_labeled=4): + """Initialize dataset with some labeled and some unlabeled samples.""" + labels = list(y[:n_labeled]) + [None] * (len(y) - n_labeled) + return Dataset(X, labels) + + +class CoreSetTestCase(unittest.TestCase): + + def setUp(self): + np.random.seed(1126) + self.X = np.array([ + # Labeled points (cluster near origin) + [0.0, 0.0], [0.1, 0.1], [0.2, 0.0], [0.0, 0.2], + # Unlabeled points at varying distances + [1.0, 0.0], # distance ~1.0 from labeled + [0.5, 0.5], # distance ~0.5 from labeled + [3.0, 3.0], # distance ~4.24 from labeled (farthest) + [2.0, 0.0], # distance ~1.8 from labeled + [0.3, 0.1], # distance ~0.1 from labeled (closest) + [5.0, 5.0], # distance ~7.07 from labeled (very far) + ]) + self.y = np.array([0, 0, 1, 1, 0, 1, 0, 1, 0, 1]) + self.fully_labeled_ds = Dataset(self.X, self.y) + self.labeler = IdealLabeler(self.fully_labeled_ds) + + def test_returns_valid_entry_id(self): + """Query should return a valid unlabeled entry ID.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + qs = CoreSet(trn_ds, random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_selects_farthest_point(self): + """Should select the unlabeled point farthest from all labeled points.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + qs = CoreSet(trn_ds, random_state=42) + + ask_id = qs.make_query() + + # Point at [5.0, 5.0] (index 9) is farthest from labeled cluster + self.assertEqual(ask_id, 9) + + def test_with_transformer(self): + """Should use transformer for distance computation if provided.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + + # Create mock transformer that doubles features + mock_transformer = Mock() + mock_transformer.transform.side_effect = lambda X: np.asarray(X) * 2 + + qs = CoreSet( + trn_ds, + transformer=mock_transformer, + random_state=42 + ) + ask_id = qs.make_query() + + # Verify transformer was called (twice: once for pool, once for labeled) + self.assertEqual(mock_transformer.transform.call_count, 2) + # Should still select the farthest point + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + self.assertIn(ask_id, unlabeled_ids) + + def test_invalid_transformer(self): + """Should raise TypeError if transformer lacks transform method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + + invalid_transformer = object() # No transform method + + with self.assertRaises(TypeError): + CoreSet(trn_ds, transformer=invalid_transformer) + + def test_cosine_metric(self): + """Should work with cosine distance metric.""" + # Use data with no zero vectors (cosine is undefined for zero vectors) + X_cos = np.array([ + [1.0, 0.1], [0.9, 0.2], [1.0, 0.3], [0.8, 0.1], # labeled + [0.1, 1.0], [0.5, 0.5], [0.2, 0.9], [1.0, 1.0], # unlabeled + [0.3, 0.1], [0.1, 0.5], + ]) + y_cos = np.array([0, 0, 1, 1, 0, 1, 0, 1, 0, 1]) + trn_ds = init_dataset(X_cos, y_cos, n_labeled=4) + qs = CoreSet(trn_ds, metric='cosine', random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_reproducibility(self): + """Same random_state should produce same queries.""" + trn_ds1 = init_dataset(self.X, self.y, n_labeled=4) + trn_ds2 = init_dataset(self.X, self.y, n_labeled=4) + + qs1 = CoreSet(trn_ds1, random_state=42) + qs2 = CoreSet(trn_ds2, random_state=42) + + ask_id1 = qs1.make_query() + ask_id2 = qs2.make_query() + + self.assertEqual(ask_id1, ask_id2) + + def test_multiple_queries(self): + """Should handle multiple queries correctly, covering the space.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + qs = CoreSet(trn_ds, random_state=42) + + queries = [] + for _ in range(6): + ask_id = qs.make_query() + queries.append(ask_id) + lbl = self.labeler.label(self.X[ask_id]) + trn_ds.update(ask_id, lbl) + + # All queries should be unique + self.assertEqual(len(queries), len(set(queries))) + + def test_empty_pool_error(self): + """Should raise error when no unlabeled samples available.""" + trn_ds = Dataset(self.X, self.y) # All labeled + qs = CoreSet(trn_ds, random_state=42) + + with self.assertRaises(ValueError): + qs.make_query() + + def test_no_labeled_data_fallback(self): + """Should fall back to random selection when no labeled data exists.""" + # All unlabeled + labels = [None] * len(self.y) + trn_ds = Dataset(self.X, labels) + qs = CoreSet(trn_ds, random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_get_scores(self): + """_get_scores should return min-distances for all unlabeled points.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + qs = CoreSet(trn_ds, random_state=42) + + scores = qs._get_scores() + + # Should have one score per unlabeled point + unlabeled_ids = trn_ds.get_unlabeled_entries()[0] + self.assertEqual(len(scores), len(unlabeled_ids)) + + # Scores should be non-negative + for entry_id, score in scores: + self.assertGreaterEqual(score, 0.0) + + # The farthest point should have the highest score + scores_dict = dict(scores) + max_id = max(scores_dict, key=scores_dict.get) + self.assertEqual(max_id, 9) # [5.0, 5.0] is farthest + + +if __name__ == '__main__': + unittest.main() diff --git a/libact/query_strategies/tests/test_epsilon_uncertainty_sampling.py b/libact/query_strategies/tests/test_epsilon_uncertainty_sampling.py new file mode 100644 index 00000000..5ba6302c --- /dev/null +++ b/libact/query_strategies/tests/test_epsilon_uncertainty_sampling.py @@ -0,0 +1,306 @@ +"""Test Epsilon Uncertainty Sampling (ε-US) Query Strategy""" +import unittest +from unittest.mock import Mock + +import numpy as np + +from libact.base.dataset import Dataset +from libact.base.interfaces import ProbabilisticModel +from libact.query_strategies import EpsilonUncertaintySampling +from libact.labelers import IdealLabeler + + +def make_mock_model(proba_values): + """Create a mock ProbabilisticModel returning fixed probabilities.""" + model = Mock(spec=ProbabilisticModel) + model.predict_proba = Mock(return_value=np.array(proba_values)) + model.train = Mock() + return model + + +def init_dataset(X, y, n_labeled=4): + """Initialize dataset with some labeled and some unlabeled samples.""" + labels = list(y[:n_labeled]) + [None] * (len(y) - n_labeled) + return Dataset(X, labels) + + +class EpsilonUncertaintySamplingTestCase(unittest.TestCase): + + def setUp(self): + np.random.seed(1126) + self.X = np.array([ + [0.0, 0.0], [0.1, 0.1], [0.2, 0.0], [0.0, 0.2], # labeled + [1.0, 0.0], [0.5, 0.5], [3.0, 3.0], [2.0, 0.0], # unlabeled + [0.3, 0.1], [5.0, 5.0], + ]) + self.y = np.array([0, 0, 1, 1, 0, 1, 0, 1, 0, 1]) + self.fully_labeled_ds = Dataset(self.X, self.y) + self.labeler = IdealLabeler(self.fully_labeled_ds) + + def test_returns_valid_entry_id(self): + """Query should return a valid unlabeled entry ID.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + # Mock model with uniform probabilities + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.5, random_state=42 + ) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_epsilon_zero_is_pure_uncertainty(self): + """With epsilon=0, should always use uncertainty sampling.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + # Model predicts one point as most uncertain (0.5, 0.5) + mock_model = make_mock_model([ + [0.9, 0.1], # idx 4 - confident + [0.5, 0.5], # idx 5 - most uncertain + [0.8, 0.2], # idx 6 + [0.7, 0.3], # idx 7 + [0.6, 0.4], # idx 8 + [0.95, 0.05], # idx 9 - very confident + ]) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.0, random_state=42 + ) + + # With epsilon=0, should always pick the most uncertain (idx 5) + ask_id = qs.make_query() + self.assertEqual(ask_id, 5) + + def test_epsilon_one_is_pure_random(self): + """With epsilon=1, should always use random sampling.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + # Run many queries and check distribution is roughly uniform + selections = [] + for seed in range(100): + trn_ds_copy = init_dataset(self.X, self.y, n_labeled=4) + qs = EpsilonUncertaintySampling( + trn_ds_copy, model=mock_model, epsilon=1.0, random_state=seed + ) + selections.append(qs.make_query()) + + # All unlabeled IDs should appear (with high probability) + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + selected_ids = set(selections) + # At least 4 of 6 unlabeled IDs should be selected in 100 trials + self.assertGreaterEqual(len(selected_ids & unlabeled_ids), 4) + + def test_epsilon_exploration_rate(self): + """Exploration should happen approximately epsilon fraction of time.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + # Model has one clearly most uncertain point + mock_model = make_mock_model([ + [0.99, 0.01], # idx 4 - very confident + [0.5, 0.5], # idx 5 - most uncertain + [0.99, 0.01], # idx 6 + [0.99, 0.01], # idx 7 + [0.99, 0.01], # idx 8 + [0.99, 0.01], # idx 9 + ]) + + epsilon = 0.3 + n_trials = 200 + uncertain_selections = 0 + + for seed in range(n_trials): + trn_ds_copy = init_dataset(self.X, self.y, n_labeled=4) + qs = EpsilonUncertaintySampling( + trn_ds_copy, model=mock_model, epsilon=epsilon, random_state=seed + ) + ask_id = qs.make_query() + if ask_id == 5: # Most uncertain point + uncertain_selections += 1 + + # With epsilon=0.3, exploitation happens 70% of time + # Expected uncertainty selections ≈ 70% + (30% * 1/6) ≈ 75% + # Allow reasonable variance + exploitation_rate = uncertain_selections / n_trials + self.assertGreater(exploitation_rate, 0.5) # Should be mostly exploitation + self.assertLess(exploitation_rate, 0.95) # But some exploration + + def test_method_lc(self): + """Least confident method should work.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([ + [0.9, 0.1], [0.5, 0.5], [0.8, 0.2], + [0.7, 0.3], [0.6, 0.4], [0.95, 0.05], + ]) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.0, method='lc', random_state=42 + ) + ask_id = qs.make_query() + self.assertEqual(ask_id, 5) # 0.5 is least confident + + def test_method_sm(self): + """Smallest margin method should work.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([ + [0.9, 0.1], [0.51, 0.49], [0.8, 0.2], # idx 5 has smallest margin + [0.7, 0.3], [0.6, 0.4], [0.95, 0.05], + ]) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.0, method='sm', random_state=42 + ) + ask_id = qs.make_query() + self.assertEqual(ask_id, 5) # 0.51-0.49=0.02 smallest margin + + def test_method_entropy(self): + """Entropy method should work with ProbabilisticModel.""" + from libact.base.interfaces import ProbabilisticModel + + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = Mock(spec=ProbabilisticModel) + mock_model.predict_proba = Mock(return_value=np.array([ + [0.9, 0.1], [0.5, 0.5], [0.8, 0.2], # idx 5 has max entropy + [0.7, 0.3], [0.6, 0.4], [0.95, 0.05], + ])) + mock_model.train = Mock() + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.0, method='entropy', random_state=42 + ) + ask_id = qs.make_query() + self.assertEqual(ask_id, 5) # 0.5, 0.5 has max entropy + + def test_invalid_epsilon(self): + """Should raise error for epsilon outside [0, 1].""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + with self.assertRaises(ValueError): + EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=-0.1, random_state=42 + ) + + with self.assertRaises(ValueError): + EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=1.5, random_state=42 + ) + + def test_missing_model(self): + """Should raise error when model is not provided.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + + with self.assertRaises(TypeError): + EpsilonUncertaintySampling(trn_ds, epsilon=0.1, random_state=42) + + def test_invalid_model_type(self): + """Should raise error for invalid model type.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + invalid_model = object() + + with self.assertRaises(TypeError): + EpsilonUncertaintySampling( + trn_ds, model=invalid_model, epsilon=0.1, random_state=42 + ) + + def test_invalid_method(self): + """Should raise error for invalid method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + with self.assertRaises(TypeError): + EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.1, method='invalid', + random_state=42 + ) + + def test_entropy_requires_probabilistic_model(self): + """Entropy method should require ProbabilisticModel.""" + from libact.base.interfaces import ContinuousModel + + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = Mock(spec=ContinuousModel) + mock_model.predict_real = Mock(return_value=np.array([[0.5, 0.5]] * 6)) + mock_model.train = Mock() + + with self.assertRaises(TypeError): + EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.1, method='entropy', + random_state=42 + ) + + def test_reproducibility(self): + """Same random_state should produce same queries.""" + trn_ds1 = init_dataset(self.X, self.y, n_labeled=4) + trn_ds2 = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + qs1 = EpsilonUncertaintySampling( + trn_ds1, model=mock_model, epsilon=0.5, random_state=42 + ) + qs2 = EpsilonUncertaintySampling( + trn_ds2, model=mock_model, epsilon=0.5, random_state=42 + ) + + ask_id1 = qs1.make_query() + ask_id2 = qs2.make_query() + + self.assertEqual(ask_id1, ask_id2) + + def test_empty_pool_error(self): + """Should raise error when no unlabeled samples available.""" + trn_ds = Dataset(self.X, self.y) # All labeled + mock_model = make_mock_model([[0.5, 0.5]] * 10) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.5, random_state=42 + ) + + with self.assertRaises(ValueError): + qs.make_query() + + def test_return_score(self): + """make_query with return_score=True should return scores.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([ + [0.9, 0.1], [0.5, 0.5], [0.8, 0.2], + [0.7, 0.3], [0.6, 0.4], [0.95, 0.05], + ]) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.0, random_state=42 + ) + ask_id, scores = qs.make_query(return_score=True) + + self.assertEqual(ask_id, 5) + self.assertEqual(len(scores), 6) + # Scores should be (entry_id, score) tuples + for entry_id, score in scores: + self.assertIsInstance(entry_id, (int, np.integer)) + self.assertIsInstance(score, (float, np.floating)) + + def test_multiple_queries(self): + """Should handle multiple queries correctly.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + mock_model = make_mock_model([[0.5, 0.5]] * 6) + + qs = EpsilonUncertaintySampling( + trn_ds, model=mock_model, epsilon=0.5, random_state=42 + ) + + queries = [] + for _ in range(4): + ask_id = qs.make_query() + queries.append(ask_id) + lbl = self.labeler.label(self.X[ask_id]) + trn_ds.update(ask_id, lbl) + + # All queries should be from originally unlabeled set + original_unlabeled = {4, 5, 6, 7, 8, 9} + for q in queries: + self.assertIn(q, original_unlabeled) + + +if __name__ == '__main__': + unittest.main() diff --git a/libact/query_strategies/tests/test_information_density.py b/libact/query_strategies/tests/test_information_density.py new file mode 100644 index 00000000..958590c7 --- /dev/null +++ b/libact/query_strategies/tests/test_information_density.py @@ -0,0 +1,390 @@ +"""Test Information Density Query Strategy""" +import unittest +from unittest.mock import Mock, patch + +import numpy as np + +from libact.base.dataset import Dataset +from libact.base.interfaces import ProbabilisticModel, ContinuousModel +from libact.query_strategies import InformationDensity + + +def init_dataset(X, y, n_labeled=4): + """Initialize dataset with some labeled and some unlabeled samples.""" + labels = list(y[:n_labeled]) + [None] * (len(y) - n_labeled) + return Dataset(X, labels) + + +class MockProbModel(ProbabilisticModel): + """Mock probabilistic model for testing.""" + + def __init__(self, n_classes=2): + self.n_classes = n_classes + self._trained = False + + def train(self, dataset): + self._trained = True + + def predict(self, feature): + return np.zeros(len(feature), dtype=int) + + def score(self, testing_dataset): + return 0.5 + + def predict_proba(self, feature): + n = len(feature) + # Return probabilities that vary by sample index + proba = np.zeros((n, self.n_classes)) + for i in range(n): + # First sample very uncertain, last very confident + p = 0.5 + 0.4 * (i / max(n - 1, 1)) + proba[i, 0] = p + proba[i, 1] = 1.0 - p + return proba + + +class MockContinuousModel(ContinuousModel): + """Mock continuous model for testing.""" + + def __init__(self, n_classes=2): + self.n_classes = n_classes + + def train(self, dataset): + pass + + def predict(self, feature): + return np.zeros(len(feature), dtype=int) + + def score(self, testing_dataset): + return 0.5 + + def predict_real(self, feature): + n = len(feature) + dvalue = np.zeros((n, self.n_classes)) + for i in range(n): + dvalue[i, 0] = 0.5 + 0.4 * (i / max(n - 1, 1)) + dvalue[i, 1] = -(0.5 + 0.4 * (i / max(n - 1, 1))) + return dvalue + + +class InformationDensityTestCase(unittest.TestCase): + + def setUp(self): + np.random.seed(1126) + self.X = np.array([ + # Labeled (cluster near origin) + [0.0, 0.0], [0.1, 0.1], [0.2, 0.0], [0.0, 0.2], + # Unlabeled - dense cluster + [1.0, 1.0], [1.1, 1.0], [1.0, 1.1], [1.1, 1.1], + # Unlabeled - outliers + [5.0, 5.0], [8.0, 8.0], + ]) + self.y = np.array([0, 0, 1, 1, 0, 1, 0, 1, 0, 1]) + + def test_returns_valid_entry_id(self): + """Query should return a valid unlabeled entry ID.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + + self.assertIn(ask_id, unlabeled_ids) + + def test_prefers_dense_regions(self): + """Should prefer uncertain points in dense regions over outliers.""" + # Create a dataset where outlier is most uncertain but isolated + X = np.array([ + [0.0, 0.0], [0.1, 0.1], # labeled + [1.0, 1.0], [1.1, 1.0], [1.0, 1.1], [1.1, 1.1], # dense cluster + [10.0, 10.0], # isolated outlier + ]) + y = np.array([0, 1, 0, 1, 0, 1, 0]) + trn_ds = init_dataset(X, y, n_labeled=2) + + # Model that makes the outlier (index 6) most uncertain + model = Mock(spec=ProbabilisticModel) + model.train = Mock() + proba = np.array([ + [0.9, 0.1], # idx 2: confident + [0.85, 0.15], # idx 3: confident + [0.8, 0.2], # idx 4: somewhat confident + [0.75, 0.25], # idx 5: somewhat uncertain + [0.5, 0.5], # idx 6: maximally uncertain (outlier) + ]) + model.predict_proba = Mock(return_value=proba) + + qs = InformationDensity(trn_ds, model=model, beta=2.0, random_state=42) + ask_id = qs.make_query() + + # Should NOT pick the outlier (index 6) despite highest uncertainty + self.assertNotEqual(ask_id, 6) + # Should pick from the dense cluster (indices 2-5) + self.assertIn(ask_id, [2, 3, 4, 5]) + + def test_beta_zero_equals_uncertainty(self): + """With beta=0, density has no effect (pure uncertainty).""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + + qs = InformationDensity(trn_ds, model=model, beta=0.0, random_state=42) + scores = qs._get_scores() + + # With beta=0, density^0 = 1 for all, so scores = uncertainty only + # The first unlabeled point (most uncertain in MockProbModel) should score highest + entry_ids, score_values = zip(*scores) + score_values = list(score_values) + max_idx = np.argmax(score_values) + # First unlabeled has p=0.5 (max entropy) + self.assertEqual(entry_ids[max_idx], 4) + + def test_method_lc(self): + """Should work with least-confident method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, method='lc', + random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + self.assertIn(ask_id, unlabeled_ids) + + def test_method_sm(self): + """Should work with smallest-margin method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, method='sm', + random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + self.assertIn(ask_id, unlabeled_ids) + + def test_method_entropy(self): + """Should work with entropy method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, method='entropy', + random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + self.assertIn(ask_id, unlabeled_ids) + + def test_continuous_model(self): + """Should work with ContinuousModel using lc or sm.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockContinuousModel() + qs = InformationDensity(trn_ds, model=model, method='lc', + random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + self.assertIn(ask_id, unlabeled_ids) + + def test_entropy_requires_probabilistic_model(self): + """Should raise TypeError if entropy method used with ContinuousModel.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockContinuousModel() + + with self.assertRaises(TypeError): + InformationDensity(trn_ds, model=model, method='entropy') + + def test_missing_model(self): + """Should raise TypeError if model not provided.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + + with self.assertRaises(TypeError): + InformationDensity(trn_ds) + + def test_invalid_model_type(self): + """Should raise TypeError for non-compatible model.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + + with self.assertRaises(TypeError): + InformationDensity(trn_ds, model=object()) + + def test_invalid_method(self): + """Should raise TypeError for unsupported method.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + + with self.assertRaises(TypeError): + InformationDensity(trn_ds, model=model, method='invalid') + + def test_cosine_metric(self): + """Should work with cosine distance metric.""" + X_cos = np.array([ + [1.0, 0.1], [0.9, 0.2], [1.0, 0.3], [0.8, 0.1], + [0.1, 1.0], [0.5, 0.5], [0.2, 0.9], [1.0, 1.0], + [0.3, 0.1], [0.1, 0.5], + ]) + y_cos = np.array([0, 0, 1, 1, 0, 1, 0, 1, 0, 1]) + trn_ds = init_dataset(X_cos, y_cos, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, metric='cosine', + random_state=42) + + unlabeled_ids = set(trn_ds.get_unlabeled_entries()[0]) + ask_id = qs.make_query() + self.assertIn(ask_id, unlabeled_ids) + + def test_reproducibility(self): + """Same random_state should produce same queries.""" + trn_ds1 = init_dataset(self.X, self.y, n_labeled=4) + trn_ds2 = init_dataset(self.X, self.y, n_labeled=4) + model1 = MockProbModel() + model2 = MockProbModel() + + qs1 = InformationDensity(trn_ds1, model=model1, random_state=42) + qs2 = InformationDensity(trn_ds2, model=model2, random_state=42) + + self.assertEqual(qs1.make_query(), qs2.make_query()) + + def test_return_score(self): + """make_query(return_score=True) should return scores.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + result = qs.make_query(return_score=True) + self.assertEqual(len(result), 2) + ask_id, scores = result + self.assertIsInstance(ask_id, (int, np.integer)) + self.assertEqual(len(scores), 6) # 6 unlabeled + + def test_get_scores(self): + """_get_scores should return density-weighted uncertainty.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + scores = qs._get_scores() + unlabeled_ids = trn_ds.get_unlabeled_entries()[0] + self.assertEqual(len(scores), len(unlabeled_ids)) + + # All scores should be non-negative + for entry_id, score in scores: + self.assertGreaterEqual(score, 0.0) + + def test_empty_pool_error(self): + """Should raise error when no unlabeled samples available.""" + trn_ds = Dataset(self.X, self.y) # All labeled + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + with self.assertRaises(ValueError): + qs.make_query() + + def test_multiple_queries(self): + """Should handle multiple queries with dataset updates.""" + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + queries = [] + for _ in range(3): + ask_id = qs.make_query() + queries.append(ask_id) + trn_ds.update(ask_id, self.y[ask_id]) + + # All queries should be unique + self.assertEqual(len(queries), len(set(queries))) + + def test_density_scores_single_point(self): + """Density with single unlabeled point should return 1.0.""" + X = np.array([[0.0, 0.0], [0.1, 0.1], [1.0, 1.0]]) + y = np.array([0, 1, 0]) + trn_ds = init_dataset(X, y, n_labeled=2) + model = MockProbModel() + qs = InformationDensity(trn_ds, model=model, random_state=42) + + density = qs._density_scores(np.array([[1.0, 1.0]])) + self.assertEqual(density[0], 1.0) + + def test_continuous_model_large_decision_values(self): + """Scores should be non-negative even when predict_real returns large values. + + ContinuousModel.predict_real can return unbounded decision values + (e.g., SVM decision function), causing 1-max(dvalue) to go negative. + The implementation must clamp uncertainty to >=0 for the multiplicative + density combination to work correctly. + """ + # Model that returns large decision values (like SVM far from boundary) + class LargeDvalueModel(ContinuousModel): + def train(self, dataset): + pass + + def predict(self, feature): + return np.zeros(len(feature), dtype=int) + + def score(self, testing_dataset): + return 0.5 + + def predict_real(self, feature): + n = len(feature) + dvalue = np.zeros((n, 2)) + for i in range(n): + # Decision values >> 1.0 (confident predictions) + d = 3.0 + i * 0.5 + dvalue[i] = [-d, d] + return dvalue + + trn_ds = init_dataset(self.X, self.y, n_labeled=4) + model = LargeDvalueModel() + qs = InformationDensity(trn_ds, model=model, method='lc', + random_state=42) + + scores = qs._get_scores() + # All scores should be non-negative (uncertainty clamped to 0) + for entry_id, score in scores: + self.assertGreaterEqual(score, 0.0) + + def test_density_favors_dense_with_continuous_model(self): + """With ContinuousModel, density should still favor dense regions. + + Even when uncertainty from predict_real is near zero for all points + (all far from boundary), the algorithm should not invert density + preference. + """ + # Points: dense cluster + outlier + X = np.array([ + [0.0, 0.0], [0.1, 0.1], # labeled + [1.0, 1.0], [1.1, 1.0], [1.0, 1.1], # dense unlabeled cluster + [10.0, 10.0], # outlier + ]) + y = np.array([0, 1, 0, 1, 0, 1]) + + # Model where one dense point is slightly uncertain + class MixedModel(ContinuousModel): + def train(self, dataset): + pass + + def predict(self, feature): + return np.zeros(len(feature), dtype=int) + + def score(self, testing_dataset): + return 0.5 + + def predict_real(self, feature): + n = len(feature) + dvalue = np.zeros((n, 2)) + for i in range(n): + # First point near boundary, others far + d = 0.1 if i == 0 else 5.0 + dvalue[i] = [-d, d] + return dvalue + + trn_ds = init_dataset(X, y, n_labeled=2) + model = MixedModel() + qs = InformationDensity(trn_ds, model=model, method='lc', + beta=1.0, random_state=42) + + ask_id = qs.make_query() + # Should pick from dense cluster (index 2 — the uncertain one) + self.assertEqual(ask_id, 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/libact/query_strategies/uncertainty_sampling.py b/libact/query_strategies/uncertainty_sampling.py index e04b393a..096fe934 100644 --- a/libact/query_strategies/uncertainty_sampling.py +++ b/libact/query_strategies/uncertainty_sampling.py @@ -104,6 +104,8 @@ def _get_scores(self): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) + else: + raise TypeError("model must be ContinuousModel or ProbabilisticModel") if self.method == 'lc': # least confident score = -np.max(dvalue, axis=1) @@ -116,6 +118,8 @@ def _get_scores(self): elif self.method == 'entropy': score = np.sum(-dvalue * np.log(dvalue), axis=1) + else: + raise ValueError("method must be 'lc', 'sm', or 'entropy'") return zip(unlabeled_entry_ids, score) diff --git a/libact/utils/meson.build b/libact/utils/meson.build new file mode 100644 index 00000000..db3b27e7 --- /dev/null +++ b/libact/utils/meson.build @@ -0,0 +1,11 @@ +py_src = [ + '__init__.py', +] + +py.install_sources( + py_src, + subdir: 'libact/utils', +) + +subdir('multilabel') +subdir('tests') diff --git a/libact/utils/multilabel/meson.build b/libact/utils/multilabel/meson.build new file mode 100644 index 00000000..4b239756 --- /dev/null +++ b/libact/utils/multilabel/meson.build @@ -0,0 +1,8 @@ +py_src = [ + '__init__.py', +] + +py.install_sources( + py_src, + subdir: 'libact/utils/multilabel', +) diff --git a/libact/utils/tests/meson.build b/libact/utils/tests/meson.build new file mode 100644 index 00000000..a36b247e --- /dev/null +++ b/libact/utils/tests/meson.build @@ -0,0 +1,9 @@ +py_src = [ + '__init__.py', + 'test_criteria.py', +] + +py.install_sources( + py_src, + subdir: 'libact/utils/tests', +) diff --git a/meson.build b/meson.build new file mode 100644 index 00000000..3f5861db --- /dev/null +++ b/meson.build @@ -0,0 +1,20 @@ +project( + 'libact', + 'c', + 'cython', + 'cpp', + meson_version: '>=1.5.0', + default_options: ['buildtype=release', 'c_std=c11', 'cpp_std=c++17'], +) +py = import('python').find_installation(pure: false) + +result = run_command( + py, + '-c', 'import os; print("True" if os.environ.get("READTHEDOCS") else "False")', + check: true, +) +is_readthedocs = result.stdout().strip() == 'True' + +if not is_readthedocs + subdir('libact') +endif \ No newline at end of file diff --git a/meson.options b/meson.options new file mode 100644 index 00000000..2a4bf827 --- /dev/null +++ b/meson.options @@ -0,0 +1,28 @@ +option( + 'blas', + type: 'string', + value: 'auto', + description: 'option for BLAS library switching', +) +option( + 'lapack', + type: 'string', + value: 'auto', + description: 'option for LAPACK library switching', +) + +# Optional features (enabled by default) +# These features require BLAS/LAPACK libraries +# If BLAS/LAPACK are not found, they will be automatically disabled with a warning +option( + 'variance_reduction', + type: 'boolean', + value: true, + description: 'Build libact.query_strategies._variance_reduction (default: enabled, requires BLAS/LAPACK)', +) +option( + 'hintsvm', + type: 'boolean', + value: true, + description: 'Build libact.query_strategies._hintsvm (default: enabled, requires BLAS/LAPACK)', +) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e4e3dbaf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,124 @@ +[build-system] +build-backend = "mesonpy" +requires = [ + "meson-python>=0.16", + "ninja", + "cython", + "numpy", + "scipy-openblas32>=0.3.27", +] + +[project] +name = "libact" +version = "0.2.0" +description = "Pool-based active learning in Python" +readme = { file = "README.md", content-type = "text/markdown" } +requires-python = ">=3.9" +license = { file = "LICENSE" } +authors = [ + { name = "Y.-Y. Yang", email = "b01902066@csie.ntu.edu.tw" }, + { name = "S.-C. Lee", email = "b01902010@csie.ntu.edu.tw" }, + { name = "Y.-A. Chung", email = "b01902040@csie.ntu.edu.tw" }, + { name = "T.-E. Wu", email = "r00942129@ntu.edu.tw" }, + { name = "H.-T. Lin", email = "htlin@csie.ntu.edu.tw" }, +] +classifiers = [ + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "numpy>=2", + "scipy>=1.13", + "scikit-learn>=1.6", + "matplotlib>=3.8", + "joblib==1.5.1", +] + +[project.urls] +Homepage = "https://github.com/ntucllab/libact" +Repository = "https://github.com/ntucllab/libact" + +[project.optional-dependencies] +dev = ["build>=1.2.2.post1", "meson>=1.5.0", "ninja", "cython", "numpy"] + +# Build wheels + +[tool.cibuildwheel] +# Build for Python 3.9-3.12 on Linux and macOS only +build = "cp39-* cp310-* cp311-* cp312-*" +# Skip musllinux (Alpine), PyPy, and Windows +skip = "*-musllinux_* pp* *-win*" +# Test the wheel after building +test-command = """python -c ' +import libact +import libact.query_strategies as qs +# Verify basic modules work +assert hasattr(qs, \"UncertaintySampling\") +assert hasattr(qs, \"RandomSampling\") +print(\"✓ Basic modules imported successfully\") +# Check optional C-extensions (may not be available without BLAS/LAPACK) +has_ext = hasattr(qs, \"HintSVM\") and hasattr(qs, \"VarianceReduction\") +if has_ext: + from libact.query_strategies import HintSVM, VarianceReduction + print(\"✓ C-extensions (HintSVM, VarianceReduction) available\") +else: + print(\"⚠ C-extensions not available (built without BLAS/LAPACK)\") +'""" +test-requires = ["numpy>=2", "scipy>=1.13", "scikit-learn>=1.6", "matplotlib>=3.8"] + +[tool.cibuildwheel.linux] +# Architecture is set to "native" in workflow (each runner builds its own arch) +# Use manylinux_2_28 (modern standard, matches NumPy 2.x) +manylinux-x86_64-image = "manylinux_2_28" +manylinux-aarch64-image = "manylinux_2_28" +# Install OpenBLAS and create pkg-config files (manylinux_2_28 may not include them) +before-all = [ + "dnf install -y openblas-devel lapack-devel", + """cat > /tmp/openblas.pc << 'EOF' +prefix=/usr +libdir=/usr/lib64 +includedir=/usr/include/openblas + +Name: OpenBLAS +Description: OpenBLAS library +Version: 0.3.0 +Libs: -L${libdir} -lopenblas +Cflags: -I${includedir} +EOF""", + """cat > /tmp/lapack.pc << 'EOF' +prefix=/usr +libdir=/usr/lib64 +includedir=/usr/include + +Name: LAPACK +Description: LAPACK library +Version: 3.0.0 +Libs: -L${libdir} -llapack -lblas +Cflags: -I${includedir} +EOF""", +] +# Set PKG_CONFIG_PATH to include /tmp where we created .pc files +environment = { PKG_CONFIG_PATH="/tmp:/usr/lib64/pkgconfig:/usr/lib/pkgconfig" } +# Bundle OpenBLAS libraries into the wheel +repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" + +[tool.cibuildwheel.macos] +# Architecture is set to "native" in workflow (each runner builds its own arch) +# Install lapack (provides lapacke.h headers) - Accelerate framework provides the library +before-all = [ + "brew install lapack pkg-config", +] +# Bundle libraries into wheel +repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" + +[tool.cibuildwheel.macos.environment] +# Point to lapack headers (lapacke.h) - Accelerate framework provides BLAS/LAPACK libraries +# Paths work for both ARM64 (/opt/homebrew) and Intel (/usr/local) +CPPFLAGS = "-I/opt/homebrew/opt/lapack/include -I/usr/local/opt/lapack/include" +LDFLAGS = "-L/opt/homebrew/opt/lapack/lib -L/usr/local/opt/lapack/lib" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7172cd47..00000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -setuptools<60.0.0 -numpy<=2.2.2 -scipy<=1.15.1 -scikit-learn<=1.6.1 -matplotlib<=3.10.0 -Cython<=3.0.11 -joblib<=1.4.2 diff --git a/setup.py b/setup.py deleted file mode 100755 index 706db008..00000000 --- a/setup.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python - -from io import open # python 2 compatibility -import os -from setuptools import setup, Extension -import sys - -BUILD_HINTSVM = int(os.environ.get("LIBACT_BUILD_HINTSVM", 1)) -BUILD_VARIANCE_REDUCTION = int(os.environ.get("LIBACT_BUILD_VARIANCE_REDUCTION", 1)) - - -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' -# read the docs could not compile numpy and c extensions -if on_rtd: - extensions = [] - cmdclasses = {} - setup_requires = [] - install_requires = [] - tests_require = [] -else: - from Cython.Build import cythonize - from Cython.Distutils import build_ext - import numpy - import numpy.distutils - if sys.platform == 'darwin': - print("Platform Detection: Mac OS X. Link to openblas...") - extra_link_args = [] - libraries = ['openblas'] - library_dirs = [ - '/opt/local/lib', - '/usr/local/opt/openblas/lib', # for brew installs - ] - include_dirs = (numpy.distutils.misc_util.get_numpy_include_dirs() + - ['/opt/local/include', - '/usr/local/opt/openblas/include']) # for brew installs - else: - # assume linux otherwise, unless we support Windows in the future... - print("Platform Detection: Linux. Link to liblapacke...") - extra_link_args = [] - include_dirs = (numpy.distutils.misc_util.get_numpy_include_dirs() + - ['/usr/include/']) - libraries = ['lapacke', 'lapack', 'blas'] - library_dirs = ['/usr/lib'] - - extensions = [] - if BUILD_VARIANCE_REDUCTION: - print("Build VarianceReduction...") - extensions.append( - Extension( - "libact.query_strategies._variance_reduction", - ["libact/query_strategies/src/variance_reduction/variance_reduction.c"], - extra_link_args=extra_link_args, - extra_compile_args=['-std=c11'], - include_dirs=include_dirs, - libraries=libraries, - library_dirs=library_dirs, - ) - ) - if BUILD_HINTSVM: - print("Build HintSVM...") - extensions.append( - Extension( - "libact.query_strategies._hintsvm", - sources=["libact/query_strategies/_hintsvm.pyx", - "libact/query_strategies/src/hintsvm/libsvm_helper.c", - "libact/query_strategies/src/hintsvm/svm.cpp"], - include_dirs=[numpy.get_include(), - "libact/query_strategies/src/hintsvm/"], - extra_compile_args=['-lstdc++'], - ) - ) - - extensions = cythonize(extensions) - cmdclasses = {'build_ext': build_ext} - setup_requires = [] - with open('./requirements.txt') as f: - requirements = f.read().splitlines() - install_requires = requirements - tests_require = [ - 'coverage', - ] - - -setup( - name='libact', - version='0.1.6', - description='Pool-based active learning in Python', - long_description=open('README.md', 'r', newline='', encoding='utf-8').read(), - long_description_content_type="text/markdown", - author='Y.-Y. Yang, S.-C. Lee, Y.-A. Chung, T.-E. Wu, H.-T. Lin', - author_email='b01902066@csie.ntu.edu.tw, b01902010@csie.ntu.edu.tw, ' - 'b01902040@csie.ntu.edu.tw, r00942129@ntu.edu.tw, htlin@csie.ntu.edu.tw', - url='https://github.com/ntucllab/libact', - cmdclass=cmdclasses, - setup_requires=setup_requires, - install_requires=install_requires, - tests_require=tests_require, - classifiers=[ - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - ], - test_suite='libact', - packages=[ - 'libact', - 'libact.base', - 'libact.models', - 'libact.models.multilabel', - 'libact.labelers', - 'libact.query_strategies', - 'libact.query_strategies.multilabel', - 'libact.query_strategies.multiclass', - 'libact.utils', - ], - package_dir={ - 'libact': 'libact', - 'libact.base': 'libact/base', - 'libact.models': 'libact/models', - 'libact.labelers': 'libact/labelers', - 'libact.query_strategies': 'libact/query_strategies', - 'libact.query_strategies.multiclass': 'libact/query_strategies/multiclass', - 'libact.utils': 'libact/utils', - }, - ext_modules=extensions, -)