diff --git a/docs/data_structures/data_structures.rst b/docs/data_structures/data_structures.rst index 1d88d993dd..35a5ba0158 100644 --- a/docs/data_structures/data_structures.rst +++ b/docs/data_structures/data_structures.rst @@ -8,9 +8,9 @@ See :ref:`here` for instruction on constructing a complete workflow :maxdepth: 2 :caption: libEnsemble Specifications: - sim_specs - gen_specs libE_specs + gen_specs + sim_specs alloc_specs platform_specs persis_info diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index 2a9195cea5..c0ca141403 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -1,7 +1,7 @@ .. _datastruct-libe-specs: -General Specs -============= +LibE Specs +========== libEnsemble is primarily customized by setting options within a ``LibeSpecs`` class or dictionary. @@ -10,11 +10,10 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl from libensemble.specs import LibeSpecs specs = LibeSpecs( - comm=MPI.COMM_WORLD, - comms="mpi", - save_every_k_gens=1000, + gen_on_manager=True, + save_every_k_gens=100, sim_dirs_make=True, - ensemble_dir_path="/scratch/ensemble", + nworkers=4 ) .. dropdown:: Settings by Category @@ -272,8 +271,8 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl ``disable_resource_manager`` is set. **dedicated_mode** [bool] = ``False``: - Disallow any resources running libEnsemble processes (manager and workers) - from being valid targets for app submissions. + Instructs libEnsemble’s MPI executor not to run applications on nodes where + libEnsemble processes (manager and workers) are running. **zero_resource_workers** [list of ints]: List of workers (by IDs) that require no resources. For when a fixed mapping of workers diff --git a/docs/images/centralized_new_detailed.png b/docs/images/centralized_dedicated.png similarity index 100% rename from docs/images/centralized_new_detailed.png rename to docs/images/centralized_dedicated.png diff --git a/docs/images/centralized_gen_on_manager.png b/docs/images/centralized_gen_on_manager.png new file mode 100644 index 0000000000..949f8e3968 Binary files /dev/null and b/docs/images/centralized_gen_on_manager.png differ diff --git a/docs/platforms/example_scripts.rst b/docs/platforms/example_scripts.rst index 3f63524520..072cc65f22 100644 --- a/docs/platforms/example_scripts.rst +++ b/docs/platforms/example_scripts.rst @@ -2,86 +2,105 @@ Example Scheduler Submission Scripts ==================================== Below are example submission scripts used to configure and launch libEnsemble -on a variety of high-powered systems. See :ref:`here` for more -information about the respective systems and configuration. +on a variety of high-powered systems. See :ref:`Running on HPC Systems` +for more information about the respective systems and configuration. + +.. note:: + It is **highly recommended** that the directive lines (e.g., #SBATCH) in batch + submission scripts do **NOT** specify processor, task, or GPU configuration info + --- these lines should only specify the number of nodes required. + + For example, do not specify ``#SBATCH --gpus-per-node=4`` in order to use four + GPUs on the node, when each worker may use less than this, as this may assign + all of the GPUs to a single MPI invocation. Instead, the configuration should + be supplied either + :doc:`in the simulation function<../examples/sim_funcs/forces_simf_gpu>` + or, if using dynamic resources, + :doc:`in the generator<../examples/sim_funcs/forces_simf_gpu_vary_resources>`. + General examples ---------------- -.. dropdown:: Slurm - Basic +Slurm - Basic +^^^^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/submit_slurm_simple.sh - :caption: /examples/libE_submission_scripts/submit_slurm_simple.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_slurm_simple.sh + :caption: /examples/libE_submission_scripts/submit_slurm_simple.sh + :language: bash -.. dropdown:: PBS - Basic +PBS - Basic +^^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_simple.sh - :caption: /examples/libE_submission_scripts/submit_pbs_simple.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_simple.sh + :caption: /examples/libE_submission_scripts/submit_pbs_simple.sh + :language: bash -.. dropdown:: LSF - Basic +LSF - Basic +^^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/submit_lsf_simple.sh - :caption: /examples/libE_submission_scripts/submit_lsf_simple.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_lsf_simple.sh + :caption: /examples/libE_submission_scripts/submit_lsf_simple.sh + :language: bash System Examples --------------- -.. dropdown:: Aurora - - .. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_aurora.sh - :caption: /examples/libE_submission_scripts/submit_pbs_aurora.sh - :language: bash - -.. dropdown:: Frontier (Large WarpX Ensemble) - - .. literalinclude:: ../../examples/libE_submission_scripts/submit_frontier_large.sh - :caption: /examples/libE_submission_scripts/submit_frontier_large.sh - :language: bash +Aurora +^^^^^^ +.. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_aurora.sh + :caption: /examples/libE_submission_scripts/submit_pbs_aurora.sh + :language: bash -.. dropdown:: Perlmutter +Frontier (Large WarpX Ensemble) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/submit_perlmutter.sh - :caption: /examples/libE_submission_scripts/submit_perlmutter.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_frontier_large.sh + :caption: /examples/libE_submission_scripts/submit_frontier_large.sh + :language: bash -.. dropdown:: Polaris +Perlmutter +^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_polaris.sh - :caption: /examples/libE_submission_scripts/submit_pbs_polaris.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_perlmutter.sh + :caption: /examples/libE_submission_scripts/submit_perlmutter.sh + :language: bash -.. dropdown:: Bridges - Central Mode +Polaris +^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/bridges_submit_slurm_central.sh - :caption: /examples/libE_submission_scripts/bridges_submit_slurm_central.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/submit_pbs_polaris.sh + :caption: /examples/libE_submission_scripts/submit_pbs_polaris.sh + :language: bash -.. dropdown:: Bebop - Central Mode +Bebop +^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/bebop_submit_slurm_central.sh - :caption: /examples/libE_submission_scripts/bebop_submit_slurm_central.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/bebop_submit_pbs_central.sh + :caption: /examples/libE_submission_scripts/bebop_submit_pbs_central.sh + :language: bash -.. dropdown:: Bebop - Distributed Mode +Bridges - MPI / Central Mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - .. literalinclude:: ../../examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh - :caption: /examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh - :language: bash +.. literalinclude:: ../../examples/libE_submission_scripts/bridges_submit_slurm_central.sh + :caption: /examples/libE_submission_scripts/bridges_submit_slurm_central.sh + :language: bash -.. dropdown:: Summit (Decommissioned) - On Launch Nodes with Multiprocessing +.. _slurm_mpi_distributed: - .. literalinclude:: ../../examples/libE_submission_scripts/summit_submit_mproc.sh - :caption: /examples/libE_submission_scripts/summit_submit_mproc.sh - :language: bash +SLURM - MPI / Distributed Mode (co-locate workers & MPI applications) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. dropdown:: Cobalt - Intermediate node with Multiprocessing +.. literalinclude:: ../../examples/libE_submission_scripts/submit_distrib_mpi4py.sh + :caption: /examples/libE_submission_scripts/submit_distrib_mpi4py.sh + :language: bash - .. literalinclude:: ../../examples/libE_submission_scripts/cobalt_submit_mproc.sh - :caption: /examples/libE_submission_scripts/cobalt_submit_mproc.sh - :language: bash +Summit (Decommissioned) - On Launch Nodes with Multiprocessing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. literalinclude:: ../../examples/libE_submission_scripts/summit_submit_mproc.sh + :caption: /examples/libE_submission_scripts/summit_submit_mproc.sh + :language: bash diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 02e4ace459..c56ab66aa9 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -3,37 +3,119 @@ Running on HPC Systems ====================== -Central vs. Distributed ------------------------ - -libEnsemble has been developed, supported, and tested on systems of highly varying -scales, from laptops to thousands of compute nodes. On multi-node systems, there are -two basic modes of configuring libEnsemble to run and launch tasks (user applications) +libEnsemble has been tested on systems of highly varying scales, from laptops to +thousands of compute nodes. On multi-node systems, there are a few alternative +ways of configuring libEnsemble to run and launch tasks (i.e., user applications) on the available nodes. -The first mode we refer to as **central** mode, where the libEnsemble manager and worker processes -are grouped onto one or more dedicated nodes. Workers launch applications onto -the remaining allocated nodes: +The :doc:`Forces tutorial <../../tutorials/executor_forces_tutorial>` gives an +example with a simple MPI application. - .. image:: ../images/centralized_new_detailed.png - :alt: centralized - :scale: 30 - :align: center +Note that while the diagrams below show one application being run per node, +configurations with **multiple nodes per worker** or **multiple workers per node** +are both common use cases. + +Centralized Running +------------------- + +The default communications scheme places the manager and workers on the first node. +The :doc:`MPI Executor<../executor/mpi_executor>` can then be invoked by each +simulation worker, and libEnsemble will distribute user applications across the +node allocation. This is the **most common approach** where each simulation +runs an MPI application. + +The generator will run on a worker by default, but if running a single generator, +the :ref:`libE_specs` option **gen_on_manager** is recommended, +which runs the generator on the manager (using a thread) as below. + +.. list-table:: + :widths: 60 40 + + * - .. image:: ../images/centralized_gen_on_manager.png + :alt: centralized + :scale: 55 + + - In calling script: + + .. code-block:: python + :linenos: + + ensemble.libE_specs = LibeSpecs( + gen_on_manager=True, + ) + + A SLURM batch script may include: + + + .. code-block:: bash + + #SBATCH --nodes 3 + + python run_libe_forces.py --nworkers 3 + + +When using **gen_on_manager**, set ``nworkers`` to the number of workers desired for running simulations. + +Dedicated Mode +^^^^^^^^^^^^^^ + +If the :ref:`libE_specs` option **dedicated_mode** is set to +True, the MPI executor will not launch applications on nodes where libEnsemble Python +processes (manager and workers) are running. Workers launch applications onto the +remaining nodes in the allocation. -Alternatively, in **distributed** mode, the libEnsemble (manager/worker) processes -will share nodes with submitted tasks. This enables libEnsemble, using the *mpi4py* -communicator, to be run with the workers spread across nodes so as to be co-located -with their tasks. + +.. list-table:: + :widths: 60 40 + + * - .. image:: ../images/centralized_dedicated.png + :alt: centralized dedicated mode + :scale: 30 + + - In calling script: + + .. code-block:: python + :linenos: + + ensemble.libE_specs = LibeSpecs( + num_resource_sets=2, + dedicated_mode=True, + ) + + A SLURM batch script may include: + + + .. code-block:: bash + + #SBATCH --nodes 3 + + python run_libe_forces.py --nworkers 3 + + +Note that **gen_on_manager** is not set in the above example. + +Distributed Running +-------------------- + +In the **distributed** approach, libEnsemble can be run using the **mpi4py** +communicator, with workers distributed across nodes. This is most often used +when workers run simulation code directly, via a Python interface. The user +script is invoked with an MPI runner, for example (using an `mpich` based MPI):: + + mpirun -np 4 -ppn 1 python myscript.py + +The distributed approach, can also be used with the executor, to co-locate workers +with the applications they submit. To ensure workers are placed as required in this +case, requires :ref:`a careful MPI rank placement `. .. image:: ../images/distributed_new_detailed.png :alt: distributed :scale: 30 :align: center -Configurations with multiple nodes per worker or multiple workers per node are both -common use cases. The distributed approach allows the libEnsemble worker to read files -produced by the application on local node storage. HPC systems that allow only one -application to be launched to a node at any one time prevent distributed configuration. +This allows the libEnsemble worker to read files produced by the application on +local node storage. + Configuring the Run ------------------- @@ -44,41 +126,53 @@ the nodes within that allocation. *How does libEnsemble know where to run tasks (user applications)?* -The libEnsemble :doc:`Executor<../executor/ex_index>` can be initialized from the user calling +The libEnsemble :doc:`MPI Executor<../executor/mpi_executor>` can be initialized from the user calling script, and then used by workers to run tasks. The Executor will automatically detect the nodes available on most systems. Alternatively, the user can provide a file called **node_list** in the run directory. By default, the Executor will divide up the nodes evenly to each worker. -If the argument ``libE_specs["dedicated_mode"]=True`` is used when initializing libEnsemble, then any node -that is running a libEnsemble manager or worker will be removed from the node-list available -to the workers, ensuring libEnsemble has dedicated nodes. -To run in central mode using a 5-node allocation with 4 workers: From the head node -of the allocation:: +Mapping Tasks to Resources +-------------------------- - mpirun -np 5 python myscript.py +The :ref:`resource manager` detects node lists from +:ref:`common batch schedulers`, +and partitions these to workers. The :doc:`MPI Executor<../executor/mpi_executor>` +accesses the resources available to the current worker when launching tasks. + +Zero-resource workers +~~~~~~~~~~~~~~~~~~~~~ + +Users with persistent ``gen_f`` functions may notice that the persistent workers +are still automatically assigned system resources. This can be resolved by using +the ``gen_on_manager`` option or by +:ref:`fixing the number of resource sets`. + +Assigning GPUs +-------------- + +libEnsemble automatically detects and assigns Nvidia, AMD, and Intel GPUs without modifying the user scripts. This automatically works on many systems, but if the assignment is incorrect or needs to be modified the user can specify :ref:`platform information`. +The :doc:`forces_gpu tutorial<../tutorials/forces_gpu_tutorial>` shows an example of this. + +Varying resources +----------------- + +libEnsemble also features :ref:`dynamic resource assignment`, whereby the +number of processes and/or the number of GPUs can be a set for each simulation by the generator. -or:: - python myscript.py --nworkers 4 +Overriding Auto-Detection +------------------------- -Either of these will run libEnsemble (inc. manager and 4 workers) on the first node. The remaining -4 nodes will be divided among the workers for submitted applications. If the same run was -performed without ``libE_specs["dedicated_mode"]=True``, runs could be submitted to all 5 nodes. The number of workers -can be modified to allow either multiple workers to map to each node or multiple nodes per worker. +libEnsemble can automatically detect system information. This includes resource information, such as +available nodes and the number of cores on the node, and information about available MPI runners. -To launch libEnsemble distributed requires a less trivial libEnsemble run script. -For example:: +System detection for resources can be overridden using the :ref:`resource_info` +libE_specs option. - mpirun -np 5 -ppn 1 python myscript.py +When using the MPI Executor, it is possible to override the detected information using the +`custom_info` argument. See the :doc:`MPI Executor<../executor/mpi_executor>` for more. -would launch libEnsemble with 5 processes across 5 nodes. However, the manager would have its -own node, which is likely wasteful. More often, a ``machinefile`` is used to add the manager to -the first node. In the :doc:`examples` directory, you can find an example submission -script, configured to run libensemble distributed, with multiple workers per node or multiple nodes -per worker, and adding the manager onto the first node. -HPC systems that only allow one application to be launched to a node at any one time, -will not allow a distributed configuration. Systems with Launch/MOM Nodes ----------------------------- @@ -115,34 +209,9 @@ or *to entirely different systems*. (New) Multi-System: libEnsemble + BalsamExecutor -Submission scripts for running on launch/MOM nodes and for using Balsam, can be found in +Submission scripts for running on launch/MOM nodes and for using Balsam can be found in the :doc:`examples`. -Mapping Tasks to Resources --------------------------- - -The :ref:`resource manager` can :ref:`detect system resources`, -and partition these to workers. The :doc:`MPI Executor<../executor/mpi_executor>` -accesses the resources available to the current worker when launching tasks. - -Zero-resource workers -~~~~~~~~~~~~~~~~~~~~~ - -Users with persistent ``gen_f`` functions may notice that the persistent workers -are still automatically assigned system resources. This can be resolved by -:ref:`fixing the number of resource sets`. - -Overriding Auto-Detection -------------------------- - -libEnsemble can automatically detect system information. This includes resource information, such as -available nodes and the number of cores on the node, and information about available MPI runners. - -System detection for resources can be overridden using the :ref:`resource_info` -libE_specs option. - -When using the MPI Executor, it is possible to override the detected information using the -`custom_info` argument. See the :doc:`MPI Executor<../executor/mpi_executor>` for more. .. _globus_compute_ref: diff --git a/docs/tutorials/forces_gpu_tutorial.rst b/docs/tutorials/forces_gpu_tutorial.rst index e53687af4e..ab1ee121fc 100644 --- a/docs/tutorials/forces_gpu_tutorial.rst +++ b/docs/tutorials/forces_gpu_tutorial.rst @@ -225,6 +225,8 @@ If you want to have two GPUs per worker on the same system (with four GPUs per n you could assign only four workers. You will see that two GPUs are used for each forces run. +.. _var-resources-gpu: + Varying resources ----------------- diff --git a/examples/libE_submission_scripts/submit_distrib_mpi4py.sh b/examples/libE_submission_scripts/submit_distrib_mpi4py.sh index d696c1dfa6..2d15c07abb 100644 --- a/examples/libE_submission_scripts/submit_distrib_mpi4py.sh +++ b/examples/libE_submission_scripts/submit_distrib_mpi4py.sh @@ -20,10 +20,11 @@ export NUM_WORKERS=4 export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra) export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment. +# Sometimes may be necessary # As libE shares nodes with user applications allow fallback if contexts overrun. -unset I_MPI_FABRICS -export I_MPI_FABRICS_LIST=tmi,tcp -export I_MPI_FALLBACK=1 +# unset I_MPI_FABRICS +# export I_MPI_FABRICS_LIST=tmi,tcp +# export I_MPI_FALLBACK=1 # If using in calling script (After N mins manager kills workers and exits cleanly) export LIBE_WALLCLOCK=55 diff --git a/examples/libE_submission_scripts/submit_perlmutter.sh b/examples/libE_submission_scripts/submit_perlmutter.sh index ae34d39e56..8a029761ab 100644 --- a/examples/libE_submission_scripts/submit_perlmutter.sh +++ b/examples/libE_submission_scripts/submit_perlmutter.sh @@ -7,10 +7,9 @@ # This script is using GPU partition export MPICH_GPU_SUPPORT_ENABLED=1 -export SLURM_EXACT=1 # One worker for generator and 4 for sims (one GPU each) python libe_calling_script.py -n 5 # Or if libE_specs option gen_on_manager=True -python libe_calling_script.py -n 4 +# python libe_calling_script.py -n 4 diff --git a/libensemble/specs.py b/libensemble/specs.py index 3b72482fc2..aa70018362 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -469,8 +469,8 @@ class LibeSpecs(BaseModel): dedicated_mode: Optional[bool] = False """ - Instructs libEnsemble to not run applications on resources where libEnsemble - processes (manager and workers) are running. + Instructs libEnsemble’s MPI executor not to run applications on nodes where + libEnsemble processes (manager and workers) are running. """ zero_resource_workers: Optional[List[int]] = []