diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 8302b192..00000000 --- a/.coveragerc +++ /dev/null @@ -1,17 +0,0 @@ -[run] -branch = False -omit = - # Mujoco requires a licence - stable_baselines/*/run_mujoco.py - stable_baselines/ppo1/run_humanoid.py - stable_baselines/ppo1/run_robotics.py - # HER requires mpi and Mujoco - stable_baselines/her/experiment/* - tests/* - setup.py - -[report] -exclude_lines = - pragma: no cover - raise NotImplementedError() - if KFAC_DEBUG: diff --git a/.dockerignore b/.dockerignore deleted file mode 120000 index 3e4e48b0..00000000 --- a/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -.gitignore \ No newline at end of file diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 53acce05..00000000 --- a/.gitignore +++ /dev/null @@ -1,42 +0,0 @@ -*.swp -*.pyc -*.pkl -*.py~ -*.bak -.pytest_cache -.pytype -.DS_Store -.idea -.coverage -.coverage.* -__pycache__/ -_build/ -*.npz -*.zip - -# Setuptools distribution and build folders. -/dist/ -/build -keys/ - -# Virtualenv -/env -/venv - -*.sublime-project -*.sublime-workspace - -logs/ - -.ipynb_checkpoints -ghostdriver.log - -htmlcov - -junk -src - -*.egg-info -.cache - -MUJOCO_LOG.TXT diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index 6b05eedd..00000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,18 +0,0 @@ -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/conf.py - -# Optionally build your docs in additional formats such as PDF and ePub -formats: all - -# Optionally set the version of Python and requirements required to build your docs -python: - version: 3.7 - install: - - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 049cc0ff..00000000 --- a/.travis.yml +++ /dev/null @@ -1,52 +0,0 @@ -language: python -python: - - "3.5" - -env: - global: - - DOCKER_IMAGE=stablebaselines/stable-baselines-cpu:v2.10.0 - -notifications: - email: false - -services: - - docker - -install: - - docker pull ${DOCKER_IMAGE} - -script: - - ./scripts/run_tests_travis.sh "${TEST_GLOB}" - -jobs: - include: - # Big test suite. Run in parallel to decrease wall-clock time, and to avoid OOM error from leaks - - stage: Test - name: "Unit Tests a-h" - env: TEST_GLOB="[a-h]*" - - - name: "Unit Tests i-l" - env: TEST_GLOB="[i-l]*" - - - name: "Unit Tests m-sa" - env: TEST_GLOB="{[m-r]*,sa*}" - - - name: "Unit Tests sb-z" - env: TEST_GLOB="{s[b-z]*,[t-z]*}" - - - name: "Unit Tests determinism" - env: TEST_GLOB="0deterministic.py" - - - name: "Sphinx Documentation" - script: - - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pushd docs/ && make clean && make html"' - - - name: "Type Checking" - script: - - 'docker run --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pytype --version && pytype"' - - - stage: Codacy Trigger - if: type != pull_request - script: - # When all test coverage reports have been uploaded, instruct Codacy to start analysis. - - 'docker run -it --rm --network host --ipc=host --mount src=$(pwd),target=/root/code/stable-baselines,type=bind --env CODACY_PROJECT_TOKEN=${CODACY_PROJECT_TOKEN} ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && java -jar /root/code/codacy-coverage-reporter.jar final"' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index d11f2780..00000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,122 +0,0 @@ -## Contributing to Stable-Baselines - -If you are interested in contributing to Stable-Baselines, your contributions will fall -into two categories: -1. You want to propose a new Feature and implement it - - Create an issue about your intended feature, and we shall discuss the design and - implementation. Once we agree that the plan looks good, go ahead and implement it. -2. You want to implement a feature or bug-fix for an outstanding issue - - Look at the outstanding issues here: https://github.com/hill-a/stable-baselines/issues - - Look at the roadmap here: https://github.com/hill-a/stable-baselines/projects/1 - - Pick an issue or feature and comment on the task that you want to work on this feature. - - If you need more context on a particular issue, please ask and we shall provide. - -Once you finish implementing a feature or bug-fix, please send a Pull Request to -https://github.com/hill-a/stable-baselines/ - - -If you are not familiar with creating a Pull Request, here are some guides: -- http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request -- https://help.github.com/articles/creating-a-pull-request/ - - -## Developing Stable-Baselines - -To develop Stable-Baselines on your machine, here are some tips: - -1. Clone a copy of Stable-Baselines from source: - -```bash -git clone https://github.com/hill-a/stable-baselines/ -cd stable-baselines -``` - -2. Install Stable-Baselines in develop mode, with support for building the docs and running tests: - -```bash -pip install -e .[docs,tests] -``` - -## Codestyle - -We follow the [PEP8 codestyle](https://www.python.org/dev/peps/pep-0008/). Please order the imports as follows: - -1. built-in -2. packages -3. current module - -with one space between each, that gives for instance: -```python -import os -import warnings - -import numpy as np - -from stable_baselines import PPO2 -``` - -In general, we recommend using pycharm to format everything in an efficient way. - -Please document each function/method and [type](https://google.github.io/pytype/user_guide.html) them using the following template: - -```python - -def my_function(arg1: type1, arg2: type2) -> returntype: - """ - Short description of the function. - - :param arg1: (type1) describe what is arg1 - :param arg2: (type2) describe what is arg2 - :return: (returntype) describe what is returned - """ - ... - return my_variable -``` - -## Pull Request (PR) - -Before proposing a PR, please open an issue, where the feature will be discussed. This prevent from duplicated PR to be proposed and also ease the code review process. - -Each PR need to be reviewed and accepted by at least one of the maintainers (@hill-a, @araffin, @erniejunior, @AdamGleave or @Miffyli). -A PR must pass the Continuous Integration tests (travis + codacy) to be merged with the master branch. - -Note: in rare cases, we can create exception for codacy failure. - -## Test - -All new features must add tests in the `tests/` folder ensuring that everything works fine. -We use [pytest](https://pytest.org/). -Also, when a bug fix is proposed, tests should be added to avoid regression. - -To run tests with `pytest`: - -``` -make pytest -``` - -Type checking with `pytype`: - -``` -make type -``` - -Build the documentation: - -``` -make doc -``` - -Check documentation spelling (you need to install `sphinxcontrib.spelling` package for that): - -``` -make spelling -``` - - -## Changelog and Documentation - -Please do not forget to update the changelog (`docs/misc/changelog.rst`) and add documentation if needed. -A README is present in the `docs/` folder for instructions on how to build the documentation. - - -Credits: this contributing guide is based on the [PyTorch](https://github.com/pytorch/pytorch/) one. diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index db9d0a09..00000000 --- a/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -ARG PARENT_IMAGE -FROM $PARENT_IMAGE -ARG USE_GPU - -RUN apt-get -y update \ - && apt-get -y install \ - curl \ - cmake \ - default-jre \ - git \ - jq \ - python-dev \ - python-pip \ - python3-dev \ - libfontconfig1 \ - libglib2.0-0 \ - libsm6 \ - libxext6 \ - libxrender1 \ - libopenmpi-dev \ - zlib1g-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -ENV CODE_DIR /root/code -ENV VENV /root/venv - -COPY ./stable_baselines/version.txt ${CODE_DIR}/stable-baselines/stable_baselines/version.txt -COPY ./setup.py ${CODE_DIR}/stable-baselines/setup.py - -RUN \ - pip install pip --upgrade && \ - pip install virtualenv && \ - virtualenv $VENV --python=python3 && \ - . $VENV/bin/activate && \ - pip install --upgrade pip && \ - cd ${CODE_DIR}/stable-baselines && \ - pip install -e .[mpi,tests,docs] && \ - rm -rf $HOME/.cache/pip - -ENV PATH=$VENV/bin:$PATH - -# Codacy code coverage report: used for partial code coverage reporting -RUN cd $CODE_DIR && \ - curl -Ls -o codacy-coverage-reporter.jar "$(curl -Ls https://api.github.com/repos/codacy/codacy-coverage-reporter/releases/latest | jq -r '.assets | map({name, browser_download_url} | select(.name | (startswith("codacy-coverage-reporter") and contains("assembly") and endswith(".jar")))) | .[0].browser_download_url')" - -CMD /bin/bash diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 209b99f0..00000000 --- a/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License - -Copyright (c) 2017 OpenAI (http://openai.com) -Copyright (c) 2018-2019 Stable-Baselines Team - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/Makefile b/Makefile deleted file mode 100644 index 2c7a69df..00000000 --- a/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# Run pytest and coverage report -pytest: - ./scripts/run_tests.sh - -# Type check -type: - pytype -j auto - -# Build the doc -doc: - cd docs && make html - -# Check the spelling in the doc -spelling: - cd docs && make spelling - -# Clean the doc build folder -clean: - cd docs && make clean - -# Build docker images -# If you do export RELEASE=True, it will also push them -docker: docker-cpu docker-gpu - -docker-cpu: - ./scripts/build_docker.sh - -docker-gpu: - USE_GPU=True ./scripts/build_docker.sh - -# PyPi package release -release: - python setup.py sdist - python setup.py bdist_wheel - twine upload dist/* - -# Test PyPi package release -test-release: - python setup.py sdist - python setup.py bdist_wheel - twine upload --repository-url https://test.pypi.org/legacy/ dist/* diff --git a/conftest.py b/conftest.py deleted file mode 100644 index f0da3aac..00000000 --- a/conftest.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Configures pytest to ignore certain unit tests unless the appropriate flag is passed. - ---rungpu: tests that require GPU. ---expensive: tests that take a long time to run (e.g. training an RL algorithm for many timestesps).""" - -import pytest - - -def pytest_addoption(parser): - parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests") - parser.addoption("--expensive", action="store_true", - help="run expensive tests (which are otherwise skipped).") - - -def pytest_collection_modifyitems(config, items): - flags = {'gpu': '--rungpu', 'expensive': '--expensive'} - skips = {keyword: pytest.mark.skip(reason="need {} option to run".format(flag)) - for keyword, flag in flags.items() if not config.getoption(flag)} - for item in items: - for keyword, skip in skips.items(): - if keyword in item.keywords: - item.add_marker(skip) diff --git a/data/logo.jpg b/data/logo.jpg deleted file mode 100644 index a6fa8019..00000000 Binary files a/data/logo.jpg and /dev/null differ diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 47f98cd7..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -W # make warnings fatal -SPHINXBUILD = sphinx-build -SPHINXPROJ = StableBaselines -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index ffb37538..00000000 --- a/docs/README.md +++ /dev/null @@ -1,25 +0,0 @@ -## Stable Baselines Documentation - -This folder contains documentation for the RL baselines. - - -### Build the Documentation - -#### Install Sphinx and Theme - -``` -pip install sphinx sphinx-autobuild sphinx-rtd-theme -``` - -#### Building the Docs - -In the `docs/` folder: -``` -make html -``` - -if you want to building each time a file is changed: - -``` -sphinx-autobuild . _build/html -``` diff --git a/docs/_static/css/baselines_theme.css b/docs/_static/css/baselines_theme.css deleted file mode 100644 index 5701d00c..00000000 --- a/docs/_static/css/baselines_theme.css +++ /dev/null @@ -1,52 +0,0 @@ -/* Main colors from https://color.adobe.com/fr/Copy-of-NOUEBO-Original-color-theme-11116609 */ -:root{ - --main-bg-color: #324D5C; - --link-color: #14B278; -} - -/* Header fonts y */ -h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption { - font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif; -} - - -/* Docs background */ -.wy-side-nav-search{ - background-color: var(--main-bg-color); -} - -/* Mobile version */ -.wy-nav-top{ - background-color: var(--main-bg-color); -} - -/* Change link colors (except for the menu) */ -a { - color: var(--link-color); -} - -a:hover { - color: #4F778F; -} - -.wy-menu a { - color: #b3b3b3; -} - -.wy-menu a:hover { - color: #b3b3b3; -} - -a.icon.icon-home { - color: #b3b3b3; -} - -.version{ - color: var(--link-color) !important; -} - - -/* Make code blocks have a background */ -.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight'] { - background: #f8f8f8;; -} diff --git a/docs/_static/img/Tensorboard_example_1.png b/docs/_static/img/Tensorboard_example_1.png deleted file mode 100644 index 0738bebc..00000000 Binary files a/docs/_static/img/Tensorboard_example_1.png and /dev/null differ diff --git a/docs/_static/img/Tensorboard_example_2.png b/docs/_static/img/Tensorboard_example_2.png deleted file mode 100644 index 5b1ba579..00000000 Binary files a/docs/_static/img/Tensorboard_example_2.png and /dev/null differ diff --git a/docs/_static/img/Tensorboard_example_3.png b/docs/_static/img/Tensorboard_example_3.png deleted file mode 100644 index b488af15..00000000 Binary files a/docs/_static/img/Tensorboard_example_3.png and /dev/null differ diff --git a/docs/_static/img/breakout.gif b/docs/_static/img/breakout.gif deleted file mode 100644 index da5beb4f..00000000 Binary files a/docs/_static/img/breakout.gif and /dev/null differ diff --git a/docs/_static/img/colab.svg b/docs/_static/img/colab.svg deleted file mode 100644 index c2d30e97..00000000 --- a/docs/_static/img/colab.svg +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/docs/_static/img/learning_curve.png b/docs/_static/img/learning_curve.png deleted file mode 100644 index 5dd8edf5..00000000 Binary files a/docs/_static/img/learning_curve.png and /dev/null differ diff --git a/docs/_static/img/logo.png b/docs/_static/img/logo.png deleted file mode 100644 index 33bdea60..00000000 Binary files a/docs/_static/img/logo.png and /dev/null differ diff --git a/docs/_static/img/mistake.png b/docs/_static/img/mistake.png deleted file mode 100644 index 8fae18b5..00000000 Binary files a/docs/_static/img/mistake.png and /dev/null differ diff --git a/docs/_static/img/try_it.png b/docs/_static/img/try_it.png deleted file mode 100644 index 961ca222..00000000 Binary files a/docs/_static/img/try_it.png and /dev/null differ diff --git a/docs/common/cmd_utils.rst b/docs/common/cmd_utils.rst deleted file mode 100644 index 96779cdc..00000000 --- a/docs/common/cmd_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _cmd_utils: - -Command Utils -========================= - -.. automodule:: stable_baselines.common.cmd_util - :members: diff --git a/docs/common/distributions.rst b/docs/common/distributions.rst deleted file mode 100644 index e4f3de75..00000000 --- a/docs/common/distributions.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. _distributions: - -Probability Distributions -========================= - -Probability distributions used for the different action spaces: - -- ``CategoricalProbabilityDistribution`` -> Discrete -- ``DiagGaussianProbabilityDistribution`` -> Box (continuous actions) -- ``MultiCategoricalProbabilityDistribution`` -> MultiDiscrete -- ``BernoulliProbabilityDistribution`` -> MultiBinary - -The policy networks output parameters for the distributions (named ``flat`` in the methods). -Actions are then sampled from those distributions. - -For instance, in the case of discrete actions. The policy network outputs probability -of taking each action. The ``CategoricalProbabilityDistribution`` allows to sample from it, -computes the entropy, the negative log probability (``neglogp``) and backpropagate the gradient. - -In the case of continuous actions, a Gaussian distribution is used. The policy network outputs -mean and (log) std of the distribution (assumed to be a ``DiagGaussianProbabilityDistribution``). - -.. automodule:: stable_baselines.common.distributions - :members: diff --git a/docs/common/env_checker.rst b/docs/common/env_checker.rst deleted file mode 100644 index 404f6d6a..00000000 --- a/docs/common/env_checker.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _env_checker: - -Gym Environment Checker -======================== - -.. automodule:: stable_baselines.common.env_checker - :members: diff --git a/docs/common/evaluation.rst b/docs/common/evaluation.rst deleted file mode 100644 index ded89665..00000000 --- a/docs/common/evaluation.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _eval: - -Evaluation Helper -================= - -.. automodule:: stable_baselines.common.evaluation - :members: diff --git a/docs/common/monitor.rst b/docs/common/monitor.rst deleted file mode 100644 index cb696fa7..00000000 --- a/docs/common/monitor.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _monitor: - -Monitor Wrapper -=============== - -.. automodule:: stable_baselines.bench.monitor - :members: diff --git a/docs/common/schedules.rst b/docs/common/schedules.rst deleted file mode 100644 index 968a0676..00000000 --- a/docs/common/schedules.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _schedules: - -Schedules -========= - -Schedules are used as hyperparameter for most of the algorithms, -in order to change value of a parameter over time (usually the learning rate). - - -.. automodule:: stable_baselines.common.schedules - :members: diff --git a/docs/common/tf_utils.rst b/docs/common/tf_utils.rst deleted file mode 100644 index 2f682676..00000000 --- a/docs/common/tf_utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _tf_utils: - -Tensorflow Utils -========================= - -.. automodule:: stable_baselines.common.tf_util - :members: diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index efe73f8c..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,223 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -from unittest.mock import MagicMock - -# We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support -# PyEnchant. -try: - import sphinxcontrib.spelling - enable_spell_check = True -except ImportError: - enable_spell_check = False - -# source code directory, relative to this file, for sphinx-autobuild -sys.path.insert(0, os.path.abspath('..')) - - -class Mock(MagicMock): - @classmethod - def __getattr__(cls, name): - return MagicMock() - -# Mock modules that requires C modules -# Note: because of that we cannot test examples using CI -MOCK_MODULES = ['joblib', 'scipy', 'scipy.signal', - 'mpi4py', 'mujoco-py', 'cv2', 'tensorflow', - 'tensorflow.contrib', 'tensorflow.contrib.layers', - 'tensorflow.python', 'tensorflow.python.client', 'tensorflow.python.ops', - 'tqdm', 'matplotlib', 'matplotlib.pyplot', - 'seaborn', 'tensorflow.core', 'tensorflow.core.util', 'tensorflow.python.util', - 'zmq'] -sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) - -# Read version from file -version_file = os.path.join(os.path.dirname(__file__), '../stable_baselines', 'version.txt') -with open(version_file, 'r') as file_handler: - __version__ = file_handler.read().strip() - -# -- Project information ----------------------------------------------------- - -project = 'Stable Baselines' -copyright = '2018-2021, Stable Baselines' -author = 'Stable Baselines Contributors' - -# The short X.Y version -version = 'master (' + __version__ + ' )' -# The full version, including alpha/beta/rc tags -release = __version__ - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', -] - -if enable_spell_check: - extensions.append('sphinxcontrib.spelling') - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# Add a string of reStructuredText that will be included at the beginning -# of every source file that is read. This is a possible place to add -# substitutions that should be available in every file. -rst_prolog = """ -.. warning:: - - This package is in maintenance mode, please use `Stable-Baselines3 - (SB3)`_ for an up-to-date version. You can find a `migration guide`_ in - SB3 documentation. - -.. _Stable-Baselines3 (SB3): https://github.com/DLR-RM/stable-baselines3 -.. _migration guide: https://stable-baselines3.readthedocs.io/en/master/guide/migration.html -""" - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = "en" - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. - -# Fix for read the docs -on_rtd = os.environ.get('READTHEDOCS') == 'True' -if on_rtd: - html_theme = 'default' -else: - html_theme = 'sphinx_rtd_theme' - -html_logo = '_static/img/logo.png' - - -def setup(app): - app.add_css_file("css/baselines_theme.css") - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'StableBaselinesdoc' - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'StableBaselines.tex', 'Stable Baselines Documentation', - 'Stable Baselines Contributors', 'manual'), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'stablebaselines', 'Stable Baselines Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'StableBaselines', 'Stable Baselines Documentation', - author, 'StableBaselines', 'One line description of project.', - 'Miscellaneous'), -] - - -# -- Extension configuration ------------------------------------------------- diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst deleted file mode 100644 index 50803c0e..00000000 --- a/docs/guide/algos.rst +++ /dev/null @@ -1,78 +0,0 @@ -RL Algorithms -============= - -This table displays the rl algorithms that are implemented in the stable baselines project, -along with some useful characteristics: support for recurrent policies, discrete/continuous actions, multiprocessing. - -.. Table too large -.. ===== ======================== ========= ======= ============ ================= =============== ================ -.. Name Refactored \ :sup:`(1)`\ Recurrent ``Box`` ``Discrete`` ``MultiDiscrete`` ``MultiBinary`` Multi Processing -.. ===== ======================== ========= ======= ============ ================= =============== ================ -.. A2C ✔️ -.. ===== ======================== ========= ======= ============ ================= =============== ================ - - -============ ======================== ========= =========== ============ ================ -Name Refactored [#f1]_ Recurrent ``Box`` ``Discrete`` Multi Processing -============ ======================== ========= =========== ============ ================ -A2C ✔️ ✔️ ✔️ ✔️ ✔️ -ACER ✔️ ✔️ ❌ [#f4]_ ✔️ ✔️ -ACKTR ✔️ ✔️ ✔️ ✔️ ✔️ -DDPG ✔️ ❌ ✔️ ❌ ✔️ [#f3]_ -DQN ✔️ ❌ ❌ ✔️ ❌ -HER ✔️ ❌ ✔️ ✔️ ❌ -GAIL [#f2]_ ✔️ ✔️ ✔️ ✔️ ✔️ [#f3]_ -PPO1 ✔️ ❌ ✔️ ✔️ ✔️ [#f3]_ -PPO2 ✔️ ✔️ ✔️ ✔️ ✔️ -SAC ✔️ ❌ ✔️ ❌ ❌ -TD3 ✔️ ❌ ✔️ ❌ ❌ -TRPO ✔️ ❌ ✔️ ✔ ✔️ [#f3]_ -============ ======================== ========= =========== ============ ================ - -.. [#f1] Whether or not the algorithm has be refactored to fit the ``BaseRLModel`` class. -.. [#f2] Only implemented for TRPO. -.. [#f3] Multi Processing with `MPI`_. -.. [#f4] TODO, in project scope. - -.. note:: - Non-array spaces such as ``Dict`` or ``Tuple`` are not currently supported by any algorithm, - except HER for dict when working with ``gym.GoalEnv`` - -Actions ``gym.spaces``: - -- ``Box``: A N-dimensional box that contains every point in the action - space. -- ``Discrete``: A list of possible actions, where each timestep only - one of the actions can be used. -- ``MultiDiscrete``: A list of possible actions, where each timestep only one action of each discrete set can be used. -- ``MultiBinary``: A list of possible actions, where each timestep any of the actions can be used in any combination. - -.. _MPI: https://mpi4py.readthedocs.io/en/stable/ - -.. note:: - - Some logging values (like ``ep_rewmean``, ``eplenmean``) are only available when using a Monitor wrapper - See `Issue #339 `_ for more info. - - -Reproducibility ---------------- - -Completely reproducible results are not guaranteed across Tensorflow releases or different platforms. -Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds. - -In order to make computations deterministic on CPU, on your specific problem on one specific platform, -you need to pass a ``seed`` argument at the creation of a model and set `n_cpu_tf_sess=1` (number of cpu for Tensorflow session). -If you pass an environment to the model using `set_env()`, then you also need to seed the environment first. - -.. note:: - - Because of the current limits of Tensorflow 1.x, we cannot ensure reproducible results on the GPU yet. This issue is solved in `Stable-Baselines3 "PyTorch edition" `_ - - -.. note:: - - TD3 sometimes fail to have reproducible results for obscure reasons, even when following the previous steps (cf `PR #492 `_). If you find the reason then please open an issue ;) - - -Credit: part of the *Reproducibility* section comes from `PyTorch Documentation `_ diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst deleted file mode 100644 index 62cf7425..00000000 --- a/docs/guide/callbacks.rst +++ /dev/null @@ -1,340 +0,0 @@ -.. _callbacks: - -Callbacks -========= - -A callback is a set of functions that will be called at given stages of the training procedure. -You can use callbacks to access internal state of the RL model during training. -It allows one to do monitoring, auto saving, model manipulation, progress bars, ... - - -Custom Callback ---------------- - -To build a custom callback, you need to create a class that derives from ``BaseCallback``. -This will give you access to events (``_on_training_start``, ``_on_step``) and useful variables (like `self.model` for the RL model). - - -You can find two examples of custom callbacks in the documentation: one for saving the best model according to the training reward (see :ref:`Examples `), and one for logging additional values with Tensorboard (see :ref:`Tensorboard section `). - - -.. code-block:: python - - from stable_baselines.common.callbacks import BaseCallback - - - class CustomCallback(BaseCallback): - """ - A custom callback that derives from ``BaseCallback``. - - :param verbose: (int) Verbosity level 0: not output 1: info 2: debug - """ - def __init__(self, verbose=0): - super(CustomCallback, self).__init__(verbose) - # Those variables will be accessible in the callback - # (they are defined in the base class) - # The RL model - # self.model = None # type: BaseRLModel - # An alias for self.model.get_env(), the environment used for training - # self.training_env = None # type: Union[gym.Env, VecEnv, None] - # Number of time the callback was called - # self.n_calls = 0 # type: int - # self.num_timesteps = 0 # type: int - # local and global variables - # self.locals = None # type: Dict[str, Any] - # self.globals = None # type: Dict[str, Any] - # The logger object, used to report things in the terminal - # self.logger = None # type: logger.Logger - # # Sometimes, for event callback, it is useful - # # to have access to the parent object - # self.parent = None # type: Optional[BaseCallback] - - def _on_training_start(self) -> None: - """ - This method is called before the first rollout starts. - """ - pass - - def _on_rollout_start(self) -> None: - """ - A rollout is the collection of environment interaction - using the current policy. - This event is triggered before collecting new samples. - """ - pass - - def _on_step(self) -> bool: - """ - This method will be called by the model after each call to `env.step()`. - - For child callback (of an `EventCallback`), this will be called - when the event is triggered. - - :return: (bool) If the callback returns False, training is aborted early. - """ - return True - - def _on_rollout_end(self) -> None: - """ - This event is triggered before updating the policy. - """ - pass - - def _on_training_end(self) -> None: - """ - This event is triggered before exiting the `learn()` method. - """ - pass - - -.. note:: - `self.num_timesteps` corresponds to the total number of steps taken in the environment, i.e., it is the number of environments multiplied by the number of time `env.step()` was called - - You should know that ``PPO1`` and ``TRPO`` update `self.num_timesteps` after each rollout (and not each step) because they rely on MPI. - - For the other algorithms, `self.num_timesteps` is incremented by ``n_envs`` (number of environments) after each call to `env.step()` - - -.. note:: - - For off-policy algorithms like SAC, DDPG, TD3 or DQN, the notion of ``rollout`` corresponds to the steps taken in the environment between two updates. - - -.. _EventCallback: - -Event Callback --------------- - -Compared to Keras, Stable Baselines provides a second type of ``BaseCallback``, named ``EventCallback`` that is meant to trigger events. When an event is triggered, then a child callback is called. - -As an example, :ref:`EvalCallback` is an ``EventCallback`` that will trigger its child callback when there is a new best model. -A child callback is for instance :ref:`StopTrainingOnRewardThreshold ` that stops the training if the mean reward achieved by the RL model is above a threshold. - -.. note:: - - We recommend to take a look at the source code of :ref:`EvalCallback` and :ref:`StopTrainingOnRewardThreshold ` to have a better overview of what can be achieved with this kind of callbacks. - - -.. code-block:: python - - class EventCallback(BaseCallback): - """ - Base class for triggering callback on event. - - :param callback: (Optional[BaseCallback]) Callback that will be called - when an event is triggered. - :param verbose: (int) - """ - def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0): - super(EventCallback, self).__init__(verbose=verbose) - self.callback = callback - # Give access to the parent - if callback is not None: - self.callback.parent = self - ... - - def _on_event(self) -> bool: - if self.callback is not None: - return self.callback() - return True - - - -Callback Collection -------------------- - -Stable Baselines provides you with a set of common callbacks for: - -- saving the model periodically (:ref:`CheckpointCallback`) -- evaluating the model periodically and saving the best one (:ref:`EvalCallback`) -- chaining callbacks (:ref:`CallbackList`) -- triggering callback on events (:ref:`EventCallback`, :ref:`EveryNTimesteps`) -- stopping the training early based on a reward threshold (:ref:`StopTrainingOnRewardThreshold `) - - -.. _CheckpointCallback: - -CheckpointCallback -^^^^^^^^^^^^^^^^^^ - -Callback for saving a model every ``save_freq`` steps, you must specify a log folder (``save_path``) -and optionally a prefix for the checkpoints (``rl_model`` by default). - - -.. code-block:: python - - from stable_baselines import SAC - from stable_baselines.common.callbacks import CheckpointCallback - # Save a checkpoint every 1000 steps - checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', - name_prefix='rl_model') - - model = SAC('MlpPolicy', 'Pendulum-v0') - model.learn(2000, callback=checkpoint_callback) - - -.. _EvalCallback: - -EvalCallback -^^^^^^^^^^^^ - -Evaluate periodically the performance of an agent, using a separate test environment. -It will save the best model if ``best_model_save_path`` folder is specified and save the evaluations results in a numpy archive (`evaluations.npz`) if ``log_path`` folder is specified. - - -.. note:: - - You can pass a child callback via the ``callback_on_new_best`` argument. It will be triggered each time there is a new best model. - - - -.. code-block:: python - - import gym - - from stable_baselines import SAC - from stable_baselines.common.callbacks import EvalCallback - - # Separate evaluation env - eval_env = gym.make('Pendulum-v0') - # Use deterministic actions for evaluation - eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', - log_path='./logs/', eval_freq=500, - deterministic=True, render=False) - - model = SAC('MlpPolicy', 'Pendulum-v0') - model.learn(5000, callback=eval_callback) - - -.. _Callbacklist: - -CallbackList -^^^^^^^^^^^^ - -Class for chaining callbacks, they will be called sequentially. -Alternatively, you can pass directly a list of callbacks to the `learn()` method, it will be converted automatically to a ``CallbackList``. - - -.. code-block:: python - - import gym - - from stable_baselines import SAC - from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback - - checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') - # Separate evaluation env - eval_env = gym.make('Pendulum-v0') - eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', - log_path='./logs/results', eval_freq=500) - # Create the callback list - callback = CallbackList([checkpoint_callback, eval_callback]) - - model = SAC('MlpPolicy', 'Pendulum-v0') - # Equivalent to: - # model.learn(5000, callback=[checkpoint_callback, eval_callback]) - model.learn(5000, callback=callback) - - -.. _StopTrainingCallback: - -StopTrainingOnRewardThreshold -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Stop the training once a threshold in episodic reward (mean episode reward over the evaluations) has been reached (i.e., when the model is good enough). -It must be used with the :ref:`EvalCallback` and use the event triggered by a new best model. - - -.. code-block:: python - - import gym - - from stable_baselines import SAC - from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold - - # Separate evaluation env - eval_env = gym.make('Pendulum-v0') - # Stop training when the model reaches the reward threshold - callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1) - eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) - - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) - # Almost infinite number of timesteps, but the training will stop - # early as soon as the reward threshold is reached - model.learn(int(1e10), callback=eval_callback) - - -.. _EveryNTimesteps: - -EveryNTimesteps -^^^^^^^^^^^^^^^ - -An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` timesteps. - - -.. note:: - - Because of the way ``PPO1`` and ``TRPO`` work (they rely on MPI), ``n_steps`` is a lower bound between two events. - - -.. code-block:: python - - import gym - - from stable_baselines import PPO2 - from stable_baselines.common.callbacks import CheckpointCallback, EveryNTimesteps - - # this is equivalent to defining CheckpointCallback(save_freq=500) - # checkpoint_callback will be triggered every 500 steps - checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') - event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - - model = PPO2('MlpPolicy', 'Pendulum-v0', verbose=1) - - model.learn(int(2e4), callback=event_callback) - - -.. automodule:: stable_baselines.common.callbacks - :members: - - - Legacy: A functional approach - ----------------------------- - - .. warning:: - - This way of doing callbacks is deprecated in favor of the object oriented approach. - - - - A callback function takes the ``locals()`` variables and the ``globals()`` variables from the model, then returns a boolean value for whether or not the training should continue. - - Thanks to the access to the models variables, in particular ``_locals["self"]``, we are able to even change the parameters of the model without halting the training, or changing the model's code. - - - .. code-block:: python - - from typing import Dict, Any - - from stable_baselines import PPO2 - - - def simple_callback(_locals: Dict[str, Any], _globals: Dict[str, Any]) -> bool: - """ - Callback called at each step (for DQN and others) or after n steps (see ACER or PPO2). - This callback will save the model and stop the training after the first call. - - :param _locals: (Dict[str, Any]) - :param _globals: (Dict[str, Any]) - :return: (bool) If your callback returns False, training is aborted early. - """ - print("callback called") - # Save the model - _locals["self"].save("saved_model") - # If you want to continue training, the callback must return True. - # return True # returns True, training continues. - print("stop training") - return False # returns False, training stops. - - model = PPO2('MlpPolicy', 'CartPole-v1') - model.learn(2000, callback=simple_callback) diff --git a/docs/guide/checking_nan.rst b/docs/guide/checking_nan.rst deleted file mode 100644 index c51bdc0a..00000000 --- a/docs/guide/checking_nan.rst +++ /dev/null @@ -1,253 +0,0 @@ -Dealing with NaNs and infs -========================== - -During the training of a model on a given environment, it is possible that the RL model becomes completely -corrupted when a NaN or an inf is given or returned from the RL model. - -How and why? ------------- - -The issue arises then NaNs or infs do not crash, but simply get propagated through the training, -until all the floating point number converge to NaN or inf. This is in line with the -`IEEE Standard for Floating-Point Arithmetic (IEEE 754) `_ standard, as it says: - -.. note:: - Five possible exceptions can occur: - - Invalid operation (:math:`\sqrt{-1}`, :math:`\inf \times 1`, :math:`\text{NaN}\ \mathrm{mod}\ 1`, ...) return NaN - - Division by zero: - - if the operand is not zero (:math:`1/0`, :math:`-2/0`, ...) returns :math:`\pm\inf` - - if the operand is zero (:math:`0/0`) returns signaling NaN - - Overflow (exponent too high to represent) returns :math:`\pm\inf` - - Underflow (exponent too low to represent) returns :math:`0` - - Inexact (not representable exactly in base 2, eg: :math:`1/5`) returns the rounded value (ex: :code:`assert (1/5) * 3 == 0.6000000000000001`) - -And of these, only ``Division by zero`` will signal an exception, the rest will propagate invalid values quietly. - -In python, dividing by zero will indeed raise the exception: ``ZeroDivisionError: float division by zero``, -but ignores the rest. - -The default in numpy, will warn: ``RuntimeWarning: invalid value encountered`` -but will not halt the code. - -And the worst of all, Tensorflow will not signal anything - -.. code-block:: python - - import tensorflow as tf - import numpy as np - - print("tensorflow test:") - - a = tf.constant(1.0) - b = tf.constant(0.0) - c = a / b - - sess = tf.Session() - val = sess.run(c) # this will be quiet - print(val) - sess.close() - - print("\r\nnumpy test:") - - a = np.float64(1.0) - b = np.float64(0.0) - val = a / b # this will warn - print(val) - - print("\r\npure python test:") - - a = 1.0 - b = 0.0 - val = a / b # this will raise an exception and halt. - print(val) - -Unfortunately, most of the floating point operations are handled by Tensorflow and numpy, -meaning you might get little to no warning when a invalid value occurs. - -Numpy parameters ----------------- - -Numpy has a convenient way of dealing with invalid value: `numpy.seterr `_, -which defines for the python process, how it should handle floating point error. - -.. code-block:: python - - import numpy as np - - np.seterr(all='raise') # define before your code. - - print("numpy test:") - - a = np.float64(1.0) - b = np.float64(0.0) - val = a / b # this will now raise an exception instead of a warning. - print(val) - -but this will also avoid overflow issues on floating point numbers: - -.. code-block:: python - - import numpy as np - - np.seterr(all='raise') # define before your code. - - print("numpy overflow test:") - - a = np.float64(10) - b = np.float64(1000) - val = a ** b # this will now raise an exception - print(val) - -but will not avoid the propagation issues: - -.. code-block:: python - - import numpy as np - - np.seterr(all='raise') # define before your code. - - print("numpy propagation test:") - - a = np.float64('NaN') - b = np.float64(1.0) - val = a + b # this will neither warn nor raise anything - print(val) - -Tensorflow parameters ---------------------- - -Tensorflow can add checks for detecting and dealing with invalid value: `tf.add_check_numerics_ops `_ and `tf.check_numerics `_, -however they will add operations to the Tensorflow graph and raise the computation time. - -.. code-block:: python - - import tensorflow as tf - - print("tensorflow test:") - - a = tf.constant(1.0) - b = tf.constant(0.0) - c = a / b - - check_nan = tf.add_check_numerics_ops() # add after your graph definition. - - sess = tf.Session() - val, _ = sess.run([c, check_nan]) # this will now raise an exception - print(val) - sess.close() - -but this will also avoid overflow issues on floating point numbers: - -.. code-block:: python - - import tensorflow as tf - - print("tensorflow overflow test:") - - check_nan = [] # the list of check_numerics operations - - a = tf.constant(10) - b = tf.constant(1000) - c = a ** b - - check_nan.append(tf.check_numerics(c, "")) # check the 'c' operations - - sess = tf.Session() - val, _ = sess.run([c] + check_nan) # this will now raise an exception - print(val) - sess.close() - -and catch propagation issues: - -.. code-block:: python - - import tensorflow as tf - - print("tensorflow propagation test:") - - check_nan = [] # the list of check_numerics operations - - a = tf.constant('NaN') - b = tf.constant(1.0) - c = a + b - - check_nan.append(tf.check_numerics(c, "")) # check the 'c' operations - - sess = tf.Session() - val, _ = sess.run([c] + check_nan) # this will now raise an exception - print(val) - sess.close() - - -VecCheckNan Wrapper -------------------- - -In order to find when and from where the invalid value originated from, stable-baselines comes with a ``VecCheckNan`` wrapper. - -It will monitor the actions, observations, and rewards, indicating what action or observation caused it and from what. - -.. code-block:: python - - import gym - from gym import spaces - import numpy as np - - from stable_baselines import PPO2 - from stable_baselines.common.vec_env import DummyVecEnv, VecCheckNan - - class NanAndInfEnv(gym.Env): - """Custom Environment that raised NaNs and Infs""" - metadata = {'render.modes': ['human']} - - def __init__(self): - super(NanAndInfEnv, self).__init__() - self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64) - self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64) - - def step(self, _action): - randf = np.random.rand() - if randf > 0.99: - obs = float('NaN') - elif randf > 0.98: - obs = float('inf') - else: - obs = randf - return [obs], 0.0, False, {} - - def reset(self): - return [0.0] - - def render(self, mode='human', close=False): - pass - - # Create environment - env = DummyVecEnv([lambda: NanAndInfEnv()]) - env = VecCheckNan(env, raise_exception=True) - - # Instantiate the agent - model = PPO2('MlpPolicy', env) - - # Train the agent - model.learn(total_timesteps=int(2e5)) # this will crash explaining that the invalid value originated from the environment. - -RL Model hyperparameters ------------------------- - -Depending on your hyperparameters, NaN can occurs much more often. -A great example of this: https://github.com/hill-a/stable-baselines/issues/340 - -Be aware, the hyperparameters given by default seem to work in most cases, -however your environment might not play nice with them. -If this is the case, try to read up on the effect each hyperparameters has on the model, -so that you can try and tune them to get a stable model. Alternatively, you can try automatic hyperparameter tuning (included in the rl zoo). - -Missing values from datasets ----------------------------- - -If your environment is generated from an external dataset, do not forget to make sure your dataset does not contain NaNs. -As some datasets will sometimes fill missing values with NaNs as a surrogate value. - -Here is some reading material about finding NaNs: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html - -And filling the missing values with something else (imputation): https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4 - diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst deleted file mode 100644 index e3b91ab5..00000000 --- a/docs/guide/custom_env.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. _custom_env: - -Using Custom Environments -========================== - -To use the rl baselines with custom environments, they just need to follow the *gym* interface. -That is to say, your environment must implement the following methods (and inherits from OpenAI Gym Class): - - -.. note:: - If you are using images as input, the input values must be in [0, 255] as the observation - is normalized (dividing by 255 to have values in [0, 1]) when using CNN policies. - - - -.. code-block:: python - - import gym - from gym import spaces - - class CustomEnv(gym.Env): - """Custom Environment that follows gym interface""" - metadata = {'render.modes': ['human']} - - def __init__(self, arg1, arg2, ...): - super(CustomEnv, self).__init__() - # Define action and observation space - # They must be gym.spaces objects - # Example when using discrete actions: - self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS) - # Example for using image as input: - self.observation_space = spaces.Box(low=0, high=255, - shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8) - - def step(self, action): - ... - return observation, reward, done, info - def reset(self): - ... - return observation # reward, done, info can't be included - def render(self, mode='human'): - ... - def close (self): - ... - - -Then you can define and train a RL agent with: - -.. code-block:: python - - # Instantiate the env - env = CustomEnv(arg1, ...) - # Define and Train the agent - model = A2C('CnnPolicy', env).learn(total_timesteps=1000) - - -To check that your environment follows the gym interface, please use: - -.. code-block:: python - - from stable_baselines.common.env_checker import check_env - - env = CustomEnv(arg1, ...) - # It will check your custom environment and output additional warnings if needed - check_env(env) - - - -We have created a `colab notebook `_ for -a concrete example of creating a custom environment. - -You can also find a `complete guide online `_ -on creating a custom Gym environment. - - -Optionally, you can also register the environment with gym, -that will allow you to create the RL agent in one line (and use ``gym.make()`` to instantiate the env). - - -In the project, for testing purposes, we use a custom environment named ``IdentityEnv`` -defined `in this file `_. -An example of how to use it can be found `here `_. diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst deleted file mode 100644 index 45b2a431..00000000 --- a/docs/guide/custom_policy.rst +++ /dev/null @@ -1,246 +0,0 @@ -.. _custom_policy: - -Custom Policy Network ---------------------- - -Stable baselines provides default policy networks (see :ref:`Policies ` ) for images (CNNPolicies) -and other type of input features (MlpPolicies). - -One way of customising the policy network architecture is to pass arguments when creating the model, -using ``policy_kwargs`` parameter: - -.. code-block:: python - - import gym - import tensorflow as tf - - from stable_baselines import PPO2 - - # Custom MLP policy of two layers of size 32 each with tanh activation function - policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32]) - # Create the agent - model = PPO2("MlpPolicy", "CartPole-v1", policy_kwargs=policy_kwargs, verbose=1) - # Retrieve the environment - env = model.get_env() - # Train the agent - model.learn(total_timesteps=100000) - # Save the agent - model.save("ppo2-cartpole") - - del model - # the policy_kwargs are automatically loaded - model = PPO2.load("ppo2-cartpole") - - -You can also easily define a custom architecture for the policy (or value) network: - -.. note:: - - Defining a custom policy class is equivalent to passing ``policy_kwargs``. - However, it lets you name the policy and so makes usually the code clearer. - ``policy_kwargs`` should be rather used when doing hyperparameter search. - - - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import FeedForwardPolicy, register_policy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import A2C - - # Custom MLP policy of three layers of size 128 each - class CustomPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomPolicy, self).__init__(*args, **kwargs, - net_arch=[dict(pi=[128, 128, 128], - vf=[128, 128, 128])], - feature_extraction="mlp") - - # Create and wrap the environment - env = gym.make('LunarLander-v2') - env = DummyVecEnv([lambda: env]) - - model = A2C(CustomPolicy, env, verbose=1) - # Train the agent - model.learn(total_timesteps=100000) - # Save the agent - model.save("a2c-lunar") - - del model - # When loading a model with a custom policy - # you MUST pass explicitly the policy when loading the saved model - model = A2C.load("a2c-lunar", policy=CustomPolicy) - -.. warning:: - - When loading a model with a custom policy, you must pass the custom policy explicitly when loading the model. - (cf previous example) - - -You can also register your policy, to help with code simplicity: you can refer to your custom policy using a string. - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import FeedForwardPolicy, register_policy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import A2C - - # Custom MLP policy of three layers of size 128 each - class CustomPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomPolicy, self).__init__(*args, **kwargs, - net_arch=[dict(pi=[128, 128, 128], - vf=[128, 128, 128])], - feature_extraction="mlp") - - # Register the policy, it will check that the name is not already taken - register_policy('CustomPolicy', CustomPolicy) - - # Because the policy is now registered, you can pass - # a string to the agent constructor instead of passing a class - model = A2C(policy='CustomPolicy', env='LunarLander-v2', verbose=1).learn(total_timesteps=100000) - - -.. deprecated:: 2.3.0 - - Use ``net_arch`` instead of ``layers`` parameter to define the network architecture. It allows to have a greater control. - - -The ``net_arch`` parameter of ``FeedForwardPolicy`` allows to specify the amount and size of the hidden layers and how many -of them are shared between the policy network and the value network. It is assumed to be a list with the following -structure: - -1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer. - If the number of ints is zero, there will be no shared layers. -2. An optional dict, to specify the following non-shared layers for the value network and the policy network. - It is formatted like ``dict(vf=[], pi=[])``. - If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed. - -In short: ``[, dict(vf=[], pi=[])]``. - -Examples -~~~~~~~~ - -Two shared layers of size 128: ``net_arch=[128, 128]`` - - -.. code-block:: none - - obs - | - <128> - | - <128> - / \ - action value - - -Value network deeper than policy network, first layer shared: ``net_arch=[128, dict(vf=[256, 256])]`` - -.. code-block:: none - - obs - | - <128> - / \ - action <256> - | - <256> - | - value - - -Initially shared then diverging: ``[128, dict(vf=[256], pi=[16])]`` - -.. code-block:: none - - obs - | - <128> - / \ - <16> <256> - | | - action value - -The ``LstmPolicy`` can be used to construct recurrent policies in a similar way: - -.. code-block:: python - - class CustomLSTMPolicy(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=[8, 'lstm', dict(vf=[5, 10], pi=[10])], - layer_norm=True, feature_extraction="mlp", **_kwargs) - -Here the ``net_arch`` parameter takes an additional (mandatory) 'lstm' entry within the shared network section. -The LSTM is shared between value network and policy network. - - - - -If your task requires even more granular control over the policy architecture, you can redefine the policy directly: - -.. code-block:: python - - import gym - import tensorflow as tf - - from stable_baselines.common.policies import ActorCriticPolicy, register_policy, nature_cnn - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import A2C - - # Custom MLP policy of three layers of size 128 each for the actor and 2 layers of 32 for the critic, - # with a nature_cnn feature extractor - class CustomPolicy(ActorCriticPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs): - super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True) - - with tf.variable_scope("model", reuse=reuse): - activ = tf.nn.relu - - extracted_features = nature_cnn(self.processed_obs, **kwargs) - extracted_features = tf.layers.flatten(extracted_features) - - pi_h = extracted_features - for i, layer_size in enumerate([128, 128, 128]): - pi_h = activ(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i))) - pi_latent = pi_h - - vf_h = extracted_features - for i, layer_size in enumerate([32, 32]): - vf_h = activ(tf.layers.dense(vf_h, layer_size, name='vf_fc' + str(i))) - value_fn = tf.layers.dense(vf_h, 1, name='vf') - vf_latent = vf_h - - self._proba_distribution, self._policy, self.q_value = \ - self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) - - self._value_fn = value_fn - self._setup_init() - - def step(self, obs, state=None, mask=None, deterministic=False): - if deterministic: - action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp], - {self.obs_ph: obs}) - else: - action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp], - {self.obs_ph: obs}) - return action, value, self.initial_state, neglogp - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run(self.policy_proba, {self.obs_ph: obs}) - - def value(self, obs, state=None, mask=None): - return self.sess.run(self.value_flat, {self.obs_ph: obs}) - - - # Create and wrap the environment - env = DummyVecEnv([lambda: gym.make('Breakout-v0')]) - - model = A2C(CustomPolicy, env, verbose=1) - # Train the agent - model.learn(total_timesteps=100000) diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst deleted file mode 100644 index 36672b22..00000000 --- a/docs/guide/examples.rst +++ /dev/null @@ -1,680 +0,0 @@ -.. _examples: - -Examples -======== - -Try it online with Colab Notebooks! ------------------------------------ - -All the following examples can be executed online using Google colab |colab| -notebooks: - -- `Full Tutorial `_ -- `All Notebooks `_ -- `Getting Started`_ -- `Training, Saving, Loading`_ -- `Multiprocessing`_ -- `Monitor Training and Plotting`_ -- `Atari Games`_ -- `Breakout`_ (trained agent included) -- `Hindsight Experience Replay`_ -- `RL Baselines zoo`_ - -.. _Getting Started: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb -.. _Training, Saving, Loading: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/saving_loading_dqn.ipynb -.. _Multiprocessing: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/multiprocessing_rl.ipynb -.. _Monitor Training and Plotting: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb -.. _Atari Games: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/atari_games.ipynb -.. _Breakout: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/breakout.ipynb -.. _Hindsight Experience Replay: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_her.ipynb -.. _RL Baselines zoo: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/rl-baselines-zoo.ipynb - -.. |colab| image:: ../_static/img/colab.svg - -Basic Usage: Training, Saving, Loading --------------------------------------- - -In the following example, we will train, save and load a DQN model on the Lunar Lander environment. - -.. image:: ../_static/img/try_it.png - :scale: 30 % - :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/saving_loading_dqn.ipynb - - -.. figure:: https://cdn-images-1.medium.com/max/960/1*f4VZPKOI0PYNWiwt0la0Rg.gif - - Lunar Lander Environment - - -.. note:: - LunarLander requires the python package ``box2d``. - You can install it using ``apt install swig`` and then ``pip install box2d box2d-kengz`` - -.. note:: - ``load`` function re-creates model from scratch on each call, which can be slow. - If you need to e.g. evaluate same model with multiple different sets of parameters, consider - using ``load_parameters`` instead. - -.. code-block:: python - - import gym - - from stable_baselines import DQN - from stable_baselines.common.evaluation import evaluate_policy - - - # Create environment - env = gym.make('LunarLander-v2') - - # Instantiate the agent - model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) - # Train the agent - model.learn(total_timesteps=int(2e5)) - # Save the agent - model.save("dqn_lunar") - del model # delete trained model to demonstrate loading - - # Load the trained agent - model = DQN.load("dqn_lunar") - - # Evaluate the agent - mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) - - # Enjoy trained agent - obs = env.reset() - for i in range(1000): - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Multiprocessing: Unleashing the Power of Vectorized Environments ----------------------------------------------------------------- - -.. image:: ../_static/img/try_it.png - :scale: 30 % - :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/multiprocessing_rl.ipynb - -.. figure:: https://cdn-images-1.medium.com/max/960/1*h4WTQNVIsvMXJTCpXm_TAw.gif - - CartPole Environment - - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines.common.vec_env import SubprocVecEnv - from stable_baselines.common import set_global_seeds, make_vec_env - from stable_baselines import ACKTR - - def make_env(env_id, rank, seed=0): - """ - Utility function for multiprocessed env. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the inital seed for RNG - :param rank: (int) index of the subprocess - """ - def _init(): - env = gym.make(env_id) - env.seed(seed + rank) - return env - set_global_seeds(seed) - return _init - - if __name__ == '__main__': - env_id = "CartPole-v1" - num_cpu = 4 # Number of processes to use - # Create the vectorized environment - env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) - - # Stable Baselines provides you with make_vec_env() helper - # which does exactly the previous steps for you: - # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) - - model = ACKTR(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - - obs = env.reset() - for _ in range(1000): - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - - -Using Callback: Monitoring Training ------------------------------------ - -.. note:: - - We recommend reading the `Callback section `_ - -You can define a custom callback function that will be called inside the agent. -This could be useful when you want to monitor training, for instance display live -learning curves in Tensorboard (or in Visdom) or save the best agent. -If your callback returns False, training is aborted early. - -.. image:: ../_static/img/try_it.png - :scale: 30 % - :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb - -.. figure:: ../_static/img/learning_curve.png - - Learning curve of DDPG on LunarLanderContinuous environment - -.. code-block:: python - - import os - - import gym - import numpy as np - import matplotlib.pyplot as plt - - from stable_baselines import DDPG - from stable_baselines.ddpg.policies import LnMlpPolicy - from stable_baselines import results_plotter - from stable_baselines.bench import Monitor - from stable_baselines.results_plotter import load_results, ts2xy - from stable_baselines.common.noise import AdaptiveParamNoiseSpec - from stable_baselines.common.callbacks import BaseCallback - - - class SaveOnBestTrainingRewardCallback(BaseCallback): - """ - Callback for saving a model (the check is done every ``check_freq`` steps) - based on the training reward (in practice, we recommend using ``EvalCallback``). - - :param check_freq: (int) - :param log_dir: (str) Path to the folder where the model will be saved. - It must contains the file created by the ``Monitor`` wrapper. - :param verbose: (int) - """ - def __init__(self, check_freq: int, log_dir: str, verbose=1): - super(SaveOnBestTrainingRewardCallback, self).__init__(verbose) - self.check_freq = check_freq - self.log_dir = log_dir - self.save_path = os.path.join(log_dir, 'best_model') - self.best_mean_reward = -np.inf - - def _init_callback(self) -> None: - # Create folder if needed - if self.save_path is not None: - os.makedirs(self.save_path, exist_ok=True) - - def _on_step(self) -> bool: - if self.n_calls % self.check_freq == 0: - - # Retrieve training reward - x, y = ts2xy(load_results(self.log_dir), 'timesteps') - if len(x) > 0: - # Mean training reward over the last 100 episodes - mean_reward = np.mean(y[-100:]) - if self.verbose > 0: - print("Num timesteps: {}".format(self.num_timesteps)) - print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward)) - - # New best model, you could save the agent here - if mean_reward > self.best_mean_reward: - self.best_mean_reward = mean_reward - # Example for saving best model - if self.verbose > 0: - print("Saving new best model to {}".format(self.save_path)) - self.model.save(self.save_path) - - return True - - # Create log dir - log_dir = "tmp/" - os.makedirs(log_dir, exist_ok=True) - - # Create and wrap the environment - env = gym.make('LunarLanderContinuous-v2') - env = Monitor(env, log_dir) - - # Add some param noise for exploration - param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) - # Because we use parameter noise, we should use a MlpPolicy with layer normalization - model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0) - # Create the callback: check every 1000 steps - callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) - # Train the agent - time_steps = 1e5 - model.learn(total_timesteps=int(time_steps), callback=callback) - - results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "DDPG LunarLander") - plt.show() - - -Atari Games ------------ - -.. figure:: ../_static/img/breakout.gif - - Trained A2C agent on Breakout - -.. figure:: https://cdn-images-1.medium.com/max/960/1*UHYJE7lF8IDZS_U5SsAFUQ.gif - - Pong Environment - - -Training a RL agent on Atari games is straightforward thanks to ``make_atari_env`` helper function. -It will do `all the preprocessing `_ -and multiprocessing for you. - -.. image:: ../_static/img/try_it.png - :scale: 30 % - :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/atari_games.ipynb - - -.. code-block:: python - - from stable_baselines.common.cmd_util import make_atari_env - from stable_baselines.common.vec_env import VecFrameStack - from stable_baselines import ACER - - # There already exists an environment generator - # that will make and wrap atari environments correctly. - # Here we are also multiprocessing training (num_env=4 => 4 processes) - env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) - # Frame-stacking with 4 frames - env = VecFrameStack(env, n_stack=4) - - model = ACER('CnnPolicy', env, verbose=1) - model.learn(total_timesteps=25000) - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -PyBullet: Normalizing input features ------------------------------------- - -Normalizing input features may be essential to successful training of an RL agent -(by default, images are scaled but not other types of input), -for instance when training on `PyBullet `_ environments. For that, a wrapper exists and -will compute a running average and standard deviation of input features (it can do the same for rewards). - - -.. note:: - - you need to install pybullet with ``pip install pybullet`` - - -.. code-block:: python - - import os - - import gym - import pybullet_envs - - from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize - from stable_baselines import PPO2 - - env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) - # Automatically normalize the input features and reward - env = VecNormalize(env, norm_obs=True, norm_reward=True, - clip_obs=10.) - - model = PPO2('MlpPolicy', env) - model.learn(total_timesteps=2000) - - # Don't forget to save the VecNormalize statistics when saving the agent - log_dir = "/tmp/" - model.save(log_dir + "ppo_halfcheetah") - stats_path = os.path.join(log_dir, "vec_normalize.pkl") - env.save(stats_path) - - # To demonstrate loading - del model, env - - # Load the agent - model = PPO2.load(log_dir + "ppo_halfcheetah") - - # Load the saved statistics - env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) - env = VecNormalize.load(stats_path, env) - # do not update them at test time - env.training = False - # reward normalization is not needed at test time - env.norm_reward = False - - - -Custom Policy Network ---------------------- - -Stable baselines provides default policy networks for images (CNNPolicies) -and other type of inputs (MlpPolicies). -However, you can also easily define a custom architecture for the policy network `(see custom policy section) `_: - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import FeedForwardPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import A2C - - # Custom MLP policy of three layers of size 128 each - class CustomPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomPolicy, self).__init__(*args, **kwargs, - net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])], - feature_extraction="mlp") - - model = A2C(CustomPolicy, 'LunarLander-v2', verbose=1) - # Train the agent - model.learn(total_timesteps=100000) - - -Accessing and modifying model parameters ----------------------------------------- - -You can access model's parameters via ``load_parameters`` and ``get_parameters`` functions, which -use dictionaries that map variable names to NumPy arrays. - -These functions are useful when you need to e.g. evaluate large set of models with same network structure, -visualize different layers of the network or modify parameters manually. - -You can access original Tensorflow Variables with function ``get_parameter_list``. - -Following example demonstrates reading parameters, modifying some of them and loading them to model -by implementing `evolution strategy `_ -for solving ``CartPole-v1`` environment. The initial guess for parameters is obtained by running -A2C policy gradient updates on the model. - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines import A2C - - def mutate(params): - """Mutate parameters by adding normal noise to them""" - return dict((name, param + np.random.normal(size=param.shape)) - for name, param in params.items()) - - def evaluate(env, model): - """Return mean fitness (sum of episodic rewards) for given model""" - episode_rewards = [] - for _ in range(10): - reward_sum = 0 - done = False - obs = env.reset() - while not done: - action, _states = model.predict(obs) - obs, reward, done, info = env.step(action) - reward_sum += reward - episode_rewards.append(reward_sum) - return np.mean(episode_rewards) - - # Create env - env = gym.make('CartPole-v1') - # Create policy with a small network - model = A2C('MlpPolicy', env, ent_coef=0.0, learning_rate=0.1, - policy_kwargs={'net_arch': [8, ]}) - - # Use traditional actor-critic policy gradient updates to - # find good initial parameters - model.learn(total_timesteps=5000) - - # Get the parameters as the starting point for ES - mean_params = model.get_parameters() - - # Include only variables with "/pi/" (policy) or "/shared" (shared layers) - # in their name: Only these ones affect the action. - mean_params = dict((key, value) for key, value in mean_params.items() - if ("/pi/" in key or "/shared" in key)) - - for iteration in range(10): - # Create population of candidates and evaluate them - population = [] - for population_i in range(100): - candidate = mutate(mean_params) - # Load new policy parameters to agent. - # Tell function that it should only update parameters - # we give it (policy parameters) - model.load_parameters(candidate, exact_match=False) - fitness = evaluate(env, model) - population.append((candidate, fitness)) - # Take top 10% and use average over their parameters as next mean parameter - top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:10] - mean_params = dict( - (name, np.stack([top_candidate[0][name] for top_candidate in top_candidates]).mean(0)) - for name in mean_params.keys() - ) - mean_fitness = sum(top_candidate[1] for top_candidate in top_candidates) / 10.0 - print("Iteration {:<3} Mean top fitness: {:.2f}".format(iteration, mean_fitness)) - - -Recurrent Policies ------------------- - -This example demonstrate how to train a recurrent policy and how to test it properly. - -.. warning:: - - One current limitation of recurrent policies is that you must test them - with the same number of environments they have been trained on. - - -.. code-block:: python - - from stable_baselines import PPO2 - - # For recurrent policies, with PPO2, the number of environments run in parallel - # should be a multiple of nminibatches. - model = PPO2('MlpLstmPolicy', 'CartPole-v1', nminibatches=1, verbose=1) - model.learn(50000) - - # Retrieve the env - env = model.get_env() - - obs = env.reset() - # Passing state=None to the predict function means - # it is the initial state - state = None - # When using VecEnv, done is a vector - done = [False for _ in range(env.num_envs)] - for _ in range(1000): - # We need to pass the previous state and a mask for recurrent policies - # to reset lstm state when a new episode begin - action, state = model.predict(obs, state=state, mask=done) - obs, reward , done, _ = env.step(action) - # Note: with VecEnv, env.reset() is automatically called - - # Show the env - env.render() - - -Hindsight Experience Replay (HER) ---------------------------------- - -For this example, we are using `Highway-Env `_ by `@eleurent `_. - - -.. image:: ../_static/img/try_it.png - :scale: 30 % - :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_her.ipynb - - -.. figure:: https://raw.githubusercontent.com/eleurent/highway-env/gh-media/docs/media/parking-env.gif - - The highway-parking-v0 environment. - -The parking env is a goal-conditioned continuous control task, in which the vehicle must park in a given space with the appropriate heading. - -.. note:: - - the hyperparameters in the following example were optimized for that environment. - - -.. code-block:: python - - import gym - import highway_env - import numpy as np - - from stable_baselines import HER, SAC, DDPG, TD3 - from stable_baselines.ddpg import NormalActionNoise - - env = gym.make("parking-v0") - - # Create 4 artificial transitions per real transition - n_sampled_goal = 4 - - # SAC hyperparams: - model = HER('MlpPolicy', env, SAC, n_sampled_goal=n_sampled_goal, - goal_selection_strategy='future', - verbose=1, buffer_size=int(1e6), - learning_rate=1e-3, - gamma=0.95, batch_size=256, - policy_kwargs=dict(layers=[256, 256, 256])) - - # DDPG Hyperparams: - # NOTE: it works even without action noise - # n_actions = env.action_space.shape[0] - # noise_std = 0.2 - # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) - # model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal, - # goal_selection_strategy='future', - # verbose=1, buffer_size=int(1e6), - # actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise, - # gamma=0.95, batch_size=256, - # policy_kwargs=dict(layers=[256, 256, 256])) - - - model.learn(int(2e5)) - model.save('her_sac_highway') - - # Load saved model - model = HER.load('her_sac_highway', env=env) - - obs = env.reset() - - # Evaluate the agent - episode_reward = 0 - for _ in range(100): - action, _ = model.predict(obs) - obs, reward, done, info = env.step(action) - env.render() - episode_reward += reward - if done or info.get('is_success', False): - print("Reward:", episode_reward, "Success?", info.get('is_success', False)) - episode_reward = 0.0 - obs = env.reset() - - - -Continual Learning ------------------- - -You can also move from learning on one environment to another for `continual learning `_ -(PPO2 on ``DemonAttack-v0``, then transferred on ``SpaceInvaders-v0``): - -.. code-block:: python - - from stable_baselines.common.cmd_util import make_atari_env - from stable_baselines import PPO2 - - # There already exists an environment generator - # that will make and wrap atari environments correctly - env = make_atari_env('DemonAttackNoFrameskip-v4', num_env=8, seed=0) - - model = PPO2('CnnPolicy', env, verbose=1) - model.learn(total_timesteps=10000) - - obs = env.reset() - for i in range(1000): - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - # Close the processes - env.close() - - # The number of environments must be identical when changing environments - env = make_atari_env('SpaceInvadersNoFrameskip-v4', num_env=8, seed=0) - - # change env - model.set_env(env) - model.learn(total_timesteps=10000) - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - env.close() - - -Record a Video --------------- - -Record a mp4 video (here using a random agent). - -.. note:: - - It requires ffmpeg or avconv to be installed on the machine. - -.. code-block:: python - - import gym - from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv - - env_id = 'CartPole-v1' - video_folder = 'logs/videos/' - video_length = 100 - - env = DummyVecEnv([lambda: gym.make(env_id)]) - - obs = env.reset() - - # Record the video starting at the first step - env = VecVideoRecorder(env, video_folder, - record_video_trigger=lambda x: x == 0, video_length=video_length, - name_prefix="random-agent-{}".format(env_id)) - - env.reset() - for _ in range(video_length + 1): - action = [env.action_space.sample()] - obs, _, _, _ = env.step(action) - # Save the video - env.close() - - -Bonus: Make a GIF of a Trained Agent ------------------------------------- - -.. note:: - For Atari games, you need to use a screen recorder such as `Kazam `_. - And then convert the video using `ffmpeg `_ - -.. code-block:: python - - import imageio - import numpy as np - - from stable_baselines import A2C - - model = A2C("MlpPolicy", "LunarLander-v2").learn(100000) - - images = [] - obs = model.env.reset() - img = model.env.render(mode='rgb_array') - for i in range(350): - images.append(img) - action, _ = model.predict(obs) - obs, _, _ ,_ = model.env.step(action) - img = model.env.render(mode='rgb_array') - - imageio.mimsave('lander_a2c.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29) diff --git a/docs/guide/export.rst b/docs/guide/export.rst deleted file mode 100644 index a4180153..00000000 --- a/docs/guide/export.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _export: - - -Exporting models -================ - -After training an agent, you may want to deploy/use it in an other language -or framework, like PyTorch or `tensorflowjs `_. -Stable Baselines does not include tools to export models to other frameworks, but -this document aims to cover parts that are required for exporting along with -more detailed stories from users of Stable Baselines. - - -Background ----------- - -In Stable Baselines, the controller is stored inside :ref:`policies ` which convert -observations into actions. Each learning algorithm (e.g. DQN, A2C, SAC) contains -one or more policies, some of which are only used for training. An easy way to find -the policy is to check the code for the ``predict`` function of the agent: -This function should only call one policy with simple arguments. - -Policies hold the necessary Tensorflow placeholders and tensors to do the -inference (i.e. predict actions), so it is enough to export these policies -to do inference in an another framework. - -.. note:: - Learning algorithms also may contain other Tensorflow placeholders, that are used for training only and are - not required for inference. - - -.. warning:: - When using CNN policies, the observation is normalized internally (dividing by 255 to have values in [0, 1]) - - -Export to PyTorch ------------------ - -A known working solution is to use :func:`get_parameters ` -function to obtain model parameters, construct the network manually in PyTorch and assign parameters correctly. - -.. warning:: - PyTorch and Tensorflow have internal differences with e.g. 2D convolutions (see discussion linked below). - - -See `discussion #372 `_ for details. - - -Export to C++ ------------------ - -Tensorflow, which is the backbone of Stable Baselines, is fundamentally a C/C++ library despite being most commonly accessed -through the Python frontend layer. This design choice means that the models created at Python level should generally be -fully compliant with the respective C++ version of Tensorflow. - -.. warning:: - It is advisable not to mix-and-match different versions of Tensorflow libraries, particularly in terms of the state. - Moving computational graphs is generally more forgiving. As a matter of fact, mentioned below `PPO_CPP `_ project uses - graphs generated with Python Tensorflow 1.x in C++ Tensorflow 2 version. - -Stable Baselines comes very handily when hoping to migrate a computational graph and/or a state (weights) as -the existing algorithms define most of the necessary computations for you so you don't need to recreate the core of the algorithms again. -This is exactly the idea that has been used in the `PPO_CPP `_ project, which executes the training at the C++ level for the sake of -computational efficiency. The graphs are exported from Stable Baselines' PPO2 implementation through ``tf.train.export_meta_graph`` -function. Alternatively, and perhaps more commonly, you could use the C++ layer only for inference. That could be useful -as a deployment step of server backends or optimization for more limited devices. - -.. warning:: - As a word of caution, C++-level APIs are more imperative than their Python counterparts or more plainly speaking: cruder. - This is particularly apparent in Tensorflow 2.0 where the declarativeness of Autograph exists only at Python level. The - C++ counterpart still operates on Session objects' use, which are known from earlier versions of Tensorflow. In our use case, - availability of graphs utilized by Session depends on the use of ``tf.function`` decorators. However, as of November 2019, Stable Baselines still - uses Tensorflow 1.x in the main version which is slightly easier to use in the context of the C++ portability. - - -Export to tensorflowjs / tfjs ------------------------------ - -Can be done via Tensorflow's `simple_save `_ function -and `tensorflowjs_converter `_. - -See `discussion #474 `_ for details. - - -Export to Java ---------------- - -Can be done via Tensorflow's `simple_save `_ function. - -See `this discussion `_ for details. - - -Manual export -------------- - -You can also manually export required parameters (weights) and construct the -network in your desired framework, as done with the PyTorch example above. - -You can access parameters of the model via agents' -:func:`get_parameters ` -function. If you use default policies, you can find the architecture of the networks in -source for :ref:`policies `. Otherwise, for DQN/SAC/DDPG or TD3 you need to check the `policies.py` file located -in their respective folders. diff --git a/docs/guide/install.rst b/docs/guide/install.rst deleted file mode 100644 index 07b11d60..00000000 --- a/docs/guide/install.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. _install: - -Installation -============ - -Prerequisites -------------- - -Baselines requires python3 (>=3.5) with the development headers. You'll -also need system packages CMake, OpenMPI and zlib. Those can be -installed as follows - -.. note:: - - Stable-Baselines supports Tensorflow versions from 1.8.0 to 1.15.0, and does not work on - Tensorflow versions 2.0.0 and above. PyTorch support is done in `Stable-Baselines3 `_ - - -Ubuntu -~~~~~~ - -.. code-block:: bash - - sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev - -Mac OS X -~~~~~~~~ - -Installation of system packages on Mac requires `Homebrew`_. With -Homebrew installed, run the following: - -.. code-block:: bash - - brew install cmake openmpi - -.. _Homebrew: https://brew.sh - - -Windows 10 -~~~~~~~~~~ - -We recommend using `Anaconda `_ for Windows users for easier installation of Python packages and required libraries. You need an environment with Python version 3.5 or above. - -For a quick start you can move straight to installing Stable-Baselines in the next step (without MPI). This supports most but not all algorithms. - -To support all algorithms, Install `MPI for Windows `_ (you need to download and install ``msmpisetup.exe``) and follow the instructions on how to install Stable-Baselines with MPI support in following section. - -.. note:: - - Trying to create Atari environments may result to vague errors related to missing DLL files and modules. This is an - issue with atari-py package. `See this discussion for more information `_. - - -.. _openmpi: - -Stable Release -~~~~~~~~~~~~~~ -To install with support for all algorithms, including those depending on OpenMPI, execute: - -.. code-block:: bash - - pip install stable-baselines[mpi] - -GAIL, DDPG, TRPO, and PPO1 parallelize training using OpenMPI. OpenMPI has had weird -interactions with Tensorflow in the past (see -`Issue #430 `_) and so if you do not -intend to use these algorithms we recommend installing without OpenMPI. To do this, execute: - -.. code-block:: bash - - pip install stable-baselines - -If you have already installed with MPI support, you can disable MPI by uninstalling ``mpi4py`` -with ``pip uninstall mpi4py``. - - -.. note:: - - Unless you are using the bleeding-edge version, you need to install the correct Tensorflow version manually. See `Issue #849 `_ - - -Bleeding-edge version ---------------------- - -To install the latest master version: - -.. code-block:: bash - - pip install git+https://github.com/hill-a/stable-baselines - - -Development version -------------------- - -To contribute to Stable-Baselines, with support for running tests and building the documentation. - -.. code-block:: bash - - git clone https://github.com/hill-a/stable-baselines && cd stable-baselines - pip install -e .[docs,tests,mpi] - - -Using Docker Images -------------------- - -If you are looking for docker images with stable-baselines already installed in it, -we recommend using images from `RL Baselines Zoo `_. - -Otherwise, the following images contained all the dependencies for stable-baselines but not the stable-baselines package itself. -They are made for development. - -Use Built Images -~~~~~~~~~~~~~~~~ - -GPU image (requires `nvidia-docker`_): - -.. code-block:: bash - - docker pull stablebaselines/stable-baselines - -CPU only: - -.. code-block:: bash - - docker pull stablebaselines/stable-baselines-cpu - -Build the Docker Images -~~~~~~~~~~~~~~~~~~~~~~~~ - -Build GPU image (with nvidia-docker): - -.. code-block:: bash - - make docker-gpu - -Build CPU image: - -.. code-block:: bash - - make docker-cpu - -Note: if you are using a proxy, you need to pass extra params during -build and do some `tweaks`_: - -.. code-block:: bash - - --network=host --build-arg HTTP_PROXY=http://your.proxy.fr:8080/ --build-arg http_proxy=http://your.proxy.fr:8080/ --build-arg HTTPS_PROXY=https://your.proxy.fr:8080/ --build-arg https_proxy=https://your.proxy.fr:8080/ - -Run the images (CPU/GPU) -~~~~~~~~~~~~~~~~~~~~~~~~ - -Run the nvidia-docker GPU image - -.. code-block:: bash - - docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines bash -c 'cd /root/code/stable-baselines/ && pytest tests/' - -Or, with the shell file: - -.. code-block:: bash - - ./scripts/run_docker_gpu.sh pytest tests/ - -Run the docker CPU image - -.. code-block:: bash - - docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines-cpu bash -c 'cd /root/code/stable-baselines/ && pytest tests/' - -Or, with the shell file: - -.. code-block:: bash - - ./scripts/run_docker_cpu.sh pytest tests/ - -Explanation of the docker command: - -- ``docker run -it`` create an instance of an image (=container), and - run it interactively (so ctrl+c will work) -- ``--rm`` option means to remove the container once it exits/stops - (otherwise, you will have to use ``docker rm``) -- ``--network host`` don't use network isolation, this allow to use - tensorboard/visdom on host machine -- ``--ipc=host`` Use the host system’s IPC namespace. IPC (POSIX/SysV IPC) namespace provides - separation of named shared memory segments, semaphores and message - queues. -- ``--name test`` give explicitly the name ``test`` to the container, - otherwise it will be assigned a random name -- ``--mount src=...`` give access of the local directory (``pwd`` - command) to the container (it will be map to ``/root/code/stable-baselines``), so - all the logs created in the container in this folder will be kept -- ``bash -c '...'`` Run command inside the docker image, here run the tests - (``pytest tests/``) - -.. _nvidia-docker: https://github.com/NVIDIA/nvidia-docker -.. _tweaks: https://stackoverflow.com/questions/23111631/cannot-download-docker-images-behind-a-proxy diff --git a/docs/guide/pretrain.rst b/docs/guide/pretrain.rst deleted file mode 100644 index af03c37a..00000000 --- a/docs/guide/pretrain.rst +++ /dev/null @@ -1,152 +0,0 @@ -.. _pretrain: - -.. automodule:: stable_baselines.gail - :noindex: - - -Pre-Training (Behavior Cloning) -=============================== - -With the ``.pretrain()`` method, you can pre-train RL policies using trajectories from an expert, and therefore accelerate training. - -Behavior Cloning (BC) treats the problem of imitation learning, i.e., using expert demonstrations, as a supervised learning problem. -That is to say, given expert trajectories (observations-actions pairs), the policy network is trained to reproduce the expert behavior: -for a given observation, the action taken by the policy must be the one taken by the expert. - -Expert trajectories can be human demonstrations, trajectories from another controller (e.g. a PID controller) -or trajectories from a trained RL agent. - - -.. note:: - - Only ``Box`` and ``Discrete`` spaces are supported for now for pre-training a model. - - -.. note:: - - Images datasets are treated a bit differently as other datasets to avoid memory issues. - The images from the expert demonstrations must be located in a folder, not in the expert numpy archive. - - - -Generate Expert Trajectories ----------------------------- - -Here, we are going to train a RL model and then generate expert trajectories -using this agent. - -Note that in practice, generating expert trajectories usually does not require training an RL agent. - -The following example is only meant to demonstrate the ``pretrain()`` feature. - -However, we recommend users to take a look at the code of the ``generate_expert_traj()`` function (located in ``gail/dataset/`` folder) -to learn about the data structure of the expert dataset (see below for an overview) and how to record trajectories. - - -.. code-block:: python - - from stable_baselines import DQN - from stable_baselines.gail import generate_expert_traj - - model = DQN('MlpPolicy', 'CartPole-v1', verbose=1) - # Train a DQN agent for 1e5 timesteps and generate 10 trajectories - # data will be saved in a numpy archive named `expert_cartpole.npz` - generate_expert_traj(model, 'expert_cartpole', n_timesteps=int(1e5), n_episodes=10) - - - -Here is an additional example when the expert controller is a callable, -that is passed to the function instead of a RL model. -The idea is that this callable can be a PID controller, asking a human player, ... - - -.. code-block:: python - - import gym - - from stable_baselines.gail import generate_expert_traj - - env = gym.make("CartPole-v1") - # Here the expert is a random agent - # but it can be any python function, e.g. a PID controller - def dummy_expert(_obs): - """ - Random agent. It samples actions randomly - from the action space of the environment. - - :param _obs: (np.ndarray) Current observation - :return: (np.ndarray) action taken by the expert - """ - return env.action_space.sample() - # Data will be saved in a numpy archive named `expert_cartpole.npz` - # when using something different than an RL expert, - # you must pass the environment object explicitly - generate_expert_traj(dummy_expert, 'dummy_expert_cartpole', env, n_episodes=10) - - - -Pre-Train a Model using Behavior Cloning ----------------------------------------- - -Using the ``expert_cartpole.npz`` dataset generated with the previous script. - -.. code-block:: python - - from stable_baselines import PPO2 - from stable_baselines.gail import ExpertDataset - # Using only one expert trajectory - # you can specify `traj_limitation=-1` for using the whole dataset - dataset = ExpertDataset(expert_path='expert_cartpole.npz', - traj_limitation=1, batch_size=128) - - model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1) - # Pretrain the PPO2 model - model.pretrain(dataset, n_epochs=1000) - - # As an option, you can train the RL agent - # model.learn(int(1e5)) - - # Test the pre-trained model - env = model.get_env() - obs = env.reset() - - reward_sum = 0.0 - for _ in range(1000): - action, _ = model.predict(obs) - obs, reward, done, _ = env.step(action) - reward_sum += reward - env.render() - if done: - print(reward_sum) - reward_sum = 0.0 - obs = env.reset() - - env.close() - - -Data Structure of the Expert Dataset ------------------------------------- - -The expert dataset is a ``.npz`` archive. The data is saved in python dictionary format with keys: ``actions``, ``episode_returns``, ``rewards``, ``obs``, ``episode_starts``. - -In case of images, ``obs`` contains the relative path to the images. - -obs, actions: shape (N * L, ) + S - -where N = # episodes, L = episode length -and S is the environment observation/action space. - -S = (1, ) for discrete space - - -.. autoclass:: ExpertDataset - :members: - :inherited-members: - - -.. autoclass:: DataLoader - :members: - :inherited-members: - - -.. autofunction:: generate_expert_traj diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst deleted file mode 100644 index bcd74749..00000000 --- a/docs/guide/quickstart.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. _quickstart: - -=============== -Getting Started -=============== - -Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms. - -Here is a quick example of how to train and run PPO2 on a cartpole environment: - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import PPO2 - - env = gym.make('CartPole-v1') - # Optional: PPO2 requires a vectorized environment to run - # the env is now wrapped automatically when passing it to the constructor - # env = DummyVecEnv([lambda: env]) - - model = PPO2(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=10000) - - obs = env.reset() - for i in range(1000): - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Or just train a model with a one liner if -`the environment is registered in Gym `_ and if -`the policy is registered `_: - -.. code-block:: python - - from stable_baselines import PPO2 - - model = PPO2('MlpPolicy', 'CartPole-v1').learn(10000) - - -.. figure:: https://cdn-images-1.medium.com/max/960/1*R_VMmdgKAY0EDhEjHVelzw.gif - - Define and train a RL agent in one line of code! diff --git a/docs/guide/rl.rst b/docs/guide/rl.rst deleted file mode 100644 index ca9aeb1d..00000000 --- a/docs/guide/rl.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _rl: - -================================ -Reinforcement Learning Resources -================================ - - -Stable-Baselines assumes that you already understand the basic concepts of Reinforcement Learning (RL). - -However, if you want to learn about RL, there are several good resources to get started: - -- `OpenAI Spinning Up `_ -- `David Silver's course `_ -- `Lilian Weng's blog `_ -- `Berkeley's Deep RL Bootcamp `_ -- `Berkeley's Deep Reinforcement Learning course `_ -- `More resources `_ diff --git a/docs/guide/rl_tips.rst b/docs/guide/rl_tips.rst deleted file mode 100644 index 7d00af27..00000000 --- a/docs/guide/rl_tips.rst +++ /dev/null @@ -1,252 +0,0 @@ -.. _rl_tips: - -====================================== -Reinforcement Learning Tips and Tricks -====================================== - -The aim of this section is to help you doing reinforcement learning experiments. -It covers general advice about RL (where to start, which algorithm to choose, how to evaluate an algorithm, ...), -as well as tips and tricks when using a custom environment or implementing an RL algorithm. - - -General advice when using Reinforcement Learning -================================================ - -TL;DR ------ - -1. Read about RL and Stable Baselines -2. Do quantitative experiments and hyperparameter tuning if needed -3. Evaluate the performance using a separate test environment -4. For better performance, increase the training budget - - -Like any other subject, if you want to work with RL, you should first read about it (we have a dedicated `resource page `_ to get you started) -to understand what you are using. We also recommend you read Stable Baselines (SB) documentation and do the `tutorial `_. -It covers basic usage and guide you towards more advanced concepts of the library (e.g. callbacks and wrappers). - -Reinforcement Learning differs from other machine learning methods in several ways. The data used to train the agent is collected -through interactions with the environment by the agent itself (compared to supervised learning where you have a fixed dataset for instance). -This dependence can lead to vicious circle: if the agent collects poor quality data (e.g., trajectories with no rewards), then it will not improve and continue to amass -bad trajectories. - -This factor, among others, explains that results in RL may vary from one run to another (i.e., when only the seed of the pseudo-random generator changes). -For this reason, you should always do several runs to have quantitative results. - -Good results in RL are generally dependent on finding appropriate hyperparameters. Recent algorithms (PPO, SAC, TD3) normally require little hyperparameter tuning, -however, *don't expect the default ones to work* on any environment. - -Therefore, we *highly recommend you* to take a look at the `RL zoo `_ (or the original papers) for tuned hyperparameters. -A best practice when you apply RL to a new problem is to do automatic hyperparameter optimization. Again, this is included in the `RL zoo `_. - -When applying RL to a custom problem, you should always normalize the input to the agent (e.g. using VecNormalize for PPO2/A2C) -and look at common preprocessing done on other environments (e.g. for `Atari `_, frame-stack, ...). -Please refer to *Tips and Tricks when creating a custom environment* paragraph below for more advice related to custom environments. - - -Current Limitations of RL -------------------------- - -You have to be aware of the current `limitations `_ of reinforcement learning. - - -Model-free RL algorithms (i.e. all the algorithms implemented in SB) are usually *sample inefficient*. They require a lot of samples (sometimes millions of interactions) to learn something useful. -That's why most of the successes in RL were achieved on games or in simulation only. For instance, in this `work `_ by ETH Zurich, the ANYmal robot was trained in simulation only, and then tested in the real world. - -As a general advice, to obtain better performances, you should augment the budget of the agent (number of training timesteps). - - -In order to achieve the desired behavior, expert knowledge is often required to design an adequate reward function. -This *reward engineering* (or *RewArt* as coined by `Freek Stulp `_), necessitates several iterations. As a good example of reward shaping, -you can take a look at `Deep Mimic paper `_ which combines imitation learning and reinforcement learning to do acrobatic moves. - -One last limitation of RL is the instability of training. That is to say, you can observe during training a huge drop in performance. -This behavior is particularly present in ``DDPG``, that's why its extension ``TD3`` tries to tackle that issue. -Other method, like ``TRPO`` or ``PPO`` make use of a *trust region* to minimize that problem by avoiding too large update. - - -How to evaluate an RL algorithm? --------------------------------- - -Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance -of your agent at a given time. It is recommended to periodically evaluate your agent for ``n`` test episodes (``n`` is usually between 5 and 20) -and average the reward per episode to have a good estimate. - -As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method, -this frequently leads to better performance. -Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance. - - -.. note:: - - We provide an ``EvalCallback`` for doing such evaluation. You can read more about it in the :ref:`Callbacks ` section. - - - -We suggest you reading `Deep Reinforcement Learning that Matters `_ for a good discussion about RL evaluation. - -You can also take a look at this `blog post `_ -and this `issue `_ by Cédric Colas. - - -Which algorithm should I use? -============================= - -There is no silver bullet in RL, depending on your needs and problem, you may choose one or the other. -The first distinction comes from your action space, i.e., do you have discrete (e.g. LEFT, RIGHT, ...) -or continuous actions (ex: go to a certain speed)? - -Some algorithms are only tailored for one or the other domain: ``DQN`` only supports discrete actions, where ``SAC`` is restricted to continuous actions. - -The second difference that will help you choose is whether you can parallelize your training or not, and how you can do it (with or without MPI?). -If what matters is the wall clock training time, then you should lean towards ``A2C`` and its derivatives (PPO, ACER, ACKTR, ...). -Take a look at the `Vectorized Environments `_ to learn more about training with multiple workers. - -To sum it up: - -Discrete Actions ----------------- - -.. note:: - - This covers ``Discrete``, ``MultiDiscrete``, ``Binary`` and ``MultiBinary`` spaces - - -Discrete Actions - Single Process -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -DQN with extensions (double DQN, prioritized replay, ...) and ACER are the recommended algorithms. -DQN is usually slower to train (regarding wall clock time) but is the most sample efficient (because of its replay buffer). - -Discrete Actions - Multiprocessed -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You should give a try to PPO2, A2C and its successors (ACKTR, ACER). - -If you can multiprocess the training using MPI, then you should checkout PPO1 and TRPO. - - -Continuous Actions ------------------- - -Continuous Actions - Single Process -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Current State Of The Art (SOTA) algorithms are ``SAC`` and ``TD3``. -Please use the hyperparameters in the `RL zoo `_ for best results. - - -Continuous Actions - Multiprocessed -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Take a look at PPO2, TRPO or A2C. Again, don't forget to take the hyperparameters from the `RL zoo `_ -for continuous actions problems (cf *Bullet* envs). - -.. note:: - - Normalization is critical for those algorithms - -If you can use MPI, then you can choose between PPO1, TRPO and DDPG. - - -Goal Environment ------------------ - -If your environment follows the ``GoalEnv`` interface (cf `HER <../modules/her.html>`_), then you should use -HER + (SAC/TD3/DDPG/DQN) depending on the action space. - - -.. note:: - - The number of workers is an important hyperparameters for experiments with HER. Currently, only HER+DDPG supports multiprocessing using MPI. - - - -Tips and Tricks when creating a custom environment -================================================== - -If you want to learn about how to create a custom environment, we recommend you read this `page `_. -We also provide a `colab notebook `_ for -a concrete example of creating a custom gym environment. - -Some basic advice: - -- always normalize your observation space when you can, i.e., when you know the boundaries -- normalize your action space and make it symmetric when continuous (cf potential issue below) A good practice is to rescale your actions to lie in [-1, 1]. This does not limit you as you can easily rescale the action inside the environment -- start with shaped reward (i.e. informative reward) and simplified version of your problem -- debug with random actions to check that your environment works and follows the gym interface: - - -We provide a helper to check that your environment runs without error: - -.. code-block:: python - - from stable_baselines.common.env_checker import check_env - - env = CustomEnv(arg1, ...) - # It will check your custom environment and output additional warnings if needed - check_env(env) - - -If you want to quickly try a random agent on your environment, you can also do: - -.. code-block:: python - - env = YourEnv() - obs = env.reset() - n_steps = 10 - for _ in range(n_steps): - # Random action - action = env.action_space.sample() - obs, reward, done, info = env.step(action) - - -**Why should I normalize the action space?** - - -Most reinforcement learning algorithms rely on a Gaussian distribution (initially centered at 0 with std 1) for continuous actions. -So, if you forget to normalize the action space when using a custom environment, -this can harm learning and be difficult to debug (cf attached image and `issue #473 `_). - -.. figure:: ../_static/img/mistake.png - - -Another consequence of using a Gaussian is that the action range is not bounded. -That's why clipping is usually used as a bandage to stay in a valid interval. -A better solution would be to use a squashing function (cf ``SAC``) or a Beta distribution (cf `issue #112 `_). - -.. note:: - - This statement is not true for ``DDPG`` or ``TD3`` because they don't rely on any probability distribution. - - - -Tips and Tricks when implementing an RL algorithm -================================================= - -When you try to reproduce a RL paper by implementing the algorithm, the `nuts and bolts of RL research `_ -by John Schulman are quite useful (`video `_). - -We *recommend following those steps to have a working RL algorithm*: - -1. Read the original paper several times -2. Read existing implementations (if available) -3. Try to have some "sign of life" on toy problems -4. Validate the implementation by making it run on harder and harder envs (you can compare results against the RL zoo) - You usually need to run hyperparameter optimization for that step. - -You need to be particularly careful on the shape of the different objects you are manipulating (a broadcast mistake will fail silently cf `issue #75 `_) -and when to stop the gradient propagation. - -A personal pick (by @araffin) for environments with gradual difficulty in RL with continuous actions: - -1. Pendulum (easy to solve) -2. HalfCheetahBullet (medium difficulty with local minima and shaped reward) -3. BipedalWalkerHardcore (if it works on that one, then you can have a cookie) - -in RL with discrete actions: - -1. CartPole-v1 (easy to be better than random agent, harder to achieve maximal performance) -2. LunarLander -3. Pong (one of the easiest Atari game) -4. other Atari games (e.g. Breakout) diff --git a/docs/guide/rl_zoo.rst b/docs/guide/rl_zoo.rst deleted file mode 100644 index 61c4d15b..00000000 --- a/docs/guide/rl_zoo.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. _rl_zoo: - -================= -RL Baselines Zoo -================= - -`RL Baselines Zoo `_. is a collection of pre-trained Reinforcement Learning agents using -Stable-Baselines. -It also provides basic scripts for training, evaluating agents, tuning hyperparameters and recording videos. - -Goals of this repository: - -1. Provide a simple interface to train and enjoy RL agents -2. Benchmark the different Reinforcement Learning algorithms -3. Provide tuned hyperparameters for each environment and RL algorithm -4. Have fun with the trained agents! - -Installation ------------- - -1. Install dependencies -:: - - apt-get install swig cmake libopenmpi-dev zlib1g-dev ffmpeg - pip install stable-baselines box2d box2d-kengz pyyaml pybullet optuna pytablewriter - -2. Clone the repository: - -:: - - git clone https://github.com/araffin/rl-baselines-zoo - - -Train an Agent --------------- - -The hyperparameters for each environment are defined in -``hyperparameters/algo_name.yml``. - -If the environment exists in this file, then you can train an agent -using: - -:: - - python train.py --algo algo_name --env env_id - -For example (with tensorboard support): - -:: - - python train.py --algo ppo2 --env CartPole-v1 --tensorboard-log /tmp/stable-baselines/ - -Train for multiple environments (with one call) and with tensorboard -logging: - -:: - - python train.py --algo a2c --env MountainCar-v0 CartPole-v1 --tensorboard-log /tmp/stable-baselines/ - -Continue training (here, load pretrained agent for Breakout and continue -training for 5000 steps): - -:: - - python train.py --algo a2c --env BreakoutNoFrameskip-v4 -i trained_agents/a2c/BreakoutNoFrameskip-v4.pkl -n 5000 - - -Enjoy a Trained Agent ---------------------- - -If the trained agent exists, then you can see it in action using: - -:: - - python enjoy.py --algo algo_name --env env_id - -For example, enjoy A2C on Breakout during 5000 timesteps: - -:: - - python enjoy.py --algo a2c --env BreakoutNoFrameskip-v4 --folder trained_agents/ -n 5000 - - -Hyperparameter Optimization ---------------------------- - -We use `Optuna `_ for optimizing the hyperparameters. - - -Tune the hyperparameters for PPO2, using a random sampler and median pruner, 2 parallels jobs, -with a budget of 1000 trials and a maximum of 50000 steps: - -:: - - python train.py --algo ppo2 --env MountainCar-v0 -n 50000 -optimize --n-trials 1000 --n-jobs 2 \ - --sampler random --pruner median - - -Colab Notebook: Try it Online! ------------------------------- - -You can train agents online using Google `colab notebook `_. - - -.. note:: - - You can find more information about the rl baselines zoo in the repo `README `_. For instance, how to record a video of a trained agent. diff --git a/docs/guide/save_format.rst b/docs/guide/save_format.rst deleted file mode 100644 index 38561a3f..00000000 --- a/docs/guide/save_format.rst +++ /dev/null @@ -1,86 +0,0 @@ -.. _save_format: - - -On saving and loading -===================== - -Stable baselines stores both neural network parameters and algorithm-related parameters such as -exploration schedule, number of environments and observation/action space. This allows continual learning and easy -use of trained agents without training, but it is not without its issues. Following describes two formats -used to save agents in stable baselines, their pros and shortcomings. - -Terminology used in this page: - -- *parameters* refer to neural network parameters (also called "weights"). This is a dictionary - mapping Tensorflow variable name to a NumPy array. -- *data* refers to RL algorithm parameters, e.g. learning rate, exploration schedule, action/observation space. - These depend on the algorithm used. This is a dictionary mapping classes variable names their values. - - -Cloudpickle (stable-baselines<=2.7.0) -------------------------------------- - -Original stable baselines save format. Data and parameters are bundled up into a tuple ``(data, parameters)`` -and then serialized with ``cloudpickle`` library (essentially the same as ``pickle``). - -This save format is still available via an argument in model save function in stable-baselines versions above -v2.7.0 for backwards compatibility reasons, but its usage is discouraged. - -Pros: - -- Easy to implement and use. -- Works with almost any type of Python object, including functions. - - -Cons: - -- Pickle/Cloudpickle is not designed for long-term storage or sharing between Python version. -- If one object in file is not readable (e.g. wrong library version), then reading the rest of the - file is difficult. -- Python-specific format, hard to read stored files from other languages. - - -If part of a saved model becomes unreadable for any reason (e.g. different Tensorflow versions), then -it may be tricky to restore any of the model. For this reason another save format was designed. - - -Zip-archive (stable-baselines>2.7.0) -------------------------------------- - -A zip-archived JSON dump and NumPy zip archive of the arrays. The data dictionary (class parameters) -is stored as a JSON file, model parameters are serialized with ``numpy.savez`` function and these two files -are stored under a single .zip archive. - -Any objects that are not JSON serializable are serialized with cloudpickle and stored as base64-encoded -string in the JSON file, along with some information that was stored in the serialization. This allows -inspecting stored objects without deserializing the object itself. - -This format allows skipping elements in the file, i.e. we can skip deserializing objects that are -broken/non-serializable. This can be done via ``custom_objects`` argument to load functions. - -This is the default save format in stable baselines versions after v2.7.0. - -File structure: - -:: - - saved_model.zip/ - ├── data JSON file of class-parameters (dictionary) - ├── parameter_list JSON file of model parameters and their ordering (list) - ├── parameters Bytes from numpy.savez (a zip file of the numpy arrays). ... - ├── ... Being a zip-archive itself, this object can also be opened ... - ├── ... as a zip-archive and browsed. - - -Pros: - - -- More robust to unserializable objects (one bad object does not break everything). -- Saved file can be inspected/extracted with zip-archive explorers and by other - languages. - - -Cons: - -- More complex implementation. -- Still relies partly on cloudpickle for complex objects (e.g. custom functions). diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst deleted file mode 100644 index b10c1b66..00000000 --- a/docs/guide/tensorboard.rst +++ /dev/null @@ -1,146 +0,0 @@ -.. _tensorboard: - -Tensorboard Integration -========================== - -Basic Usage ------------- - -To use Tensorboard with the rl baselines, you simply need to define a log location for the RL agent: - -.. code-block:: python - - import gym - - from stable_baselines import A2C - - model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") - model.learn(total_timesteps=10000) - - -Or after loading an existing model (by default the log path is not saved): - -.. code-block:: python - - import gym - - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import A2C - - env = gym.make('CartPole-v1') - env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run - - model = A2C.load("./a2c_cartpole.pkl", env=env, tensorboard_log="./a2c_cartpole_tensorboard/") - model.learn(total_timesteps=10000) - - -You can also define custom logging name when training (by default it is the algorithm name) - -.. code-block:: python - - import gym - - from stable_baselines import A2C - - model = A2C('MlpPolicy', 'CartPole-v1', verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/") - model.learn(total_timesteps=10000, tb_log_name="first_run") - # Pass reset_num_timesteps=False to continue the training curve in tensorboard - # By default, it will create a new curve - model.learn(total_timesteps=10000, tb_log_name="second_run", reset_num_timesteps=False) - model.learn(total_timesteps=10000, tb_log_name="thrid_run", reset_num_timesteps=False) - - -Once the learn function is called, you can monitor the RL agent during or after the training, with the following bash command: - -.. code-block:: bash - - tensorboard --logdir ./a2c_cartpole_tensorboard/ - -you can also add past logging folders: - -.. code-block:: bash - - tensorboard --logdir ./a2c_cartpole_tensorboard/;./ppo2_cartpole_tensorboard/ - -It will display information such as the model graph, the episode reward, the model losses, the observation and other parameter unique to some models. - -.. image:: ../_static/img/Tensorboard_example_1.png - :width: 400 - :alt: plotting - -.. image:: ../_static/img/Tensorboard_example_2.png - :width: 400 - :alt: histogram - -.. image:: ../_static/img/Tensorboard_example_3.png - :width: 400 - :alt: graph - - -Logging More Values -------------------- - -Using a callback, you can easily log more values with TensorBoard. -Here is a simple example on how to log both additional tensor or arbitrary scalar value: - -.. code-block:: python - - import tensorflow as tf - import numpy as np - - from stable_baselines import SAC - from stable_baselines.common.callbacks import BaseCallback - - model = SAC("MlpPolicy", "Pendulum-v0", tensorboard_log="/tmp/sac/", verbose=1) - - class TensorboardCallback(BaseCallback): - """ - Custom callback for plotting additional values in tensorboard. - """ - def __init__(self, verbose=0): - self.is_tb_set = False - super(TensorboardCallback, self).__init__(verbose) - - def _on_step(self) -> bool: - # Log additional tensor - if not self.is_tb_set: - with self.model.graph.as_default(): - tf.summary.scalar('value_target', tf.reduce_mean(self.model.value_target)) - self.model.summary = tf.summary.merge_all() - self.is_tb_set = True - # Log scalar value (here a random variable) - value = np.random.random() - summary = tf.Summary(value=[tf.Summary.Value(tag='random_value', simple_value=value)]) - self.locals['writer'].add_summary(summary, self.num_timesteps) - return True - - - model.learn(50000, callback=TensorboardCallback()) - -Legacy Integration -------------------- - -All the information displayed in the terminal (default logging) can be also logged in tensorboard. -For that, you need to define several environment variables: - -.. code-block:: bash - - # formats are comma-separated, but for tensorboard you only need the last one - # stdout -> terminal - export OPENAI_LOG_FORMAT='stdout,log,csv,tensorboard' - export OPENAI_LOGDIR=path/to/tensorboard/data - -and to configure the logger using: - -.. code-block:: python - - from stable_baselines.logger import configure - - configure() - - -Then start tensorboard with: - -.. code-block:: bash - - tensorboard --logdir=$OPENAI_LOGDIR diff --git a/docs/guide/vec_envs.rst b/docs/guide/vec_envs.rst deleted file mode 100644 index a5530cbb..00000000 --- a/docs/guide/vec_envs.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. _vec_env: - -.. automodule:: stable_baselines.common.vec_env - -Vectorized Environments -======================= - -Vectorized Environments are a method for stacking multiple independent environments into a single environment. -Instead of training an RL agent on 1 environment per step, it allows us to train it on ``n`` environments per step. -Because of this, ``actions`` passed to the environment are now a vector (of dimension ``n``). -It is the same for ``observations``, ``rewards`` and end of episode signals (``dones``). -In the case of non-array observation spaces such as ``Dict`` or ``Tuple``, where different sub-spaces -may have different shapes, the sub-observations are vectors (of dimension ``n``). - -============= ======= ============ ======== ========= ================ -Name ``Box`` ``Discrete`` ``Dict`` ``Tuple`` Multi Processing -============= ======= ============ ======== ========= ================ -DummyVecEnv ✔️ ✔️ ✔️ ✔️ ❌️ -SubprocVecEnv ✔️ ✔️ ✔️ ✔️ ✔️ -============= ======= ============ ======== ========= ================ - -.. note:: - - Vectorized environments are required when using wrappers for frame-stacking or normalization. - -.. note:: - - When using vectorized environments, the environments are automatically reset at the end of each episode. - Thus, the observation returned for the i-th environment when ``done[i]`` is true will in fact be the first observation of the next episode, not the last observation of the episode that has just terminated. - You can access the "real" final observation of the terminated episode—that is, the one that accompanied the ``done`` event provided by the underlying environment—using the ``terminal_observation`` keys in the info dicts returned by the vecenv. - -.. warning:: - - When using ``SubprocVecEnv``, users must wrap the code in an ``if __name__ == "__main__":`` if using the ``forkserver`` or ``spawn`` start method (default on Windows). - On Linux, the default start method is ``fork`` which is not thread safe and can create deadlocks. - - For more information, see Python's `multiprocessing guidelines `_. - -VecEnv ------- - -.. autoclass:: VecEnv - :members: - -DummyVecEnv ------------ - -.. autoclass:: DummyVecEnv - :members: - -SubprocVecEnv -------------- - -.. autoclass:: SubprocVecEnv - :members: - -Wrappers --------- - -VecFrameStack -~~~~~~~~~~~~~ - -.. autoclass:: VecFrameStack - :members: - - -VecNormalize -~~~~~~~~~~~~ - -.. autoclass:: VecNormalize - :members: - - -VecVideoRecorder -~~~~~~~~~~~~~~~~ - -.. autoclass:: VecVideoRecorder - :members: - - -VecCheckNan -~~~~~~~~~~~~~~~~ - -.. autoclass:: VecCheckNan - :members: diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index a1a4cdc9..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,126 +0,0 @@ -.. Stable Baselines documentation master file, created by - sphinx-quickstart on Sat Aug 25 10:33:54 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Stable Baselines docs! - RL Baselines Made Easy -=========================================================== - -`Stable Baselines `_ is a set of improved implementations -of Reinforcement Learning (RL) algorithms based on OpenAI `Baselines `_. - - -Github repository: https://github.com/hill-a/stable-baselines - -RL Baselines Zoo (collection of pre-trained agents): https://github.com/araffin/rl-baselines-zoo - -RL Baselines zoo also offers a simple interface to train, evaluate agents and do hyperparameter tuning. - -You can read a detailed presentation of Stable Baselines in the -Medium article: `link `_ - - -Main differences with OpenAI Baselines --------------------------------------- - -This toolset is a fork of OpenAI Baselines, with a major structural refactoring, and code cleanups: - -- Unified structure for all algorithms -- PEP8 compliant (unified code style) -- Documented functions and classes -- More tests & more code coverage -- Additional algorithms: SAC and TD3 (+ HER support for DQN, DDPG, SAC and TD3) - - -.. toctree:: - :maxdepth: 2 - :caption: User Guide - - guide/install - guide/quickstart - guide/rl_tips - guide/rl - guide/algos - guide/examples - guide/vec_envs - guide/custom_env - guide/custom_policy - guide/callbacks - guide/tensorboard - guide/rl_zoo - guide/pretrain - guide/checking_nan - guide/save_format - guide/export - - -.. toctree:: - :maxdepth: 1 - :caption: RL Algorithms - - modules/base - modules/policies - modules/a2c - modules/acer - modules/acktr - modules/ddpg - modules/dqn - modules/gail - modules/her - modules/ppo1 - modules/ppo2 - modules/sac - modules/td3 - modules/trpo - -.. toctree:: - :maxdepth: 1 - :caption: Common - - common/distributions - common/tf_utils - common/cmd_utils - common/schedules - common/evaluation - common/env_checker - common/monitor - -.. toctree:: - :maxdepth: 1 - :caption: Misc - - misc/changelog - misc/projects - misc/results_plotter - - -Citing Stable Baselines ------------------------ -To cite this project in publications: - -.. code-block:: bibtex - - @misc{stable-baselines, - author = {Hill, Ashley and Raffin, Antonin and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi and Traore, Rene and Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai}, - title = {Stable Baselines}, - year = {2018}, - publisher = {GitHub}, - journal = {GitHub repository}, - howpublished = {\url{https://github.com/hill-a/stable-baselines}}, - } - -Contributing ------------- - -To any interested in making the rl baselines better, there are still some improvements -that need to be done. -A full TODO list is available in the `roadmap `_. - -If you want to contribute, please read `CONTRIBUTING.md `_ first. - -Indices and tables -------------------- - -* :ref:`genindex` -* :ref:`search` -* :ref:`modindex` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 22b5fff4..00000000 --- a/docs/make.bat +++ /dev/null @@ -1,36 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build -set SPHINXPROJ=StableBaselines - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst deleted file mode 100644 index ce47e4e9..00000000 --- a/docs/misc/changelog.rst +++ /dev/null @@ -1,789 +0,0 @@ -.. _changelog: - -Changelog -========== - -For download links, please look at `Github release page `_. - -Pre-Release 2.10.3a0 (WIP) ---------------------------- - - -Breaking Changes: -^^^^^^^^^^^^^^^^^ - -New Features: -^^^^^^^^^^^^^ - -Bug Fixes: -^^^^^^^^^^ -- Fixed bug in pretraining method that prevented from calling it twice. -- Fixed a bug where a crash would occur if a PPO2 model was trained in a vectorized environment, saved and subsequently loaded, then trained in a vectorized environment with a different length - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ - -Documentation: -^^^^^^^^^^^^^^ -- Adding the version warning banner on every documentation page (@qgallouedec) - -Release 2.10.2 (2021-04-05) ---------------------------- - -.. warning:: - - This package is in maintenance mode, please use `Stable-Baselines3 - (SB3)`_ for an up-to-date version. You can find a `migration guide`_ in - SB3 documentation. - - -.. _Stable-Baselines3 (SB3): https://github.com/DLR-RM/stable-baselines3 -.. _migration guide: https://stable-baselines3.readthedocs.io/en/master/guide/migration.html - - -Breaking Changes: -^^^^^^^^^^^^^^^^^ - -New Features: -^^^^^^^^^^^^^ -- EvalCallback now works also for recurrent policies (@mily20001) - -Bug Fixes: -^^^^^^^^^^ -- Fixed calculation of the log probability of Diagonal Gaussian distribution - when using ``action_probability()`` method (@SVJayanthi, @sunshineclt) -- Fixed docker image build (@anj1) - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ -- Faster tests, switched to GitHub CI - -Documentation: -^^^^^^^^^^^^^^ -- Added stable-baselines-tf2 link on Projects page. (@sophiagu) -- Fixed a typo in ``stable_baselines.common.env_checker.check_env`` (@OGordon100) - -Release 2.10.1 (2020-08-05) ---------------------------- - -**Bug fixes release** - -Breaking Changes: -^^^^^^^^^^^^^^^^^ -- ``render()`` method of ``VecEnvs`` now only accept one argument: ``mode`` - -New Features: -^^^^^^^^^^^^^ -- Added momentum parameter to A2C for the embedded RMSPropOptimizer (@kantneel) -- ActionNoise is now an abstract base class and implements ``__call__``, ``NormalActionNoise`` and ``OrnsteinUhlenbeckActionNoise`` have return types (@PartiallyTyped) -- HER now passes info dictionary to compute_reward, allowing for the computation of rewards that are independent of the goal (@tirafesi) - -Bug Fixes: -^^^^^^^^^^ -- Fixed DDPG sampling empty replay buffer when combined with HER (@tirafesi) -- Fixed a bug in ``HindsightExperienceReplayWrapper``, where the openai-gym signature for ``compute_reward`` was not matched correctly (@johannes-dornheim) -- Fixed SAC/TD3 checking time to update on learn steps instead of total steps (@PartiallyTyped) -- Added ``**kwarg`` pass through for ``reset`` method in ``atari_wrappers.FrameStack`` (@PartiallyTyped) -- Fix consistency in ``setup_model()`` for SAC, ``target_entropy`` now uses ``self.action_space`` instead of ``self.env.action_space`` (@PartiallyTyped) -- Fix reward threshold in ``test_identity.py`` -- Partially fix tensorboard indexing for PPO2 (@enderdead) -- Fixed potential bug in ``DummyVecEnv`` where ``copy()`` was used instead of ``deepcopy()`` -- Fixed a bug in ``GAIL`` where the dataloader was not available after saving, causing an error when using ``CheckpointCallback`` -- Fixed a bug in ``SAC`` where any convolutional layers were not included in the target network parameters. -- Fixed ``render()`` method for ``VecEnvs`` -- Fixed ``seed()``` method for ``SubprocVecEnv`` -- Fixed a bug ``callback.locals`` did not have the correct values (@PartiallyTyped) -- Fixed a bug in the ``close()`` method of ``SubprocVecEnv``, causing wrappers further down in the wrapper stack to not be closed. (@NeoExtended) -- Fixed a bug in the ``generate_expert_traj()`` method in ``record_expert.py`` when using a non-image vectorized environment (@jbarsce) -- Fixed a bug in CloudPickleWrapper's (used by VecEnvs) ``__setstate___`` where loading was incorrectly using ``pickle.loads`` (@shwang). -- Fixed a bug in ``SAC`` and ``TD3`` where the log timesteps was not correct(@YangRui2015) -- Fixed a bug where the environment was reset twice when using ``evaluate_policy`` - - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ -- Added ``version.txt`` to manage version number in an easier way -- Added ``.readthedocs.yml`` to install requirements with read the docs -- Added a test for seeding ``SubprocVecEnv``` and rendering - -Documentation: -^^^^^^^^^^^^^^ -- Fix typos (@caburu) -- Fix typos in PPO2 (@kvenkman) -- Removed ``stable_baselines\deepq\experiments\custom_cartpole.py`` (@aakash94) -- Added Google's motion imitation project -- Added documentation page for monitor -- Fixed typos and update ``VecNormalize`` example to show normalization at test-time -- Fixed ``train_mountaincar`` description -- Added imitation baselines project -- Updated install instructions -- Added Slime Volleyball project (@hardmaru) -- Added a table of the variables accessible from the ``on_step`` function of the callbacks for each algorithm (@PartiallyTyped) -- Fix typo in README.md (@ColinLeongUDRI) -- Fix typo in gail.rst (@roccivic) - -Release 2.10.0 (2020-03-11) ---------------------------- - -**Callback collection, cleanup and bug fixes** - -Breaking Changes: -^^^^^^^^^^^^^^^^^ -- ``evaluate_policy`` now returns the standard deviation of the reward per episode - as second return value (instead of ``n_steps``) -- ``evaluate_policy`` now returns as second return value a list of the episode lengths - when ``return_episode_rewards`` is set to ``True`` (instead of ``n_steps``) -- Callback are now called after each ``env.step()`` for consistency (it was called every ``n_steps`` before - in algorithm like ``A2C`` or ``PPO2``) -- Removed unused code in ``common/a2c/utils.py`` (``calc_entropy_softmax``, ``make_path``) -- **Refactoring, including removed files and moving functions.** - - - Algorithms no longer import from each other, and ``common`` does not import from algorithms. - - ``a2c/utils.py`` removed and split into other files: - - - common/tf_util.py: ``sample``, ``calc_entropy``, ``mse``, ``avg_norm``, ``total_episode_reward_logger``, - ``q_explained_variance``, ``gradient_add``, ``avg_norm``, ``check_shape``, - ``seq_to_batch``, ``batch_to_seq``. - - common/tf_layers.py: ``conv``, ``linear``, ``lstm``, ``_ln``, ``lnlstm``, ``conv_to_fc``, ``ortho_init``. - - a2c/a2c.py: ``discount_with_dones``. - - acer/acer_simple.py: ``get_by_index``, ``EpisodeStats``. - - common/schedules.py: ``constant``, ``linear_schedule``, ``middle_drop``, ``double_linear_con``, ``double_middle_drop``, - ``SCHEDULES``, ``Scheduler``. - - - ``trpo_mpi/utils.py`` functions moved (``traj_segment_generator`` moved to ``common/runners.py``, ``flatten_lists`` to ``common/misc_util.py``). - - ``ppo2/ppo2.py`` functions moved (``safe_mean`` to ``common/math_util.py``, ``constfn`` and ``get_schedule_fn`` to ``common/schedules.py``). - - ``sac/policies.py`` function ``mlp`` moved to ``common/tf_layers.py``. - - ``sac/sac.py`` function ``get_vars`` removed (replaced with ``tf.util.get_trainable_vars``). - - ``deepq/replay_buffer.py`` renamed to ``common/buffers.py``. - - -New Features: -^^^^^^^^^^^^^ -- Parallelized updating and sampling from the replay buffer in DQN. (@flodorner) -- Docker build script, `scripts/build_docker.sh`, can push images automatically. -- Added callback collection -- Added ``unwrap_vec_normalize`` and ``sync_envs_normalization`` in the ``vec_env`` module - to synchronize two VecNormalize environment -- Added a seeding method for vectorized environments. (@NeoExtended) -- Added extend method to store batches of experience in ReplayBuffer. (@PartiallyTyped) - - -Bug Fixes: -^^^^^^^^^^ - -- Fixed Docker images via ``scripts/build_docker.sh`` and ``Dockerfile``: GPU image now contains ``tensorflow-gpu``, - and both images have ``stable_baselines`` installed in developer mode at correct directory for mounting. -- Fixed Docker GPU run script, ``scripts/run_docker_gpu.sh``, to work with new NVidia Container Toolkit. -- Repeated calls to ``RLModel.learn()`` now preserve internal counters for some episode - logging statistics that used to be zeroed at the start of every call. -- Fix `DummyVecEnv.render` for ``num_envs > 1``. This used to print a warning and then not render at all. (@shwang) -- Fixed a bug in PPO2, ACER, A2C, and ACKTR where repeated calls to ``learn(total_timesteps)`` reset - the environment on every call, potentially biasing samples toward early episode timesteps. - (@shwang) -- Fixed by adding lazy property ``ActorCriticRLModel.runner``. Subclasses now use lazily-generated - ``self.runner`` instead of reinitializing a new Runner every time ``learn()`` is called. -- Fixed a bug in ``check_env`` where it would fail on high dimensional action spaces -- Fixed ``Monitor.close()`` that was not calling the parent method -- Fixed a bug in ``BaseRLModel`` when seeding vectorized environments. (@NeoExtended) -- Fixed ``num_timesteps`` computation to be consistent between algorithms (updated after ``env.step()``) - Only ``TRPO`` and ``PPO1`` update it differently (after synchronization) because they rely on MPI -- Fixed bug in ``TRPO`` with NaN standardized advantages (@richardwu) -- Fixed partial minibatch computation in ExpertDataset (@richardwu) -- Fixed normalization (with ``VecNormalize``) for off-policy algorithms -- Fixed ``sync_envs_normalization`` to sync the reward normalization too -- Bump minimum Gym version (>=0.11) - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ -- Removed redundant return value from ``a2c.utils::total_episode_reward_logger``. (@shwang) -- Cleanup and refactoring in ``common/identity_env.py`` (@shwang) -- Added a Makefile to simplify common development tasks (build the doc, type check, run the tests) - - -Documentation: -^^^^^^^^^^^^^^ -- Add dedicated page for callbacks -- Fixed example for creating a GIF (@KuKuXia) -- Change Colab links in the README to point to the notebooks repo -- Fix typo in Reinforcement Learning Tips and Tricks page. (@mmcenta) - - -Release 2.9.0 (2019-12-20) --------------------------- - -*Reproducible results, automatic ``VecEnv`` wrapping, env checker and more usability improvements* - -Breaking Changes: -^^^^^^^^^^^^^^^^^ -- The ``seed`` argument has been moved from `learn()` method to model constructor - in order to have reproducible results -- ``allow_early_resets`` of the ``Monitor`` wrapper now default to ``True`` -- ``make_atari_env`` now returns a ``DummyVecEnv`` by default (instead of a ``SubprocVecEnv``) - this usually improves performance. -- Fix inconsistency of sample type, so that mode/sample function returns tensor of tf.int64 in CategoricalProbabilityDistribution/MultiCategoricalProbabilityDistribution (@seheevic) - -New Features: -^^^^^^^^^^^^^ -- Add ``n_cpu_tf_sess`` to model constructor to choose the number of threads used by Tensorflow -- Environments are automatically wrapped in a ``DummyVecEnv`` if needed when passing them to the model constructor -- Added ``stable_baselines.common.make_vec_env`` helper to simplify VecEnv creation -- Added ``stable_baselines.common.evaluation.evaluate_policy`` helper to simplify model evaluation -- ``VecNormalize`` changes: - - - Now supports being pickled and unpickled (@AdamGleave). - - New methods ``.normalize_obs(obs)`` and `normalize_reward(rews)` apply normalization - to arbitrary observation or rewards without updating statistics (@shwang) - - ``.get_original_reward()`` returns the unnormalized rewards from the most recent timestep - - ``.reset()`` now collects observation statistics (used to only apply normalization) - -- Add parameter ``exploration_initial_eps`` to DQN. (@jdossgollin) -- Add type checking and PEP 561 compliance. - Note: most functions are still not annotated, this will be a gradual process. -- DDPG, TD3 and SAC accept non-symmetric action spaces. (@Antymon) -- Add ``check_env`` util to check if a custom environment follows the gym interface (@araffin and @justinkterry) - -Bug Fixes: -^^^^^^^^^^ -- Fix seeding, so it is now possible to have deterministic results on cpu -- Fix a bug in DDPG where ``predict`` method with `deterministic=False` would fail -- Fix a bug in TRPO: mean_losses was not initialized causing the logger to crash when there was no gradients (@MarvineGothic) -- Fix a bug in ``cmd_util`` from API change in recent Gym versions -- Fix a bug in DDPG, TD3 and SAC where warmup and random exploration actions would end up scaled in the replay buffer (@Antymon) - -Deprecations: -^^^^^^^^^^^^^ -- ``nprocs`` (ACKTR) and ``num_procs`` (ACER) are deprecated in favor of ``n_cpu_tf_sess`` which is now common - to all algorithms -- ``VecNormalize``: ``load_running_average`` and ``save_running_average`` are deprecated in favour of using pickle. - -Others: -^^^^^^^ -- Add upper bound for Tensorflow version (<2.0.0). -- Refactored test to remove duplicated code -- Add pull request template -- Replaced redundant code in load_results (@jbulow) -- Minor PEP8 fixes in dqn.py (@justinkterry) -- Add a message to the assert in ``PPO2`` -- Update replay buffer doctring -- Fix ``VecEnv`` docstrings - -Documentation: -^^^^^^^^^^^^^^ -- Add plotting to the Monitor example (@rusu24edward) -- Add Snake Game AI project (@pedrohbtp) -- Add note on the support Tensorflow versions. -- Remove unnecessary steps required for Windows installation. -- Remove ``DummyVecEnv`` creation when not needed -- Added ``make_vec_env`` to the examples to simplify VecEnv creation -- Add QuaRL project (@srivatsankrishnan) -- Add Pwnagotchi project (@evilsocket) -- Fix multiprocessing example (@rusu24edward) -- Fix ``result_plotter`` example -- Add JNRR19 tutorial (by @edbeeching, @hill-a and @araffin) -- Updated notebooks link -- Fix typo in algos.rst, "containes" to "contains" (@SyllogismRXS) -- Fix outdated source documentation for load_results -- Add PPO_CPP project (@Antymon) -- Add section on C++ portability of Tensorflow models (@Antymon) -- Update custom env documentation to reflect new gym API for the ``close()`` method (@justinkterry) -- Update custom env documentation to clarify what step and reset return (@justinkterry) -- Add RL tips and tricks for doing RL experiments -- Corrected lots of typos -- Add spell check to documentation if available - - -Release 2.8.0 (2019-09-29) --------------------------- - -**MPI dependency optional, new save format, ACKTR with continuous actions** - -Breaking Changes: -^^^^^^^^^^^^^^^^^ -- OpenMPI-dependent algorithms (PPO1, TRPO, GAIL, DDPG) are disabled in the - default installation of stable_baselines. ``mpi4py`` is now installed as an - extra. When ``mpi4py`` is not available, stable-baselines skips imports of - OpenMPI-dependent algorithms. - See :ref:`installation notes ` and - `Issue #430 `_. -- SubprocVecEnv now defaults to a thread-safe start method, ``forkserver`` when - available and otherwise ``spawn``. This may require application code be - wrapped in ``if __name__ == '__main__'``. You can restore previous behavior - by explicitly setting ``start_method = 'fork'``. See - `PR #428 `_. -- Updated dependencies: tensorflow v1.8.0 is now required -- Removed ``checkpoint_path`` and ``checkpoint_freq`` argument from ``DQN`` that were not used -- Removed ``bench/benchmark.py`` that was not used -- Removed several functions from ``common/tf_util.py`` that were not used -- Removed ``ppo1/run_humanoid.py`` - -New Features: -^^^^^^^^^^^^^ -- **important change** Switch to using zip-archived JSON and Numpy ``savez`` for - storing models for better support across library/Python versions. (@Miffyli) -- ACKTR now supports continuous actions -- Add ``double_q`` argument to ``DQN`` constructor - -Bug Fixes: -^^^^^^^^^^ -- Skip automatic imports of OpenMPI-dependent algorithms to avoid an issue - where OpenMPI would cause stable-baselines to hang on Ubuntu installs. - See :ref:`installation notes ` and - `Issue #430 `_. -- Fix a bug when calling ``logger.configure()`` with MPI enabled (@keshaviyengar) -- set ``allow_pickle=True`` for numpy>=1.17.0 when loading expert dataset -- Fix a bug when using VecCheckNan with numpy ndarray as state. `Issue #489 `_. (@ruifeng96150) - -Deprecations: -^^^^^^^^^^^^^ -- Models saved with cloudpickle format (stable-baselines<=2.7.0) are now - deprecated in favor of zip-archive format for better support across - Python/Tensorflow versions. (@Miffyli) - -Others: -^^^^^^^ -- Implementations of noise classes (``AdaptiveParamNoiseSpec``, ``NormalActionNoise``, - ``OrnsteinUhlenbeckActionNoise``) were moved from `stable_baselines.ddpg.noise` - to ``stable_baselines.common.noise``. The API remains backward-compatible; - for example ``from stable_baselines.ddpg.noise import NormalActionNoise`` is still - okay. (@shwang) -- Docker images were updated -- Cleaned up files in ``common/`` folder and in `acktr/` folder that were only used by old ACKTR version - (e.g. `filter.py`) -- Renamed `acktr_disc.py` to `acktr.py` - -Documentation: -^^^^^^^^^^^^^^ -- Add WaveRL project (@jaberkow) -- Add Fenics-DRL project (@DonsetPG) -- Fix and rename custom policy names (@eavelardev) -- Add documentation on exporting models. -- Update maintainers list (Welcome to @Miffyli) - - -Release 2.7.0 (2019-07-31) --------------------------- - -**Twin Delayed DDPG (TD3) and GAE bug fix (TRPO, PPO1, GAIL)** - -Breaking Changes: -^^^^^^^^^^^^^^^^^ - -New Features: -^^^^^^^^^^^^^ -- added Twin Delayed DDPG (TD3) algorithm, with HER support -- added support for continuous action spaces to ``action_probability``, computing the PDF of a Gaussian - policy in addition to the existing support for categorical stochastic policies. -- added flag to ``action_probability`` to return log-probabilities. -- added support for python lists and numpy arrays in ``logger.writekvs``. (@dwiel) -- the info dict returned by VecEnvs now include a ``terminal_observation`` key providing access to the last observation in a trajectory. (@qxcv) - -Bug Fixes: -^^^^^^^^^^ -- fixed a bug in ``traj_segment_generator`` where the ``episode_starts`` was wrongly recorded, - resulting in wrong calculation of Generalized Advantage Estimation (GAE), this affects TRPO, PPO1 and GAIL (thanks to @miguelrass for spotting the bug) -- added missing property ``n_batch`` in ``BasePolicy``. - -Deprecations: -^^^^^^^^^^^^^ - -Others: -^^^^^^^ -- renamed some keys in ``traj_segment_generator`` to be more meaningful -- retrieve unnormalized reward when using Monitor wrapper with TRPO, PPO1 and GAIL - to display them in the logs (mean episode reward) -- clean up DDPG code (renamed variables) - -Documentation: -^^^^^^^^^^^^^^ - -- doc fix for the hyperparameter tuning command in the rl zoo -- added an example on how to log additional variable with tensorboard and a callback - - - -Release 2.6.0 (2019-06-12) --------------------------- - -**Hindsight Experience Replay (HER) - Reloaded | get/load parameters** - -Breaking Changes: -^^^^^^^^^^^^^^^^^ - -- **breaking change** removed ``stable_baselines.ddpg.memory`` in favor of ``stable_baselines.deepq.replay_buffer`` (see fix below) - -**Breaking Change:** DDPG replay buffer was unified with DQN/SAC replay buffer. As a result, -when loading a DDPG model trained with stable_baselines<2.6.0, it throws an import error. -You can fix that using: - -.. code-block:: python - - import sys - import pkg_resources - - import stable_baselines - - # Fix for breaking change for DDPG buffer in v2.6.0 - if pkg_resources.get_distribution("stable_baselines").version >= "2.6.0": - sys.modules['stable_baselines.ddpg.memory'] = stable_baselines.deepq.replay_buffer - stable_baselines.deepq.replay_buffer.Memory = stable_baselines.deepq.replay_buffer.ReplayBuffer - - -We recommend you to save again the model afterward, so the fix won't be needed the next time the trained agent is loaded. - - -New Features: -^^^^^^^^^^^^^ - -- **revamped HER implementation**: clean re-implementation from scratch, now supports DQN, SAC and DDPG -- add ``action_noise`` param for SAC, it helps exploration for problem with deceptive reward -- The parameter ``filter_size`` of the function ``conv`` in A2C utils now supports passing a list/tuple of two integers (height and width), in order to have non-squared kernel matrix. (@yutingsz) -- add ``random_exploration`` parameter for DDPG and SAC, it may be useful when using HER + DDPG/SAC. This hack was present in the original OpenAI Baselines DDPG + HER implementation. -- added ``load_parameters`` and ``get_parameters`` to base RL class. With these methods, users are able to load and get parameters to/from existing model, without touching tensorflow. (@Miffyli) -- added specific hyperparameter for PPO2 to clip the value function (``cliprange_vf``) -- added ``VecCheckNan`` wrapper - -Bug Fixes: -^^^^^^^^^^ - -- bugfix for ``VecEnvWrapper.__getattr__`` which enables access to class attributes inherited from parent classes. -- fixed path splitting in ``TensorboardWriter._get_latest_run_id()`` on Windows machines (@PatrickWalter214) -- fixed a bug where initial learning rate is logged instead of its placeholder in ``A2C.setup_model`` (@sc420) -- fixed a bug where number of timesteps is incorrectly updated and logged in ``A2C.learn`` and ``A2C._train_step`` (@sc420) -- fixed ``num_timesteps`` (total_timesteps) variable in PPO2 that was wrongly computed. -- fixed a bug in DDPG/DQN/SAC, when there were the number of samples in the replay buffer was lesser than the batch size - (thanks to @dwiel for spotting the bug) -- **removed** ``a2c.utils.find_trainable_params`` please use ``common.tf_util.get_trainable_vars`` instead. - ``find_trainable_params`` was returning all trainable variables, discarding the scope argument. - This bug was causing the model to save duplicated parameters (for DDPG and SAC) - but did not affect the performance. - -Deprecations: -^^^^^^^^^^^^^ - -- **deprecated** ``memory_limit`` and ``memory_policy`` in DDPG, please use ``buffer_size`` instead. (will be removed in v3.x.x) - -Others: -^^^^^^^ - -- **important change** switched to using dictionaries rather than lists when storing parameters, with tensorflow Variable names being the keys. (@Miffyli) -- removed unused dependencies (tdqm, dill, progressbar2, seaborn, glob2, click) -- removed ``get_available_gpus`` function which hadn't been used anywhere (@Pastafarianist) - -Documentation: -^^^^^^^^^^^^^^ - -- added guide for managing ``NaN`` and ``inf`` -- updated ven_env doc -- misc doc updates - -Release 2.5.1 (2019-05-04) --------------------------- - -**Bug fixes + improvements in the VecEnv** - -**Warning: breaking changes when using custom policies** - -- doc update (fix example of result plotter + improve doc) -- fixed logger issues when stdout lacks ``read`` function -- fixed a bug in ``common.dataset.Dataset`` where shuffling was not disabled properly (it affects only PPO1 with recurrent policies) -- fixed output layer name for DDPG q function, used in pop-art normalization and l2 regularization of the critic -- added support for multi env recording to ``generate_expert_traj`` (@XMaster96) -- added support for LSTM model recording to ``generate_expert_traj`` (@XMaster96) -- ``GAIL``: remove mandatory matplotlib dependency and refactor as subclass of ``TRPO`` (@kantneel and @AdamGleave) -- added ``get_attr()``, ``env_method()`` and ``set_attr()`` methods for all VecEnv. - Those methods now all accept ``indices`` keyword to select a subset of envs. - ``set_attr`` now returns ``None`` rather than a list of ``None``. (@kantneel) -- ``GAIL``: ``gail.dataset.ExpertDataset`` supports loading from memory rather than file, and - ``gail.dataset.record_expert`` supports returning in-memory rather than saving to file. -- added support in ``VecEnvWrapper`` for accessing attributes of arbitrarily deeply nested - instances of ``VecEnvWrapper`` and ``VecEnv``. This is allowed as long as the attribute belongs - to exactly one of the nested instances i.e. it must be unambiguous. (@kantneel) -- fixed bug where result plotter would crash on very short runs (@Pastafarianist) -- added option to not trim output of result plotter by number of timesteps (@Pastafarianist) -- clarified the public interface of ``BasePolicy`` and ``ActorCriticPolicy``. **Breaking change** when using custom policies: ``masks_ph`` is now called ``dones_ph``, - and most placeholders were made private: e.g. ``self.value_fn`` is now ``self._value_fn`` -- support for custom stateful policies. -- fixed episode length recording in ``trpo_mpi.utils.traj_segment_generator`` (@GerardMaggiolino) - - -Release 2.5.0 (2019-03-28) --------------------------- - -**Working GAIL, pretrain RL models and hotfix for A2C with continuous actions** - -- fixed various bugs in GAIL -- added scripts to generate dataset for gail -- added tests for GAIL + data for Pendulum-v0 -- removed unused ``utils`` file in DQN folder -- fixed a bug in A2C where actions were cast to ``int32`` even in the continuous case -- added addional logging to A2C when Monitor wrapper is used -- changed logging for PPO2: do not display NaN when reward info is not present -- change default value of A2C lr schedule -- removed behavior cloning script -- added ``pretrain`` method to base class, in order to use behavior cloning on all models -- fixed ``close()`` method for DummyVecEnv. -- added support for Dict spaces in DummyVecEnv and SubprocVecEnv. (@AdamGleave) -- added support for arbitrary multiprocessing start methods and added a warning about SubprocVecEnv that are not thread-safe by default. (@AdamGleave) -- added support for Discrete actions for GAIL -- fixed deprecation warning for tf: replaces ``tf.to_float()`` by ``tf.cast()`` -- fixed bug in saving and loading ddpg model when using normalization of obs or returns (@tperol) -- changed DDPG default buffer size from 100 to 50000. -- fixed a bug in ``ddpg.py`` in ``combined_stats`` for eval. Computed mean on ``eval_episode_rewards`` and ``eval_qs`` (@keshaviyengar) -- fixed a bug in ``setup.py`` that would error on non-GPU systems without TensorFlow installed - - -Release 2.4.1 (2019-02-11) --------------------------- - -**Bug fixes and improvements** - -- fixed computation of training metrics in TRPO and PPO1 -- added ``reset_num_timesteps`` keyword when calling train() to continue tensorboard learning curves -- reduced the size taken by tensorboard logs (added a ``full_tensorboard_log`` to enable full logging, which was the previous behavior) -- fixed image detection for tensorboard logging -- fixed ACKTR for recurrent policies -- fixed gym breaking changes -- fixed custom policy examples in the doc for DQN and DDPG -- remove gym spaces patch for equality functions -- fixed tensorflow dependency: cpu version was installed overwritting tensorflow-gpu when present. -- fixed a bug in ``traj_segment_generator`` (used in ppo1 and trpo) where ``new`` was not updated. (spotted by @junhyeokahn) - - -Release 2.4.0 (2019-01-17) --------------------------- - -**Soft Actor-Critic (SAC) and policy kwargs** - -- added Soft Actor-Critic (SAC) model -- fixed a bug in DQN where prioritized_replay_beta_iters param was not used -- fixed DDPG that did not save target network parameters -- fixed bug related to shape of true_reward (@abhiskk) -- fixed example code in documentation of tf_util:Function (@JohannesAck) -- added learning rate schedule for SAC -- fixed action probability for continuous actions with actor-critic models -- added optional parameter to action_probability for likelihood calculation of given action being taken. -- added more flexible custom LSTM policies -- added auto entropy coefficient optimization for SAC -- clip continuous actions at test time too for all algorithms (except SAC/DDPG where it is not needed) -- added a mean to pass kwargs to policy when creating a model (+ save those kwargs) -- fixed DQN examples in DQN folder -- added possibility to pass activation function for DDPG, DQN and SAC - - -Release 2.3.0 (2018-12-05) --------------------------- - -- added support for storing model in file like object. (thanks to @ernestum) -- fixed wrong image detection when using tensorboard logging with DQN -- fixed bug in ppo2 when passing non callable lr after loading -- fixed tensorboard logging in ppo2 when nminibatches=1 -- added early stoppping via callback return value (@ernestum) -- added more flexible custom mlp policies (@ernestum) - - -Release 2.2.1 (2018-11-18) --------------------------- - -- added VecVideoRecorder to record mp4 videos from environment. - - -Release 2.2.0 (2018-11-07) --------------------------- - -- Hotfix for ppo2, the wrong placeholder was used for the value function - - -Release 2.1.2 (2018-11-06) --------------------------- - -- added ``async_eigen_decomp`` parameter for ACKTR and set it to ``False`` by default (remove deprecation warnings) -- added methods for calling env methods/setting attributes inside a VecEnv (thanks to @bjmuld) -- updated gym minimum version - - -Release 2.1.1 (2018-10-20) --------------------------- - -- fixed MpiAdam synchronization issue in PPO1 (thanks to @brendenpetersen) issue #50 -- fixed dependency issues (new mujoco-py requires a mujoco license + gym broke MultiDiscrete space shape) - - -Release 2.1.0 (2018-10-2) -------------------------- - -.. warning:: - - This version contains breaking changes for DQN policies, please read the full details - -**Bug fixes + doc update** - - -- added patch fix for equal function using `gym.spaces.MultiDiscrete` and `gym.spaces.MultiBinary` -- fixes for DQN action_probability -- re-added double DQN + refactored DQN policies **breaking changes** -- replaced ``async`` with ``async_eigen_decomp`` in ACKTR/KFAC for python 3.7 compatibility -- removed action clipping for prediction of continuous actions (see issue #36) -- fixed NaN issue due to clipping the continuous action in the wrong place (issue #36) -- documentation was updated (policy + DDPG example hyperparameters) - -Release 2.0.0 (2018-09-18) --------------------------- - -.. warning:: - - This version contains breaking changes, please read the full details - -**Tensorboard, refactoring and bug fixes** - - -- Renamed DeepQ to DQN **breaking changes** -- Renamed DeepQPolicy to DQNPolicy **breaking changes** -- fixed DDPG behavior **breaking changes** -- changed default policies for DDPG, so that DDPG now works correctly **breaking changes** -- added more documentation (some modules from common). -- added doc about using custom env -- added Tensorboard support for A2C, ACER, ACKTR, DDPG, DeepQ, PPO1, PPO2 and TRPO -- added episode reward to Tensorboard -- added documentation for Tensorboard usage -- added Identity for Box action space -- fixed render function ignoring parameters when using wrapped environments -- fixed PPO1 and TRPO done values for recurrent policies -- fixed image normalization not occurring when using images -- updated VecEnv objects for the new Gym version -- added test for DDPG -- refactored DQN policies -- added registry for policies, can be passed as string to the agent -- added documentation for custom policies + policy registration -- fixed numpy warning when using DDPG Memory -- fixed DummyVecEnv not copying the observation array when stepping and resetting -- added pre-built docker images + installation instructions -- added ``deterministic`` argument in the predict function -- added assert in PPO2 for recurrent policies -- fixed predict function to handle both vectorized and unwrapped environment -- added input check to the predict function -- refactored ActorCritic models to reduce code duplication -- refactored Off Policy models (to begin HER and replay_buffer refactoring) -- added tests for auto vectorization detection -- fixed render function, to handle positional arguments - - -Release 1.0.7 (2018-08-29) --------------------------- - -**Bug fixes and documentation** - -- added html documentation using sphinx + integration with read the docs -- cleaned up README + typos -- fixed normalization for DQN with images -- fixed DQN identity test - - -Release 1.0.1 (2018-08-20) --------------------------- - -**Refactored Stable Baselines** - -- refactored A2C, ACER, ACTKR, DDPG, DeepQ, GAIL, TRPO, PPO1 and PPO2 under a single constant class -- added callback to refactored algorithm training -- added saving and loading to refactored algorithms -- refactored ACER, DDPG, GAIL, PPO1 and TRPO to fit with A2C, PPO2 and ACKTR policies -- added new policies for most algorithms (Mlp, MlpLstm, MlpLnLstm, Cnn, CnnLstm and CnnLnLstm) -- added dynamic environment switching (so continual RL learning is now feasible) -- added prediction from observation and action probability from observation for all the algorithms -- fixed graphs issues, so models wont collide in names -- fixed behavior_clone weight loading for GAIL -- fixed Tensorflow using all the GPU VRAM -- fixed models so that they are all compatible with vectorized environments -- fixed ``set_global_seed`` to update ``gym.spaces``'s random seed -- fixed PPO1 and TRPO performance issues when learning identity function -- added new tests for loading, saving, continuous actions and learning the identity function -- fixed DQN wrapping for atari -- added saving and loading for Vecnormalize wrapper -- added automatic detection of action space (for the policy network) -- fixed ACER buffer with constant values assuming n_stack=4 -- fixed some RL algorithms not clipping the action to be in the action_space, when using ``gym.spaces.Box`` -- refactored algorithms can take either a ``gym.Environment`` or a ``str`` ([if the environment name is registered](https://github.com/openai/gym/wiki/Environments)) -- Hoftix in ACER (compared to v1.0.0) - -Future Work : - -- Finish refactoring HER -- Refactor ACKTR and ACER for continuous implementation - - - -Release 0.1.6 (2018-07-27) --------------------------- - -**Deobfuscation of the code base + pep8 and fixes** - -- Fixed ``tf.session().__enter__()`` being used, rather than - ``sess = tf.session()`` and passing the session to the objects -- Fixed uneven scoping of TensorFlow Sessions throughout the code -- Fixed rolling vecwrapper to handle observations that are not only - grayscale images -- Fixed deepq saving the environment when trying to save itself -- Fixed - ``ValueError: Cannot take the length of Shape with unknown rank.`` in - ``acktr``, when running ``run_atari.py`` script. -- Fixed calling baselines sequentially no longer creates graph - conflicts -- Fixed mean on empty array warning with deepq -- Fixed kfac eigen decomposition not cast to float64, when the - parameter use_float64 is set to True -- Fixed Dataset data loader, not correctly resetting id position if - shuffling is disabled -- Fixed ``EOFError`` when reading from connection in the ``worker`` in - ``subproc_vec_env.py`` -- Fixed ``behavior_clone`` weight loading and saving for GAIL -- Avoid taking root square of negative number in ``trpo_mpi.py`` -- Removed some duplicated code (a2cpolicy, trpo_mpi) -- Removed unused, undocumented and crashing function ``reset_task`` in - ``subproc_vec_env.py`` -- Reformated code to PEP8 style -- Documented all the codebase -- Added atari tests -- Added logger tests - -Missing: tests for acktr continuous (+ HER, rely on mujoco...) - -Maintainers ------------ - -Stable-Baselines is currently maintained by `Ashley Hill`_ (aka @hill-a), `Antonin Raffin`_ (aka `@araffin`_), -`Maximilian Ernestus`_ (aka @ernestum), `Adam Gleave`_ (`@AdamGleave`_) and `Anssi Kanervisto`_ (aka `@Miffyli`_). - -.. _Ashley Hill: https://github.com/hill-a -.. _Antonin Raffin: https://araffin.github.io/ -.. _Maximilian Ernestus: https://github.com/ernestum -.. _Adam Gleave: https://gleave.me/ -.. _@araffin: https://github.com/araffin -.. _@AdamGleave: https://github.com/adamgleave -.. _Anssi Kanervisto: https://github.com/Miffyli -.. _@Miffyli: https://github.com/Miffyli - - -Contributors (since v2.0.0): ----------------------------- -In random order... - -Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck @mily20001 -@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol -@XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs -@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket -@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching -@flodorner @KuKuXia @NeoExtended @PartiallyTyped @mmcenta @richardwu @tirafesi @caburu @johannes-dornheim @kvenkman @aakash94 -@enderdead @hardmaru @jbarsce @ColinLeongUDRI @shwang @YangRui2015 @sophiagu @OGordon100 @SVJayanthi @sunshineclt -@roccivic @anj1 \ No newline at end of file diff --git a/docs/misc/projects.rst b/docs/misc/projects.rst deleted file mode 100644 index 1798a537..00000000 --- a/docs/misc/projects.rst +++ /dev/null @@ -1,216 +0,0 @@ -.. _projects: - -Projects -========= - -This is a list of projects using stable-baselines. -Please tell us, if you want your project to appear on this page ;) - - -Stable Baselines for TensorFlow 2 ---------------------------------- -A fork of the original stable-baselines repo that works with TF2.x. - -| Author: Sophia Gu (@sophiagu) -| Github repo: https://github.com/sophiagu/stable-baselines-tf2 - - -Slime Volleyball Gym Environment --------------------------------- -A simple environment for benchmarking single and multi-agent reinforcement learning algorithms on a clone of the Slime Volleyball game. Only dependencies are gym and numpy. Both state and pixel observation environments are available. The motivation of this environment is to easily enable trained agents to play against each other, and also facilitate the training of agents directly in a multi-agent setting, thus adding an extra dimension for evaluating an agent's performance. - -Uses stable-baselines to train RL agents for both state and pixel observation versions of the task. A tutorial is also provided on modifying stable-baselines for self-play using PPO. - -| Author: David Ha (@hardmaru) -| Github repo: https://github.com/hardmaru/slimevolleygym - - -Learning to drive in a day --------------------------- -Implementation of reinforcement learning approach to make a donkey car learn to drive. -Uses DDPG on VAE features (reproducing paper from wayve.ai) - -| Author: Roma Sokolkov (@r7vme) -| Github repo: https://github.com/r7vme/learning-to-drive-in-a-day - - -Donkey Gym ----------- -OpenAI gym environment for donkeycar simulator. - -| Author: Tawn Kramer (@tawnkramer) -| Github repo: https://github.com/tawnkramer/donkey_gym - - -Self-driving FZERO Artificial Intelligence ------------------------------------------- -Series of videos on how to make a self-driving FZERO artificial intelligence using reinforcement learning algorithms PPO2 and A2C. - -| Author: Lucas Thompson -| `Video Link `_ - - -S-RL Toolbox ------------- -S-RL Toolbox: Reinforcement Learning (RL) and State Representation Learning (SRL) for Robotics. -Stable-Baselines was originally developped for this project. - -| Authors: Antonin Raffin, Ashley Hill, René Traoré, Timothée Lesort, Natalia Díaz-Rodríguez, David Filliat -| Github repo: https://github.com/araffin/robotics-rl-srl - - -Roboschool simulations training on Amazon SageMaker ---------------------------------------------------- -"In this notebook example, we will make HalfCheetah learn to walk using the stable-baselines [...]" - -| Author: Amazon AWS -| `Repo Link `_ - - -MarathonEnvs + OpenAi.Baselines -------------------------------- -Experimental - using OpenAI baselines with MarathonEnvs (ML-Agents) - -| Author: Joe Booth (@Sohojoe) -| Github repo: https://github.com/Sohojoe/MarathonEnvsBaselines - - -Learning to drive smoothly in minutes -------------------------------------- -Implementation of reinforcement learning approach to make a car learn to drive smoothly in minutes. -Uses SAC on VAE features. - -| Author: Antonin Raffin (@araffin) -| Blog post: https://towardsdatascience.com/learning-to-drive-smoothly-in-minutes-450a7cdb35f4 -| Github repo: https://github.com/araffin/learning-to-drive-in-5-minutes - - -Making Roboy move with elegance -------------------------------- -Project around Roboy, a tendon-driven robot, that enabled it to move its shoulder in simulation to reach a pre-defined point in 3D space. The agent used Proximal Policy Optimization (PPO) or Soft Actor-Critic (SAC) and was tested on the real hardware. - -| Authors: Alexander Pakakis, Baris Yazici, Tomas Ruiz -| Email: FirstName.LastName@tum.de -| GitHub repo: https://github.com/Roboy/DeepAndReinforced -| DockerHub image: deepandreinforced/rl:latest -| Presentation: https://tinyurl.com/DeepRoboyControl -| Video: https://tinyurl.com/DeepRoboyControlVideo -| Blog post: https://tinyurl.com/mediumDRC -| Website: https://roboy.org/ - - -Train a ROS-integrated mobile robot (differential drive) to avoid dynamic objects ---------------------------------------------------------------------------------- -The RL-agent serves as local planner and is trained in a simulator, fusion of the Flatland Simulator and the crowd simulator Pedsim. This was tested on a real mobile robot. -The Proximal Policy Optimization (PPO) algorithm is applied. - -| Author: Ronja Güldenring -| Email: 6guelden@informatik.uni-hamburg.de -| Video: https://www.youtube.com/watch?v=laGrLaMaeT4 -| GitHub: https://github.com/RGring/drl_local_planner_ros_stable_baselines - - -Adversarial Policies: Attacking Deep Reinforcement Learning ------------------------------------------------------------ -Uses Stable Baselines to train *adversarial policies* that attack pre-trained victim policies in a zero-sum multi-agent environments. -May be useful as an example of how to integrate Stable Baselines with `Ray `_ to perform distributed experiments and `Sacred `_ for experiment configuration and monitoring. - -| Authors: Adam Gleave, Michael Dennis, Neel Kant, Cody Wild -| Email: adam@gleave.me -| GitHub: https://github.com/HumanCompatibleAI/adversarial-policies -| Paper: https://arxiv.org/abs/1905.10615 -| Website: https://adversarialpolicies.github.io - - -WaveRL: Training RL agents to perform active damping ----------------------------------------------------- -Reinforcement learning is used to train agents to control pistons attached to a bridge to cancel out vibrations. The bridge is modeled as a one dimensional oscillating system and dynamics are simulated using a finite difference solver. Agents were trained using Proximal Policy Optimization. See presentation for environment detalis. - -| Author: Jack Berkowitz -| Email: jackberkowitz88@gmail.com -| GitHub: https://github.com/jaberkow/WaveRL -| Presentation: http://bit.ly/WaveRLslides - - -Fenics-DRL: Fluid mechanics and Deep Reinforcement Learning ------------------------------------------------------------ -Deep Reinforcement Learning is used to control the position or the shape of obstacles in different fluids in order to optimize drag or lift. `Fenics `_ is used for the Fluid Mechanics part, and Stable Baselines is used for the DRL. - -| Authors: Paul Garnier, Jonathan Viquerat, Aurélien Larcher, Elie Hachem -| Email: paul.garnier@mines-paristech.fr -| GitHub: https://github.com/DonsetPG/openFluid -| Paper: https://arxiv.org/abs/1908.04127 -| Website: https://donsetpg.github.io/blog/2019/08/06/DRL-FM-review/ - - -Air Learning: An AI Research Platform Algorithm Hardware Benchmarking of Autonomous Aerial Robots -------------------------------------------------------------------------------------------------- -Aerial robotics is a cross-layer, interdisciplinary field. Air Learning is an effort to bridge seemingly disparate fields. - -Designing an autonomous robot to perform a task involves interactions between various boundaries spanning from modeling the environment down to the choice of onboard computer platform available in the robot. Our goal through building Air Learning is to provide researchers with a cross-domain infrastructure that allows them to holistically study and evaluate reinforcement learning algorithms for autonomous aerial machines. We use stable-baselines to train UAV agent with Deep Q-Networks and Proximal Policy Optimization algorithms. - -| Authors: Srivatsan Krishnan, Behzad Boroujerdian, William Fu, Aleksandra Faust, Vijay Janapa Reddi -| Email: srivatsan@seas.harvard.edu -| Github: https://github.com/harvard-edge/airlearning -| Paper: https://arxiv.org/pdf/1906.00421.pdf -| Video: https://www.youtube.com/watch?v=oakzGnh7Llw (Simulation), https://www.youtube.com/watch?v=cvO5YOzI0mg (on a CrazyFlie Nano-Drone) - - -Snake Game AI --------------------------- -AI to play the classic snake game. -The game was trained using PPO2 available from stable-baselines and -then exported to tensorflowjs to run directly on the browser - -| Author: Pedro Torres (@pedrohbtp) -| Repository: https://github.com/pedrohbtp/snake-rl -| Website: https://www.pedro-torres.com/snake-rl/ - - -Pwnagotchi --------------------------- -Pwnagotchi is an A2C-based “AI” powered by bettercap and running on a Raspberry Pi Zero W that learns from its surrounding WiFi environment in order to maximize the crackable WPA key material it captures (either through passive sniffing or by performing deauthentication and association attacks). This material is collected on disk as PCAP files containing any form of handshake supported by hashcat, including full and half WPA handshakes as well as PMKIDs. - -| Author: Simone Margaritelli (@evilsocket) -| Repository: https://github.com/evilsocket/pwnagotchi -| Website: https://pwnagotchi.ai/ - - -Quantized Reinforcement Learning (QuaRL) ----------------------------------------- -QuaRL is a open-source framework to study the effects of quantization broad spectrum of reinforcement learning algorithms. The RL algorithms we used in -this study are from stable-baselines. - -| Authors: Srivatsan Krishnan, Sharad Chitlangia, Maximilian Lam, Zishen Wan, Aleksandra Faust, Vijay Janapa Reddi -| Email: srivatsan@seas.harvard.edu -| Github: https://github.com/harvard-edge/quarl -| Paper: https://arxiv.org/pdf/1910.01055.pdf - - -PPO_CPP: C++ version of a Deep Reinforcement Learning algorithm PPO -------------------------------------------------------------------- -Executes PPO at C++ level yielding notable execution performance speedups. -Uses Stable Baselines to create a computational graph which is then used for training with custom environments by machine-code-compiled binary. - -| Author: Szymon Brych -| Email: szymon.brych@gmail.com -| GitHub: https://github.com/Antymon/ppo_cpp - - -Learning Agile Robotic Locomotion Skills by Imitating Animals -------------------------------------------------------------- -Learning locomotion gaits by imitating animals. It uses PPO1 and AWR. - -| Authors: Xue Bin Peng, Erwin Coumans, Tingnan Zhang, Tsang-Wei Lee, Jie Tan, Sergey Levine -| Website: https://xbpeng.github.io/projects/Robotic_Imitation/index.html -| Github: https://github.com/google-research/motion_imitation -| Paper: https://arxiv.org/abs/2004.00784 - - -Imitation Learning Baseline Implementations -------------------------------------------- -This project aims to provide clean implementations of imitation learning algorithms. -Currently we have implementations of AIRL and GAIL, and intend to add more in the future. - -| Authors: Adam Gleave, Steven Wang, Nevan Wichers, Sam Toyer -| Github: https://github.com/HumanCompatibleAI/imitation diff --git a/docs/misc/results_plotter.rst b/docs/misc/results_plotter.rst deleted file mode 100644 index 283958f2..00000000 --- a/docs/misc/results_plotter.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _results_plotter: - - -Plotting Results -================ - -.. automodule:: stable_baselines.results_plotter - :members: diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst deleted file mode 100644 index 45f34555..00000000 --- a/docs/modules/a2c.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. _a2c: - -.. automodule:: stable_baselines.a2c - - -A2C -==== - -A synchronous, deterministic variant of `Asynchronous Advantage Actor Critic (A3C) `_. -It uses multiple workers to avoid the use of a replay buffer. - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1602.01783 -- OpenAI blog post: https://openai.com/blog/baselines-acktr-a2c/ -- ``python -m stable_baselines.a2c.run_atari`` runs the algorithm for 40M - frames = 10M timesteps on an Atari game. See help (``-h``) for more - options. -- ``python -m stable_baselines.a2c.run_mujoco`` runs the algorithm for 1M - frames on a Mujoco environment. - -Can I use? ----------- - -- Recurrent policies: ✔️ -- Multi processing: ✔️ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ✔️ ✔️ -MultiBinary ✔️ ✔️ -============= ====== =========== - - -Example -------- - -Train a A2C agent on `CartPole-v1` using 4 processes. - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines.common import make_vec_env - from stable_baselines import A2C - - # Parallel environments - env = make_vec_env('CartPole-v1', n_envs=4) - - model = A2C(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("a2c_cartpole") - - del model # remove to demonstrate saving and loading - - model = A2C.load("a2c_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: A2C - :members: - :inherited-members: - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- new_tb_log | | - |- writer | | - |- t_start | | - |- mb_obs | | - |- mb_rewards | | - |- mb_actions | | - |- mb_values | | - |- mb_dones | | - |- mb_states | | - |- ep_infos | | - |- actions | | - |- values | | - |- states | | - |- clipped_actions | | - |- obs | | - |- rewards | | - |- dones | | - |- infos | | - +--------------------------------+-----------------------------------------------------+ - |- info |From timestep 2 | - |- maybe_ep_info | | - +--------------------------------+-----------------------------------------------------+ - |- update |From timestep ``n_step+1`` | - |- rollout | | - |- masks | | - |- true_reward | | - +--------------------------------+-----------------------------------------------------+ - |- value_loss |From timestep ``2 * n_step+1`` | - |- policy_entropy | | - |- n_seconds | | - |- fps | | - +--------------------------------+-----------------------------------------------------+ diff --git a/docs/modules/acer.rst b/docs/modules/acer.rst deleted file mode 100644 index ea9dd1b4..00000000 --- a/docs/modules/acer.rst +++ /dev/null @@ -1,118 +0,0 @@ -.. _acer: - -.. automodule:: stable_baselines.acer - - -ACER -==== - - `Sample Efficient Actor-Critic with Experience Replay (ACER) `_ combines - several ideas of previous algorithms: it uses multiple workers (as A2C), implements a replay buffer (as in DQN), - uses Retrace for Q-value estimation, importance sampling and a trust region. - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1611.01224 -- ``python -m stable_baselines.acer.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (``-h``) for more options. - -Can I use? ----------- - -- Recurrent policies: ✔️ -- Multi processing: ✔️ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ❌ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy - from stable_baselines.common import make_vec_env - from stable_baselines import ACER - - # multiprocess environment - env = make_vec_env('CartPole-v1', n_envs=4) - - model = ACER(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("acer_cartpole") - - del model # remove to demonstrate saving and loading - - model = ACER.load("acer_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Parameters ----------- - -.. autoclass:: ACER - :members: - :inherited-members: - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible from "timestep X" are variables that can be accessed when -``self.timestep==X`` from the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self | From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- new_tb_log | | - |- writer | | - |- episode_stats | | - |- buffer | | - |- t_start | | - |- enc_obs | | - |- mb_obs | | - |- mb_actions | | - |- mb_mus | | - |- mb_dones | | - |- mb_rewards | | - |- actions | | - |- states | | - |- mus | | - |- clipped_actions | | - |- obs | | - |- rewards | | - |- dones | | - +--------------------------------+-----------------------------------------------------+ - |- steps | From timestep ``n_step+1`` | - |- masks | | - +--------------------------------+-----------------------------------------------------+ - |- names_ops | From timestep ``2 * n_step+1`` | - |- values_ops | | - +--------------------------------+-----------------------------------------------------+ - |- samples_number | After replay_start steps, when replay_ratio > 0 and| - | | buffer is not None | - +--------------------------------+-----------------------------------------------------+ diff --git a/docs/modules/acktr.rst b/docs/modules/acktr.rst deleted file mode 100644 index 7337dc48..00000000 --- a/docs/modules/acktr.rst +++ /dev/null @@ -1,134 +0,0 @@ -.. _acktr: - -.. automodule:: stable_baselines.acktr - - -ACKTR -===== - -`Actor Critic using Kronecker-Factored Trust Region (ACKTR) `_ uses -Kronecker-factored approximate curvature (K-FAC) for trust region optimization. - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1708.05144 -- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ -- ``python -m stable_baselines.acktr.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. - See help (``-h``) for more options. - -Can I use? ----------- - -- Recurrent policies: ✔️ -- Multi processing: ✔️ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy - from stable_baselines.common import make_vec_env - from stable_baselines import ACKTR - - # multiprocess environment - env = make_vec_env('CartPole-v1', n_envs=4) - - model = ACKTR(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("acktr_cartpole") - - del model # remove to demonstrate saving and loading - - model = ACKTR.load("acktr_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Parameters ----------- - -.. autoclass:: ACKTR - :members: - :inherited-members: - - - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible from "timestep X" are variables that can be accessed when -``self.timestep==X`` from the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- new_tb_log | | - |- writer | | - |- tf_vars | | - |- is_uninitialized | | - |- new_uninitialized_vars | | - |- t_start | | - |- coord | | - |- enqueue_threads | | - |- old_uninitialized_vars | | - |- mb_obs | | - |- mb_rewards | | - |- mb_actions | | - |- mb_values | | - |- mb_dones | | - |- mb_states | | - |- ep_infos | | - |- _ | | - |- actions | | - |- values | | - |- states | | - |- clipped_actions | | - |- obs | | - |- rewards | | - |- dones | | - |- infos | | - +--------------------------------+-----------------------------------------------------+ - |- info |From timestep 2 | - |- maybe_ep_info | | - +--------------------------------+-----------------------------------------------------+ - |- update |From timestep ``n_steps+1`` | - |- rollout | | - |- returns | | - |- masks | | - |- true_reward | | - +--------------------------------+-----------------------------------------------------+ - |- policy_loss |From timestep ``2*n_steps+1`` | - |- value_loss | | - |- policy_entropy | | - |- n_seconds | | - |- fps | | - +--------------------------------+-----------------------------------------------------+ diff --git a/docs/modules/base.rst b/docs/modules/base.rst deleted file mode 100644 index 84dfa1b2..00000000 --- a/docs/modules/base.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _base_algo: - -.. automodule:: stable_baselines.common.base_class - - -Base RL Class -============= - -Common interface for all the RL algorithms - -.. autoclass:: BaseRLModel - :members: diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst deleted file mode 100644 index f6883a20..00000000 --- a/docs/modules/ddpg.rst +++ /dev/null @@ -1,227 +0,0 @@ -.. _ddpg: - -.. automodule:: stable_baselines.ddpg - - -DDPG -==== -`Deep Deterministic Policy Gradient (DDPG) `_ - -.. note:: - - DDPG requires :ref:`OpenMPI `. If OpenMPI isn't enabled, then DDPG isn't - imported into the ``stable_baselines`` module. - -.. warning:: - - The DDPG model does not support ``stable_baselines.common.policies`` because it uses q-value instead - of value estimation, as a result it must use its own policy models (see :ref:`ddpg_policies`). - - -.. rubric:: Available Policies - -.. autosummary:: - :nosignatures: - - MlpPolicy - LnMlpPolicy - CnnPolicy - LnCnnPolicy - -Notes ------ - -- Original paper: https://arxiv.org/abs/1509.02971 -- Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ -- ``python -m stable_baselines.ddpg.main`` runs the algorithm for 1M frames = 10M timesteps - on a Mujoco environment. See help (``-h``) for more options. - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ✔️ (using MPI) -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ❌ ✔️ -Box ✔️ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines.ddpg.policies import MlpPolicy - from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec - from stable_baselines import DDPG - - env = gym.make('MountainCarContinuous-v0') - - # the noise objects for DDPG - n_actions = env.action_space.shape[-1] - param_noise = None - action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) - - model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) - model.learn(total_timesteps=400000) - model.save("ddpg_mountain") - - del model # remove to demonstrate saving and loading - - model = DDPG.load("ddpg_mountain") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: DDPG - :members: - :inherited-members: - -.. _ddpg_policies: - -DDPG Policies -------------- - -.. autoclass:: MlpPolicy - :members: - :inherited-members: - - -.. autoclass:: LnMlpPolicy - :members: - :inherited-members: - - -.. autoclass:: CnnPolicy - :members: - :inherited-members: - - -.. autoclass:: LnCnnPolicy - :members: - :inherited-members: - - -Action and Parameters Noise ---------------------------- - -.. autoclass:: AdaptiveParamNoiseSpec - :members: - :inherited-members: - -.. autoclass:: NormalActionNoise - :members: - :inherited-members: - -.. autoclass:: OrnsteinUhlenbeckActionNoise - :members: - :inherited-members: - - -Custom Policy Network ---------------------- - -Similarly to the example given in the `examples <../guide/custom_policy.html>`_ page. -You can easily define a custom architecture for the policy network: - -.. code-block:: python - - import gym - - from stable_baselines.ddpg.policies import FeedForwardPolicy - from stable_baselines import DDPG - - # Custom MLP policy of two layers of size 16 each - class CustomDDPGPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomDDPGPolicy, self).__init__(*args, **kwargs, - layers=[16, 16], - layer_norm=False, - feature_extraction="mlp") - - - model = DDPG(CustomDDPGPolicy, 'Pendulum-v0', verbose=1) - # Train the agent - model.learn(total_timesteps=100000) - - -Callbacks - Accessible Variables --------------------------------- - - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible from "timestep X" are variables that can be accessed when ``self.timestep==X`` from the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- replay_wrapper | | - |- new_tb_log | | - |- writer | | - |- rank | | - |- eval_episode_rewards_history | | - |- episode_rewards_history | | - |- episode_successes | | - |- obs | | - |- eval_obs | | - |- episode_reward | | - |- episode_step | | - |- episodes | | - |- step | | - |- total_steps | | - |- start_time | | - |- epoch_episode_rewards | | - |- epoch_episode_steps | | - |- epoch_actor_losses | | - |- epoch_critic_losses | | - |- epoch_adaptive_distances | | - |- eval_episode_rewards | | - |- eval_qs | | - |- epoch_actions | | - |- epoch_qs | | - |- epoch_episodes | | - |- epoch | | - |- action | | - |- q_value | | - |- unscaled_action | | - |- new_obs | | - |- reward | | - |- done | | - |- info | | - +--------------------------------+-----------------------------------------------------+ - |- obs\_ |From timestep 2 | - |- new_obs\_ | | - |- reward\_ | | - +--------------------------------+-----------------------------------------------------+ - |- t_train |After nb_rollout_steps+1 | - +--------------------------------+-----------------------------------------------------+ - |- distance |After | - | |nb_rollout_steps*ceil(nb_rollout_steps/batch_size)```| - |- critic_loss | | - |- actor_loss | | - +--------------------------------+-----------------------------------------------------+ - |- maybe_is_success |After episode termination | - +--------------------------------+-----------------------------------------------------+ diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst deleted file mode 100644 index 0eea78d5..00000000 --- a/docs/modules/dqn.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. _dqn: - -.. automodule:: stable_baselines.deepq - - -DQN -=== - -`Deep Q Network (DQN) `_ -and its extensions (Double-DQN, Dueling-DQN, Prioritized Experience Replay). - -.. warning:: - - The DQN model does not support ``stable_baselines.common.policies``, - as a result it must use its own policy models (see :ref:`deepq_policies`). - -.. rubric:: Available Policies - -.. autosummary:: - :nosignatures: - - MlpPolicy - LnMlpPolicy - CnnPolicy - LnCnnPolicy - -Notes ------ - -- DQN paper: https://arxiv.org/abs/1312.5602 -- Dueling DQN: https://arxiv.org/abs/1511.06581 -- Double-Q Learning: https://arxiv.org/abs/1509.06461 -- Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 - -.. note:: - - By default, the DQN class has double q learning and dueling extensions enabled. - See `Issue #406 `_ for disabling dueling. - To disable double-q learning, you can change the default value in the constructor. - - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ❌ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ❌ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines.deepq.policies import MlpPolicy - from stable_baselines import DQN - - env = gym.make('CartPole-v1') - - model = DQN(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("deepq_cartpole") - - del model # remove to demonstrate saving and loading - - model = DQN.load("deepq_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -With Atari: - -.. code-block:: python - - from stable_baselines.common.atari_wrappers import make_atari - from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy - from stable_baselines import DQN - - env = make_atari('BreakoutNoFrameskip-v4') - - model = DQN(CnnPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("deepq_breakout") - - del model # remove to demonstrate saving and loading - - model = DQN.load("deepq_breakout") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: DQN - :members: - :inherited-members: - -.. _deepq_policies: - -DQN Policies ------------- - -.. autoclass:: MlpPolicy - :members: - :inherited-members: - - -.. autoclass:: LnMlpPolicy - :members: - :inherited-members: - - -.. autoclass:: CnnPolicy - :members: - :inherited-members: - - -.. autoclass:: LnCnnPolicy - :members: - :inherited-members: - - -Custom Policy Network ---------------------- - -Similarly to the example given in the `examples <../guide/custom_policy.html>`_ page. -You can easily define a custom architecture for the policy network: - -.. code-block:: python - - import gym - - from stable_baselines.deepq.policies import FeedForwardPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import DQN - - # Custom MLP policy of two layers of size 32 each - class CustomDQNPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomDQNPolicy, self).__init__(*args, **kwargs, - layers=[32, 32], - layer_norm=False, - feature_extraction="mlp") - - # Create and wrap the environment - env = gym.make('LunarLander-v2') - env = DummyVecEnv([lambda: env]) - - model = DQN(CustomDQNPolicy, env, verbose=1) - # Train the agent - model.learn(total_timesteps=100000) - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible from "timestep X" are variables that can be accessed when -``self.timestep==X`` from the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- replay_wrapper | | - |- new_tb_log | | - |- writer | | - |- episode_rewards | | - |- episode_successes | | - |- reset | | - |- obs | | - |- _ | | - |- kwargs | | - |- update_eps | | - |- update_param_noise_threshold | | - |- action | | - |- env_action | | - |- new_obs | | - |- rew | | - |- done | | - |- info | | - +--------------------------------+-----------------------------------------------------+ - |- obs\_ |From timestep 2 | - |- new_obs\_ | | - |- reward\_ | | - |- can_sample | | - |- mean_100ep_reward | | - |- num_episodes | | - +--------------------------------+-----------------------------------------------------+ - |- maybe_is_success |After the first episode | - +--------------------------------+-----------------------------------------------------+ - |- obses_t |After at least ``max(batch_size, learning_starts)`` | - |- actions |and every `train_freq` steps | - |- rewards | | - |- obses_tp1 | | - |- dones | | - |- weights | | - |- batch_idxes | | - |- td_errors | | - +--------------------------------+-----------------------------------------------------+ \ No newline at end of file diff --git a/docs/modules/gail.rst b/docs/modules/gail.rst deleted file mode 100644 index f3c3d31f..00000000 --- a/docs/modules/gail.rst +++ /dev/null @@ -1,136 +0,0 @@ -.. _gail: - -.. automodule:: stable_baselines.gail - - -GAIL -==== - -The `Generative Adversarial Imitation Learning (GAIL) `_ uses expert trajectories -to recover a cost function and then learn a policy. - -Learning a cost function from expert demonstrations is called Inverse Reinforcement Learning (IRL). -The connection between GAIL and Generative Adversarial Networks (GANs) is that it uses a discriminator that tries -to separate expert trajectory from trajectories of the learned policy, which has the role of the generator here. - -.. note:: - - GAIL requires :ref:`OpenMPI `. If OpenMPI isn't enabled, then GAIL isn't - imported into the ``stable_baselines`` module. - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1606.03476 - -.. warning:: - - Images are not yet handled properly by the current implementation - - - -If you want to train an imitation learning agent ------------------------------------------------- - - -Step 1: Generate expert data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can either train a RL algorithm in a classic setting, use another controller (e.g. a PID controller) -or human demonstrations. - -We recommend you to take a look at :ref:`pre-training ` section -or directly look at ``stable_baselines/gail/dataset/`` folder to learn more about the expected format for the dataset. - -Here is an example of training a Soft Actor-Critic model to generate expert trajectories for GAIL: - - -.. code-block:: python - - from stable_baselines import SAC - from stable_baselines.gail import generate_expert_traj - - # Generate expert trajectories (train expert) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) - # Train for 60000 timesteps and record 10 trajectories - # all the data will be saved in 'expert_pendulum.npz' file - generate_expert_traj(model, 'expert_pendulum', n_timesteps=60000, n_episodes=10) - - - -Step 2: Run GAIL -~~~~~~~~~~~~~~~~ - - -**In case you want to run Behavior Cloning (BC)** - -Use the ``.pretrain()`` method (cf guide). - - -**Others** - -Thanks to the open source: - -- @openai/imitation -- @carpedm20/deep-rl-tensorflow - - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ✔️ (using MPI) -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines import GAIL, SAC - from stable_baselines.gail import ExpertDataset, generate_expert_traj - - # Generate expert trajectories (train expert) - model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) - generate_expert_traj(model, 'expert_pendulum', n_timesteps=100, n_episodes=10) - - # Load the expert dataset - dataset = ExpertDataset(expert_path='expert_pendulum.npz', traj_limitation=10, verbose=1) - - model = GAIL('MlpPolicy', 'Pendulum-v0', dataset, verbose=1) - # Note: in practice, you need to train for 1M steps to have a working policy - model.learn(total_timesteps=1000) - model.save("gail_pendulum") - - del model # remove to demonstrate saving and loading - - model = GAIL.load("gail_pendulum") - - env = gym.make('Pendulum-v0') - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Parameters ----------- - -.. autoclass:: GAIL - :members: - :inherited-members: diff --git a/docs/modules/her.rst b/docs/modules/her.rst deleted file mode 100644 index e64cd7ed..00000000 --- a/docs/modules/her.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _her: - -.. automodule:: stable_baselines.her - - -HER -==== - -`Hindsight Experience Replay (HER) `_ - -HER is a method wrapper that works with Off policy methods (DQN, SAC, TD3 and DDPG for example). - -.. note:: - - HER was re-implemented from scratch in Stable-Baselines compared to the original OpenAI baselines. - If you want to reproduce results from the paper, please use the rl baselines zoo - in order to have the correct hyperparameters and at least 8 MPI workers with DDPG. - -.. warning:: - - HER requires the environment to inherits from `gym.GoalEnv `_ - - -.. warning:: - - you must pass an environment or wrap it with ``HERGoalEnvWrapper`` in order to use the predict method - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1707.01495 -- OpenAI paper: `Plappert et al. (2018)`_ -- OpenAI blog post: https://openai.com/blog/ingredients-for-robotics-research/ - - -.. _Plappert et al. (2018): https://arxiv.org/abs/1802.09464 - -Can I use? ----------- - -Please refer to the wrapped model (DQN, SAC, TD3 or DDPG) for that section. - -Example -------- - -.. code-block:: python - - from stable_baselines import HER, DQN, SAC, DDPG, TD3 - from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper - from stable_baselines.common.bit_flipping_env import BitFlippingEnv - - model_class = DQN # works also with SAC, DDPG and TD3 - - env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - - # Available strategies (cf paper): future, final, episode, random - goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE - - # Wrap the model - model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, - verbose=1) - # Train the model - model.learn(1000) - - model.save("./her_bit_env") - - # WARNING: you must pass an env - # or wrap your environment with HERGoalEnvWrapper to use the predict method - model = HER.load('./her_bit_env', env=env) - - obs = env.reset() - for _ in range(100): - action, _ = model.predict(obs) - obs, reward, done, _ = env.step(action) - - if done: - obs = env.reset() - - -Parameters ----------- - -.. autoclass:: HER - :members: - -Goal Selection Strategies -------------------------- - -.. autoclass:: GoalSelectionStrategy - :members: - :inherited-members: - :undoc-members: - - -Goal Env Wrapper ----------------- - -.. autoclass:: HERGoalEnvWrapper - :members: - :inherited-members: - :undoc-members: - - -Replay Wrapper --------------- - -.. autoclass:: HindsightExperienceReplayWrapper - :members: - :inherited-members: diff --git a/docs/modules/policies.rst b/docs/modules/policies.rst deleted file mode 100644 index b630f573..00000000 --- a/docs/modules/policies.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. _policies: - -.. automodule:: stable_baselines.common.policies - -Policy Networks -=============== - -Stable-baselines provides a set of default policies, that can be used with most action spaces. -To customize the default policies, you can specify the ``policy_kwargs`` parameter to the model class you use. -Those kwargs are then passed to the policy on instantiation (see :ref:`custom_policy` for an example). -If you need more control on the policy architecture, you can also create a custom policy (see :ref:`custom_policy`). - -.. note:: - - CnnPolicies are for images only. MlpPolicies are made for other type of features (e.g. robot joints) - -.. warning:: - For all algorithms (except DDPG, TD3 and SAC), continuous actions are clipped during training and testing - (to avoid out of bound error). - - -.. rubric:: Available Policies - -.. autosummary:: - :nosignatures: - - MlpPolicy - MlpLstmPolicy - MlpLnLstmPolicy - CnnPolicy - CnnLstmPolicy - CnnLnLstmPolicy - - -Base Classes ------------- - -.. autoclass:: BasePolicy - :members: - -.. autoclass:: ActorCriticPolicy - :members: - -.. autoclass:: FeedForwardPolicy - :members: - -.. autoclass:: LstmPolicy - :members: - -MLP Policies ------------- - -.. autoclass:: MlpPolicy - :members: - -.. autoclass:: MlpLstmPolicy - :members: - -.. autoclass:: MlpLnLstmPolicy - :members: - - -CNN Policies ------------- - -.. autoclass:: CnnPolicy - :members: - -.. autoclass:: CnnLstmPolicy - :members: - -.. autoclass:: CnnLnLstmPolicy - :members: diff --git a/docs/modules/ppo1.rst b/docs/modules/ppo1.rst deleted file mode 100644 index 98caf867..00000000 --- a/docs/modules/ppo1.rst +++ /dev/null @@ -1,141 +0,0 @@ -.. _ppo1: - -.. automodule:: stable_baselines.ppo1 - - -PPO1 -==== - -The `Proximal Policy Optimization `_ algorithm combines ideas from A2C (having multiple workers) -and TRPO (it uses a trust region to improve the actor). - -The main idea is that after an update, the new policy should be not too far from the ``old`` policy. -For that, ppo uses clipping to avoid too large update. - -.. note:: - - PPO1 requires :ref:`OpenMPI `. If OpenMPI isn't enabled, then PPO1 isn't - imported into the ``stable_baselines`` module. - -.. note:: - - PPO1 uses MPI for multiprocessing unlike PPO2, which uses vectorized environments. - PPO2 is the implementation OpenAI made for GPU. - -Notes ------ - -- Original paper: https://arxiv.org/abs/1707.06347 -- Clear explanation of PPO on Arxiv Insights channel: https://www.youtube.com/watch?v=5P7I-xPq8u8 -- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ -- ``mpirun -np 8 python -m stable_baselines.ppo1.run_atari`` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (``-h``) for more options. -- ``python -m stable_baselines.ppo1.run_mujoco`` runs the algorithm for 1M frames on a Mujoco environment. -- Train mujoco 3d humanoid (with optimal-ish hyperparameters): ``mpirun -np 16 python -m stable_baselines.ppo1.run_humanoid --model-path=/path/to/model`` -- Render the 3d humanoid: ``python -m stable_baselines.ppo1.run_humanoid --play --model-path=/path/to/model`` - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ✔️ (using MPI) -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ✔️ ✔️ -MultiBinary ✔️ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines import PPO1 - - env = gym.make('CartPole-v1') - - model = PPO1(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("ppo1_cartpole") - - del model # remove to demonstrate saving and loading - - model = PPO1.load("ppo1_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Parameters ----------- - -.. autoclass:: PPO1 - :members: - :inherited-members: - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 0 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- new_tb_log | | - |- writer | | - |- policy | | - |- env | | - |- horizon | | - |- reward_giver | | - |- gail | | - |- step | | - |- cur_ep_ret | | - |- current_it_len | | - |- current_ep_len | | - |- cur_ep_true_ret | | - |- ep_true_rets | | - |- ep_rets | | - |- ep_lens | | - |- observations | | - |- true_rewards | | - |- rewards | | - |- vpreds | | - |- episode_starts | | - |- dones | | - |- actions | | - |- states | | - |- episode_start | | - |- done | | - |- vpred | | - |- _ | | - |- i | | - |- clipped_action | | - |- reward | | - |- true_reward | | - |- info | | - |- action | | - |- observation | | - +--------------------------------+-----------------------------------------------------+ - |- maybe_ep_info |After the first episode termination | - +--------------------------------+-----------------------------------------------------+ diff --git a/docs/modules/ppo2.rst b/docs/modules/ppo2.rst deleted file mode 100644 index 05c6f454..00000000 --- a/docs/modules/ppo2.rst +++ /dev/null @@ -1,131 +0,0 @@ -.. _ppo2: - -.. automodule:: stable_baselines.ppo2 - -PPO2 -==== - -The `Proximal Policy Optimization `_ algorithm combines ideas from A2C (having multiple workers) -and TRPO (it uses a trust region to improve the actor). - -The main idea is that after an update, the new policy should be not too far from the old policy. -For that, PPO uses clipping to avoid too large update. - -.. note:: - - PPO2 is the implementation of OpenAI made for GPU. For multiprocessing, it uses vectorized environments - compared to PPO1 which uses MPI. - -.. note:: - - PPO2 contains several modifications from the original algorithm not documented - by OpenAI: value function is also clipped and advantages are normalized. - - -Notes ------ - -- Original paper: https://arxiv.org/abs/1707.06347 -- Clear explanation of PPO on Arxiv Insights channel: https://www.youtube.com/watch?v=5P7I-xPq8u8 -- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ -- ``python -m stable_baselines.ppo2.run_atari`` runs the algorithm for 40M - frames = 10M timesteps on an Atari game. See help (``-h``) for more - options. -- ``python -m stable_baselines.ppo2.run_mujoco`` runs the algorithm for 1M - frames on a Mujoco environment. - -Can I use? ----------- - -- Recurrent policies: ✔️ -- Multi processing: ✔️ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ✔️ ✔️ -MultiBinary ✔️ ✔️ -============= ====== =========== - -Example -------- - -Train a PPO agent on `CartPole-v1` using 4 processes. - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines.common import make_vec_env - from stable_baselines import PPO2 - - # multiprocess environment - env = make_vec_env('CartPole-v1', n_envs=4) - - model = PPO2(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("ppo2_cartpole") - - del model # remove to demonstrate saving and loading - - model = PPO2.load("ppo2_cartpole") - - # Enjoy trained agent - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: PPO2 - :members: - :inherited-members: - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- cliprange_vf | | - |- new_tb_log | | - |- writer | | - |- t_first_start | | - |- n_updates | | - |- mb_obs | | - |- mb_rewards | | - |- mb_actions | | - |- mb_values | | - |- mb_dones | | - |- mb_neglogpacs | | - |- mb_states | | - |- ep_infos | | - |- actions | | - |- values | | - |- neglogpacs | | - |- clipped_actions | | - |- rewards | | - |- infos | | - +--------------------------------+-----------------------------------------------------+ - |- info |From timestep 1 | - |- maybe_ep_info | | - +--------------------------------+-----------------------------------------------------+ - diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst deleted file mode 100644 index 47c8a2c2..00000000 --- a/docs/modules/sac.rst +++ /dev/null @@ -1,209 +0,0 @@ -.. _sac: - -.. automodule:: stable_baselines.sac - - -SAC -=== - -`Soft Actor Critic (SAC) `_ Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor. - -SAC is the successor of `Soft Q-Learning SQL `_ and incorporates the double Q-learning trick from TD3. -A key feature of SAC, and a major difference with common RL algorithms, is that it is trained to maximize a trade-off between expected return and entropy, a measure of randomness in the policy. - - -.. warning:: - - The SAC model does not support ``stable_baselines.common.policies`` because it uses double q-values - and value estimation, as a result it must use its own policy models (see :ref:`sac_policies`). - - -.. rubric:: Available Policies - -.. autosummary:: - :nosignatures: - - MlpPolicy - LnMlpPolicy - CnnPolicy - LnCnnPolicy - -Notes ------ - -- Original paper: https://arxiv.org/abs/1801.01290 -- OpenAI Spinning Guide for SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html -- Original Implementation: https://github.com/haarnoja/sac -- Blog post on using SAC with real robots: https://bair.berkeley.edu/blog/2018/12/14/sac/ - -.. note:: - In our implementation, we use an entropy coefficient (as in OpenAI Spinning or Facebook Horizon), - which is the equivalent to the inverse of reward scale in the original SAC paper. - The main reason is that it avoids having too high errors when updating the Q functions. - - -.. note:: - - The default policies for SAC differ a bit from others MlpPolicy: it uses ReLU instead of tanh activation, - to match the original paper - - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ❌ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ❌ ✔️ -Box ✔️ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines.sac.policies import MlpPolicy - from stable_baselines import SAC - - env = gym.make('Pendulum-v0') - - model = SAC(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=50000, log_interval=10) - model.save("sac_pendulum") - - del model # remove to demonstrate saving and loading - - model = SAC.load("sac_pendulum") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: SAC - :members: - :inherited-members: - -.. _sac_policies: - -SAC Policies -------------- - -.. autoclass:: MlpPolicy - :members: - :inherited-members: - - -.. autoclass:: LnMlpPolicy - :members: - :inherited-members: - - -.. autoclass:: CnnPolicy - :members: - :inherited-members: - - -.. autoclass:: LnCnnPolicy - :members: - :inherited-members: - - -Custom Policy Network ---------------------- - -Similarly to the example given in the `examples <../guide/custom_policy.html>`_ page. -You can easily define a custom architecture for the policy network: - -.. code-block:: python - - import gym - - from stable_baselines.sac.policies import FeedForwardPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines import SAC - - # Custom MLP policy of three layers of size 128 each - class CustomSACPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomSACPolicy, self).__init__(*args, **kwargs, - layers=[128, 128, 128], - layer_norm=False, - feature_extraction="mlp") - - # Create and wrap the environment - env = gym.make('Pendulum-v0') - env = DummyVecEnv([lambda: env]) - - model = SAC(CustomSACPolicy, env, verbose=1) - # Train the agent - model.learn(total_timesteps=100000) - - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - - +--------------------------------+-----------------------------------------------------+ - |Variable | Availability| - +================================+=====================================================+ - |- self |From timestep 1 | - |- total_timesteps | | - |- callback | | - |- log_interval | | - |- tb_log_name | | - |- reset_num_timesteps | | - |- replay_wrapper | | - |- new_tb_log | | - |- writer | | - |- current_lr | | - |- start_time | | - |- episode_rewards | | - |- episode_successes | | - |- obs | | - |- n_updates | | - |- infos_values | | - |- step | | - |- unscaled_action | | - |- action | | - |- new_obs | | - |- reward | | - |- done | | - |- info | | - +--------------------------------+-----------------------------------------------------+ - |- obs\_ |From timestep 2 | - |- new_obs\_ | | - |- reward\_ | | - |- maybe_ep_info | | - |- mean_reward | | - |- num_episodes | | - +--------------------------------+-----------------------------------------------------+ - |- mb_infos_vals |After timestep train_freq steps | - |- grad_step | | - +--------------------------------+-----------------------------------------------------+ - |- frac |After timestep train_freq steps | - | |After at least batch_size and learning_starts steps | - +--------------------------------+-----------------------------------------------------+ - |- maybe_is_success |After the first episode | - +--------------------------------+-----------------------------------------------------+ - diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst deleted file mode 100644 index 0b9d4703..00000000 --- a/docs/modules/td3.rst +++ /dev/null @@ -1,214 +0,0 @@ -.. _td3: - -.. automodule:: stable_baselines.td3 - - -TD3 -=== - -`Twin Delayed DDPG (TD3) `_ Addressing Function Approximation Error in Actor-Critic Methods. - -TD3 is a direct successor of DDPG and improves it using three major tricks: clipped double Q-Learning, delayed policy update and target policy smoothing. -We recommend reading `OpenAI Spinning guide on TD3 `_ to learn more about those. - - -.. warning:: - - The TD3 model does not support ``stable_baselines.common.policies`` because it uses double q-values - estimation, as a result it must use its own policy models (see :ref:`td3_policies`). - - -.. rubric:: Available Policies - -.. autosummary:: - :nosignatures: - - MlpPolicy - LnMlpPolicy - CnnPolicy - LnCnnPolicy - -Notes ------ - -- Original paper: https://arxiv.org/pdf/1802.09477.pdf -- OpenAI Spinning Guide for TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html -- Original Implementation: https://github.com/sfujim/TD3 - -.. note:: - - The default policies for TD3 differ a bit from others MlpPolicy: it uses ReLU instead of tanh activation, - to match the original paper - - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ❌ -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ❌ ✔️ -Box ✔️ ✔️ -MultiDiscrete ❌ ✔️ -MultiBinary ❌ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines import TD3 - from stable_baselines.td3.policies import MlpPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - - env = gym.make('Pendulum-v0') - - # The noise objects for TD3 - n_actions = env.action_space.shape[-1] - action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) - - model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1) - model.learn(total_timesteps=50000, log_interval=10) - model.save("td3_pendulum") - - del model # remove to demonstrate saving and loading - - model = TD3.load("td3_pendulum") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - -Parameters ----------- - -.. autoclass:: TD3 - :members: - :inherited-members: - -.. _td3_policies: - -TD3 Policies -------------- - -.. autoclass:: MlpPolicy - :members: - :inherited-members: - - -.. autoclass:: LnMlpPolicy - :members: - :inherited-members: - - -.. autoclass:: CnnPolicy - :members: - :inherited-members: - - -.. autoclass:: LnCnnPolicy - :members: - :inherited-members: - - -Custom Policy Network ---------------------- - -Similarly to the example given in the `examples <../guide/custom_policy.html>`_ page. -You can easily define a custom architecture for the policy network: - -.. code-block:: python - - import gym - import numpy as np - - from stable_baselines import TD3 - from stable_baselines.td3.policies import FeedForwardPolicy - from stable_baselines.common.vec_env import DummyVecEnv - from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise - - # Custom MLP policy with two layers - class CustomTD3Policy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - super(CustomTD3Policy, self).__init__(*args, **kwargs, - layers=[400, 300], - layer_norm=False, - feature_extraction="mlp") - - # Create and wrap the environment - env = gym.make('Pendulum-v0') - env = DummyVecEnv([lambda: env]) - - # The noise objects for TD3 - n_actions = env.action_space.shape[-1] - action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) - - - model = TD3(CustomTD3Policy, env, action_noise=action_noise, verbose=1) - # Train the agent - model.learn(total_timesteps=80000) - - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - -+--------------------------------+-----------------------------------------------------+ -|Variable | Availability| -+================================+=====================================================+ -|- self |From timestep 1 | -|- total_timesteps | | -|- callback | | -|- log_interval | | -|- tb_log_name | | -|- reset_num_timesteps | | -|- replay_wrapper | | -|- new_tb_log | | -|- writer | | -|- current_lr | | -|- start_time | | -|- episode_rewards | | -|- episode_successes | | -|- obs | | -|- n_updates | | -|- infos_values | | -|- step | | -|- unscaled_action | | -|- action | | -|- new_obs | | -|- reward | | -|- done | | -|- info | | -+--------------------------------+-----------------------------------------------------+ -|- obs\_ |From timestep 2 | -|- new_obs\_ | | -|- reward\_ | | -|- maybe_ep_info | | -|- mean_reward | | -|- num_episodes | | -+--------------------------------+-----------------------------------------------------+ -|- mb_infos_vals |After timestep train_freq steps | -|- grad_step | | -+--------------------------------+-----------------------------------------------------+ -|- frac |After timestep train_freq steps | -| |After at least batch_size and learning_starts steps | -+--------------------------------+-----------------------------------------------------+ -|- maybe_is_success |After the first episode | -+--------------------------------+-----------------------------------------------------+ - diff --git a/docs/modules/trpo.rst b/docs/modules/trpo.rst deleted file mode 100644 index 9a101b15..00000000 --- a/docs/modules/trpo.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. _trpo: - -.. automodule:: stable_baselines.trpo_mpi - - -TRPO -==== - -`Trust Region Policy Optimization (TRPO) `_ -is an iterative approach for optimizing policies with guaranteed monotonic improvement. - -.. note:: - - TRPO requires :ref:`OpenMPI `. If OpenMPI isn't enabled, then TRPO isn't - imported into the ``stable_baselines`` module. - -Notes ------ - -- Original paper: https://arxiv.org/abs/1502.05477 -- OpenAI blog post: https://blog.openai.com/openai-baselines-ppo/ -- ``mpirun -np 16 python -m stable_baselines.trpo_mpi.run_atari`` runs the algorithm - for 40M frames = 10M timesteps on an Atari game. See help (``-h``) for more options. -- ``python -m stable_baselines.trpo_mpi.run_mujoco`` runs the algorithm for 1M timesteps on a Mujoco environment. - -Can I use? ----------- - -- Recurrent policies: ❌ -- Multi processing: ✔️ (using MPI) -- Gym spaces: - - -============= ====== =========== -Space Action Observation -============= ====== =========== -Discrete ✔️ ✔️ -Box ✔️ ✔️ -MultiDiscrete ✔️ ✔️ -MultiBinary ✔️ ✔️ -============= ====== =========== - - -Example -------- - -.. code-block:: python - - import gym - - from stable_baselines.common.policies import MlpPolicy - from stable_baselines import TRPO - - env = gym.make('CartPole-v1') - - model = TRPO(MlpPolicy, env, verbose=1) - model.learn(total_timesteps=25000) - model.save("trpo_cartpole") - - del model # remove to demonstrate saving and loading - - model = TRPO.load("trpo_cartpole") - - obs = env.reset() - while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() - - -Parameters ----------- - -.. autoclass:: TRPO - :members: - :inherited-members: - -Callbacks - Accessible Variables --------------------------------- - -Depending on initialization parameters and timestep, different variables are accessible. -Variables accessible "From timestep X" are variables that can be accessed when -``self.timestep==X`` in the ``on_step`` function. - -+--------------------------------+-----------------------------------------------------+ -|Variable | Availability| -+================================+=====================================================+ -|- total_timesteps |From timestep 0 | -|- callback | | -|- log_interval | | -|- tb_log_name | | -|- reset_num_timesteps | | -|- new_tb_log | | -|- writer | | -|- self | | -|- policy | | -|- env | | -|- horizon | | -|- reward_giver | | -|- gail | | -|- step | | -|- cur_ep_ret | | -|- current_it_len | | -|- current_ep_len | | -|- cur_ep_true_ret | | -|- ep_true_rets | | -|- ep_rets | | -|- ep_lens | | -|- observations | | -|- true_rewards | | -|- rewards | | -|- vpreds | | -|- episode_starts | | -|- dones | | -|- actions | | -|- states | | -|- episode_start | | -|- done | | -|- vpred | | -|- clipped_action | | -|- reward | | -|- true_reward | | -|- info | | -|- action | | -|- observation | | -|- maybe_ep_info | | -+--------------------------------+-----------------------------------------------------+ diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 99a688c6..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -gym -pandas diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt deleted file mode 100644 index b5a3c560..00000000 --- a/docs/spelling_wordlist.txt +++ /dev/null @@ -1,110 +0,0 @@ -py -env -atari -argparse -Argparse -TensorFlow -feedforward -envs -VecEnv -pretrain -petrained -tf -np -mujoco -cpu -ndarray -ndarrays -timestep -timesteps -stepsize -dataset -adam -fn -normalisation -Kullback -Leibler -boolean -deserialized -pretrained -minibatch -subprocesses -ArgumentParser -Tensorflow -Gaussian -approximator -minibatches -hyperparameters -hyperparameter -vectorized -rl -colab -dataloader -npz -datasets -vf -logits -num -Utils -backpropagate -prepend -NaN -preprocessing -Cloudpickle -async -multiprocess -tensorflow -mlp -cnn -neglogp -tanh -coef -repo -Huber -params -ppo -arxiv -Arxiv -func -DQN -Uhlenbeck -Ornstein -multithread -cancelled -Tensorboard -parallelize -customising -serializable -Multiprocessed -cartpole -toolset -lstm -rescale -ffmpeg -avconv -unnormalized -Github -pre -preprocess -backend -attr -preprocess -Antonin -Raffin -araffin -Homebrew -Numpy -Theano -rollout -kfac -Piecewise -csv -nvidia -visdom -tensorboard -preprocessed -namespace -sklearn -GoalEnv -BaseCallback -Keras diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh deleted file mode 100755 index e70df658..00000000 --- a/scripts/build_docker.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -CPU_PARENT=ubuntu:16.04 -GPU_PARENT=nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 - -TAG=stablebaselines/stable-baselines -VERSION=$(cat ./stable_baselines/version.txt) - -if [[ ${USE_GPU} == "True" ]]; then - PARENT=${GPU_PARENT} -else - PARENT=${CPU_PARENT} - TAG="${TAG}-cpu" -fi - -docker build --build-arg PARENT_IMAGE=${PARENT} --build-arg USE_GPU=${USE_GPU} -t ${TAG}:${VERSION} . -docker tag ${TAG}:${VERSION} ${TAG}:latest - -if [[ ${RELEASE} == "True" ]]; then - docker push ${TAG}:${VERSION} - docker push ${TAG}:latest -fi diff --git a/scripts/run_docker_cpu.sh b/scripts/run_docker_cpu.sh deleted file mode 100755 index 355490e9..00000000 --- a/scripts/run_docker_cpu.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# Launch an experiment using the docker cpu image - -cmd_line="$@" - -echo "Executing in the docker (cpu image):" -echo $cmd_line - -docker run -it --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines-cpu:v2.10.0 \ - bash -c "cd /root/code/stable-baselines/ && $cmd_line" diff --git a/scripts/run_docker_gpu.sh b/scripts/run_docker_gpu.sh deleted file mode 100755 index a5783470..00000000 --- a/scripts/run_docker_gpu.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# Launch an experiment using the docker gpu image - -cmd_line="$@" - -echo "Executing in the docker (gpu image):" -echo $cmd_line - -# TODO: always use new-style once sufficiently widely used (probably 2021 onwards) -if [ -x "$(which nvidia-docker)" ]; then - # old-style nvidia-docker2 - NVIDIA_ARG="--runtime=nvidia" -else - NVIDIA_ARG="--gpus all" -fi - -docker run -it ${NVIDIA_ARG} --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines,type=bind stablebaselines/stable-baselines:v2.10.0 \ - bash -c "cd /root/code/stable-baselines/ && $cmd_line" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh deleted file mode 100755 index d765f3ad..00000000 --- a/scripts/run_tests.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -python -m pytest --cov-config .coveragerc --cov-report html --cov-report term --cov=. -v diff --git a/scripts/run_tests_travis.sh b/scripts/run_tests_travis.sh deleted file mode 100755 index 76383de0..00000000 --- a/scripts/run_tests_travis.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -DOCKER_CMD="docker run -it --rm --network host --ipc=host --mount src=$(pwd),target=/root/code/stable-baselines,type=bind" -BASH_CMD="cd /root/code/stable-baselines/" - -if [[ $# -ne 1 ]]; then - echo "usage: $0 " - exit 1 -fi - -if [[ ${DOCKER_IMAGE} = "" ]]; then - echo "Need DOCKER_IMAGE environment variable to be set." - exit 1 -fi - -TEST_GLOB=$1 - -set -e # exit immediately on any error - -# For pull requests from fork, Codacy token is not available, leading to build failure -if [[ ${CODACY_PROJECT_TOKEN} = "" ]]; then - echo "WARNING: CODACY_PROJECT_TOKEN not set. Skipping Codacy upload." - echo "(This is normal when building in a fork and can be ignored.)" - ${DOCKER_CMD} ${DOCKER_IMAGE} \ - bash -c "${BASH_CMD} && \ - pytest --cov-config .coveragerc --cov-report term --cov=. -v tests/test_${TEST_GLOB}" -else - ${DOCKER_CMD} --env CODACY_PROJECT_TOKEN=${CODACY_PROJECT_TOKEN} ${DOCKER_IMAGE} \ - bash -c "${BASH_CMD} && \ - pytest --cov-config .coveragerc --cov-report term --cov-report xml --cov=. -v tests/test_${TEST_GLOB} && \ - java -jar /root/code/codacy-coverage-reporter.jar report -l python -r coverage.xml --partial" -fi diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 39378e81..00000000 --- a/setup.cfg +++ /dev/null @@ -1,22 +0,0 @@ -[metadata] -# This includes the license file in the wheel. -license_file = LICENSE - -[tool:pytest] -# Deterministic ordering for tests; useful for pytest-xdist. -env = - PYTHONHASHSEED=0 -filterwarnings = - ignore:inspect.getargspec:DeprecationWarning:tensorflow - ignore::pytest.PytestUnknownMarkWarning - # Tensorflow internal warnings - ignore:builtin type EagerTensor has no __module__ attribute:DeprecationWarning - ignore:The binary mode of fromstring is deprecated:DeprecationWarning - ignore::FutureWarning:tensorflow - # Gym warnings - ignore:Parameters to load are deprecated.:DeprecationWarning - ignore:the imp module is deprecated in favour of importlib:PendingDeprecationWarning - -[pytype] -inputs = stable_baselines -; python_version = 3.5 diff --git a/setup.py b/setup.py deleted file mode 100644 index acfbb295..00000000 --- a/setup.py +++ /dev/null @@ -1,168 +0,0 @@ -import os -import sys -import subprocess -from setuptools import setup, find_packages -from distutils.version import LooseVersion - -if sys.version_info.major != 3: - print('This Python is only compatible with Python 3, but you are running ' - 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) - -# Read version from file -with open(os.path.join('stable_baselines', 'version.txt'), 'r') as file_handler: - __version__ = file_handler.read().strip() - - -# Check tensorflow installation to avoid -# breaking pre-installed tf gpu -def find_tf_dependency(): - install_tf, tf_gpu = False, False - try: - import tensorflow as tf - if tf.__version__ < LooseVersion('1.8.0'): - install_tf = True - # check if a gpu version is needed - tf_gpu = tf.test.is_gpu_available() - except ImportError: - install_tf = True - # Check if a nvidia gpu is present - for command in ['nvidia-smi', '/usr/bin/nvidia-smi', 'nvidia-smi.exe']: - try: - if subprocess.call([command]) == 0: - tf_gpu = True - break - except IOError: # command does not exist / is not executable - pass - if os.environ.get('USE_GPU') == 'True': # force GPU even if not auto-detected - tf_gpu = True - - tf_dependency = [] - if install_tf: - tf_dependency = ['tensorflow-gpu>=1.8.0,<2.0.0'] if tf_gpu else ['tensorflow>=1.8.0,<2.0.0'] - if tf_gpu: - print("A GPU was detected, tensorflow-gpu will be installed") - - return tf_dependency - - -long_description = """ -**WARNING: This package is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation.** - -[![Build Status](https://travis-ci.com/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.com/hill-a/stable-baselines) [![Documentation Status](https://readthedocs.org/projects/stable-baselines/badge/?version=master)](https://stable-baselines.readthedocs.io/en/master/?badge=master) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage) - -# Stable Baselines - -Stable Baselines is a set of improved implementations of reinforcement learning algorithms based on OpenAI [Baselines](https://github.com/openai/baselines/). - -These algorithms will make it easier for the research community and industry to replicate, refine, and identify new ideas, and will create good baselines to build projects on top of. We expect these tools will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. We also hope that the simplicity of these tools will allow beginners to experiment with a more advanced toolset, without being buried in implementation details. - -## Main differences with OpenAI Baselines -This toolset is a fork of OpenAI Baselines, with a major structural refactoring, and code cleanups: - -- Unified structure for all algorithms -- PEP8 compliant (unified code style) -- Documented functions and classes -- More tests & more code coverage -- Additional algorithms: SAC and TD3 (+ HER support for DQN, DDPG, SAC and TD3) - -## Links - -Repository: -https://github.com/hill-a/stable-baselines - -Medium article: -https://medium.com/@araffin/df87c4b2fc82 - -Documentation: -https://stable-baselines.readthedocs.io/en/master/ - -RL Baselines Zoo: -https://github.com/araffin/rl-baselines-zoo - -## Quick example - -Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms using Gym. - -Here is a quick example of how to train and run PPO2 on a cartpole environment: - -```python -import gym - -from stable_baselines.common.policies import MlpPolicy -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines import PPO2 - -env = gym.make('CartPole-v1') -# Optional: PPO2 requires a vectorized environment to run -# the env is now wrapped automatically when passing it to the constructor -# env = DummyVecEnv([lambda: env]) - -model = PPO2(MlpPolicy, env, verbose=1) -model.learn(total_timesteps=10000) - -obs = env.reset() -for i in range(1000): - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() -``` - -Or just train a model with a one liner if [the environment is registered in Gym](https://github.com/openai/gym/wiki/Environments) and if [the policy is registered](https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html): - -```python -from stable_baselines import PPO2 - -model = PPO2('MlpPolicy', 'CartPole-v1').learn(10000) -``` - -""" - -setup(name='stable_baselines', - packages=[package for package in find_packages() - if package.startswith('stable_baselines')], - package_data={ - 'stable_baselines': ['py.typed', 'version.txt'], - }, - install_requires=[ - 'gym[atari,classic_control]==0.19.0', - 'scipy', - 'joblib', - 'cloudpickle>=0.5.5', - 'opencv-python', - 'numpy', - 'pandas', - 'matplotlib' - ] + find_tf_dependency(), - extras_require={ - 'mpi': [ - 'mpi4py', - ], - 'tests': [ - 'pytest', - 'pytest-cov', - 'pytest-env', - 'pytest-xdist', - 'pytype', - ], - 'docs': [ - 'sphinx', - 'sphinx-autobuild', - 'sphinx-rtd-theme' - ] - }, - description='A fork of OpenAI Baselines, implementations of reinforcement learning algorithms.', - author='Ashley Hill', - url='https://github.com/hill-a/stable-baselines', - author_email='ashley.hill@u-psud.fr', - keywords="reinforcement-learning-algorithms reinforcement-learning machine-learning " - "gym openai baselines toolbox python data-science", - license="MIT", - long_description=long_description, - long_description_content_type='text/markdown', - version=__version__, - ) - -# python setup.py sdist -# python setup.py bdist_wheel -# twine upload --repository-url https://test.pypi.org/legacy/ dist/* -# twine upload dist/* diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py deleted file mode 100644 index df22523a..00000000 --- a/stable_baselines/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -import warnings - -from stable_baselines.a2c import A2C -from stable_baselines.acer import ACER -from stable_baselines.acktr import ACKTR -from stable_baselines.deepq import DQN -from stable_baselines.her import HER -from stable_baselines.ppo2 import PPO2 -from stable_baselines.td3 import TD3 -from stable_baselines.sac import SAC - -# Load mpi4py-dependent algorithms only if mpi is installed. -try: - import mpi4py -except ImportError: - mpi4py = None - -if mpi4py is not None: - from stable_baselines.ddpg import DDPG - from stable_baselines.gail import GAIL - from stable_baselines.ppo1 import PPO1 - from stable_baselines.trpo_mpi import TRPO -del mpi4py - -# Read version from file -version_file = os.path.join(os.path.dirname(__file__), "version.txt") -with open(version_file, "r") as file_handler: - __version__ = file_handler.read().strip() - - -warnings.warn( - "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation." -) diff --git a/stable_baselines/a2c/__init__.py b/stable_baselines/a2c/__init__.py deleted file mode 100644 index d68abe7e..00000000 --- a/stable_baselines/a2c/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.a2c.a2c import A2C diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py deleted file mode 100644 index 45a96fa4..00000000 --- a/stable_baselines/a2c/a2c.py +++ /dev/null @@ -1,413 +0,0 @@ -import time - -import gym -import numpy as np -import tensorflow as tf - -from stable_baselines import logger -from stable_baselines.common import explained_variance, tf_util, ActorCriticRLModel, SetVerbosity, TensorboardWriter -from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy -from stable_baselines.common.runners import AbstractEnvRunner -from stable_baselines.common.schedules import Scheduler -from stable_baselines.common.tf_util import mse, total_episode_reward_logger -from stable_baselines.common.math_util import safe_mean - - -def discount_with_dones(rewards, dones, gamma): - """ - Apply the discount value to the reward, where the environment is not done - - :param rewards: ([float]) The rewards - :param dones: ([bool]) Whether an environment is done or not - :param gamma: (float) The discount value - :return: ([float]) The discounted rewards - """ - discounted = [] - ret = 0 # Return: discounted reward - for reward, done in zip(rewards[::-1], dones[::-1]): - ret = reward + gamma * ret * (1. - done) # fixed off by one bug - discounted.append(ret) - return discounted[::-1] - - -class A2C(ActorCriticRLModel): - """ - The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783 - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) Discount factor - :param n_steps: (int) The number of steps to run for each environment per update - (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) - :param vf_coef: (float) Value function coefficient for the loss calculation - :param ent_coef: (float) Entropy coefficient for the loss calculation - :param max_grad_norm: (float) The maximum value for the gradient clipping - :param learning_rate: (float) The learning rate - :param alpha: (float) RMSProp decay parameter (default: 0.99) - :param momentum: (float) RMSProp momentum parameter (default: 0.0) - :param epsilon: (float) RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) - (default: 1e-5) - :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - (used only for loading) - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, - learning_rate=7e-4, alpha=0.99, momentum=0.0, epsilon=1e-5, lr_schedule='constant', - verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): - - self.n_steps = n_steps - self.gamma = gamma - self.vf_coef = vf_coef - self.ent_coef = ent_coef - self.max_grad_norm = max_grad_norm - self.alpha = alpha - self.momentum = momentum - self.epsilon = epsilon - self.lr_schedule = lr_schedule - self.learning_rate = learning_rate - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - - self.learning_rate_ph = None - self.n_batch = None - self.actions_ph = None - self.advs_ph = None - self.rewards_ph = None - self.pg_loss = None - self.vf_loss = None - self.entropy = None - self.apply_backprop = None - self.train_model = None - self.step_model = None - self.proba_step = None - self.value = None - self.initial_state = None - self.learning_rate_schedule = None - self.summary = None - - super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - # if we are loading, it is possible the environment is not known, however the obs and action space are known - if _init_setup_model: - self.setup_model() - - def _make_runner(self) -> AbstractEnvRunner: - return A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) - - def _get_pretrain_placeholders(self): - policy = self.train_model - if isinstance(self.action_space, gym.spaces.Discrete): - return policy.obs_ph, self.actions_ph, policy.policy - return policy.obs_ph, self.actions_ph, policy.deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ - "instance of common.policies.ActorCriticPolicy." - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - self.n_batch = self.n_envs * self.n_steps - - n_batch_step = None - n_batch_train = None - if issubclass(self.policy, RecurrentActorCriticPolicy): - n_batch_step = self.n_envs - n_batch_train = self.n_envs * self.n_steps - - step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - n_batch_step, reuse=False, **self.policy_kwargs) - - with tf.variable_scope("train_model", reuse=True, - custom_getter=tf_util.outer_scope_getter("train_model")): - train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, - self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False): - self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") - self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") - self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") - self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") - - neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) - self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) - self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) - self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) - # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 - # and https://github.com/dennybritz/reinforcement-learning/issues/34 - # suggest to add an entropy component in order to improve exploration. - loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef - - tf.summary.scalar('entropy_loss', self.entropy) - tf.summary.scalar('policy_gradient_loss', self.pg_loss) - tf.summary.scalar('value_function_loss', self.vf_loss) - tf.summary.scalar('loss', loss) - - self.params = tf_util.get_trainable_vars("model") - grads = tf.gradients(loss, self.params) - if self.max_grad_norm is not None: - grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) - grads = list(zip(grads, self.params)) - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) - tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) - if self.full_tensorboard_log: - tf.summary.histogram('discounted_rewards', self.rewards_ph) - tf.summary.histogram('learning_rate', self.learning_rate_ph) - tf.summary.histogram('advantage', self.advs_ph) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', train_model.obs_ph) - else: - tf.summary.histogram('observation', train_model.obs_ph) - - trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, - epsilon=self.epsilon, momentum=self.momentum) - self.apply_backprop = trainer.apply_gradients(grads) - - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.proba_step = step_model.proba_step - self.value = step_model.value - self.initial_state = step_model.initial_state - tf.global_variables_initializer().run(session=self.sess) - - self.summary = tf.summary.merge_all() - - def _train_step(self, obs, states, rewards, masks, actions, values, update, writer=None): - """ - applies a training step to the model - - :param obs: ([float]) The input observations - :param states: ([float]) The states (used for recurrent policies) - :param rewards: ([float]) The rewards from the environment - :param masks: ([bool]) Whether or not the episode is over (used for recurrent policies) - :param actions: ([float]) The actions taken - :param values: ([float]) The logits values - :param update: (int) the current step iteration - :param writer: (TensorFlow Summary.writer) the writer for tensorboard - :return: (float, float, float) policy loss, value loss, policy entropy - """ - advs = rewards - values - cur_lr = None - for _ in range(len(obs)): - cur_lr = self.learning_rate_schedule.value() - assert cur_lr is not None, "Error: the observation input array cannon be empty" - - td_map = {self.train_model.obs_ph: obs, self.actions_ph: actions, self.advs_ph: advs, - self.rewards_ph: rewards, self.learning_rate_ph: cur_lr} - if states is not None: - td_map[self.train_model.states_ph] = states - td_map[self.train_model.dones_ph] = masks - - if writer is not None: - # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) - if self.full_tensorboard_log and (1 + update) % 10 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], - td_map, options=run_options, run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % (update * self.n_batch)) - else: - summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map) - writer.add_summary(summary, update * self.n_batch) - - else: - policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map) - - return policy_loss, value_loss, policy_entropy - - def set_env(self,env): - super().set_env(env) - self.n_batch = self.n_envs * self.n_steps - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", - reset_num_timesteps=True): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, - schedule=self.lr_schedule) - - t_start = time.time() - callback.on_training_start(locals(), globals()) - - for update in range(1, total_timesteps // self.n_batch + 1): - - callback.on_rollout_start() - # true_reward is the reward without discount - rollout = self.runner.run(callback) - # unpack - obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout - callback.update_locals(locals()) - callback.on_rollout_end() - - # Early stopping due to the callback - if not self.runner.continue_training: - break - - self.ep_info_buf.extend(ep_infos) - _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, - self.num_timesteps // self.n_batch, writer) - n_seconds = time.time() - t_start - fps = int((update * self.n_batch) / n_seconds) - - if writer is not None: - total_episode_reward_logger(self.episode_reward, - true_reward.reshape((self.n_envs, self.n_steps)), - masks.reshape((self.n_envs, self.n_steps)), - writer, self.num_timesteps) - - if self.verbose >= 1 and (update % log_interval == 0 or update == 1): - explained_var = explained_variance(values, rewards) - logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", self.num_timesteps) - logger.record_tabular("fps", fps) - logger.record_tabular("policy_entropy", float(policy_entropy)) - logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(explained_var)) - if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: - logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) - logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) - logger.dump_tabular() - - callback.on_training_end() - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "n_steps": self.n_steps, - "vf_coef": self.vf_coef, - "ent_coef": self.ent_coef, - "max_grad_norm": self.max_grad_norm, - "learning_rate": self.learning_rate, - "alpha": self.alpha, - "epsilon": self.epsilon, - "lr_schedule": self.lr_schedule, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) - - -class A2CRunner(AbstractEnvRunner): - def __init__(self, env, model, n_steps=5, gamma=0.99): - """ - A runner to learn the policy of an environment for an a2c model - - :param env: (Gym environment) The environment to learn from - :param model: (Model) The model to learn - :param n_steps: (int) The number of steps to run for each environment - :param gamma: (float) Discount factor - """ - super(A2CRunner, self).__init__(env=env, model=model, n_steps=n_steps) - self.gamma = gamma - - def _run(self): - """ - Run a learning step of the model - - :return: ([float], [float], [float], [bool], [float], [float]) - observations, states, rewards, masks, actions, values - """ - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] - mb_states = self.states - ep_infos = [] - for _ in range(self.n_steps): - actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) # pytype: disable=attribute-error - mb_obs.append(np.copy(self.obs)) - mb_actions.append(actions) - mb_values.append(values) - mb_dones.append(self.dones) - clipped_actions = actions - # Clip the actions to avoid out of bound error - if isinstance(self.env.action_space, gym.spaces.Box): - clipped_actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) - obs, rewards, dones, infos = self.env.step(clipped_actions) - - self.model.num_timesteps += self.n_envs - - if self.callback is not None: - # Abort training early - self.callback.update_locals(locals()) - if self.callback.on_step() is False: - self.continue_training = False - # Return dummy values - return [None] * 8 - - for info in infos: - maybe_ep_info = info.get('episode') - if maybe_ep_info is not None: - ep_infos.append(maybe_ep_info) - - self.states = states - self.dones = dones - self.obs = obs - mb_rewards.append(rewards) - mb_dones.append(self.dones) - # batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(0, 1) - mb_actions = np.asarray(mb_actions, dtype=self.env.action_space.dtype).swapaxes(0, 1) - mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(0, 1) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(0, 1) - mb_masks = mb_dones[:, :-1] - mb_dones = mb_dones[:, 1:] - true_rewards = np.copy(mb_rewards) - last_values = self.model.value(self.obs, self.states, self.dones).tolist() # pytype: disable=attribute-error - # discount/bootstrap off value fn - for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): - rewards = rewards.tolist() - dones = dones.tolist() - if dones[-1] == 0: - rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] - else: - rewards = discount_with_dones(rewards, dones, self.gamma) - mb_rewards[n] = rewards - - # convert from [n_env, n_steps, ...] to [n_steps * n_env, ...] - mb_rewards = mb_rewards.reshape(-1, *mb_rewards.shape[2:]) - mb_actions = mb_actions.reshape(-1, *mb_actions.shape[2:]) - mb_values = mb_values.reshape(-1, *mb_values.shape[2:]) - mb_masks = mb_masks.reshape(-1, *mb_masks.shape[2:]) - true_rewards = true_rewards.reshape(-1, *true_rewards.shape[2:]) - return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, ep_infos, true_rewards diff --git a/stable_baselines/a2c/run_atari.py b/stable_baselines/a2c/run_atari.py deleted file mode 100644 index f8f7817e..00000000 --- a/stable_baselines/a2c/run_atari.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -from stable_baselines import logger, A2C -from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser -from stable_baselines.common.vec_env import VecFrameStack -from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy - - -def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): - """ - Train A2C model for atari environment, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) - :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param num_env: (int) The number of environments - """ - policy_fn = None - if policy == 'cnn': - policy_fn = CnnPolicy - elif policy == 'lstm': - policy_fn = CnnLstmPolicy - elif policy == 'lnlstm': - policy_fn = CnnLnLstmPolicy - if policy_fn is None: - raise ValueError("Error: policy {} not implemented".format(policy)) - - env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) - - model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed) - model.learn(total_timesteps=int(num_timesteps * 1.1)) - env.close() - - -def main(): - """ - Runs the test - """ - parser = atari_arg_parser() - parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') - parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', - help='Learning rate schedule') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule, - num_env=16) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/acer/__init__.py b/stable_baselines/acer/__init__.py deleted file mode 100644 index a81d161a..00000000 --- a/stable_baselines/acer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.acer.acer_simple import ACER diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py deleted file mode 100644 index 160f48a3..00000000 --- a/stable_baselines/acer/acer_simple.py +++ /dev/null @@ -1,769 +0,0 @@ -import time -import warnings - -import numpy as np -import tensorflow as tf -from gym.spaces import Discrete, Box -from collections import deque - -from stable_baselines import logger -from stable_baselines.common.schedules import Scheduler -from stable_baselines.common.tf_util import batch_to_seq, seq_to_batch, \ - check_shape, avg_norm, gradient_add, q_explained_variance, total_episode_reward_logger -from stable_baselines.acer.buffer import Buffer -from stable_baselines.common import ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter -from stable_baselines.common.runners import AbstractEnvRunner -from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy - - -# For ACER -def get_by_index(input_tensor, idx): - """ - Return the input tensor, offset by a certain value - - :param input_tensor: (TensorFlow Tensor) The input tensor - :param idx: (int) The index offset - :return: (TensorFlow Tensor) the offset tensor - """ - assert len(input_tensor.get_shape()) == 2 - assert len(idx.get_shape()) == 1 - idx_flattened = tf.range(0, input_tensor.shape[0], dtype=tf.int64) * input_tensor.shape[1] + idx - offset_tensor = tf.gather(tf.reshape(input_tensor, [-1]), # flatten input - idx_flattened) # use flattened indices - return offset_tensor - - -def strip(var, n_envs, n_steps, flat=False): - """ - Removes the last step in the batch - - :param var: (TensorFlow Tensor) The input Tensor - :param n_envs: (int) The number of environments - :param n_steps: (int) The number of steps to run for each environment - :param flat: (bool) If the input Tensor is flat - :return: (TensorFlow Tensor) the input tensor, without the last step in the batch - """ - out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat) - return seq_to_batch(out_vars[:-1], flat) - - -def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma): - """ - Calculates the target Q-retrace - - :param rewards: ([TensorFlow Tensor]) The rewards - :param dones: ([TensorFlow Tensor]) - :param q_i: ([TensorFlow Tensor]) The Q values for actions taken - :param values: ([TensorFlow Tensor]) The output of the value functions - :param rho_i: ([TensorFlow Tensor]) The importance weight for each action - :param n_envs: (int) The number of environments - :param n_steps: (int) The number of steps to run for each environment - :param gamma: (float) The discount value - :return: ([TensorFlow Tensor]) the target Q-retrace - """ - rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True) # list of len steps, shape [n_envs] - reward_seq = batch_to_seq(rewards, n_envs, n_steps, True) # list of len steps, shape [n_envs] - done_seq = batch_to_seq(dones, n_envs, n_steps, True) # list of len steps, shape [n_envs] - q_is = batch_to_seq(q_i, n_envs, n_steps, True) - value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True) - final_value = value_sequence[-1] - qret = final_value - qrets = [] - for i in range(n_steps - 1, -1, -1): - check_shape([qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i]], [[n_envs]] * 6) - qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i]) - qrets.append(qret) - qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i] - qrets = qrets[::-1] - qret = seq_to_batch(qrets, flat=True) - return qret - - -class EpisodeStats: - def __init__(self, n_steps, n_envs): - """ - Calculates the episode statistics - - :param n_steps: (int) The number of steps to run for each environment - :param n_envs: (int) The number of environments - """ - self.episode_rewards = [] - for _ in range(n_envs): - self.episode_rewards.append([]) - self.len_buffer = deque(maxlen=40) # rolling buffer for episode lengths - self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - self.n_steps = n_steps - self.n_envs = n_envs - - def feed(self, rewards, masks): - """ - Update the latest reward and mask - - :param rewards: ([float]) The new rewards for the new step - :param masks: ([float]) The new masks for the new step - """ - rewards = np.reshape(rewards, [self.n_envs, self.n_steps]) - masks = np.reshape(masks, [self.n_envs, self.n_steps]) - for i in range(0, self.n_envs): - for j in range(0, self.n_steps): - self.episode_rewards[i].append(rewards[i][j]) - if masks[i][j]: - reward_length = len(self.episode_rewards[i]) - reward_sum = sum(self.episode_rewards[i]) - self.len_buffer.append(reward_length) - self.rewbuffer.append(reward_sum) - self.episode_rewards[i] = [] - - def mean_length(self): - """ - Returns the average length of each episode - - :return: (float) - """ - if self.len_buffer: - return np.mean(self.len_buffer) - else: - return 0 # on the first params dump, no episodes are finished - - def mean_reward(self): - """ - Returns the average reward of each episode - - :return: (float) - """ - if self.rewbuffer: - return np.mean(self.rewbuffer) - else: - return 0 - - -class ACER(ActorCriticRLModel): - """ - The ACER (Actor-Critic with Experience Replay) model class, https://arxiv.org/abs/1611.01224 - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) The discount value - :param n_steps: (int) The number of steps to run for each environment per update - (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) - :param num_procs: (int) The number of threads for TensorFlow operations - - .. deprecated:: 2.9.0 - Use `n_cpu_tf_sess` instead. - - :param q_coef: (float) The weight for the loss on the Q value - :param ent_coef: (float) The weight for the entropy loss - :param max_grad_norm: (float) The clipping value for the maximum gradient - :param learning_rate: (float) The initial learning rate for the RMS prop optimizer - :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param rprop_epsilon: (float) RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) - (default: 1e-5) - :param rprop_alpha: (float) RMSProp decay parameter (default: 0.99) - :param buffer_size: (int) The buffer size in number of steps - :param replay_ratio: (float) The number of replay learning per on policy learning on average, - using a poisson distribution - :param replay_start: (int) The minimum number of steps in the buffer, before learning replay - :param correction_term: (float) Importance weight clipping factor (default: 10) - :param trust_region: (bool) Whether or not algorithms estimates the gradient KL divergence - between the old and updated policy and uses it to determine step size (default: True) - :param alpha: (float) The decay rate for the Exponential moving average of the parameters - :param delta: (float) max KL divergence between the old policy and updated policy (default: 1) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, - learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000, - replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, - alpha=0.99, delta=1, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): - - if num_procs is not None: - warnings.warn("num_procs will be removed in a future version (v3.x.x) " - "use n_cpu_tf_sess instead", DeprecationWarning) - n_cpu_tf_sess = num_procs - - self.n_steps = n_steps - self.replay_ratio = replay_ratio - self.buffer_size = buffer_size - self.replay_start = replay_start - self.gamma = gamma - self.alpha = alpha - self.correction_term = correction_term - self.q_coef = q_coef - self.ent_coef = ent_coef - self.trust_region = trust_region - self.delta = delta - self.max_grad_norm = max_grad_norm - self.rprop_alpha = rprop_alpha - self.rprop_epsilon = rprop_epsilon - self.learning_rate = learning_rate - self.lr_schedule = lr_schedule - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - - self.action_ph = None - self.done_ph = None - self.reward_ph = None - self.mu_ph = None - self.learning_rate_ph = None - self.polyak_model = None - self.learning_rate_schedule = None - self.run_ops = None - self.names_ops = None - self.train_model = None - self.step_model = None - self.proba_step = None - self.n_act = None - self.n_batch = None - self.summary = None - - super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - if _init_setup_model: - self.setup_model() - - def _make_runner(self) -> AbstractEnvRunner: - return _Runner(env=self.env, model=self, n_steps=self.n_steps) - - def _get_pretrain_placeholders(self): - policy = self.step_model - action_ph = policy.pdtype.sample_placeholder([None]) - if isinstance(self.action_space, Discrete): - return policy.obs_ph, action_ph, policy.policy - raise NotImplementedError('Only discrete actions are supported for ACER for now') - - def set_env(self, env): - if env is not None: - assert self.n_envs == env.num_envs, \ - "Error: the environment passed must have the same number of environments as the model was trained on." \ - "This is due to ACER not being capable of changing the number of environments." - - super().set_env(env) - - def setup_model(self): - with SetVerbosity(self.verbose): - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \ - "an instance of common.policies.ActorCriticPolicy." - - if isinstance(self.action_space, Discrete): - self.n_act = self.action_space.n - continuous = False - elif isinstance(self.action_space, Box): - # self.n_act = self.action_space.shape[-1] - # continuous = True - raise NotImplementedError("WIP: Acer does not support Continuous actions yet.") - else: - raise ValueError("Error: ACER does not work with {} actions space.".format(self.action_space)) - - self.n_batch = self.n_envs * self.n_steps - - self.graph = tf.Graph() - with self.graph.as_default(): - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - self.set_random_seed(self.seed) - n_batch_step = None - if issubclass(self.policy, RecurrentActorCriticPolicy): - n_batch_step = self.n_envs - n_batch_train = self.n_envs * (self.n_steps + 1) - - step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - n_batch_step, reuse=False, **self.policy_kwargs) - - self.params = tf_util.get_trainable_vars("model") - - with tf.variable_scope("train_model", reuse=True, - custom_getter=tf_util.outer_scope_getter("train_model")): - train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, - self.n_steps + 1, n_batch_train, reuse=True, **self.policy_kwargs) - - with tf.variable_scope("moving_average"): - # create averaged model - ema = tf.train.ExponentialMovingAverage(self.alpha) - ema_apply_op = ema.apply(self.params) - - def custom_getter(getter, name, *args, **kwargs): - name = name.replace("polyak_model/", "") - val = ema.average(getter(name, *args, **kwargs)) - return val - - with tf.variable_scope("polyak_model", reuse=True, custom_getter=custom_getter): - self.polyak_model = polyak_model = self.policy(self.sess, self.observation_space, self.action_space, - self.n_envs, self.n_steps + 1, - self.n_envs * (self.n_steps + 1), reuse=True, - **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False): - self.done_ph = tf.placeholder(tf.float32, [self.n_batch]) # dones - self.reward_ph = tf.placeholder(tf.float32, [self.n_batch]) # rewards, not returns - self.mu_ph = tf.placeholder(tf.float32, [self.n_batch, self.n_act]) # mu's - self.action_ph = train_model.pdtype.sample_placeholder([self.n_batch]) - self.learning_rate_ph = tf.placeholder(tf.float32, []) - eps = 1e-6 - - # Notation: (var) = batch variable, (var)s = sequence variable, - # (var)_i = variable index by action at step i - # shape is [n_envs * (n_steps + 1)] - if continuous: - value = train_model.value_flat - else: - value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1) - - rho, rho_i_ = None, None - if continuous: - action_ = strip(train_model.proba_distribution.sample(), self.n_envs, self.n_steps) - distribution_f = tf.contrib.distributions.MultivariateNormalDiag( - loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps), - scale_diag=strip(train_model.proba_distribution.logstd, self.n_envs, self.n_steps)) - f_polyak = tf.contrib.distributions.MultivariateNormalDiag( - loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps), - scale_diag=strip(polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps)) - - f_i = distribution_f.prob(self.action_ph) - f_i_ = distribution_f.prob(action_) - f_polyak_i = f_polyak.prob(self.action_ph) - phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps) - - q_value = strip(train_model.value_fn, self.n_envs, self.n_steps) - q_i = q_value[:, 0] - - rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps) - rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps) - - qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1 / self.n_act), - self.n_envs, self.n_steps, self.gamma) - else: - # strip off last step - # f is a distribution, chosen to be Gaussian distributions - # with fixed diagonal covariance and mean \phi(x) - # in the paper - distribution_f, f_polyak, q_value = \ - map(lambda variables: strip(variables, self.n_envs, self.n_steps), - [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value]) - - # Get pi and q values for actions taken - f_i = get_by_index(distribution_f, self.action_ph) - f_i_ = distribution_f - phi_i = distribution_f - f_polyak_i = f_polyak - - q_i = get_by_index(q_value, self.action_ph) - - # Compute ratios for importance truncation - rho = distribution_f / (self.mu_ph + eps) - rho_i = get_by_index(rho, self.action_ph) - - # Calculate Q_retrace targets - qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps, - self.gamma) - - # Calculate losses - # Entropy - entropy = tf.reduce_sum(train_model.proba_distribution.entropy()) - - # Policy Gradient loss, with truncated importance sampling & bias correction - value = strip(value, self.n_envs, self.n_steps, True) - # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4) - # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2) - - # Truncated importance sampling - adv = qret - value - log_f = tf.log(f_i + eps) - # [n_envs * n_steps] - gain_f = log_f * tf.stop_gradient(adv * tf.minimum(self.correction_term, rho_i)) - loss_f = -tf.reduce_mean(gain_f) - - # Bias correction for the truncation - adv_bc = (q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1])) # [n_envs * n_steps, n_act] - - # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2) - if continuous: - gain_bc = tf.stop_gradient(adv_bc * - tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) * - f_i_) - else: - log_f_bc = tf.log(f_i_ + eps) # / (f_old + eps) - gain_bc = tf.reduce_sum(log_f_bc * - tf.stop_gradient( - adv_bc * - tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) * - f_i_), - axis=1) - # IMP: This is sum, as expectation wrt f - loss_bc = -tf.reduce_mean(gain_bc) - - loss_policy = loss_f + loss_bc - - # Value/Q function loss, and explained variance - check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2) - explained_variance = q_explained_variance(tf.reshape(q_i, [self.n_envs, self.n_steps]), - tf.reshape(qret, [self.n_envs, self.n_steps])) - loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) - - # Net loss - check_shape([loss_policy, loss_q, entropy], [[]] * 3) - loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy - - tf.summary.scalar('entropy_loss', entropy) - tf.summary.scalar('policy_gradient_loss', loss_policy) - tf.summary.scalar('value_function_loss', loss_q) - tf.summary.scalar('loss', loss) - - norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None - avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None - if self.trust_region: - # [n_envs * n_steps, n_act] - grad = tf.gradients(- (loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs, - phi_i) - # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f - kl_grad = - f_polyak_i / (f_i_ + eps) - k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) - adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / ( - tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] - - # Calculate stats (before doing adjustment) for logging. - avg_norm_k = avg_norm(kl_grad) - avg_norm_g = avg_norm(grad) - avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) - avg_norm_adj = tf.reduce_mean(tf.abs(adj)) - - grad = grad - tf.reshape(adj, [self.n_envs * self.n_steps, 1]) * kl_grad - # These are turst region adjusted gradients wrt f ie statistics of policy pi - grads_f = -grad / (self.n_envs * self.n_steps) - grads_policy = tf.gradients(f_i_, self.params, grads_f) - grads_q = tf.gradients(loss_q * self.q_coef, self.params) - grads = [gradient_add(g1, g2, param, verbose=self.verbose) - for (g1, g2, param) in zip(grads_policy, grads_q, self.params)] - - avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs) - norm_grads_q = tf.global_norm(grads_q) - norm_grads_policy = tf.global_norm(grads_policy) - else: - grads = tf.gradients(loss, self.params) - - norm_grads = None - if self.max_grad_norm is not None: - grads, norm_grads = tf.clip_by_global_norm(grads, self.max_grad_norm) - grads = list(zip(grads, self.params)) - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) - tf.summary.scalar('advantage', tf.reduce_mean(adv)) - tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph)) - - if self.full_tensorboard_log: - tf.summary.histogram('rewards', self.reward_ph) - tf.summary.histogram('learning_rate', self.learning_rate) - tf.summary.histogram('advantage', adv) - tf.summary.histogram('action_probability', self.mu_ph) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', train_model.obs_ph) - else: - tf.summary.histogram('observation', train_model.obs_ph) - - trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.rprop_alpha, - epsilon=self.rprop_epsilon) - _opt_op = trainer.apply_gradients(grads) - - # so when you call _train, you first do the gradient step, then you apply ema - with tf.control_dependencies([_opt_op]): - _train = tf.group(ema_apply_op) - - # Ops/Summaries to run, and their names for logging - assert norm_grads is not None - run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads] - names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', - 'norm_grads'] - if self.trust_region: - self.run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, - avg_norm_k_dot_g, avg_norm_adj] - self.names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', - 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] - - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.proba_step = step_model.proba_step - self.initial_state = step_model.initial_state - - tf.global_variables_initializer().run(session=self.sess) - - self.summary = tf.summary.merge_all() - - def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, writer=None): - """ - applies a training step to the model - - :param obs: ([float]) The input observations - :param actions: ([float]) The actions taken - :param rewards: ([float]) The rewards from the environment - :param dones: ([bool]) Whether or not the episode is over (aligned with reward, used for reward calculation) - :param mus: ([float]) The logits values - :param states: ([float]) The states (used for recurrent policies) - :param masks: ([bool]) Whether or not the episode is over (used for recurrent policies) - :param steps: (int) the number of steps done so far (can be None) - :param writer: (TensorFlow Summary.writer) the writer for tensorboard - :return: ([str], [float]) the list of update operation name, and the list of the results of the operations - """ - cur_lr = self.learning_rate_schedule.value_steps(steps) - td_map = {self.train_model.obs_ph: obs, self.polyak_model.obs_ph: obs, self.action_ph: actions, - self.reward_ph: rewards, self.done_ph: dones, self.mu_ph: mus, self.learning_rate_ph: cur_lr} - - if states is not None: - td_map[self.train_model.states_ph] = states - td_map[self.train_model.dones_ph] = masks - td_map[self.polyak_model.states_ph] = states - td_map[self.polyak_model.dones_ph] = masks - - if writer is not None: - # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) - if self.full_tensorboard_log and (1 + (steps / self.n_batch)) % 10 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - step_return = self.sess.run([self.summary] + self.run_ops, td_map, options=run_options, - run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % steps) - else: - step_return = self.sess.run([self.summary] + self.run_ops, td_map) - writer.add_summary(step_return[0], steps) - step_return = step_return[1:] - else: - step_return = self.sess.run(self.run_ops, td_map) - - return self.names_ops, step_return[1:] # strip off _train - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER", - reset_num_timesteps=True): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, - schedule=self.lr_schedule) - - episode_stats = EpisodeStats(self.n_steps, self.n_envs) - - if self.replay_ratio > 0: - buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) - else: - buffer = None - - t_start = time.time() - callback.on_training_start(locals(), globals()) - - # n_batch samples, 1 on_policy call and multiple off-policy calls - for steps in range(0, total_timesteps, self.n_batch): - - callback.on_rollout_start() - - enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run(callback) - callback.update_locals(locals()) - callback.on_rollout_end() - - # Early stopping due to the callback - if not self.runner.continue_training: - break - - episode_stats.feed(rewards, dones) - - if buffer is not None: - buffer.put(enc_obs, actions, rewards, mus, dones, masks) - - if writer is not None: - total_episode_reward_logger(self.episode_reward, - rewards.reshape((self.n_envs, self.n_steps)), - dones.reshape((self.n_envs, self.n_steps)), - writer, self.num_timesteps) - - # reshape stuff correctly - obs = obs.reshape(self.runner.batch_ob_shape) - actions = actions.reshape([self.n_batch]) - rewards = rewards.reshape([self.n_batch]) - mus = mus.reshape([self.n_batch, self.n_act]) - dones = dones.reshape([self.n_batch]) - masks = masks.reshape([self.runner.batch_ob_shape[0]]) - - names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, - self.num_timesteps, writer) - - if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0): - logger.record_tabular("total_timesteps", self.num_timesteps) - logger.record_tabular("fps", int(steps / (time.time() - t_start))) - # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, - # not just at the terminal state. Thus, this is mean until end of life, not end of episode. - # For true episode rewards, see the monitor files in the log folder. - logger.record_tabular("mean_episode_length", episode_stats.mean_length()) - logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) - for name, val in zip(names_ops, values_ops): - logger.record_tabular(name, float(val)) - logger.dump_tabular() - - if (self.replay_ratio > 0 and - buffer is not None and - buffer.has_atleast(self.replay_start)): - samples_number = np.random.poisson(self.replay_ratio) - for _ in range(samples_number): - # get obs, actions, rewards, mus, dones from buffer. - obs, actions, rewards, mus, dones, masks = buffer.get() - - # reshape stuff correctly - obs = obs.reshape(self.runner.batch_ob_shape) - actions = actions.reshape([self.n_batch]) - rewards = rewards.reshape([self.n_batch]) - mus = mus.reshape([self.n_batch, self.n_act]) - dones = dones.reshape([self.n_batch]) - masks = masks.reshape([self.runner.batch_ob_shape[0]]) - - self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, - self.num_timesteps) - - callback.on_training_end() - - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "n_steps": self.n_steps, - "q_coef": self.q_coef, - "ent_coef": self.ent_coef, - "max_grad_norm": self.max_grad_norm, - "learning_rate": self.learning_rate, - "lr_schedule": self.lr_schedule, - "rprop_alpha": self.rprop_alpha, - "rprop_epsilon": self.rprop_epsilon, - "replay_ratio": self.replay_ratio, - "replay_start": self.replay_start, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - 'n_cpu_tf_sess': self.n_cpu_tf_sess, - 'seed': self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) - - -class _Runner(AbstractEnvRunner): - def __init__(self, env, model, n_steps): - """ - A runner to learn the policy of an environment for a model - - :param env: (Gym environment) The environment to learn from - :param model: (Model) The model to learn - :param n_steps: (int) The number of steps to run for each environment - """ - - super(_Runner, self).__init__(env=env, model=model, n_steps=n_steps) - self.env = env - self.model = model - self.n_env = n_env = env.num_envs - if isinstance(env.action_space, Discrete): - self.n_act = env.action_space.n - else: - self.n_act = env.action_space.shape[-1] - self.n_batch = n_env * n_steps - - if len(env.observation_space.shape) > 1: - self.raw_pixels = True - obs_height, obs_width, obs_num_channels = env.observation_space.shape - self.batch_ob_shape = (n_env * (n_steps + 1), obs_height, obs_width, obs_num_channels) - self.obs_dtype = np.uint8 - self.obs = np.zeros((n_env, obs_height, obs_width, obs_num_channels), dtype=self.obs_dtype) - self.num_channels = obs_num_channels - else: - if len(env.observation_space.shape) == 1: - self.obs_dim = env.observation_space.shape[0] - else: - self.obs_dim = 1 - self.raw_pixels = False - if isinstance(self.env.observation_space, Discrete): - self.batch_ob_shape = (n_env * (n_steps + 1),) - else: - self.batch_ob_shape = (n_env * (n_steps + 1), self.obs_dim) - self.obs_dtype = np.float32 - - self.n_steps = n_steps - self.states = model.initial_state - self.dones = [False for _ in range(n_env)] - - def _run(self): - """ - Run a step leaning of the model - - :return: ([float], [float], [int64], [float], [float], [bool], [float]) - encoded observation, observations, actions, rewards, mus, dones, masks - """ - enc_obs = [self.obs] - mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] - for _ in range(self.n_steps): - actions, _, states, _ = self.model.step(self.obs, self.states, self.dones) - mus = self.model.proba_step(self.obs, self.states, self.dones) - mb_obs.append(np.copy(self.obs)) - mb_actions.append(actions) - mb_mus.append(mus) - mb_dones.append(self.dones) - clipped_actions = actions - # Clip the actions to avoid out of bound error - if isinstance(self.env.action_space, Box): - clipped_actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) - obs, rewards, dones, _ = self.env.step(clipped_actions) - - self.model.num_timesteps += self.n_envs - - if self.callback is not None: - # Abort training early - self.callback.update_locals(locals()) - if self.callback.on_step() is False: - self.continue_training = False - # Return dummy values - return [None] * 7 - - # states information for statefull models like LSTM - self.states = states - self.dones = dones - self.obs = obs - mb_rewards.append(rewards) - enc_obs.append(obs) - mb_obs.append(np.copy(self.obs)) - mb_dones.append(self.dones) - - enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) - mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int64).swapaxes(1, 0) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) - mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) - mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - - mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done - mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards - - # shapes are now [nenv, nsteps, []] - # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. - - return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks diff --git a/stable_baselines/acer/buffer.py b/stable_baselines/acer/buffer.py deleted file mode 100644 index e946c462..00000000 --- a/stable_baselines/acer/buffer.py +++ /dev/null @@ -1,162 +0,0 @@ -import numpy as np - - -class Buffer(object): - def __init__(self, env, n_steps, size=50000): - """ - A buffer for observations, actions, rewards, mu's, states, masks and dones values - - :param env: (Gym environment) The environment to learn from - :param n_steps: (int) The number of steps to run for each environment - :param size: (int) The buffer size in number of steps - """ - self.n_env = env.num_envs - self.n_steps = n_steps - self.n_batch = self.n_env * self.n_steps - # Each loc contains n_env * n_steps frames, thus total buffer is n_env * size frames - self.size = size // self.n_steps - - if len(env.observation_space.shape) > 1: - self.raw_pixels = True - self.height, self.width, self.n_channels = env.observation_space.shape - self.obs_dtype = np.uint8 - else: - self.raw_pixels = False - if len(env.observation_space.shape) == 1: - self.obs_dim = env.observation_space.shape[-1] - else: - self.obs_dim = 1 - self.obs_dtype = np.float32 - - # Memory - self.enc_obs = None - self.actions = None - self.rewards = None - self.mus = None - self.dones = None - self.masks = None - - # Size indexes - self.next_idx = 0 - self.num_in_buffer = 0 - - def has_atleast(self, frames): - """ - Check to see if the buffer has at least the asked number of frames - - :param frames: (int) The number of frames checked - :return: (bool) number of frames in buffer >= number asked - """ - # Frames per env, so total (n_env * frames) Frames needed - # Each buffer loc has n_env * n_steps frames - return self.num_in_buffer >= (frames // self.n_steps) - - def can_sample(self): - """ - Check if the buffer has at least one frame - - :return: (bool) if the buffer has at least one frame - """ - return self.num_in_buffer > 0 - - def decode(self, enc_obs): - """ - Get the stacked frames of an observation - - :param enc_obs: ([float]) the encoded observation - :return: ([float]) the decoded observation - """ - # enc_obs has shape [n_envs, n_steps + 1, nh, nw, nc] - # dones has shape [n_envs, n_steps, nh, nw, nc] - # returns stacked obs of shape [n_env, (n_steps + 1), nh, nw, nc] - n_env, n_steps = self.n_env, self.n_steps - if self.raw_pixels: - obs_dim = [self.height, self.width, self.n_channels] - else: - obs_dim = [self.obs_dim] - - obs = np.zeros([1, n_steps + 1, n_env] + obs_dim, dtype=self.obs_dtype) - # [n_steps + nstack, n_env, nh, nw, nc] - x_var = np.reshape(enc_obs, [n_env, n_steps + 1] + obs_dim).swapaxes(1, 0) - obs[-1, :] = x_var - - if self.raw_pixels: - obs = obs.transpose((2, 1, 3, 4, 0, 5)) - else: - obs = obs.transpose((2, 1, 3, 0)) - return np.reshape(obs, [n_env, (n_steps + 1)] + obs_dim[:-1] + [obs_dim[-1]]) - - def put(self, enc_obs, actions, rewards, mus, dones, masks): - """ - Adds a frame to the buffer - - :param enc_obs: ([float]) the encoded observation - :param actions: ([float]) the actions - :param rewards: ([float]) the rewards - :param mus: ([float]) the policy probability for the actions - :param dones: ([bool]) - :param masks: ([bool]) - """ - # enc_obs [n_env, (n_steps + n_stack), nh, nw, nc] - # actions, rewards, dones [n_env, n_steps] - # mus [n_env, n_steps, n_act] - - if self.enc_obs is None: - self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=self.obs_dtype) - self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32) - self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32) - self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32) - self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool) - self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool) - - self.enc_obs[self.next_idx] = enc_obs - self.actions[self.next_idx] = actions - self.rewards[self.next_idx] = rewards - self.mus[self.next_idx] = mus - self.dones[self.next_idx] = dones - self.masks[self.next_idx] = masks - - self.next_idx = (self.next_idx + 1) % self.size - self.num_in_buffer = min(self.size, self.num_in_buffer + 1) - - def take(self, arr, idx, envx): - """ - Reads a frame from a list and index for the asked environment ids - - :param arr: (np.ndarray) the array that is read - :param idx: ([int]) the idx that are read - :param envx: ([int]) the idx for the environments - :return: ([float]) the askes frames from the list - """ - n_env = self.n_env - out = np.empty([n_env] + list(arr.shape[2:]), dtype=arr.dtype) - for i in range(n_env): - out[i] = arr[idx[i], envx[i]] - return out - - def get(self): - """ - randomly read a frame from the buffer - - :return: ([float], [float], [float], [float], [bool], [float]) - observations, actions, rewards, mus, dones, maskes - """ - # returns - # obs [n_env, (n_steps + 1), nh, nw, n_stack*nc] - # actions, rewards, dones [n_env, n_steps] - # mus [n_env, n_steps, n_act] - n_env = self.n_env - assert self.can_sample() - - # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env. - idx = np.random.randint(0, self.num_in_buffer, n_env) - envx = np.arange(n_env) - - dones = self.take(self.dones, idx, envx) - enc_obs = self.take(self.enc_obs, idx, envx) - obs = self.decode(enc_obs) - actions = self.take(self.actions, idx, envx) - rewards = self.take(self.rewards, idx, envx) - mus = self.take(self.mus, idx, envx) - masks = self.take(self.masks, idx, envx) - return obs, actions, rewards, mus, dones, masks diff --git a/stable_baselines/acer/run_atari.py b/stable_baselines/acer/run_atari.py deleted file mode 100644 index c09788fe..00000000 --- a/stable_baselines/acer/run_atari.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -import warnings - -from stable_baselines import logger, ACER -from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy -from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser -from stable_baselines.common.vec_env import VecFrameStack - - -def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): - """ - train an ACER model on atari - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) - :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param num_cpu: (int) The number of cpu to train on - """ - env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) - if policy == 'cnn': - policy_fn = CnnPolicy - elif policy == 'lstm': - policy_fn = CnnLstmPolicy - else: - warnings.warn("Policy {} not implemented".format(policy)) - return - - model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000, seed=seed) - model.learn(total_timesteps=int(num_timesteps * 1.1)) - env.close() - # Free memory - del model - - -def main(): - """ - Runs the test - """ - parser = atari_arg_parser() - parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') - parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', - help='Learning rate schedule') - parser.add_argument('--logdir', help='Directory for logging') - args = parser.parse_args() - logger.configure(args.logdir) - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/acktr/__init__.py b/stable_baselines/acktr/__init__.py deleted file mode 100644 index fcae30ed..00000000 --- a/stable_baselines/acktr/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.acktr.acktr import ACKTR diff --git a/stable_baselines/acktr/acktr.py b/stable_baselines/acktr/acktr.py deleted file mode 100644 index e8509bd5..00000000 --- a/stable_baselines/acktr/acktr.py +++ /dev/null @@ -1,412 +0,0 @@ -import time -import warnings - -import tensorflow as tf -from gym.spaces import Box, Discrete - -from stable_baselines import logger -from stable_baselines.a2c.a2c import A2CRunner -from stable_baselines.ppo2.ppo2 import Runner as PPO2Runner -from stable_baselines.common.tf_util import mse, total_episode_reward_logger -from stable_baselines.acktr import kfac -from stable_baselines.common.schedules import Scheduler -from stable_baselines.common import explained_variance, ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter -from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy -from stable_baselines.common.math_util import safe_mean - - -class ACKTR(ActorCriticRLModel): - """ - The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) Discount factor - :param nprocs: (int) The number of threads for TensorFlow operations - - .. deprecated:: 2.9.0 - Use `n_cpu_tf_sess` instead. - - :param n_steps: (int) The number of steps to run for each environment - :param ent_coef: (float) The weight for the entropy loss - :param vf_coef: (float) The weight for the loss on the value function - :param vf_fisher_coef: (float) The weight for the fisher loss on the value function - :param learning_rate: (float) The initial learning rate for the RMS prop optimizer - :param max_grad_norm: (float) The clipping value for the maximum gradient - :param kfac_clip: (float) gradient clipping for Kullback-Leibler - :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param async_eigen_decomp: (bool) Use async eigen decomposition - :param kfac_update: (int) update kfac after kfac_update steps - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param gae_lambda: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator - If None (default), then the classic advantage will be used instead of GAE - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, - learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, - tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1, - gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): - - if nprocs is not None: - warnings.warn("nprocs will be removed in a future version (v3.x.x) " - "use n_cpu_tf_sess instead", DeprecationWarning) - n_cpu_tf_sess = nprocs - - self.n_steps = n_steps - self.gamma = gamma - self.ent_coef = ent_coef - self.vf_coef = vf_coef - self.vf_fisher_coef = vf_fisher_coef - self.kfac_clip = kfac_clip - self.max_grad_norm = max_grad_norm - self.learning_rate = learning_rate - self.lr_schedule = lr_schedule - - self.tensorboard_log = tensorboard_log - self.async_eigen_decomp = async_eigen_decomp - self.full_tensorboard_log = full_tensorboard_log - self.kfac_update = kfac_update - self.gae_lambda = gae_lambda - - self.actions_ph = None - self.advs_ph = None - self.rewards_ph = None - self.learning_rate_ph = None - self.step_model = None - self.train_model = None - self.entropy = None - self.pg_loss = None - self.vf_loss = None - self.pg_fisher = None - self.vf_fisher = None - self.joint_fisher = None - self.grads_check = None - self.optim = None - self.train_op = None - self.q_runner = None - self.learning_rate_schedule = None - self.proba_step = None - self.value = None - self.initial_state = None - self.n_batch = None - self.summary = None - self.trained = False - self.continuous_actions = False - - super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - if _init_setup_model: - self.setup_model() - - def _make_runner(self): - if self.gae_lambda is not None: - return PPO2Runner( - env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.gae_lambda) - else: - return A2CRunner( - self.env, self, n_steps=self.n_steps, gamma=self.gamma) - - def _get_pretrain_placeholders(self): - policy = self.train_model - if isinstance(self.action_space, Discrete): - return policy.obs_ph, self.actions_ph, policy.policy - return policy.obs_ph, self.actions_ph, policy.deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ - "an instance of common.policies.ActorCriticPolicy." - - # Enable continuous actions tricks (normalized advantage) - self.continuous_actions = isinstance(self.action_space, Box) - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - n_batch_step = None - n_batch_train = None - if issubclass(self.policy, RecurrentActorCriticPolicy): - n_batch_step = self.n_envs - n_batch_train = self.n_envs * self.n_steps - - step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, - 1, n_batch_step, reuse=False, **self.policy_kwargs) - - self.params = params = tf_util.get_trainable_vars("model") - - with tf.variable_scope("train_model", reuse=True, - custom_getter=tf_util.outer_scope_getter("train_model")): - train_model = self.policy(self.sess, self.observation_space, self.action_space, - self.n_envs, self.n_steps, n_batch_train, - reuse=True, **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): - self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) - self.rewards_ph = rewards_ph = tf.placeholder(tf.float32, [None]) - self.learning_rate_ph = learning_rate_ph = tf.placeholder(tf.float32, []) - self.actions_ph = train_model.pdtype.sample_placeholder([None]) - - neg_log_prob = train_model.proba_distribution.neglogp(self.actions_ph) - - # training loss - pg_loss = tf.reduce_mean(advs_ph * neg_log_prob) - self.entropy = entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) - self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy - self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) - train_loss = pg_loss + self.vf_coef * vf_loss - - # Fisher loss construction - self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neg_log_prob) - sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn)) - self.vf_fisher = vf_fisher_loss = - self.vf_fisher_coef * tf.reduce_mean( - tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) - self.joint_fisher = pg_fisher_loss + vf_fisher_loss - - tf.summary.scalar('entropy_loss', self.entropy) - tf.summary.scalar('policy_gradient_loss', pg_loss) - tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) - tf.summary.scalar('value_function_loss', self.vf_loss) - tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) - tf.summary.scalar('loss', train_loss) - - self.grads_check = tf.gradients(train_loss, params) - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) - tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) - - if self.full_tensorboard_log: - tf.summary.histogram('discounted_rewards', self.rewards_ph) - tf.summary.histogram('learning_rate', self.learning_rate_ph) - tf.summary.histogram('advantage', self.advs_ph) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', train_model.obs_ph) - else: - tf.summary.histogram('observation', train_model.obs_ph) - - with tf.variable_scope("kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): - with tf.device('/gpu:0'): - self.optim = optim = kfac.KfacOptimizer(learning_rate=learning_rate_ph, clip_kl=self.kfac_clip, - momentum=0.9, kfac_update=self.kfac_update, - epsilon=0.01, stats_decay=0.99, - async_eigen_decomp=self.async_eigen_decomp, - cold_iter=10, - max_grad_norm=self.max_grad_norm, verbose=self.verbose) - - optim.compute_and_apply_stats(self.joint_fisher, var_list=params) - - self.train_model = train_model - self.step_model = step_model - self.step = step_model.step - self.proba_step = step_model.proba_step - self.value = step_model.value - self.initial_state = step_model.initial_state - tf.global_variables_initializer().run(session=self.sess) - - self.summary = tf.summary.merge_all() - - def _train_step(self, obs, states, rewards, masks, actions, values, update, writer): - """ - applies a training step to the model - - :param obs: ([float]) The input observations - :param states: ([float]) The states (used for recurrent policies) - :param rewards: ([float]) The rewards from the environment - :param masks: ([bool]) Whether or not the episode is over (used for recurrent policies) - :param actions: ([float]) The actions taken - :param values: ([float]) The logits values - :param update: (int) the current step iteration - :param writer: (TensorFlow Summary.writer) the writer for tensorboard - :return: (float, float, float) policy loss, value loss, policy entropy - """ - advs = rewards - values - # Normalize advantage (used in the original continuous version) - if self.continuous_actions: - advs = (advs - advs.mean()) / (advs.std() + 1e-8) - - current_lr = None - - assert len(obs) > 0, "Error: the observation input array cannot be empty" - - # Note: in the original continuous version, - # the stepsize was automatically tuned computing the kl div - # and comparing it to the desired one - for _ in range(len(obs)): - current_lr = self.learning_rate_schedule.value() - - td_map = { - self.train_model.obs_ph: obs, - self.actions_ph: actions, - self.advs_ph: advs, - self.rewards_ph: rewards, - self.learning_rate_ph: current_lr - } - - if states is not None: - td_map[self.train_model.states_ph] = states - td_map[self.train_model.dones_ph] = masks - - if writer is not None: - # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) - if self.full_tensorboard_log and (1 + update) % 10 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.train_op], - td_map, options=run_options, run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % (update * (self.n_batch + 1))) - else: - summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.train_op], td_map) - writer.add_summary(summary, update * (self.n_batch + 1)) - else: - policy_loss, value_loss, policy_entropy, _ = self.sess.run( - [self.pg_loss, self.vf_loss, self.entropy, self.train_op], td_map) - - return policy_loss, value_loss, policy_entropy - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", - reset_num_timesteps=True): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - self.n_batch = self.n_envs * self.n_steps - - self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, - schedule=self.lr_schedule) - - # FIFO queue of the q_runner thread is closed at the end of the learn function. - # As a result, it needs to be redefinied at every call - with self.graph.as_default(): - with tf.variable_scope("kfac_apply", reuse=self.trained, - custom_getter=tf_util.outer_scope_getter("kfac_apply")): - # Some of the variables are not in a scope when they are create - # so we make a note of any previously uninitialized variables - tf_vars = tf.global_variables() - is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) - old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] - - self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) - - # then we check for new uninitialized variables and initialize them - tf_vars = tf.global_variables() - is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) - new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) - if not f and v not in old_uninitialized_vars] - - if len(new_uninitialized_vars) != 0: - self.sess.run(tf.variables_initializer(new_uninitialized_vars)) - - self.trained = True - - t_start = time.time() - coord = tf.train.Coordinator() - if self.q_runner is not None: - enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) - else: - enqueue_threads = [] - - callback.on_training_start(locals(), globals()) - - for update in range(1, total_timesteps // self.n_batch + 1): - - callback.on_rollout_start() - - # pytype:disable=bad-unpacking - # true_reward is the reward without discount - if isinstance(self.runner, PPO2Runner): - # We are using GAE - rollout = self.runner.run(callback) - obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout - else: - rollout = self.runner.run(callback) - obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout - # pytype:enable=bad-unpacking - callback.update_locals(locals()) - callback.on_rollout_end() - - # Early stopping due to the callback - if not self.runner.continue_training: - break - - self.ep_info_buf.extend(ep_infos) - policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values, - self.num_timesteps // (self.n_batch + 1), - writer) - n_seconds = time.time() - t_start - fps = int((update * self.n_batch) / n_seconds) - - if writer is not None: - total_episode_reward_logger(self.episode_reward, - true_reward.reshape((self.n_envs, self.n_steps)), - masks.reshape((self.n_envs, self.n_steps)), - writer, self.num_timesteps) - - if self.verbose >= 1 and (update % log_interval == 0 or update == 1): - explained_var = explained_variance(values, returns) - logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", self.num_timesteps) - logger.record_tabular("fps", fps) - logger.record_tabular("policy_entropy", float(policy_entropy)) - logger.record_tabular("policy_loss", float(policy_loss)) - logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(explained_var)) - if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: - logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) - logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) - logger.dump_tabular() - - coord.request_stop() - coord.join(enqueue_threads) - - callback.on_training_end() - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "gae_lambda": self.gae_lambda, - "n_steps": self.n_steps, - "vf_coef": self.vf_coef, - "ent_coef": self.ent_coef, - "vf_fisher_coef": self.vf_fisher_coef, - "max_grad_norm": self.max_grad_norm, - "learning_rate": self.learning_rate, - "kfac_clip": self.kfac_clip, - "lr_schedule": self.lr_schedule, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "kfac_update": self.kfac_update, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/acktr/kfac.py b/stable_baselines/acktr/kfac.py deleted file mode 100644 index 4ab20805..00000000 --- a/stable_baselines/acktr/kfac.py +++ /dev/null @@ -1,1012 +0,0 @@ -import re -from functools import reduce - -import tensorflow as tf -import numpy as np - -from stable_baselines.acktr.kfac_utils import detect_min_val, factor_reshape, gmatmul - -KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd'] -KFAC_DEBUG = False - - -class KfacOptimizer: - def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, - full_stats_init=False, cold_iter=100, cold_lr=None, async_eigen_decomp=False, - async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, - channel_fac=False, factored_damping=False, approx_t2=False, - use_float64=False, weight_decay_dict=None, max_grad_norm=0.5, verbose=1): - """ - Kfac Optimizer for ACKTR models - link: https://arxiv.org/pdf/1708.05144.pdf - - :param learning_rate: (float) The learning rate - :param momentum: (float) The momentum value for the TensorFlow momentum optimizer - :param clip_kl: (float) gradient clipping for Kullback-Leibler - :param kfac_update: (int) update kfac after kfac_update steps - :param stats_accum_iter: (int) how may steps to accumulate stats - :param full_stats_init: (bool) whether or not to fully initialize stats - :param cold_iter: (int) Cold start learning rate for how many steps - :param cold_lr: (float) Cold start learning rate - :param async_eigen_decomp: (bool) Use async eigen decomposition - :param async_stats: (bool) Asynchronous stats update - :param epsilon: (float) epsilon value for small numbers - :param stats_decay: (float) the stats decay rate - :param blockdiag_bias: (bool) - :param channel_fac: (bool) factorization along the channels - :param factored_damping: (bool) use factored damping - :param approx_t2: (bool) approximate T2 act and grad fisher - :param use_float64: (bool) use 64-bit float - :param weight_decay_dict: (dict) custom weight decay coeff for a given gradient - :param max_grad_norm: (float) The maximum value for the gradient clipping - :param verbose: (int) verbosity level - """ - self.max_grad_norm = max_grad_norm - self._lr = learning_rate - self._momentum = momentum - self._clip_kl = clip_kl - self._channel_fac = channel_fac - self._kfac_update = kfac_update - self._async_eigen_decomp = async_eigen_decomp - self._async_stats = async_stats - self._epsilon = epsilon - self._stats_decay = stats_decay - self._blockdiag_bias = blockdiag_bias - self._approx_t2 = approx_t2 - self._use_float64 = use_float64 - self._factored_damping = factored_damping - self._cold_iter = cold_iter - self.verbose = verbose - if cold_lr is None: - # good heuristics - self._cold_lr = self._lr # * 3. - else: - self._cold_lr = cold_lr - self._stats_accum_iter = stats_accum_iter - if weight_decay_dict is None: - weight_decay_dict = {} - self._weight_decay_dict = weight_decay_dict - self._diag_init_coeff = 0. - self._full_stats_init = full_stats_init - if not self._full_stats_init: - self._stats_accum_iter = self._cold_iter - - self.sgd_step = tf.Variable(0, name='KFAC/sgd_step', trainable=False) - self.global_step = tf.Variable( - 0, name='KFAC/global_step', trainable=False) - self.cold_step = tf.Variable(0, name='KFAC/cold_step', trainable=False) - self.factor_step = tf.Variable( - 0, name='KFAC/factor_step', trainable=False) - self.stats_step = tf.Variable( - 0, name='KFAC/stats_step', trainable=False) - self.v_f_v = tf.Variable(0., name='KFAC/vFv', trainable=False) - - self.factors = {} - self.param_vars = [] - self.stats = {} - self.stats_eigen = {} - - self._update_stats_op = None - - def get_factors(self, gradients, varlist): - """ - get factors to update - - :param gradients: ([TensorFlow Tensor]) The gradients - :param varlist: ([TensorFlow Tensor]) The parameters - :return: ([TensorFlow Tensor]) The factors to update - """ - default_graph = tf.get_default_graph() - factor_tensors = {} - fprop_tensors = [] - bprop_tensors = [] - op_types = [] - - def _search_factors(gradient, graph): - # hard coded search stratergy - bprop_op = gradient.op - bprop_op_name = bprop_op.name - - b_tensors = [] - f_tensors = [] - - # combining additive gradient, assume they are the same op type and - # indepedent - if 'AddN' in bprop_op_name: - factors = [] - for grad in gradient.op.inputs: - factors.append(_search_factors(grad, graph)) - op_names = [_item['opName'] for _item in factors] - if self.verbose > 1: - # TODO: need to check all the attribute of the ops as well - print(gradient.name) - print(op_names) - print(len(np.unique(op_names))) - assert len(np.unique(op_names)) == 1, \ - 'Error: {} is shared among different computation OPs'.format(gradient.name) - - b_tensors = reduce(lambda x, y: x + y, - [_item['bpropFactors'] for _item in factors]) - if len(factors[0]['fpropFactors']) > 0: - f_tensors = reduce( - lambda x, y: x + y, [_item['fpropFactors'] for _item in factors]) - fprop_op_name = op_names[0] - fprop_op = factors[0]['op'] - else: - fprop_op_match = re.search('gradientsSampled(_[0-9]+|)/(.+?)_grad', bprop_op_name) - assert fprop_op_match is not None - fprop_op_name = fprop_op_match.group(2) - fprop_op = graph.get_operation_by_name(fprop_op_name) - if fprop_op.op_def.name in KFAC_OPS: - # Known OPs - b_tensor = [_i for _i in bprop_op.inputs if 'gradientsSampled' in _i.name][-1] - b_tensor_shape = fprop_op.outputs[0].get_shape() - if b_tensor.get_shape()[0].value is None: - b_tensor.set_shape(b_tensor_shape) - b_tensors.append(b_tensor) - - if fprop_op.op_def.name == 'BiasAdd': - f_tensors = [] - else: - f_tensors.append([_i for _i in fprop_op.inputs if param.op.name not in _i.name][0]) - fprop_op_name = fprop_op.op_def.name - else: - # unknown OPs, block approximation used - b_inputs_list = [_i for _i in bprop_op.inputs[0].op.inputs - if 'gradientsSampled' in _i.name if 'Shape' not in _i.name] - if len(b_inputs_list) > 0: - b_tensor = b_inputs_list[0] - # only if tensor shape is defined, usually this will prevent tensor like Sum:0 to be used. - if b_tensor.get_shape(): - b_tensor_shape = fprop_op.outputs[0].get_shape() - if len(b_tensor.get_shape()) > 0 and b_tensor.get_shape()[0].value is None: - b_tensor.set_shape(b_tensor_shape) - b_tensors.append(b_tensor) - fprop_op_name = 'UNK-' + fprop_op.op_def.name - op_types.append(fprop_op_name) - - return {'opName': fprop_op_name, 'op': fprop_op, 'fpropFactors': f_tensors, 'bpropFactors': b_tensors} - - for _grad, param in zip(gradients, varlist): - if KFAC_DEBUG: - print(('get factor for ' + param.name)) - found_factors = _search_factors(_grad, default_graph) - factor_tensors[param] = found_factors - - # check associated weights and bias for homogeneous coordinate representation - # and check redundent factors - # TODO: there may be a bug to detect associate bias and weights for forking layer, e.g. in inception models. - for param in varlist: - factor_tensors[param]['assnWeights'] = None - factor_tensors[param]['assnBias'] = None - for param in varlist: - if factor_tensors[param]['opName'] == 'BiasAdd': - factor_tensors[param]['assnWeights'] = None - for item in varlist: - if len(factor_tensors[item]['bpropFactors']) > 0: - if (set(factor_tensors[item]['bpropFactors']) == set(factor_tensors[param]['bpropFactors'])) \ - and (len(factor_tensors[item]['fpropFactors']) > 0): - factor_tensors[param]['assnWeights'] = item - factor_tensors[item]['assnBias'] = param - factor_tensors[param]['bpropFactors'] = factor_tensors[ - item]['bpropFactors'] - - # concatenate the additive gradients along the batch dimension, i.e. assuming independence structure - for key in ['fpropFactors', 'bpropFactors']: - for i, param in enumerate(varlist): - if len(factor_tensors[param][key]) > 0: - if (key + '_concat') not in factor_tensors[param]: - tensor = factor_tensors[param][key][0] # type: tf.Tensor - name_scope = tensor.name.split(':')[ - 0] - with tf.name_scope(name_scope): - factor_tensors[param][ - key + '_concat'] = tf.concat(factor_tensors[param][key], 0) - else: - factor_tensors[param][key + '_concat'] = None - for _, param2 in enumerate(varlist[(i + 1):]): - if (len(factor_tensors[param][key]) > 0) and ( - set(factor_tensors[param2][key]) == set(factor_tensors[param][key])): - factor_tensors[param2][key] = factor_tensors[param][key] - factor_tensors[param2][ - key + '_concat'] = factor_tensors[param][key + '_concat'] - - if KFAC_DEBUG: - for items in zip(varlist, fprop_tensors, bprop_tensors, op_types): - print((items[0].name, factor_tensors[item])) - self.factors = factor_tensors - return factor_tensors - - def get_stats(self, factors, varlist): - """ - return the stats values from the factors to update and the parameters - - :param factors: ([TensorFlow Tensor]) The factors to update - :param varlist: ([TensorFlow Tensor]) The parameters - :return: ([TensorFlow Tensor]) The stats values - """ - if len(self.stats) == 0: - # initialize stats variables on CPU because eigen decomp is - # computed on CPU - with tf.device('/cpu'): - tmp_stats_cache = {} - - # search for tensor factors and - # use block diag approx for the bias units - for var in varlist: - bprop_factor = factors[var]['bpropFactors_concat'] - op_type = factors[var]['opName'] - if op_type == 'Conv2D': - operator_height = bprop_factor.get_shape()[1] - operator_width = bprop_factor.get_shape()[2] - if operator_height == 1 and operator_width == 1 and self._channel_fac: - # factorization along the channels do not support - # homogeneous coordinate - var_assn_bias = factors[var]['assnBias'] - if var_assn_bias: - factors[var]['assnBias'] = None - factors[var_assn_bias]['assnWeights'] = None - - for var in varlist: - fprop_factor = factors[var]['fpropFactors_concat'] - bprop_factor = factors[var]['bpropFactors_concat'] - op_type = factors[var]['opName'] - self.stats[var] = {'opName': op_type, - 'fprop_concat_stats': [], - 'bprop_concat_stats': [], - 'assnWeights': factors[var]['assnWeights'], - 'assnBias': factors[var]['assnBias'], - } - if fprop_factor is not None: - if fprop_factor not in tmp_stats_cache: - if op_type == 'Conv2D': - kernel_height = var.get_shape()[0] - kernel_width = var.get_shape()[1] - n_channels = fprop_factor.get_shape()[-1] - - operator_height = bprop_factor.get_shape()[1] - operator_width = bprop_factor.get_shape()[2] - if operator_height == 1 and operator_width == 1 and self._channel_fac: - # factorization along the channels - # assume independence between input channels and spatial - # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix - # factorization along the channels do not - # support homogeneous coordinate, assnBias - # is always None - fprop_factor2_size = kernel_height * kernel_width - slot_fprop_factor_stats2 = tf.Variable(tf.diag(tf.ones( - [fprop_factor2_size])) * self._diag_init_coeff, - name='KFAC_STATS/' + fprop_factor.op.name, - trainable=False) - self.stats[var]['fprop_concat_stats'].append( - slot_fprop_factor_stats2) - - fprop_factor_size = n_channels - else: - # 2K-1 x 2K-1 x C x C covariance matrix - # assume BHWC - fprop_factor_size = kernel_height * kernel_width * n_channels - else: - # D x D covariance matrix - fprop_factor_size = fprop_factor.get_shape()[-1] - - # use homogeneous coordinate - if not self._blockdiag_bias and self.stats[var]['assnBias']: - fprop_factor_size += 1 - - slot_fprop_factor_stats = tf.Variable( - tf.diag(tf.ones([fprop_factor_size])) * self._diag_init_coeff, - name='KFAC_STATS/' + fprop_factor.op.name, trainable=False) - self.stats[var]['fprop_concat_stats'].append( - slot_fprop_factor_stats) - if op_type != 'Conv2D': - tmp_stats_cache[fprop_factor] = self.stats[ - var]['fprop_concat_stats'] - else: - self.stats[var][ - 'fprop_concat_stats'] = tmp_stats_cache[fprop_factor] - - if bprop_factor is not None: - # no need to collect backward stats for bias vectors if - # using homogeneous coordinates - if not ((not self._blockdiag_bias) and self.stats[var]['assnWeights']): - if bprop_factor not in tmp_stats_cache: - slot_bprop_factor_stats = tf.Variable(tf.diag(tf.ones([bprop_factor.get_shape( - )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bprop_factor.op.name, - trainable=False) - self.stats[var]['bprop_concat_stats'].append( - slot_bprop_factor_stats) - tmp_stats_cache[bprop_factor] = self.stats[ - var]['bprop_concat_stats'] - else: - self.stats[var][ - 'bprop_concat_stats'] = tmp_stats_cache[bprop_factor] - - return self.stats - - def compute_and_apply_stats(self, loss_sampled, var_list=None): - """ - compute and apply stats - - :param loss_sampled: ([TensorFlow Tensor]) the loss function output - :param var_list: ([TensorFlow Tensor]) The parameters - :return: (function) apply stats - """ - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - - stats = self.compute_stats(loss_sampled, var_list=varlist) - return self.apply_stats(stats) - - def compute_stats(self, loss_sampled, var_list=None): - """ - compute the stats values - - :param loss_sampled: ([TensorFlow Tensor]) the loss function output - :param var_list: ([TensorFlow Tensor]) The parameters - :return: ([TensorFlow Tensor]) stats updates - """ - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - - gradient_sampled = tf.gradients(loss_sampled, varlist, name='gradientsSampled') - self.gradient_sampled = gradient_sampled - - # remove unused variables - gradient_sampled, varlist = zip(*[(grad, var) for (grad, var) in zip(gradient_sampled, varlist) - if grad is not None]) - - factors = self.get_factors(gradient_sampled, varlist) - stats = self.get_stats(factors, varlist) - - update_ops = [] - stats_updates = {} - stats_updates_cache = {} - for var in varlist: - op_type = factors[var]['opName'] - fops = factors[var]['op'] - fprop_factor = factors[var]['fpropFactors_concat'] - fprop_stats_vars = stats[var]['fprop_concat_stats'] - bprop_factor = factors[var]['bpropFactors_concat'] - bprop_stats_vars = stats[var]['bprop_concat_stats'] - svd_factors = {} - for stats_var in fprop_stats_vars: - stats_var_dim = int(stats_var.get_shape()[0]) - if stats_var not in stats_updates_cache: - batch_size = (tf.shape(fprop_factor)[0]) # batch size - if op_type == 'Conv2D': - strides = fops.get_attr("strides") - padding = fops.get_attr("padding") - convkernel_size = var.get_shape()[0:3] - - kernel_height = int(convkernel_size[0]) - kernel_width = int(convkernel_size[1]) - chan = int(convkernel_size[2]) - flatten_size = int(kernel_height * kernel_width * chan) - - operator_height = int(bprop_factor.get_shape()[1]) - operator_width = int(bprop_factor.get_shape()[2]) - - if operator_height == 1 and operator_width == 1 and self._channel_fac: - # factorization along the channels - # assume independence among input channels - # factor = B x 1 x 1 x (KH xKW x C) - # patches = B x Oh x Ow x (KH xKW x C) - if len(svd_factors) == 0: - if KFAC_DEBUG: - print(('approx %s act factor with rank-1 SVD factors' % var.name)) - # find closest rank-1 approx to the feature map - S, U, V = tf.batch_svd(tf.reshape( - fprop_factor, [-1, kernel_height * kernel_width, chan])) - # get rank-1 approx slides - sqrt_s1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) - patches_k = U[:, :, 0] * sqrt_s1 # B x KH*KW - full_factor_shape = fprop_factor.get_shape() - patches_k.set_shape( - [full_factor_shape[0], kernel_height * kernel_width]) - patches_c = V[:, :, 0] * sqrt_s1 # B x C - patches_c.set_shape([full_factor_shape[0], chan]) - svd_factors[chan] = patches_c - svd_factors[kernel_height * kernel_width] = patches_k - fprop_factor = svd_factors[stats_var_dim] - - else: - # poor mem usage implementation - patches = tf.extract_image_patches(fprop_factor, ksizes=[1, convkernel_size[ - 0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding) - - if self._approx_t2: - if KFAC_DEBUG: - print(('approxT2 act fisher for %s' % var.name)) - # T^2 terms * 1/T^2, size: B x C - fprop_factor = tf.reduce_mean(patches, [1, 2]) - else: - # size: (B x Oh x Ow) x C - fprop_factor = tf.reshape( - patches, [-1, flatten_size]) / operator_height / operator_width - fprop_factor_size = int(fprop_factor.get_shape()[-1]) - if stats_var_dim == (fprop_factor_size + 1) and not self._blockdiag_bias: - if op_type == 'Conv2D' and not self._approx_t2: - # correct padding for numerical stability (we - # divided out OhxOw from activations for T1 approx) - fprop_factor = tf.concat([fprop_factor, tf.ones( - [tf.shape(fprop_factor)[0], 1]) / operator_height / operator_width], 1) - else: - # use homogeneous coordinates - fprop_factor = tf.concat( - [fprop_factor, tf.ones([tf.shape(fprop_factor)[0], 1])], 1) - - # average over the number of data points in a batch - # divided by B - cov = tf.matmul(fprop_factor, fprop_factor, - transpose_a=True) / tf.cast(batch_size, tf.float32) - update_ops.append(cov) - stats_updates[stats_var] = cov - if op_type != 'Conv2D': - # HACK: for convolution we recompute fprop stats for - # every layer including forking layers - stats_updates_cache[stats_var] = cov - - for stats_var in bprop_stats_vars: - if stats_var not in stats_updates_cache: - bprop_factor_shape = bprop_factor.get_shape() - batch_size = tf.shape(bprop_factor)[0] # batch size - chan = int(bprop_factor_shape[-1]) # num channels - if op_type == 'Conv2D' or len(bprop_factor_shape) == 4: - if fprop_factor is not None: - if self._approx_t2: - if KFAC_DEBUG: - print(('approxT2 grad fisher for %s' % var.name)) - bprop_factor = tf.reduce_sum( - bprop_factor, [1, 2]) # T^2 terms * 1/T^2 - else: - bprop_factor = tf.reshape( - bprop_factor, [-1, chan]) * operator_height * operator_width # T * 1/T terms - else: - # just doing block diag approx. spatial independent - # structure does not apply here. summing over - # spatial locations - if KFAC_DEBUG: - print(('block diag approx fisher for %s' % var.name)) - bprop_factor = tf.reduce_sum(bprop_factor, [1, 2]) - - # assume sampled loss is averaged. TODO:figure out better - # way to handle this - bprop_factor *= tf.cast(batch_size, tf.float32) - ## - - cov_b = tf.matmul(bprop_factor, bprop_factor, - transpose_a=True) / tf.cast(tf.shape(bprop_factor)[0], tf.float32) - - update_ops.append(cov_b) - stats_updates[stats_var] = cov_b - stats_updates_cache[stats_var] = cov_b - - if KFAC_DEBUG: - a_key = list(stats_updates.keys())[0] - stats_updates[a_key] = tf.Print(stats_updates[a_key], [tf.convert_to_tensor('step:'), self.global_step, - tf.convert_to_tensor('computing stats')]) - self.stats_updates = stats_updates - return stats_updates - - def apply_stats(self, stats_updates): - """ - compute stats and update/apply the new stats to the running average - - :param stats_updates: ([TensorFlow Tensor]) The stats updates - :return: (function) update stats operation - """ - - def _update_accum_stats(): - if self._full_stats_init: - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group( - *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)), - tf.no_op) - else: - return tf.group( - *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)) - - def _update_running_avg_stats(stats_updates): - return tf.group(*self._apply_stats(stats_updates)) - - if self._async_stats: - # asynchronous stats update - update_stats = self._apply_stats(stats_updates) - - queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[ - item.get_shape() for item in update_stats]) - enqueue_op = queue.enqueue(update_stats) - - def dequeue_stats_op(): - return queue.dequeue() - - self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op]) - update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor( - 0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ])) - else: - # synchronous stats update - update_stats_op = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), - lambda: _update_running_avg_stats(stats_updates), _update_accum_stats) - self._update_stats_op = update_stats_op - return update_stats_op - - def _apply_stats(self, stats_updates, accumulate=False, accumulate_coeff=0.): - update_ops = [] - # obtain the stats var list - for stats_var in stats_updates: - stats_new = stats_updates[stats_var] - if accumulate: - # simple superbatch averaging - update_op = tf.assign_add( - stats_var, accumulate_coeff * stats_new, use_locking=True) - else: - # exponential running averaging - update_op = tf.assign( - stats_var, stats_var * self._stats_decay, use_locking=True) - update_op = tf.assign_add( - update_op, (1. - self._stats_decay) * stats_new, use_locking=True) - update_ops.append(update_op) - - with tf.control_dependencies(update_ops): - stats_step_op = tf.assign_add(self.stats_step, 1) - - if KFAC_DEBUG: - stats_step_op = (tf.Print(stats_step_op, - [tf.convert_to_tensor('step:'), - self.global_step, - tf.convert_to_tensor('fac step:'), - self.factor_step, - tf.convert_to_tensor('sgd step:'), - self.sgd_step, - tf.convert_to_tensor('Accum:'), - tf.convert_to_tensor(accumulate), - tf.convert_to_tensor('Accum coeff:'), - tf.convert_to_tensor(accumulate_coeff), - tf.convert_to_tensor('stat step:'), - self.stats_step, update_ops[0], update_ops[1]])) - return [stats_step_op, ] - - def get_stats_eigen(self, stats=None): - """ - Return the eigen values from the stats - - :param stats: ([TensorFlow Tensor]) The stats - :return: ([TensorFlow Tensor]) The stats eigen values - """ - if len(self.stats_eigen) == 0: - stats_eigen = {} - if stats is None: - stats = self.stats - - tmp_eigen_cache = {} - with tf.device('/cpu:0'): - for var in stats: - for key in ['fprop_concat_stats', 'bprop_concat_stats']: - for stats_var in stats[var][key]: - if stats_var not in tmp_eigen_cache: - stats_dim = stats_var.get_shape()[1].value - eigen_values = tf.Variable(tf.ones( - [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', - trainable=False) - eigen_vectors = tf.Variable(tf.diag(tf.ones( - [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', - trainable=False) - stats_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} - tmp_eigen_cache[ - stats_var] = stats_eigen[stats_var] - else: - stats_eigen[stats_var] = tmp_eigen_cache[ - stats_var] - self.stats_eigen = stats_eigen - return self.stats_eigen - - def compute_stats_eigen(self): - """ - compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue - - :return: ([TensorFlow Tensor]) update operations - """ - # TODO: figure out why this op has delays (possibly moving eigenvectors around?) - with tf.device('/cpu:0'): - stats_eigen = self.stats_eigen - computed_eigen = {} - eigen_reverse_lookup = {} - update_ops = [] - # sync copied stats - with tf.control_dependencies([]): - for stats_var in stats_eigen: - if stats_var not in computed_eigen: - eigen_decomposition = tf.self_adjoint_eig(stats_var) - eigen_values = eigen_decomposition[0] - eigen_vectors = eigen_decomposition[1] - if self._use_float64: - eigen_values = tf.cast(eigen_values, tf.float64) - eigen_vectors = tf.cast(eigen_vectors, tf.float64) - update_ops.append(eigen_values) - update_ops.append(eigen_vectors) - computed_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} - eigen_reverse_lookup[eigen_values] = stats_eigen[stats_var]['e'] - eigen_reverse_lookup[eigen_vectors] = stats_eigen[stats_var]['Q'] - - self.eigen_reverse_lookup = eigen_reverse_lookup - self.eigen_update_list = update_ops - - if KFAC_DEBUG: - self.eigen_update_list = [item for item in update_ops] - with tf.control_dependencies(update_ops): - update_ops.append(tf.Print(tf.constant( - 0.), [tf.convert_to_tensor('computed factor eigen')])) - - return update_ops - - def apply_stats_eigen(self, eigen_list): - """ - apply the update using the eigen values of the stats - - :param eigen_list: ([TensorFlow Tensor]) The list of eigen values of the stats - :return: ([TensorFlow Tensor]) update operations - """ - update_ops = [] - if self.verbose > 1: - print(('updating %d eigenvalue/vectors' % len(eigen_list))) - for _, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)): - stats_eigen_var = self.eigen_reverse_lookup[mark] - update_ops.append( - tf.assign(stats_eigen_var, tensor, use_locking=True)) - - with tf.control_dependencies(update_ops): - factor_step_op = tf.assign_add(self.factor_step, 1) - update_ops.append(factor_step_op) - if KFAC_DEBUG: - update_ops.append(tf.Print(tf.constant( - 0.), [tf.convert_to_tensor('updated kfac factors')])) - return update_ops - - def get_kfac_precond_updates(self, gradlist, varlist): - """ - return the KFAC updates - - :param gradlist: ([TensorFlow Tensor]) The gradients - :param varlist: ([TensorFlow Tensor]) The parameters - :return: ([TensorFlow Tensor]) the update list - """ - v_g = 0. - - assert len(self.stats) > 0 - assert len(self.stats_eigen) > 0 - assert len(self.factors) > 0 - counter = 0 - - grad_dict = {var: grad for grad, var in zip(gradlist, varlist)} - - for grad, var in zip(gradlist, varlist): - grad_reshape = False - - fprop_factored_fishers = self.stats[var]['fprop_concat_stats'] - bprop_factored_fishers = self.stats[var]['bprop_concat_stats'] - - if (len(fprop_factored_fishers) + len(bprop_factored_fishers)) > 0: - counter += 1 - grad_shape = grad.get_shape() - if len(grad.get_shape()) > 2: - # reshape conv kernel parameters - kernel_width = int(grad.get_shape()[0]) - kernel_height = int(grad.get_shape()[1]) - n_channels = int(grad.get_shape()[2]) - depth = int(grad.get_shape()[3]) - - if len(fprop_factored_fishers) > 1 and self._channel_fac: - # reshape conv kernel parameters into tensor - grad = tf.reshape(grad, [kernel_width * kernel_height, n_channels, depth]) - else: - # reshape conv kernel parameters into 2D grad - grad = tf.reshape(grad, [-1, depth]) - grad_reshape = True - elif len(grad.get_shape()) == 1: - # reshape bias or 1D parameters - - grad = tf.expand_dims(grad, 0) - grad_reshape = True - - if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: - # use homogeneous coordinates only works for 2D grad. - # TODO: figure out how to factorize bias grad - # stack bias grad - var_assn_bias = self.stats[var]['assnBias'] - grad = tf.concat( - [grad, tf.expand_dims(grad_dict[var_assn_bias], 0)], 0) - - # project gradient to eigen space and reshape the eigenvalues - # for broadcasting - eig_vals = [] - - for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - eigen_vectors = self.stats_eigen[stats]['Q'] - eigen_values = detect_min_val(self.stats_eigen[stats][ - 'e'], var, name='act', debug=KFAC_DEBUG) - - eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, - grad, fac_idx=idx, f_type='act') - eig_vals.append(eigen_values) - grad = gmatmul(eigen_vectors, grad, transpose_a=True, reduce_dim=idx) - - for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - eigen_vectors = self.stats_eigen[stats]['Q'] - eigen_values = detect_min_val(self.stats_eigen[stats][ - 'e'], var, name='grad', debug=KFAC_DEBUG) - - eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, - grad, fac_idx=idx, f_type='grad') - eig_vals.append(eigen_values) - grad = gmatmul(grad, eigen_vectors, transpose_b=False, reduce_dim=idx) - - # whiten using eigenvalues - weight_decay_coeff = 0. - if var in self._weight_decay_dict: - weight_decay_coeff = self._weight_decay_dict[var] - if KFAC_DEBUG: - print(('weight decay coeff for %s is %f' % (var.name, weight_decay_coeff))) - - if self._factored_damping: - if KFAC_DEBUG: - print(('use factored damping for %s' % var.name)) - coeffs = 1. - num_factors = len(eig_vals) - # compute the ratio of two trace norm of the left and right - # KFac matrices, and their generalization - if len(eig_vals) == 1: - damping = self._epsilon + weight_decay_coeff - else: - damping = tf.pow( - self._epsilon + weight_decay_coeff, 1. / num_factors) - eig_vals_tnorm_avg = [tf.reduce_mean( - tf.abs(e)) for e in eig_vals] - for eigen_val, e_tnorm in zip(eig_vals, eig_vals_tnorm_avg): - eig_tnorm_neg_list = [ - item for item in eig_vals_tnorm_avg if item != e_tnorm] - if len(eig_vals) == 1: - adjustment = 1. - elif len(eig_vals) == 2: - adjustment = tf.sqrt( - e_tnorm / eig_tnorm_neg_list[0]) - else: - eig_tnorm_neg_list_prod = reduce( - lambda x, y: x * y, eig_tnorm_neg_list) - adjustment = tf.pow( - tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_neg_list_prod, 1. / num_factors) - coeffs *= (eigen_val + adjustment * damping) - else: - coeffs = 1. - damping = (self._epsilon + weight_decay_coeff) - for eigen_val in eig_vals: - coeffs *= eigen_val - coeffs += damping - - grad /= coeffs - - # project gradient back to euclidean space - for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - eigen_vectors = self.stats_eigen[stats]['Q'] - grad = gmatmul(eigen_vectors, grad, transpose_a=False, reduce_dim=idx) - - for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - eigen_vectors = self.stats_eigen[stats]['Q'] - grad = gmatmul(grad, eigen_vectors, transpose_b=True, reduce_dim=idx) - - if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: - # use homogeneous coordinates only works for 2D grad. - # TODO: figure out how to factorize bias grad - # un-stack bias grad - var_assn_bias = self.stats[var]['assnBias'] - c_plus_one = int(grad.get_shape()[0]) - grad_assn_bias = tf.reshape(tf.slice(grad, - begin=[ - c_plus_one - 1, 0], - size=[1, -1]), var_assn_bias.get_shape()) - grad_assn_weights = tf.slice(grad, - begin=[0, 0], - size=[c_plus_one - 1, -1]) - grad_dict[var_assn_bias] = grad_assn_bias - grad = grad_assn_weights - - if grad_reshape: - grad = tf.reshape(grad, grad_shape) - - grad_dict[var] = grad - - if self.verbose > 1: - print(('projecting %d gradient matrices' % counter)) - - for grad_1, var in zip(gradlist, varlist): - grad = grad_dict[var] - # clipping - if KFAC_DEBUG: - print(('apply clipping to %s' % var.name)) - tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad") - local_vg = tf.reduce_sum(grad * grad_1 * (self._lr * self._lr)) - v_g += local_vg - - # rescale everything - if KFAC_DEBUG: - print('apply vFv clipping') - - scaling = tf.minimum(1., tf.sqrt(self._clip_kl / v_g)) - if KFAC_DEBUG: - scaling = tf.Print(scaling, [tf.convert_to_tensor( - 'clip: '), scaling, tf.convert_to_tensor(' vFv: '), v_g]) - with tf.control_dependencies([tf.assign(self.v_f_v, v_g)]): - updatelist = [grad_dict[var] for var in varlist] - for i, item in enumerate(updatelist): - updatelist[i] = scaling * item - - return updatelist - - @classmethod - def compute_gradients(cls, loss, var_list=None): - """ - compute the gradients from the loss and the parameters - - :param loss: ([TensorFlow Tensor]) The loss - :param var_list: ([TensorFlow Tensor]) The parameters - :return: ([TensorFlow Tensor]) the gradient - """ - varlist = var_list - if varlist is None: - varlist = tf.trainable_variables() - gradients = tf.gradients(loss, varlist) - - return [(a, b) for a, b in zip(gradients, varlist)] - - def apply_gradients_kfac(self, grads): - """ - apply the kfac gradient - - :param grads: ([TensorFlow Tensor]) the gradient - :return: ([function], QueueRunner) Update functions, queue operation runner - """ - grad, varlist = list(zip(*grads)) - - if len(self.stats_eigen) == 0: - self.get_stats_eigen() - - queue_runner = None - # launch eigen-decomp on a queue thread - if self._async_eigen_decomp: - if self.verbose > 1: - print('Using async eigen decomposition') - # get a list of factor loading tensors - factor_ops_dummy = self.compute_stats_eigen() - - # define a queue for the list of factor loading tensors - queue = tf.FIFOQueue(1, [item.dtype for item in factor_ops_dummy], - shapes=[item.get_shape() for item in factor_ops_dummy]) - enqueue_op = tf.cond( - tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor( - 0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), - lambda: queue.enqueue(self.compute_stats_eigen()), tf.no_op) - - def dequeue_op(): - return queue.dequeue() - - queue_runner = tf.train.QueueRunner(queue, [enqueue_op]) - - update_ops = [] - global_step_op = tf.assign_add(self.global_step, 1) - update_ops.append(global_step_op) - - with tf.control_dependencies([global_step_op]): - - # compute updates - assert self._update_stats_op is not None - update_ops.append(self._update_stats_op) - dependency_list = [] - if not self._async_eigen_decomp: - dependency_list.append(self._update_stats_op) - - with tf.control_dependencies(dependency_list): - def no_op_wrapper(): - return tf.group(*[tf.assign_add(self.cold_step, 1)]) - - if not self._async_eigen_decomp: - # synchronous eigen-decomp updates - update_factor_ops = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), - tf.convert_to_tensor(0)), - tf.greater_equal(self.stats_step, - self._stats_accum_iter)), - lambda: tf.group(*self.apply_stats_eigen(self.compute_stats_eigen())), - no_op_wrapper) - else: - # asynchronous eigen-decomp updates using queue - update_factor_ops = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), - lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)), - tf.no_op, - - lambda: tf.group( - *self.apply_stats_eigen(dequeue_op())), - ), - no_op_wrapper) - - update_ops.append(update_factor_ops) - - with tf.control_dependencies([update_factor_ops]): - def grad_op(): - return list(grad) - - def get_kfac_grad_op(): - return self.get_kfac_precond_updates(grad, varlist) - - u = tf.cond(tf.greater(self.factor_step, - tf.convert_to_tensor(0)), get_kfac_grad_op, grad_op) - - optim = tf.train.MomentumOptimizer( - self._lr * (1. - self._momentum), self._momentum) - - # optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01) - - def optim_op(): - def update_optim_op(): - if self._full_stats_init: - return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), - lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op) - else: - return optim.apply_gradients(list(zip(u, varlist))) - - if self._full_stats_init: - return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), update_optim_op, - tf.no_op) - else: - return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), update_optim_op, tf.no_op) - - update_ops.append(optim_op()) - - return tf.group(*update_ops), queue_runner - - def apply_gradients(self, grads): - """ - apply the gradient - - :param grads: ([TensorFlow Tensor]) the gradient - :return: (function, QueueRunner) train operation, queue operation runner - """ - cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum) - - def _cold_sgd_start(): - sgd_grads, sgd_var = zip(*grads) - - if self.max_grad_norm is not None: - sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm) - - sgd_grads = list(zip(sgd_grads, sgd_var)) - - sgd_step_op = tf.assign_add(self.sgd_step, 1) - cold_optim_op = cold_optim.apply_gradients(sgd_grads) - if KFAC_DEBUG: - with tf.control_dependencies([sgd_step_op, cold_optim_op]): - sgd_step_op = tf.Print( - sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) - return tf.group(*[sgd_step_op, cold_optim_op]) - - # remove unused variables - grads = [(grad, var) for (grad, var) in grads if grad is not None] - - kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads) - - def _warm_kfac_start(): - return kfac_optim_op - - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner - - def minimize(self, loss, loss_sampled, var_list=None): - """ - minimize the gradient loss - - :param loss: ([TensorFlow Tensor]) The loss - :param loss_sampled: ([TensorFlow Tensor]) the loss function output - :param var_list: ([TensorFlow Tensor]) The parameters - :return: (function, q_runner) train operation, queue operation runner - """ - grads = self.compute_gradients(loss, var_list=var_list) - self.compute_and_apply_stats(loss_sampled, var_list=var_list) - return self.apply_gradients(grads) diff --git a/stable_baselines/acktr/kfac_utils.py b/stable_baselines/acktr/kfac_utils.py deleted file mode 100644 index 512e21a2..00000000 --- a/stable_baselines/acktr/kfac_utils.py +++ /dev/null @@ -1,128 +0,0 @@ -import tensorflow as tf - - -def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None): - """ - Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match - - :param tensor_a: (TensorFlow Tensor) - :param tensor_b: (TensorFlow Tensor) - :param transpose_a: (bool) If 'a' needs transposing - :param transpose_b: (bool) If 'b' needs transposing - :param reduce_dim: (int) the multiplication over the dim - :return: (TensorFlow Tensor) a * b - """ - assert reduce_dim is not None - - # weird batch matmul - if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2: - # reshape reduce_dim to the left most dim in b - b_shape = tensor_b.get_shape() - if reduce_dim != 0: - b_dims = list(range(len(b_shape))) - b_dims.remove(reduce_dim) - b_dims.insert(0, reduce_dim) - tensor_b = tf.transpose(tensor_b, b_dims) - b_t_shape = tensor_b.get_shape() - tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1]) - result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, - transpose_b=transpose_b) - result = tf.reshape(result, b_t_shape) - if reduce_dim != 0: - b_dims = list(range(len(b_shape))) - b_dims.remove(0) - b_dims.insert(reduce_dim, 0) - result = tf.transpose(result, b_dims) - return result - - elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2: - # reshape reduce_dim to the right most dim in a - a_shape = tensor_a.get_shape() - outter_dim = len(a_shape) - 1 - reduce_dim = len(a_shape) - reduce_dim - 1 - if reduce_dim != outter_dim: - a_dims = list(range(len(a_shape))) - a_dims.remove(reduce_dim) - a_dims.insert(outter_dim, reduce_dim) - tensor_a = tf.transpose(tensor_a, a_dims) - a_t_shape = tensor_a.get_shape() - tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])]) - result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, - transpose_b=transpose_b) - result = tf.reshape(result, a_t_shape) - if reduce_dim != outter_dim: - a_dims = list(range(len(a_shape))) - a_dims.remove(outter_dim) - a_dims.insert(reduce_dim, outter_dim) - result = tf.transpose(result, a_dims) - return result - - elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2: - return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) - - assert False, 'something went wrong' - - -def clipout_neg(vec, threshold=1e-6): - """ - clip to 0 if input lower than threshold value - - :param vec: (TensorFlow Tensor) - :param threshold: (float) the cutoff threshold - :return: (TensorFlow Tensor) clipped input - """ - mask = tf.cast(vec > threshold, tf.float32) - return mask * vec - - -def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False): - """ - If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values - - :param input_mat: (TensorFlow Tensor) - :param var: (TensorFlow Tensor) variable - :param threshold: (float) the cutoff threshold - :param name: (str) the name of the variable - :param debug: (bool) debug function - :return: (TensorFlow Tensor) clipped tensor - """ - eigen_min = tf.reduce_min(input_mat) - eigen_max = tf.reduce_max(input_mat) - eigen_ratio = eigen_max / eigen_min - input_mat_clipped = clipout_neg(input_mat, threshold) - - if debug: - input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), - lambda: input_mat_clipped, lambda: tf.Print( - input_mat_clipped, - [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), - eigen_min, eigen_max, eigen_ratio])) - - return input_mat_clipped - - -def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'): - """ - factor and reshape input eigen values - - :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors - :param eigen_values: ([TensorFlow Tensor]) eigen values - :param grad: ([TensorFlow Tensor]) gradient - :param fac_idx: (int) index that should be factored - :param f_type: (str) function type to factor and reshape - :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors - and eigen values - """ - grad_shape = grad.get_shape() - if f_type == 'act': - assert eigen_values.get_shape()[0] == grad_shape[fac_idx] - expanded_shape = [1, ] * len(grad_shape) - expanded_shape[fac_idx] = -1 - eigen_values = tf.reshape(eigen_values, expanded_shape) - if f_type == 'grad': - assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1] - expanded_shape = [1, ] * len(grad_shape) - expanded_shape[len(grad_shape) - fac_idx - 1] = -1 - eigen_values = tf.reshape(eigen_values, expanded_shape) - - return eigen_vectors, eigen_values diff --git a/stable_baselines/acktr/run_atari.py b/stable_baselines/acktr/run_atari.py deleted file mode 100644 index 694eb811..00000000 --- a/stable_baselines/acktr/run_atari.py +++ /dev/null @@ -1,32 +0,0 @@ -from stable_baselines import logger, ACKTR -from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser -from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack -from stable_baselines.common.policies import CnnPolicy - - -def train(env_id, num_timesteps, seed, num_cpu): - """ - train an ACKTR model on atari - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - :param num_cpu: (int) The number of cpu to train on - """ - env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) - model = ACKTR(CnnPolicy, env, nprocs=num_cpu, seed=seed) - model.learn(total_timesteps=int(num_timesteps * 1.1)) - env.close() - - -def main(): - """ - Runs the test - """ - args = atari_arg_parser().parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/bench/__init__.py b/stable_baselines/bench/__init__.py deleted file mode 100644 index 58a0727d..00000000 --- a/stable_baselines/bench/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.bench.monitor import Monitor, load_results diff --git a/stable_baselines/bench/monitor.py b/stable_baselines/bench/monitor.py deleted file mode 100644 index e308fcd0..00000000 --- a/stable_baselines/bench/monitor.py +++ /dev/null @@ -1,208 +0,0 @@ -__all__ = ['Monitor', 'get_monitor_files', 'load_results'] - -import csv -import json -import os -import time -from glob import glob -from typing import Tuple, Dict, Any, List, Optional - -import gym -import pandas -import numpy as np - - -class Monitor(gym.Wrapper): - """ - A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. - - :param env: (gym.Env) The environment - :param filename: (Optional[str]) the location to save a log file, can be None for no log - :param allow_early_resets: (bool) allows the reset of the environment before it is done - :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset - :param info_keywords: (tuple) extra information to log, from the information return of environment.step - """ - EXT = "monitor.csv" - file_handler = None - - def __init__(self, - env: gym.Env, - filename: Optional[str], - allow_early_resets: bool = True, - reset_keywords=(), - info_keywords=()): - super(Monitor, self).__init__(env=env) - self.t_start = time.time() - if filename is None: - self.file_handler = None - self.logger = None - else: - if not filename.endswith(Monitor.EXT): - if os.path.isdir(filename): - filename = os.path.join(filename, Monitor.EXT) - else: - filename = filename + "." + Monitor.EXT - self.file_handler = open(filename, "wt") - self.file_handler.write('#%s\n' % json.dumps({"t_start": self.t_start, 'env_id': env.spec and env.spec.id})) - self.logger = csv.DictWriter(self.file_handler, - fieldnames=('r', 'l', 't') + reset_keywords + info_keywords) - self.logger.writeheader() - self.file_handler.flush() - - self.reset_keywords = reset_keywords - self.info_keywords = info_keywords - self.allow_early_resets = allow_early_resets - self.rewards = None - self.needs_reset = True - self.episode_rewards = [] - self.episode_lengths = [] - self.episode_times = [] - self.total_steps = 0 - self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() - - def reset(self, **kwargs) -> np.ndarray: - """ - Calls the Gym environment reset. Can only be called if the environment is over, or if allow_early_resets is True - - :param kwargs: Extra keywords saved for the next episode. only if defined by reset_keywords - :return: (np.ndarray) the first observation of the environment - """ - if not self.allow_early_resets and not self.needs_reset: - raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, " - "wrap your env with Monitor(env, path, allow_early_resets=True)") - self.rewards = [] - self.needs_reset = False - for key in self.reset_keywords: - value = kwargs.get(key) - if value is None: - raise ValueError('Expected you to pass kwarg {} into reset'.format(key)) - self.current_reset_info[key] = value - return self.env.reset(**kwargs) - - def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict[Any, Any]]: - """ - Step the environment with the given action - - :param action: (np.ndarray) the action - :return: (Tuple[np.ndarray, float, bool, Dict[Any, Any]]) observation, reward, done, information - """ - if self.needs_reset: - raise RuntimeError("Tried to step environment that needs reset") - observation, reward, done, info = self.env.step(action) - self.rewards.append(reward) - if done: - self.needs_reset = True - ep_rew = sum(self.rewards) - eplen = len(self.rewards) - ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)} - for key in self.info_keywords: - ep_info[key] = info[key] - self.episode_rewards.append(ep_rew) - self.episode_lengths.append(eplen) - self.episode_times.append(time.time() - self.t_start) - ep_info.update(self.current_reset_info) - if self.logger: - self.logger.writerow(ep_info) - self.file_handler.flush() - info['episode'] = ep_info - self.total_steps += 1 - return observation, reward, done, info - - def close(self): - """ - Closes the environment - """ - super(Monitor, self).close() - if self.file_handler is not None: - self.file_handler.close() - - def get_total_steps(self) -> int: - """ - Returns the total number of timesteps - - :return: (int) - """ - return self.total_steps - - def get_episode_rewards(self) -> List[float]: - """ - Returns the rewards of all the episodes - - :return: ([float]) - """ - return self.episode_rewards - - def get_episode_lengths(self) -> List[int]: - """ - Returns the number of timesteps of all the episodes - - :return: ([int]) - """ - return self.episode_lengths - - def get_episode_times(self) -> List[float]: - """ - Returns the runtime in seconds of all the episodes - - :return: ([float]) - """ - return self.episode_times - - -class LoadMonitorResultsError(Exception): - """ - Raised when loading the monitor log fails. - """ - pass - - -def get_monitor_files(path: str) -> List[str]: - """ - get all the monitor files in the given path - - :param path: (str) the logging folder - :return: ([str]) the log files - """ - return glob(os.path.join(path, "*" + Monitor.EXT)) - - -def load_results(path: str) -> pandas.DataFrame: - """ - Load all Monitor logs from a given directory path matching ``*monitor.csv`` and ``*monitor.json`` - - :param path: (str) the directory path containing the log file(s) - :return: (pandas.DataFrame) the logged data - """ - # get both csv and (old) json files - monitor_files = (glob(os.path.join(path, "*monitor.json")) + get_monitor_files(path)) - if not monitor_files: - raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path)) - data_frames = [] - headers = [] - for file_name in monitor_files: - with open(file_name, 'rt') as file_handler: - if file_name.endswith('csv'): - first_line = file_handler.readline() - assert first_line[0] == '#' - header = json.loads(first_line[1:]) - data_frame = pandas.read_csv(file_handler, index_col=None) - headers.append(header) - elif file_name.endswith('json'): # Deprecated json format - episodes = [] - lines = file_handler.readlines() - header = json.loads(lines[0]) - headers.append(header) - for line in lines[1:]: - episode = json.loads(line) - episodes.append(episode) - data_frame = pandas.DataFrame(episodes) - else: - assert 0, 'unreachable' - data_frame['t'] += header['t_start'] - data_frames.append(data_frame) - data_frame = pandas.concat(data_frames) - data_frame.sort_values('t', inplace=True) - data_frame.reset_index(inplace=True) - data_frame['t'] -= min(header['t_start'] for header in headers) - # data_frame.headers = headers # HACK to preserve backwards compatibility - return data_frame diff --git a/stable_baselines/common/__init__.py b/stable_baselines/common/__init__.py deleted file mode 100644 index 7087980e..00000000 --- a/stable_baselines/common/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# flake8: noqa F403 -from stable_baselines.common.console_util import fmt_row, fmt_item, colorize -from stable_baselines.common.dataset import Dataset -from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, \ - explained_variance_2d, flatten_arrays, unflatten_vector -from stable_baselines.common.misc_util import zipsame, set_global_seeds, boolean_flag -from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \ - TensorboardWriter -from stable_baselines.common.cmd_util import make_vec_env diff --git a/stable_baselines/common/atari_wrappers.py b/stable_baselines/common/atari_wrappers.py deleted file mode 100644 index 2a0488f3..00000000 --- a/stable_baselines/common/atari_wrappers.py +++ /dev/null @@ -1,313 +0,0 @@ -from collections import deque - -import numpy as np -import gym -from gym import spaces -import cv2 # pytype:disable=import-error -cv2.ocl.setUseOpenCL(False) - - -class NoopResetEnv(gym.Wrapper): - def __init__(self, env, noop_max=30): - """ - Sample initial states by taking random number of no-ops on reset. - No-op is assumed to be action 0. - - :param env: (Gym Environment) the environment to wrap - :param noop_max: (int) the maximum value of no-ops to run - """ - gym.Wrapper.__init__(self, env) - self.noop_max = noop_max - self.override_num_noops = None - self.noop_action = 0 - assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - - def reset(self, **kwargs): - self.env.reset(**kwargs) - if self.override_num_noops is not None: - noops = self.override_num_noops - else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) - assert noops > 0 - obs = None - for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs - - def step(self, action): - return self.env.step(action) - - -class FireResetEnv(gym.Wrapper): - def __init__(self, env): - """ - Take action on reset for environments that are fixed until firing. - - :param env: (Gym Environment) the environment to wrap - """ - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' - assert len(env.unwrapped.get_action_meanings()) >= 3 - - def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs - - def step(self, action): - return self.env.step(action) - - -class EpisodicLifeEnv(gym.Wrapper): - def __init__(self, env): - """ - Make end-of-life == end-of-episode, but only reset on true game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - - :param env: (Gym Environment) the environment to wrap - """ - gym.Wrapper.__init__(self, env) - self.lives = 0 - self.was_real_done = True - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if 0 < lives < self.lives: - # for Qbert sometimes we stay in lives == 0 condtion for a few frames - # so its important to keep lives > 0, so that we only reset once - # the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def reset(self, **kwargs): - """ - Calls the Gym environment reset, only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - - :param kwargs: Extra keywords passed to env.reset() call - :return: ([int] or [float]) the first observation of the environment - """ - if self.was_real_done: - obs = self.env.reset(**kwargs) - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.lives = self.env.unwrapped.ale.lives() - return obs - - -class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env, skip=4): - """ - Return only every `skip`-th frame (frameskipping) - - :param env: (Gym Environment) the environment - :param skip: (int) number of `skip`-th frame - """ - gym.Wrapper.__init__(self, env) - # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=env.observation_space.dtype) - self._skip = skip - - def step(self, action): - """ - Step the environment with the given action - Repeat action, sum reward, and max over last observations. - - :param action: ([int] or [float]) the action - :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information - """ - total_reward = 0.0 - done = None - for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - # Note that the observation on the done=True frame - # doesn't matter - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) - - -class ClipRewardEnv(gym.RewardWrapper): - def __init__(self, env): - """ - clips the reward to {+1, 0, -1} by its sign. - - :param env: (Gym Environment) the environment - """ - gym.RewardWrapper.__init__(self, env) - - def reward(self, reward): - """ - Bin reward to {+1, 0, -1} by its sign. - - :param reward: (float) - """ - return np.sign(reward) - - -class WarpFrame(gym.ObservationWrapper): - def __init__(self, env): - """ - Warp frames to 84x84 as done in the Nature paper and later work. - - :param env: (Gym Environment) the environment - """ - gym.ObservationWrapper.__init__(self, env) - self.width = 84 - self.height = 84 - self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1), - dtype=env.observation_space.dtype) - - def observation(self, frame): - """ - returns the current observation from a frame - - :param frame: ([int] or [float]) environment frame - :return: ([int] or [float]) the observation - """ - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) - return frame[:, :, None] - - -class FrameStack(gym.Wrapper): - def __init__(self, env, n_frames): - """Stack n_frames last frames. - - Returns lazy array, which is much more memory efficient. - - See Also - -------- - stable_baselines.common.atari_wrappers.LazyFrames - - :param env: (Gym Environment) the environment - :param n_frames: (int) the number of frames to stack - """ - gym.Wrapper.__init__(self, env) - self.n_frames = n_frames - self.frames = deque([], maxlen=n_frames) - shp = env.observation_space.shape - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * n_frames), - dtype=env.observation_space.dtype) - - def reset(self, **kwargs): - obs = self.env.reset(**kwargs) - for _ in range(self.n_frames): - self.frames.append(obs) - return self._get_ob() - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.frames.append(obs) - return self._get_ob(), reward, done, info - - def _get_ob(self): - assert len(self.frames) == self.n_frames - return LazyFrames(list(self.frames)) - - -class ScaledFloatFrame(gym.ObservationWrapper): - def __init__(self, env): - gym.ObservationWrapper.__init__(self, env) - self.observation_space = spaces.Box(low=0, high=1.0, shape=env.observation_space.shape, dtype=np.float32) - - def observation(self, observation): - # careful! This undoes the memory optimization, use - # with smaller replay buffers only. - return np.array(observation).astype(np.float32) / 255.0 - - -class LazyFrames(object): - def __init__(self, frames): - """ - This object ensures that common frames between the observations are only stored once. - It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay - buffers. - - This object should only be converted to np.ndarray before being passed to the model. - - :param frames: ([int] or [float]) environment frames - """ - self._frames = frames - self._out = None - - def _force(self): - if self._out is None: - self._out = np.concatenate(self._frames, axis=2) - self._frames = None - return self._out - - def __array__(self, dtype=None): - out = self._force() - if dtype is not None: - out = out.astype(dtype) - return out - - def __len__(self): - return len(self._force()) - - def __getitem__(self, i): - return self._force()[i] - - -def make_atari(env_id): - """ - Create a wrapped atari Environment - - :param env_id: (str) the environment ID - :return: (Gym Environment) the wrapped atari environment - """ - env = gym.make(env_id) - assert 'NoFrameskip' in env.spec.id - env = NoopResetEnv(env, noop_max=30) - env = MaxAndSkipEnv(env, skip=4) - return env - - -def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): - """ - Configure environment for DeepMind-style Atari. - - :param env: (Gym Environment) the atari environment - :param episode_life: (bool) wrap the episode life wrapper - :param clip_rewards: (bool) wrap the reward clipping wrapper - :param frame_stack: (bool) wrap the frame stacking wrapper - :param scale: (bool) wrap the scaling observation wrapper - :return: (Gym Environment) the wrapped atari environment - """ - if episode_life: - env = EpisodicLifeEnv(env) - if 'FIRE' in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = WarpFrame(env) - if scale: - env = ScaledFloatFrame(env) - if clip_rewards: - env = ClipRewardEnv(env) - if frame_stack: - env = FrameStack(env, 4) - return env diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py deleted file mode 100644 index 50431288..00000000 --- a/stable_baselines/common/base_class.py +++ /dev/null @@ -1,1190 +0,0 @@ -import os -import glob -import json -import zipfile -import warnings -from abc import ABC, abstractmethod -from collections import OrderedDict, deque -from typing import Union, List, Callable, Optional - -import gym -import cloudpickle -import numpy as np -import tensorflow as tf - -from stable_baselines.common.misc_util import set_global_seeds -from stable_baselines.common.save_util import data_to_json, json_to_data, params_to_bytes, bytes_to_params -from stable_baselines.common.policies import get_policy_from_name, ActorCriticPolicy -from stable_baselines.common.runners import AbstractEnvRunner -from stable_baselines.common.vec_env import (VecEnvWrapper, VecEnv, DummyVecEnv, - VecNormalize, unwrap_vec_normalize) -from stable_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback -from stable_baselines import logger - - -class BaseRLModel(ABC): - """ - The base RL model - - :param policy: (BasePolicy) Policy object - :param env: (Gym environment) The environment to learn from - (if registered in Gym, can be str. Can be None for loading trained models) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param requires_vec_env: (bool) Does this model require a vectorized environment - :param policy_base: (BasePolicy) the base policy used by this method - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base, - policy_kwargs=None, seed=None, n_cpu_tf_sess=None): - if isinstance(policy, str) and policy_base is not None: - self.policy = get_policy_from_name(policy_base, policy) - else: - self.policy = policy - self.env = env - self.verbose = verbose - self._requires_vec_env = requires_vec_env - self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs - self.observation_space = None - self.action_space = None - self.n_envs = None - self._vectorize_action = False - self.num_timesteps = 0 - self.graph = None - self.sess = None - self.params = None - self.seed = seed - self._param_load_ops = None - self.n_cpu_tf_sess = n_cpu_tf_sess - self.episode_reward = None - self.ep_info_buf = None - - if env is not None: - if isinstance(env, str): - if self.verbose >= 1: - print("Creating environment from the given name, wrapped in a DummyVecEnv.") - self.env = env = DummyVecEnv([lambda: gym.make(env)]) - - self.observation_space = env.observation_space - self.action_space = env.action_space - if requires_vec_env: - if isinstance(env, VecEnv): - self.n_envs = env.num_envs - else: - # The model requires a VecEnv - # wrap it in a DummyVecEnv to avoid error - self.env = DummyVecEnv([lambda: env]) - if self.verbose >= 1: - print("Wrapping the env in a DummyVecEnv.") - self.n_envs = 1 - else: - if isinstance(env, VecEnv): - if env.num_envs == 1: - self.env = _UnvecWrapper(env) - self._vectorize_action = True - else: - raise ValueError("Error: the model requires a non vectorized environment or a single vectorized" - " environment.") - self.n_envs = 1 - - # Get VecNormalize object if it exists - self._vec_normalize_env = unwrap_vec_normalize(self.env) - - def get_env(self): - """ - returns the current environment (can be None if not defined) - - :return: (Gym Environment) The current environment - """ - return self.env - - def get_vec_normalize_env(self) -> Optional[VecNormalize]: - """ - Return the ``VecNormalize`` wrapper of the training env - if it exists. - - :return: Optional[VecNormalize] The ``VecNormalize`` env. - """ - return self._vec_normalize_env - - def set_env(self, env): - """ - Checks the validity of the environment, and if it is coherent, set it as the current environment. - - :param env: (Gym Environment) The environment for learning a policy - """ - if env is None and self.env is None: - if self.verbose >= 1: - print("Loading a model without an environment, " - "this model cannot be trained until it has a valid environment.") - return - elif env is None: - raise ValueError("Error: trying to replace the current environment with None") - - # sanity checking the environment - assert self.observation_space == env.observation_space, \ - "Error: the environment passed must have at least the same observation space as the model was trained on." - assert self.action_space == env.action_space, \ - "Error: the environment passed must have at least the same action space as the model was trained on." - if self._requires_vec_env: - assert isinstance(env, VecEnv), \ - "Error: the environment passed is not a vectorized environment, however {} requires it".format( - self.__class__.__name__) - assert not self.policy.recurrent or self.n_envs == env.num_envs, \ - "Error: the environment passed must have the same number of environments as the model was trained on." \ - "This is due to the Lstm policy not being capable of changing the number of environments." - self.n_envs = env.num_envs - else: - # for models that dont want vectorized environment, check if they make sense and adapt them. - # Otherwise tell the user about this issue - if isinstance(env, VecEnv): - if env.num_envs == 1: - env = _UnvecWrapper(env) - self._vectorize_action = True - else: - raise ValueError("Error: the model requires a non vectorized environment or a single vectorized " - "environment.") - else: - self._vectorize_action = False - - self.n_envs = 1 - - self.env = env - self._vec_normalize_env = unwrap_vec_normalize(env) - - # Invalidated by environment change. - self.episode_reward = None - self.ep_info_buf = None - - def _init_num_timesteps(self, reset_num_timesteps=True): - """ - Initialize and resets num_timesteps (total timesteps since beginning of training) - if needed. Mainly used logging and plotting (tensorboard). - - :param reset_num_timesteps: (bool) Set it to false when continuing training - to not create new plotting curves in tensorboard. - :return: (bool) Whether a new tensorboard log needs to be created - """ - if reset_num_timesteps: - self.num_timesteps = 0 - - new_tb_log = self.num_timesteps == 0 - return new_tb_log - - @abstractmethod - def setup_model(self): - """ - Create all the functions and tensorflow graphs necessary to train the model - """ - pass - - def _init_callback(self, - callback: Union[None, Callable, List[BaseCallback], BaseCallback] - ) -> BaseCallback: - """ - :param callback: (Union[None, Callable, List[BaseCallback], BaseCallback]) - :return: (BaseCallback) - """ - # Convert a list of callbacks into a callback - if isinstance(callback, list): - callback = CallbackList(callback) - # Convert functional callback to object - if not isinstance(callback, BaseCallback): - callback = ConvertCallback(callback) - - callback.init_callback(self) - return callback - - def set_random_seed(self, seed: Optional[int]) -> None: - """ - :param seed: (Optional[int]) Seed for the pseudo-random generators. If None, - do not change the seeds. - """ - # Ignore if the seed is None - if seed is None: - return - # Seed python, numpy and tf random generator - set_global_seeds(seed) - if self.env is not None: - self.env.seed(seed) - # Seed the action space - # useful when selecting random actions - self.env.action_space.seed(seed) - self.action_space.seed(seed) - - def _setup_learn(self): - """ - Check the environment. - """ - if self.env is None: - raise ValueError("Error: cannot train the model without a valid environment, please set an environment with" - "set_env(self, env) method.") - if self.episode_reward is None: - self.episode_reward = np.zeros((self.n_envs,)) - if self.ep_info_buf is None: - self.ep_info_buf = deque(maxlen=100) - - @abstractmethod - def get_parameter_list(self): - """ - Get tensorflow Variables of model's parameters - - This includes all variables necessary for continuing training (saving / loading). - - :return: (list) List of tensorflow Variables - """ - pass - - def get_parameters(self): - """ - Get current model parameters as dictionary of variable name -> ndarray. - - :return: (OrderedDict) Dictionary of variable name -> ndarray of model's parameters. - """ - parameters = self.get_parameter_list() - parameter_values = self.sess.run(parameters) - return_dictionary = OrderedDict((param.name, value) for param, value in zip(parameters, parameter_values)) - return return_dictionary - - def _setup_load_operations(self): - """ - Create tensorflow operations for loading model parameters - """ - # Assume tensorflow graphs are static -> check - # that we only call this function once - if self._param_load_ops is not None: - raise RuntimeError("Parameter load operations have already been created") - # For each loadable parameter, create appropiate - # placeholder and an assign op, and store them to - # self.load_param_ops as dict of variable.name -> (placeholder, assign) - loadable_parameters = self.get_parameter_list() - # Use OrderedDict to store order for backwards compatibility with - # list-based params - self._param_load_ops = OrderedDict() - with self.graph.as_default(): - for param in loadable_parameters: - placeholder = tf.placeholder(dtype=param.dtype, shape=param.shape) - # param.name is unique (tensorflow variables have unique names) - self._param_load_ops[param.name] = (placeholder, param.assign(placeholder)) - - @abstractmethod - def _get_pretrain_placeholders(self): - """ - Return the placeholders needed for the pretraining: - - obs_ph: observation placeholder - - actions_ph will be population with an action from the environment - (from the expert dataset) - - deterministic_actions_ph: e.g., in the case of a Gaussian policy, - the mean. - - :return: ((tf.placeholder)) (obs_ph, actions_ph, deterministic_actions_ph) - """ - pass - - def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4, - adam_epsilon=1e-8, val_interval=None): - """ - Pretrain a model using behavior cloning: - supervised learning given an expert dataset. - - NOTE: only Box and Discrete spaces are supported for now. - - :param dataset: (ExpertDataset) Dataset manager - :param n_epochs: (int) Number of iterations on the training set - :param learning_rate: (float) Learning rate - :param adam_epsilon: (float) the epsilon value for the adam optimizer - :param val_interval: (int) Report training and validation losses every n epochs. - By default, every 10th of the maximum number of epochs. - :return: (BaseRLModel) the pretrained model - """ - continuous_actions = isinstance(self.action_space, gym.spaces.Box) - discrete_actions = isinstance(self.action_space, gym.spaces.Discrete) - - assert discrete_actions or continuous_actions, 'Only Discrete and Box action spaces are supported' - - # Validate the model every 10% of the total number of iteration - if val_interval is None: - # Prevent modulo by zero - if n_epochs < 10: - val_interval = 1 - else: - val_interval = int(n_epochs / 10) - - with self.graph.as_default(): - with tf.variable_scope('pretrain', reuse=tf.AUTO_REUSE): - if continuous_actions: - obs_ph, actions_ph, deterministic_actions_ph = self._get_pretrain_placeholders() - loss = tf.reduce_mean(tf.square(actions_ph - deterministic_actions_ph)) - else: - obs_ph, actions_ph, actions_logits_ph = self._get_pretrain_placeholders() - # actions_ph has a shape if (n_batch,), we reshape it to (n_batch, 1) - # so no additional changes is needed in the dataloader - actions_ph = tf.expand_dims(actions_ph, axis=1) - one_hot_actions = tf.one_hot(actions_ph, self.action_space.n) - loss = tf.nn.softmax_cross_entropy_with_logits_v2( - logits=actions_logits_ph, - labels=tf.stop_gradient(one_hot_actions) - ) - loss = tf.reduce_mean(loss) - optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=adam_epsilon) - optim_op = optimizer.minimize(loss, var_list=self.params) - - self.sess.run(tf.global_variables_initializer()) - - if self.verbose > 0: - print("Pretraining with Behavior Cloning...") - - for epoch_idx in range(int(n_epochs)): - train_loss = 0.0 - # Full pass on the training set - for _ in range(len(dataset.train_loader)): - expert_obs, expert_actions = dataset.get_next_batch('train') - feed_dict = { - obs_ph: expert_obs, - actions_ph: expert_actions, - } - train_loss_, _ = self.sess.run([loss, optim_op], feed_dict) - train_loss += train_loss_ - - train_loss /= len(dataset.train_loader) - - if self.verbose > 0 and (epoch_idx + 1) % val_interval == 0: - val_loss = 0.0 - # Full pass on the validation set - for _ in range(len(dataset.val_loader)): - expert_obs, expert_actions = dataset.get_next_batch('val') - val_loss_, = self.sess.run([loss], {obs_ph: expert_obs, - actions_ph: expert_actions}) - val_loss += val_loss_ - - val_loss /= len(dataset.val_loader) - if self.verbose > 0: - print("==== Training progress {:.2f}% ====".format(100 * (epoch_idx + 1) / n_epochs)) - print('Epoch {}'.format(epoch_idx + 1)) - print("Training loss: {:.6f}, Validation loss: {:.6f}".format(train_loss, val_loss)) - print() - # Free memory - del expert_obs, expert_actions - if self.verbose > 0: - print("Pretraining done.") - return self - - @abstractmethod - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="run", - reset_num_timesteps=True): - """ - Return a trained model. - - :param total_timesteps: (int) The total number of samples to train on - :param callback: (Union[callable, [callable], BaseCallback]) - function called at every steps with state of the algorithm. - It takes the local and global variables. If it returns False, training is aborted. - When the callback inherits from BaseCallback, you will have access - to additional stages of the training (training start/end), - please read the documentation for more details. - :param log_interval: (int) The number of timesteps before logging. - :param tb_log_name: (str) the name of the run for tensorboard log - :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging) - :return: (BaseRLModel) the trained model - """ - pass - - @abstractmethod - def predict(self, observation, state=None, mask=None, deterministic=False): - """ - Get the model's action from an observation - - :param observation: (np.ndarray) the input observation - :param state: (np.ndarray) The last states (can be None, used in recurrent policies) - :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: (np.ndarray, np.ndarray) the model's action and the next state (used in recurrent policies) - """ - pass - - @abstractmethod - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - """ - If ``actions`` is ``None``, then get the model's action probability distribution from a given observation. - - Depending on the action space the output is: - - Discrete: probability for each possible action - - Box: mean and standard deviation of the action output - - However if ``actions`` is not ``None``, this function will return the probability that the given actions are - taken with the given parameters (observation, state, ...) on this model. For discrete action spaces, it - returns the probability mass; for continuous action spaces, the probability density. This is since the - probability mass will always be zero in continuous spaces, see http://blog.christianperone.com/2019/01/ - for a good explanation - - :param observation: (np.ndarray) the input observation - :param state: (np.ndarray) The last states (can be None, used in recurrent policies) - :param mask: (np.ndarray) The last masks (can be None, used in recurrent policies) - :param actions: (np.ndarray) (OPTIONAL) For calculating the likelihood that the given actions are chosen by - the model for each of the given parameters. Must have the same number of actions and observations. - (set to None to return the complete action probability distribution) - :param logp: (bool) (OPTIONAL) When specified with actions, returns probability in log-space. - This has no effect if actions is None. - :return: (np.ndarray) the model's (log) action probability - """ - pass - - def load_parameters(self, load_path_or_dict, exact_match=True): - """ - Load model parameters from a file or a dictionary - - Dictionary keys should be tensorflow variable names, which can be obtained - with ``get_parameters`` function. If ``exact_match`` is True, dictionary - should contain keys for all model's parameters, otherwise RunTimeError - is raised. If False, only variables included in the dictionary will be updated. - - This does not load agent's hyper-parameters. - - .. warning:: - This function does not update trainer/optimizer variables (e.g. momentum). - As such training after using this function may lead to less-than-optimal results. - - :param load_path_or_dict: (str or file-like or dict) Save parameter location - or dict of parameters as variable.name -> ndarrays to be loaded. - :param exact_match: (bool) If True, expects load dictionary to contain keys for - all variables in the model. If False, loads parameters only for variables - mentioned in the dictionary. Defaults to True. - """ - # Make sure we have assign ops - if self._param_load_ops is None: - self._setup_load_operations() - - if isinstance(load_path_or_dict, dict): - # Assume `load_path_or_dict` is dict of variable.name -> ndarrays we want to load - params = load_path_or_dict - elif isinstance(load_path_or_dict, list): - warnings.warn("Loading model parameters from a list. This has been replaced " + - "with parameter dictionaries with variable names and parameters. " + - "If you are loading from a file, consider re-saving the file.", - DeprecationWarning) - # Assume `load_path_or_dict` is list of ndarrays. - # Create param dictionary assuming the parameters are in same order - # as `get_parameter_list` returns them. - params = dict() - for i, param_name in enumerate(self._param_load_ops.keys()): - params[param_name] = load_path_or_dict[i] - else: - # Assume a filepath or file-like. - # Use existing deserializer to load the parameters. - # We only need the parameters part of the file, so - # only load that part. - _, params = BaseRLModel._load_from_file(load_path_or_dict, load_data=False) - params = dict(params) - - feed_dict = {} - param_update_ops = [] - # Keep track of not-updated variables - not_updated_variables = set(self._param_load_ops.keys()) - for param_name, param_value in params.items(): - placeholder, assign_op = self._param_load_ops[param_name] - feed_dict[placeholder] = param_value - # Create list of tf.assign operations for sess.run - param_update_ops.append(assign_op) - # Keep track which variables are updated - not_updated_variables.remove(param_name) - - # Check that we updated all parameters if exact_match=True - if exact_match and len(not_updated_variables) > 0: - raise RuntimeError("Load dictionary did not contain all variables. " + - "Missing variables: {}".format(", ".join(not_updated_variables))) - - self.sess.run(param_update_ops, feed_dict=feed_dict) - - @abstractmethod - def save(self, save_path, cloudpickle=False): - """ - Save the current parameters to file - - :param save_path: (str or file-like) The save location - :param cloudpickle: (bool) Use older cloudpickle format instead of zip-archives. - """ - raise NotImplementedError() - - @classmethod - @abstractmethod - def load(cls, load_path, env=None, custom_objects=None, **kwargs): - """ - Load the model from file - - :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Environment) the new environment to run the loaded model on - (can be None if you only need prediction from a trained model) - :param custom_objects: (dict) Dictionary of objects to replace - upon loading. If a variable is present in this dictionary as a - key, it will not be deserialized and the corresponding item - will be used instead. Similar to custom_objects in - `keras.models.load_model`. Useful when you have an object in - file that can not be deserialized. - :param kwargs: extra arguments to change the model when loading - """ - raise NotImplementedError() - - @staticmethod - def _save_to_file_cloudpickle(save_path, data=None, params=None): - """Legacy code for saving models with cloudpickle - - :param save_path: (str or file-like) Where to store the model - :param data: (OrderedDict) Class parameters being stored - :param params: (OrderedDict) Model parameters being stored - """ - if isinstance(save_path, str): - _, ext = os.path.splitext(save_path) - if ext == "": - save_path += ".pkl" - - with open(save_path, "wb") as file_: - cloudpickle.dump((data, params), file_) - else: - # Here save_path is a file-like object, not a path - cloudpickle.dump((data, params), save_path) - - @staticmethod - def _save_to_file_zip(save_path, data=None, params=None): - """Save model to a .zip archive - - :param save_path: (str or file-like) Where to store the model - :param data: (OrderedDict) Class parameters being stored - :param params: (OrderedDict) Model parameters being stored - """ - # data/params can be None, so do not - # try to serialize them blindly - if data is not None: - serialized_data = data_to_json(data) - if params is not None: - serialized_params = params_to_bytes(params) - # We also have to store list of the parameters - # to store the ordering for OrderedDict. - # We can trust these to be strings as they - # are taken from the Tensorflow graph. - serialized_param_list = json.dumps( - list(params.keys()), - indent=4 - ) - - # Check postfix if save_path is a string - if isinstance(save_path, str): - _, ext = os.path.splitext(save_path) - if ext == "": - save_path += ".zip" - - # Create a zip-archive and write our objects - # there. This works when save_path - # is either str or a file-like - with zipfile.ZipFile(save_path, "w") as file_: - # Do not try to save "None" elements - if data is not None: - file_.writestr("data", serialized_data) - if params is not None: - file_.writestr("parameters", serialized_params) - file_.writestr("parameter_list", serialized_param_list) - - @staticmethod - def _save_to_file(save_path, data=None, params=None, cloudpickle=False): - """Save model to a zip archive or cloudpickle file. - - :param save_path: (str or file-like) Where to store the model - :param data: (OrderedDict) Class parameters being stored - :param params: (OrderedDict) Model parameters being stored - :param cloudpickle: (bool) Use old cloudpickle format - (stable-baselines<=2.7.0) instead of a zip archive. - """ - if cloudpickle: - BaseRLModel._save_to_file_cloudpickle(save_path, data, params) - else: - BaseRLModel._save_to_file_zip(save_path, data, params) - - @staticmethod - def _load_from_file_cloudpickle(load_path): - """Legacy code for loading older models stored with cloudpickle - - :param load_path: (str or file-like) where from to load the file - :return: (dict, OrderedDict) Class parameters and model parameters - """ - if isinstance(load_path, str): - if not os.path.exists(load_path): - if os.path.exists(load_path + ".pkl"): - load_path += ".pkl" - else: - raise ValueError("Error: the file {} could not be found".format(load_path)) - - with open(load_path, "rb") as file_: - data, params = cloudpickle.load(file_) - else: - # Here load_path is a file-like object, not a path - data, params = cloudpickle.load(load_path) - - return data, params - - @staticmethod - def _load_from_file(load_path, load_data=True, custom_objects=None): - """Load model data from a .zip archive - - :param load_path: (str or file-like) Where to load model from - :param load_data: (bool) Whether we should load and return data - (class parameters). Mainly used by `load_parameters` to - only load model parameters (weights). - :param custom_objects: (dict) Dictionary of objects to replace - upon loading. If a variable is present in this dictionary as a - key, it will not be deserialized and the corresponding item - will be used instead. Similar to custom_objects in - `keras.models.load_model`. Useful when you have an object in - file that can not be deserialized. - :return: (dict, OrderedDict) Class parameters and model parameters - """ - # Check if file exists if load_path is - # a string - if isinstance(load_path, str): - if not os.path.exists(load_path): - if os.path.exists(load_path + ".zip"): - load_path += ".zip" - else: - raise ValueError("Error: the file {} could not be found".format(load_path)) - - # Open the zip archive and load data. - try: - with zipfile.ZipFile(load_path, "r") as file_: - namelist = file_.namelist() - # If data or parameters is not in the - # zip archive, assume they were stored - # as None (_save_to_file allows this). - data = None - params = None - if "data" in namelist and load_data: - # Load class parameters and convert to string - # (Required for json library in Python 3.5) - json_data = file_.read("data").decode() - data = json_to_data(json_data, custom_objects=custom_objects) - - if "parameters" in namelist: - # Load parameter list and and parameters - parameter_list_json = file_.read("parameter_list").decode() - parameter_list = json.loads(parameter_list_json) - serialized_params = file_.read("parameters") - params = bytes_to_params( - serialized_params, parameter_list - ) - except zipfile.BadZipFile: - # load_path wasn't a zip file. Possibly a cloudpickle - # file. Show a warning and fall back to loading cloudpickle. - warnings.warn("It appears you are loading from a file with old format. " + - "Older cloudpickle format has been replaced with zip-archived " + - "models. Consider saving the model with new format.", - DeprecationWarning) - # Attempt loading with the cloudpickle format. - # If load_path is file-like, seek back to beginning of file - if not isinstance(load_path, str): - load_path.seek(0) - data, params = BaseRLModel._load_from_file_cloudpickle(load_path) - - return data, params - - @staticmethod - def _softmax(x_input): - """ - An implementation of softmax. - - :param x_input: (numpy float) input vector - :return: (numpy float) output vector - """ - x_exp = np.exp(x_input.T - np.max(x_input.T, axis=0)) - return (x_exp / x_exp.sum(axis=0)).T - - @staticmethod - def _is_vectorized_observation(observation, observation_space): - """ - For every observation type, detects and validates the shape, - then returns whether or not the observation is vectorized. - - :param observation: (np.ndarray) the input observation to validate - :param observation_space: (gym.spaces) the observation space - :return: (bool) whether the given observation is vectorized or not - """ - if isinstance(observation_space, gym.spaces.Box): - if observation.shape == observation_space.shape: - return False - elif observation.shape[1:] == observation_space.shape: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + - "Box environment, please use {} ".format(observation_space.shape) + - "or (n_env, {}) for the observation shape." - .format(", ".join(map(str, observation_space.shape)))) - elif isinstance(observation_space, gym.spaces.Discrete): - if observation.shape == (): # A numpy array of a number, has shape empty tuple '()' - return False - elif len(observation.shape) == 1: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for ".format(observation.shape) + - "Discrete environment, please use (1,) or (n_env, 1) for the observation shape.") - elif isinstance(observation_space, gym.spaces.MultiDiscrete): - if observation.shape == (len(observation_space.nvec),): - return False - elif len(observation.shape) == 2 and observation.shape[1] == len(observation_space.nvec): - return True - else: - raise ValueError("Error: Unexpected observation shape {} for MultiDiscrete ".format(observation.shape) + - "environment, please use ({},) or ".format(len(observation_space.nvec)) + - "(n_env, {}) for the observation shape.".format(len(observation_space.nvec))) - elif isinstance(observation_space, gym.spaces.MultiBinary): - if observation.shape == (observation_space.n,): - return False - elif len(observation.shape) == 2 and observation.shape[1] == observation_space.n: - return True - else: - raise ValueError("Error: Unexpected observation shape {} for MultiBinary ".format(observation.shape) + - "environment, please use ({},) or ".format(observation_space.n) + - "(n_env, {}) for the observation shape.".format(observation_space.n)) - else: - raise ValueError("Error: Cannot determine if the observation is vectorized with the space type {}." - .format(observation_space)) - - -class ActorCriticRLModel(BaseRLModel): - """ - The base class for Actor critic model - - :param policy: (BasePolicy) Policy object - :param env: (Gym environment) The environment to learn from - (if registered in Gym, can be str. Can be None for loading trained models) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param policy_base: (BasePolicy) the base policy used by this method (default=ActorCriticPolicy) - :param requires_vec_env: (bool) Does this model require a vectorized environment - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, _init_setup_model, verbose=0, policy_base=ActorCriticPolicy, - requires_vec_env=False, policy_kwargs=None, seed=None, n_cpu_tf_sess=None): - super(ActorCriticRLModel, self).__init__(policy, env, verbose=verbose, requires_vec_env=requires_vec_env, - policy_base=policy_base, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.sess = None - self.initial_state = None - self.step = None - self.proba_step = None - self.params = None - self._runner = None - - def _make_runner(self) -> AbstractEnvRunner: - """Builds a new Runner. - - Lazily called whenever `self.runner` is accessed and `self._runner is None`. - """ - raise NotImplementedError("This model is not configured to use a Runner") - - @property - def runner(self) -> AbstractEnvRunner: - if self._runner is None: - self._runner = self._make_runner() - return self._runner - - def set_env(self, env): - self._runner = None # New environment invalidates `self._runner`. - super().set_env(env) - - @abstractmethod - def setup_model(self): - pass - - @abstractmethod - def learn(self, total_timesteps, callback=None, - log_interval=100, tb_log_name="run", reset_num_timesteps=True): - pass - - def predict(self, observation, state=None, mask=None, deterministic=False): - if state is None: - state = self.initial_state - if mask is None: - mask = [False for _ in range(self.n_envs)] - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions, _, states, _ = self.step(observation, state, mask, deterministic=deterministic) - - clipped_actions = actions - # Clip the actions to avoid out of bound error - if isinstance(self.action_space, gym.spaces.Box): - clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) - - if not vectorized_env: - if state is not None: - raise ValueError("Error: The environment must be vectorized when using recurrent policies.") - clipped_actions = clipped_actions[0] - - return clipped_actions, states - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - if state is None: - state = self.initial_state - if mask is None: - mask = [False for _ in range(self.n_envs)] - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions_proba = self.proba_step(observation, state, mask) - - if len(actions_proba) == 0: # empty list means not implemented - warnings.warn("Warning: action probability is not implemented for {} action space. Returning None." - .format(type(self.action_space).__name__)) - return None - - if actions is not None: # comparing the action distribution, to given actions - prob = None - logprob = None - actions = np.array([actions]) - if isinstance(self.action_space, gym.spaces.Discrete): - actions = actions.reshape((-1,)) - assert observation.shape[0] == actions.shape[0], \ - "Error: batch sizes differ for actions and observations." - prob = actions_proba[np.arange(actions.shape[0]), actions] - - elif isinstance(self.action_space, gym.spaces.MultiDiscrete): - actions = actions.reshape((-1, len(self.action_space.nvec))) - assert observation.shape[0] == actions.shape[0], \ - "Error: batch sizes differ for actions and observations." - # Discrete action probability, over multiple categories - actions = np.swapaxes(actions, 0, 1) # swap axis for easier categorical split - prob = np.prod([proba[np.arange(act.shape[0]), act] - for proba, act in zip(actions_proba, actions)], axis=0) - - elif isinstance(self.action_space, gym.spaces.MultiBinary): - actions = actions.reshape((-1, self.action_space.n)) - assert observation.shape[0] == actions.shape[0], \ - "Error: batch sizes differ for actions and observations." - # Bernoulli action probability, for every action - prob = np.prod(actions_proba * actions + (1 - actions_proba) * (1 - actions), axis=1) - - elif isinstance(self.action_space, gym.spaces.Box): - actions = actions.reshape((-1, ) + self.action_space.shape) - mean, std = actions_proba - logstd = np.log(std) - - n_elts = np.prod(mean.shape[1:]) # first dimension is batch size - log_normalizer = n_elts / 2.0 * np.log(2 * np.pi) + np.sum(logstd, axis=1) - - # Diagonal Gaussian action probability, for every action - logprob = -np.sum(np.square(actions - mean) / (2 * np.square(std)), axis=1) - log_normalizer - - else: - warnings.warn("Warning: action_probability not implemented for {} actions space. Returning None." - .format(type(self.action_space).__name__)) - return None - - # Return in space (log or normal) requested by user, converting if necessary - if logp: - if logprob is None: - logprob = np.log(prob) - ret = logprob - else: - if prob is None: - prob = np.exp(logprob) - ret = prob - - # normalize action proba shape for the different gym spaces - ret = ret.reshape((-1, 1)) - else: - ret = actions_proba - - if not vectorized_env: - if state is not None: - raise ValueError("Error: The environment must be vectorized when using recurrent policies.") - ret = ret[0] - - return ret - - def get_parameter_list(self): - return self.params - - @abstractmethod - def save(self, save_path, cloudpickle=False): - pass - - @classmethod - def load(cls, load_path, env=None, custom_objects=None, **kwargs): - """ - Load the model from file - - :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Environment) the new environment to run the loaded model on - (can be None if you only need prediction from a trained model) - :param custom_objects: (dict) Dictionary of objects to replace - upon loading. If a variable is present in this dictionary as a - key, it will not be deserialized and the corresponding item - will be used instead. Similar to custom_objects in - `keras.models.load_model`. Useful when you have an object in - file that can not be deserialized. - :param kwargs: extra arguments to change the model when loading - """ - data, params = cls._load_from_file(load_path, custom_objects=custom_objects) - - if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: - raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " - "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], - kwargs['policy_kwargs'])) - - model = cls(policy=data["policy"], env=None, _init_setup_model=False) # pytype: disable=not-instantiable - model.__dict__.update(data) - model.__dict__.update(kwargs) - model.set_env(env) - model.setup_model() - - model.load_parameters(params) - - return model - - -class OffPolicyRLModel(BaseRLModel): - """ - The base class for off policy RL model - - :param policy: (BasePolicy) Policy object - :param env: (Gym environment) The environment to learn from - (if registered in Gym, can be str. Can be None for loading trained models) - :param replay_buffer: (ReplayBuffer) the type of replay buffer - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param requires_vec_env: (bool) Does this model require a vectorized environment - :param policy_base: (BasePolicy) the base policy used by this method - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, replay_buffer=None, _init_setup_model=False, verbose=0, *, - requires_vec_env=False, policy_base=None, - policy_kwargs=None, seed=None, n_cpu_tf_sess=None): - super(OffPolicyRLModel, self).__init__(policy, env, verbose=verbose, requires_vec_env=requires_vec_env, - policy_base=policy_base, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.replay_buffer = replay_buffer - - def is_using_her(self) -> bool: - """ - Check if is using HER - - :return: (bool) Whether is using HER or not - """ - # Avoid circular import - from stable_baselines.her.replay_buffer import HindsightExperienceReplayWrapper - return isinstance(self.replay_buffer, HindsightExperienceReplayWrapper) - - def replay_buffer_add(self, obs_t, action, reward, obs_tp1, done, info): - """ - Add a new transition to the replay buffer - - :param obs_t: (np.ndarray) the last observation - :param action: ([float]) the action - :param reward: (float) the reward of the transition - :param obs_tp1: (np.ndarray) the new observation - :param done: (bool) is the episode done - :param info: (dict) extra values used to compute the reward when using HER - """ - # Pass info dict when using HER, as it can be used to compute the reward - kwargs = dict(info=info) if self.is_using_her() else {} - self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done), **kwargs) - - @abstractmethod - def setup_model(self): - pass - - @abstractmethod - def learn(self, total_timesteps, callback=None, - log_interval=100, tb_log_name="run", reset_num_timesteps=True, replay_wrapper=None): - pass - - @abstractmethod - def predict(self, observation, state=None, mask=None, deterministic=False): - pass - - @abstractmethod - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - pass - - @abstractmethod - def save(self, save_path, cloudpickle=False): - pass - - @classmethod - def load(cls, load_path, env=None, custom_objects=None, **kwargs): - """ - Load the model from file - - :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Environment) the new environment to run the loaded model on - (can be None if you only need prediction from a trained model) - :param custom_objects: (dict) Dictionary of objects to replace - upon loading. If a variable is present in this dictionary as a - key, it will not be deserialized and the corresponding item - will be used instead. Similar to custom_objects in - `keras.models.load_model`. Useful when you have an object in - file that can not be deserialized. - :param kwargs: extra arguments to change the model when loading - """ - data, params = cls._load_from_file(load_path, custom_objects=custom_objects) - - if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: - raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " - "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], - kwargs['policy_kwargs'])) - - model = cls(policy=data["policy"], env=None, _init_setup_model=False) # pytype: disable=not-instantiable - model.__dict__.update(data) - model.__dict__.update(kwargs) - model.set_env(env) - model.setup_model() - - model.load_parameters(params) - - return model - - -class _UnvecWrapper(VecEnvWrapper): - def __init__(self, venv): - """ - Unvectorize a vectorized environment, for vectorized environment that only have one environment - - :param venv: (VecEnv) the vectorized environment to wrap - """ - super().__init__(venv) - assert venv.num_envs == 1, "Error: cannot unwrap a environment wrapper that has more than one environment." - - def seed(self, seed=None): - return self.venv.env_method('seed', seed) - - def __getattr__(self, attr): - if attr in self.__dict__: - return getattr(self, attr) - return getattr(self.venv, attr) - - def __set_attr__(self, attr, value): - if attr in self.__dict__: - setattr(self, attr, value) - else: - setattr(self.venv, attr, value) - - def compute_reward(self, achieved_goal, desired_goal, _info): - return float(self.venv.env_method('compute_reward', achieved_goal, desired_goal, _info)[0]) - - @staticmethod - def unvec_obs(obs): - """ - :param obs: (Union[np.ndarray, dict]) - :return: (Union[np.ndarray, dict]) - """ - if not isinstance(obs, dict): - return obs[0] - obs_ = OrderedDict() - for key in obs.keys(): - obs_[key] = obs[key][0] - del obs - return obs_ - - def reset(self): - return self.unvec_obs(self.venv.reset()) - - def step_async(self, actions): - self.venv.step_async([actions]) - - def step_wait(self): - obs, rewards, dones, information = self.venv.step_wait() - return self.unvec_obs(obs), float(rewards[0]), dones[0], information[0] - - def render(self, mode='human'): - return self.venv.render(mode=mode) - - -class SetVerbosity: - def __init__(self, verbose=0): - """ - define a region of code for certain level of verbosity - - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - """ - self.verbose = verbose - - def __enter__(self): - self.tf_level = os.environ.get('TF_CPP_MIN_LOG_LEVEL', '0') - self.log_level = logger.get_level() - self.gym_level = gym.logger.MIN_LEVEL - - if self.verbose <= 1: - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - if self.verbose <= 0: - logger.set_level(logger.DISABLED) - gym.logger.set_level(gym.logger.DISABLED) - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.verbose <= 1: - os.environ['TF_CPP_MIN_LOG_LEVEL'] = self.tf_level - - if self.verbose <= 0: - logger.set_level(self.log_level) - gym.logger.set_level(self.gym_level) - - -class TensorboardWriter: - def __init__(self, graph, tensorboard_log_path, tb_log_name, new_tb_log=True): - """ - Create a Tensorboard writer for a code segment, and saves it to the log directory as its own run - - :param graph: (Tensorflow Graph) the model graph - :param tensorboard_log_path: (str) the save path for the log (can be None for no logging) - :param tb_log_name: (str) the name of the run for tensorboard log - :param new_tb_log: (bool) whether or not to create a new logging folder for tensorbaord - """ - self.graph = graph - self.tensorboard_log_path = tensorboard_log_path - self.tb_log_name = tb_log_name - self.writer = None - self.new_tb_log = new_tb_log - - def __enter__(self): - if self.tensorboard_log_path is not None: - latest_run_id = self._get_latest_run_id() - if self.new_tb_log: - latest_run_id = latest_run_id + 1 - save_path = os.path.join(self.tensorboard_log_path, "{}_{}".format(self.tb_log_name, latest_run_id)) - self.writer = tf.summary.FileWriter(save_path, graph=self.graph) - return self.writer - - def _get_latest_run_id(self): - """ - returns the latest run number for the given log name and log path, - by finding the greatest number in the directories. - - :return: (int) latest run number - """ - max_run_id = 0 - for path in glob.glob("{}/{}_[0-9]*".format(self.tensorboard_log_path, self.tb_log_name)): - file_name = path.split(os.sep)[-1] - ext = file_name.split("_")[-1] - if self.tb_log_name == "_".join(file_name.split("_")[:-1]) and ext.isdigit() and int(ext) > max_run_id: - max_run_id = int(ext) - return max_run_id - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.writer is not None: - self.writer.add_graph(self.graph) - self.writer.flush() diff --git a/stable_baselines/common/bit_flipping_env.py b/stable_baselines/common/bit_flipping_env.py deleted file mode 100644 index 15a055dc..00000000 --- a/stable_baselines/common/bit_flipping_env.py +++ /dev/null @@ -1,116 +0,0 @@ -from collections import OrderedDict - -import numpy as np -from gym import GoalEnv, spaces - - -class BitFlippingEnv(GoalEnv): - """ - Simple bit flipping env, useful to test HER. - The goal is to flip all the bits to get a vector of ones. - In the continuous variant, if the ith action component has a value > 0, - then the ith bit will be flipped. - - :param n_bits: (int) Number of bits to flip - :param continuous: (bool) Whether to use the continuous actions version or not, - by default, it uses the discrete one - :param max_steps: (int) Max number of steps, by default, equal to n_bits - :param discrete_obs_space: (bool) Whether to use the discrete observation - version or not, by default, it uses the MultiBinary one - """ - def __init__(self, n_bits=10, continuous=False, max_steps=None, - discrete_obs_space=False): - super(BitFlippingEnv, self).__init__() - # The achieved goal is determined by the current state - # here, it is a special where they are equal - if discrete_obs_space: - # In the discrete case, the agent act on the binary - # representation of the observation - self.observation_space = spaces.Dict({ - 'observation': spaces.Discrete(2 ** n_bits - 1), - 'achieved_goal': spaces.Discrete(2 ** n_bits - 1), - 'desired_goal': spaces.Discrete(2 ** n_bits - 1) - }) - else: - self.observation_space = spaces.Dict({ - 'observation': spaces.MultiBinary(n_bits), - 'achieved_goal': spaces.MultiBinary(n_bits), - 'desired_goal': spaces.MultiBinary(n_bits) - }) - - self.obs_space = spaces.MultiBinary(n_bits) - - if continuous: - self.action_space = spaces.Box(-1, 1, shape=(n_bits,), dtype=np.float32) - else: - self.action_space = spaces.Discrete(n_bits) - self.continuous = continuous - self.discrete_obs_space = discrete_obs_space - self.state = None - self.desired_goal = np.ones((n_bits,)) - if max_steps is None: - max_steps = n_bits - self.max_steps = max_steps - self.current_step = 0 - self.reset() - - def convert_if_needed(self, state): - """ - Convert to discrete space if needed. - - :param state: (np.ndarray) - :return: (np.ndarray or int) - """ - if self.discrete_obs_space: - # The internal state is the binary representation of the - # observed one - return int(sum([state[i] * 2**i for i in range(len(state))])) - return state - - def _get_obs(self): - """ - Helper to create the observation. - - :return: (OrderedDict) - """ - return OrderedDict([ - ('observation', self.convert_if_needed(self.state.copy())), - ('achieved_goal', self.convert_if_needed(self.state.copy())), - ('desired_goal', self.convert_if_needed(self.desired_goal.copy())) - ]) - - def reset(self): - self.current_step = 0 - self.state = self.obs_space.sample() - return self._get_obs() - - def step(self, action): - if self.continuous: - self.state[action > 0] = 1 - self.state[action > 0] - else: - self.state[action] = 1 - self.state[action] - obs = self._get_obs() - reward = self.compute_reward(obs['achieved_goal'], obs['desired_goal'], None) - done = reward == 0 - self.current_step += 1 - # Episode terminate when we reached the goal or the max number of steps - info = {'is_success': done} - done = done or self.current_step >= self.max_steps - return obs, reward, done, info - - def compute_reward(self, - achieved_goal: np.ndarray, - desired_goal: np.ndarray, - _info) -> float: - # Deceptive reward: it is positive only when the goal is achieved - if self.discrete_obs_space: - return 0.0 if achieved_goal == desired_goal else -1.0 - return 0.0 if (achieved_goal == desired_goal).all() else -1.0 - - def render(self, mode='human'): - if mode == 'rgb_array': - return self.state.copy() - print(self.state) - - def close(self): - pass diff --git a/stable_baselines/common/buffers.py b/stable_baselines/common/buffers.py deleted file mode 100644 index ec9eb2ba..00000000 --- a/stable_baselines/common/buffers.py +++ /dev/null @@ -1,264 +0,0 @@ -import random -from typing import Optional, List, Union - -import numpy as np - -from stable_baselines.common.segment_tree import SumSegmentTree, MinSegmentTree -from stable_baselines.common.vec_env import VecNormalize - - -class ReplayBuffer(object): - def __init__(self, size: int): - """ - Implements a ring buffer (FIFO). - - :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old - memories are dropped. - """ - self._storage = [] - self._maxsize = size - self._next_idx = 0 - - def __len__(self) -> int: - return len(self._storage) - - @property - def storage(self): - """[(Union[np.ndarray, int], Union[np.ndarray, int], float, Union[np.ndarray, int], bool)]: content of the replay buffer""" - return self._storage - - @property - def buffer_size(self) -> int: - """float: Max capacity of the buffer""" - return self._maxsize - - def can_sample(self, n_samples: int) -> bool: - """ - Check if n_samples samples can be sampled - from the buffer. - - :param n_samples: (int) - :return: (bool) - """ - return len(self) >= n_samples - - def is_full(self) -> int: - """ - Check whether the replay buffer is full or not. - - :return: (bool) - """ - return len(self) == self.buffer_size - - def add(self, obs_t, action, reward, obs_tp1, done): - """ - add a new transition to the buffer - - :param obs_t: (Union[np.ndarray, int]) the last observation - :param action: (Union[np.ndarray, int]) the action - :param reward: (float) the reward of the transition - :param obs_tp1: (Union[np.ndarray, int]) the current observation - :param done: (bool) is the episode done - """ - data = (obs_t, action, reward, obs_tp1, done) - - if self._next_idx >= len(self._storage): - self._storage.append(data) - else: - self._storage[self._next_idx] = data - self._next_idx = (self._next_idx + 1) % self._maxsize - - def extend(self, obs_t, action, reward, obs_tp1, done): - """ - add a new batch of transitions to the buffer - - :param obs_t: (Union[Tuple[Union[np.ndarray, int]], np.ndarray]) the last batch of observations - :param action: (Union[Tuple[Union[np.ndarray, int]]], np.ndarray]) the batch of actions - :param reward: (Union[Tuple[float], np.ndarray]) the batch of the rewards of the transition - :param obs_tp1: (Union[Tuple[Union[np.ndarray, int]], np.ndarray]) the current batch of observations - :param done: (Union[Tuple[bool], np.ndarray]) terminal status of the batch - - Note: uses the same names as .add to keep compatibility with named argument passing - but expects iterables and arrays with more than 1 dimensions - """ - for data in zip(obs_t, action, reward, obs_tp1, done): - if self._next_idx >= len(self._storage): - self._storage.append(data) - else: - self._storage[self._next_idx] = data - self._next_idx = (self._next_idx + 1) % self._maxsize - - @staticmethod - def _normalize_obs(obs: np.ndarray, - env: Optional[VecNormalize] = None) -> np.ndarray: - """ - Helper for normalizing the observation. - """ - if env is not None: - return env.normalize_obs(obs) - return obs - - @staticmethod - def _normalize_reward(reward: np.ndarray, - env: Optional[VecNormalize] = None) -> np.ndarray: - """ - Helper for normalizing the reward. - """ - if env is not None: - return env.normalize_reward(reward) - return reward - - def _encode_sample(self, idxes: Union[List[int], np.ndarray], env: Optional[VecNormalize] = None): - obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] - for i in idxes: - data = self._storage[i] - obs_t, action, reward, obs_tp1, done = data - obses_t.append(np.array(obs_t, copy=False)) - actions.append(np.array(action, copy=False)) - rewards.append(reward) - obses_tp1.append(np.array(obs_tp1, copy=False)) - dones.append(done) - return (self._normalize_obs(np.array(obses_t), env), - np.array(actions), - self._normalize_reward(np.array(rewards), env), - self._normalize_obs(np.array(obses_tp1), env), - np.array(dones)) - - def sample(self, batch_size: int, env: Optional[VecNormalize] = None, **_kwargs): - """ - Sample a batch of experiences. - - :param batch_size: (int) How many transitions to sample. - :param env: (Optional[VecNormalize]) associated gym VecEnv - to normalize the observations/rewards when sampling - :return: - - obs_batch: (np.ndarray) batch of observations - - act_batch: (numpy float) batch of actions executed given obs_batch - - rew_batch: (numpy float) rewards received as results of executing act_batch - - next_obs_batch: (np.ndarray) next set of observations seen after executing act_batch - - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode - and 0 otherwise. - """ - idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] - return self._encode_sample(idxes, env=env) - - -class PrioritizedReplayBuffer(ReplayBuffer): - def __init__(self, size, alpha): - """ - Create Prioritized Replay buffer. - - See Also ReplayBuffer.__init__ - - :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old memories - are dropped. - :param alpha: (float) how much prioritization is used (0 - no prioritization, 1 - full prioritization) - """ - super(PrioritizedReplayBuffer, self).__init__(size) - assert alpha >= 0 - self._alpha = alpha - - it_capacity = 1 - while it_capacity < size: - it_capacity *= 2 - - self._it_sum = SumSegmentTree(it_capacity) - self._it_min = MinSegmentTree(it_capacity) - self._max_priority = 1.0 - - def add(self, obs_t, action, reward, obs_tp1, done): - """ - add a new transition to the buffer - - :param obs_t: (Any) the last observation - :param action: ([float]) the action - :param reward: (float) the reward of the transition - :param obs_tp1: (Any) the current observation - :param done: (bool) is the episode done - """ - idx = self._next_idx - super().add(obs_t, action, reward, obs_tp1, done) - self._it_sum[idx] = self._max_priority ** self._alpha - self._it_min[idx] = self._max_priority ** self._alpha - - def extend(self, obs_t, action, reward, obs_tp1, done): - """ - add a new batch of transitions to the buffer - - :param obs_t: (Union[Tuple[Union[np.ndarray, int]], np.ndarray]) the last batch of observations - :param action: (Union[Tuple[Union[np.ndarray, int]]], np.ndarray]) the batch of actions - :param reward: (Union[Tuple[float], np.ndarray]) the batch of the rewards of the transition - :param obs_tp1: (Union[Tuple[Union[np.ndarray, int]], np.ndarray]) the current batch of observations - :param done: (Union[Tuple[bool], np.ndarray]) terminal status of the batch - - Note: uses the same names as .add to keep compatibility with named argument passing - but expects iterables and arrays with more than 1 dimensions - """ - idx = self._next_idx - super().extend(obs_t, action, reward, obs_tp1, done) - while idx != self._next_idx: - self._it_sum[idx] = self._max_priority ** self._alpha - self._it_min[idx] = self._max_priority ** self._alpha - idx = (idx + 1) % self._maxsize - - def _sample_proportional(self, batch_size): - mass = [] - total = self._it_sum.sum(0, len(self._storage) - 1) - # TODO(szymon): should we ensure no repeats? - mass = np.random.random(size=batch_size) * total - idx = self._it_sum.find_prefixsum_idx(mass) - return idx - - def sample(self, batch_size: int, beta: float = 0, env: Optional[VecNormalize] = None): - """ - Sample a batch of experiences. - - compared to ReplayBuffer.sample - it also returns importance weights and idxes - of sampled experiences. - - :param batch_size: (int) How many transitions to sample. - :param beta: (float) To what degree to use importance weights (0 - no corrections, 1 - full correction) - :param env: (Optional[VecNormalize]) associated gym VecEnv - to normalize the observations/rewards when sampling - :return: - - obs_batch: (np.ndarray) batch of observations - - act_batch: (numpy float) batch of actions executed given obs_batch - - rew_batch: (numpy float) rewards received as results of executing act_batch - - next_obs_batch: (np.ndarray) next set of observations seen after executing act_batch - - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode - and 0 otherwise. - - weights: (numpy float) Array of shape (batch_size,) and dtype np.float32 denoting importance weight of - each sampled transition - - idxes: (numpy int) Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences - """ - assert beta > 0 - - idxes = self._sample_proportional(batch_size) - weights = [] - p_min = self._it_min.min() / self._it_sum.sum() - max_weight = (p_min * len(self._storage)) ** (-beta) - p_sample = self._it_sum[idxes] / self._it_sum.sum() - weights = (p_sample * len(self._storage)) ** (-beta) / max_weight - encoded_sample = self._encode_sample(idxes, env=env) - return tuple(list(encoded_sample) + [weights, idxes]) - - def update_priorities(self, idxes, priorities): - """ - Update priorities of sampled transitions. - - sets priority of transition at index idxes[i] in buffer - to priorities[i]. - - :param idxes: ([int]) List of idxes of sampled transitions - :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes - denoted by variable `idxes`. - """ - assert len(idxes) == len(priorities) - assert np.min(priorities) > 0 - assert np.min(idxes) >= 0 - assert np.max(idxes) < len(self.storage) - self._it_sum[idxes] = priorities ** self._alpha - self._it_min[idxes] = priorities ** self._alpha - - self._max_priority = max(self._max_priority, np.max(priorities)) diff --git a/stable_baselines/common/callbacks.py b/stable_baselines/common/callbacks.py deleted file mode 100644 index 81f62ed5..00000000 --- a/stable_baselines/common/callbacks.py +++ /dev/null @@ -1,380 +0,0 @@ -import os -from abc import ABC -import warnings -import typing -from typing import Union, List, Dict, Any, Optional - -import gym -import numpy as np - -from stable_baselines.common.vec_env import VecEnv, sync_envs_normalization, DummyVecEnv -from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines import logger - -if typing.TYPE_CHECKING: - from stable_baselines.common.base_class import BaseRLModel # pytype: disable=pyi-error - - -class BaseCallback(ABC): - """ - Base class for callback. - - :param verbose: (int) - """ - def __init__(self, verbose: int = 0): - super(BaseCallback, self).__init__() - # The RL model - self.model = None # type: Optional[BaseRLModel] - # An alias for self.model.get_env(), the environment used for training - self.training_env = None # type: Union[gym.Env, VecEnv, None] - # Number of time the callback was called - self.n_calls = 0 # type: int - # n_envs * n times env.step() was called - self.num_timesteps = 0 # type: int - self.verbose = verbose - self.locals = None # type: Optional[Dict[str, Any]] - self.globals = None # type: Optional[Dict[str, Any]] - self.logger = None # type: Optional[logger.Logger] - # Sometimes, for event callback, it is useful - # to have access to the parent object - self.parent = None # type: Optional[BaseCallback] - - # Type hint as string to avoid circular import - def init_callback(self, model: 'BaseRLModel') -> None: - """ - Initialize the callback by saving references to the - RL model and the training environment for convenience. - """ - self.model = model - self.training_env = model.get_env() - self.logger = logger.Logger.CURRENT - self._init_callback() - - def update_locals(self, locals_: Dict[str, Any]) -> None: - """ - Updates the local variables of the training process - - For reference to which variables are accessible, - check each individual algorithm's documentation - :param `locals_`: (Dict[str, Any]) current local variables - """ - self.locals.update(locals_) - - def _init_callback(self) -> None: - pass - - def on_training_start(self, locals_: Dict[str, Any], globals_: Dict[str, Any]) -> None: - # Those are reference and will be updated automatically - self.locals = locals_ - self.globals = globals_ - self._on_training_start() - - def _on_training_start(self) -> None: - pass - - def on_rollout_start(self) -> None: - self._on_rollout_start() - - def _on_rollout_start(self) -> None: - pass - - def _on_step(self) -> bool: - """ - :return: (bool) If the callback returns False, training is aborted early. - """ - return True - - def on_step(self) -> bool: - """ - This method will be called by the model after each call to `env.step()`. - - For child callback (of an `EventCallback`), this will be called - when the event is triggered. - - :return: (bool) If the callback returns False, training is aborted early. - """ - self.n_calls += 1 - self.num_timesteps = self.model.num_timesteps - - return self._on_step() - - def on_training_end(self) -> None: - self._on_training_end() - - def _on_training_end(self) -> None: - pass - - def on_rollout_end(self) -> None: - self._on_rollout_end() - - def _on_rollout_end(self) -> None: - pass - - -class EventCallback(BaseCallback): - """ - Base class for triggering callback on event. - - :param callback: (Optional[BaseCallback]) Callback that will be called - when an event is triggered. - :param verbose: (int) - """ - def __init__(self, callback: Optional[BaseCallback] = None, verbose: int = 0): - super(EventCallback, self).__init__(verbose=verbose) - self.callback = callback - # Give access to the parent - if callback is not None: - self.callback.parent = self - - def init_callback(self, model: 'BaseRLModel') -> None: - super(EventCallback, self).init_callback(model) - if self.callback is not None: - self.callback.init_callback(self.model) - - def _on_training_start(self) -> None: - if self.callback is not None: - self.callback.on_training_start(self.locals, self.globals) - - def _on_event(self) -> bool: - if self.callback is not None: - return self.callback.on_step() - return True - - def _on_step(self) -> bool: - return True - - -class CallbackList(BaseCallback): - """ - Class for chaining callbacks. - - :param callbacks: (List[BaseCallback]) A list of callbacks that will be called - sequentially. - """ - def __init__(self, callbacks: List[BaseCallback]): - super(CallbackList, self).__init__() - assert isinstance(callbacks, list) - self.callbacks = callbacks - - def _init_callback(self) -> None: - for callback in self.callbacks: - callback.init_callback(self.model) - - def _on_training_start(self) -> None: - for callback in self.callbacks: - callback.on_training_start(self.locals, self.globals) - - def _on_rollout_start(self) -> None: - for callback in self.callbacks: - callback.on_rollout_start() - - def _on_step(self) -> bool: - continue_training = True - for callback in self.callbacks: - # Return False (stop training) if at least one callback returns False - continue_training = callback.on_step() and continue_training - return continue_training - - def _on_rollout_end(self) -> None: - for callback in self.callbacks: - callback.on_rollout_end() - - def _on_training_end(self) -> None: - for callback in self.callbacks: - callback.on_training_end() - - -class CheckpointCallback(BaseCallback): - """ - Callback for saving a model every `save_freq` steps - - :param save_freq: (int) - :param save_path: (str) Path to the folder where the model will be saved. - :param name_prefix: (str) Common prefix to the saved models - """ - def __init__(self, save_freq: int, save_path: str, name_prefix='rl_model', verbose=0): - super(CheckpointCallback, self).__init__(verbose) - self.save_freq = save_freq - self.save_path = save_path - self.name_prefix = name_prefix - - def _init_callback(self) -> None: - # Create folder if needed - if self.save_path is not None: - os.makedirs(self.save_path, exist_ok=True) - - def _on_step(self) -> bool: - if self.n_calls % self.save_freq == 0: - path = os.path.join(self.save_path, '{}_{}_steps'.format(self.name_prefix, self.num_timesteps)) - self.model.save(path) - if self.verbose > 1: - print("Saving model checkpoint to {}".format(path)) - return True - - -class ConvertCallback(BaseCallback): - """ - Convert functional callback (old-style) to object. - - :param callback: (Callable) - :param verbose: (int) - """ - def __init__(self, callback, verbose=0): - super(ConvertCallback, self).__init__(verbose) - self.callback = callback - - def _on_step(self) -> bool: - if self.callback is not None: - return self.callback(self.locals, self.globals) - return True - - -class EvalCallback(EventCallback): - """ - Callback for evaluating an agent. - - :param eval_env: (Union[gym.Env, VecEnv]) The environment used for initialization - :param callback_on_new_best: (Optional[BaseCallback]) Callback to trigger - when there is a new best model according to the `mean_reward` - :param n_eval_episodes: (int) The number of episodes to test the agent - :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback. - :param log_path: (str) Path to a folder where the evaluations (`evaluations.npz`) - will be saved. It will be updated at each evaluation. - :param best_model_save_path: (str) Path to a folder where the best model - according to performance on the eval env will be saved. - :param deterministic: (bool) Whether the evaluation should - use a stochastic or deterministic actions. - :param render: (bool) Whether to render or not the environment during evaluation - :param verbose: (int) - """ - def __init__(self, eval_env: Union[gym.Env, VecEnv], - callback_on_new_best: Optional[BaseCallback] = None, - n_eval_episodes: int = 5, - eval_freq: int = 10000, - log_path: Optional[str] = None, - best_model_save_path: Optional[str] = None, - deterministic: bool = True, - render: bool = False, - verbose: int = 1): - super(EvalCallback, self).__init__(callback_on_new_best, verbose=verbose) - self.n_eval_episodes = n_eval_episodes - self.eval_freq = eval_freq - self.best_mean_reward = -np.inf - self.last_mean_reward = -np.inf - self.deterministic = deterministic - self.render = render - - # Convert to VecEnv for consistency - if not isinstance(eval_env, VecEnv): - eval_env = DummyVecEnv([lambda: eval_env]) - - assert eval_env.num_envs == 1, "You must pass only one environment for evaluation" - - self.eval_env = eval_env - self.best_model_save_path = best_model_save_path - # Logs will be written in `evaluations.npz` - if log_path is not None: - log_path = os.path.join(log_path, 'evaluations') - self.log_path = log_path - self.evaluations_results = [] - self.evaluations_timesteps = [] - self.evaluations_length = [] - - def _init_callback(self): - # Does not work in some corner cases, where the wrapper is not the same - if not type(self.training_env) is type(self.eval_env): - warnings.warn("Training and eval env are not of the same type" - "{} != {}".format(self.training_env, self.eval_env)) - - # Create folders if needed - if self.best_model_save_path is not None: - os.makedirs(self.best_model_save_path, exist_ok=True) - if self.log_path is not None: - os.makedirs(os.path.dirname(self.log_path), exist_ok=True) - - def _on_step(self) -> bool: - - if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: - # Sync training and eval env if there is VecNormalize - sync_envs_normalization(self.training_env, self.eval_env) - - episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env, - n_eval_episodes=self.n_eval_episodes, - render=self.render, - deterministic=self.deterministic, - return_episode_rewards=True) - - if self.log_path is not None: - self.evaluations_timesteps.append(self.num_timesteps) - self.evaluations_results.append(episode_rewards) - self.evaluations_length.append(episode_lengths) - np.savez(self.log_path, timesteps=self.evaluations_timesteps, - results=self.evaluations_results, ep_lengths=self.evaluations_length) - - mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards) - mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths) - # Keep track of the last evaluation, useful for classes that derive from this callback - self.last_mean_reward = mean_reward - - if self.verbose > 0: - print("Eval num_timesteps={}, " - "episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward)) - print("Episode length: {:.2f} +/- {:.2f}".format(mean_ep_length, std_ep_length)) - - if mean_reward > self.best_mean_reward: - if self.verbose > 0: - print("New best mean reward!") - if self.best_model_save_path is not None: - self.model.save(os.path.join(self.best_model_save_path, 'best_model')) - self.best_mean_reward = mean_reward - # Trigger callback if needed - if self.callback is not None: - return self._on_event() - - return True - - -class StopTrainingOnRewardThreshold(BaseCallback): - """ - Stop the training once a threshold in episodic reward - has been reached (i.e. when the model is good enough). - - It must be used with the `EvalCallback`. - - :param reward_threshold: (float) Minimum expected reward per episode - to stop training. - :param verbose: (int) - """ - def __init__(self, reward_threshold: float, verbose: int = 0): - super(StopTrainingOnRewardThreshold, self).__init__(verbose=verbose) - self.reward_threshold = reward_threshold - - def _on_step(self) -> bool: - assert self.parent is not None, ("`StopTrainingOnRewardThreshold` callback must be used " - "with an `EvalCallback`") - # Convert np.bool to bool, otherwise callback.on_step() is False won't work - continue_training = bool(self.parent.best_mean_reward < self.reward_threshold) - if self.verbose > 0 and not continue_training: - print("Stopping training because the mean reward {:.2f} " - " is above the threshold {}".format(self.parent.best_mean_reward, self.reward_threshold)) - return continue_training - - -class EveryNTimesteps(EventCallback): - """ - Trigger a callback every `n_steps` timesteps - - :param n_steps: (int) Number of timesteps between two trigger. - :param callback: (BaseCallback) Callback that will be called - when the event is triggered. - """ - def __init__(self, n_steps: int, callback: BaseCallback): - super(EveryNTimesteps, self).__init__(callback) - self.n_steps = n_steps - self.last_time_trigger = 0 - - def _on_step(self) -> bool: - if (self.num_timesteps - self.last_time_trigger) >= self.n_steps: - self.last_time_trigger = self.num_timesteps - return self._on_event() - return True diff --git a/stable_baselines/common/cg.py b/stable_baselines/common/cg.py deleted file mode 100644 index 15c0f952..00000000 --- a/stable_baselines/common/cg.py +++ /dev/null @@ -1,49 +0,0 @@ -import numpy as np - - -def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): - """ - conjugate gradient calculation (Ax = b), bases on - https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312 - - :param f_ax: (function) The function describing the Matrix A dot the vector x - (x being the input parameter of the function) - :param b_vec: (numpy float) vector b, where Ax = b - :param cg_iters: (int) the maximum number of iterations for converging - :param callback: (function) callback the values of x while converging - :param verbose: (bool) print extra information - :param residual_tol: (float) the break point if the residual is below this value - :return: (numpy float) vector x, where Ax = b - """ - first_basis_vect = b_vec.copy() # the first basis vector - residual = b_vec.copy() # the residual - x_var = np.zeros_like(b_vec) # vector x, where Ax = b - residual_dot_residual = residual.dot(residual) # L2 norm of the residual - - fmt_str = "%10i %10.3g %10.3g" - title_str = "%10s %10s %10s" - if verbose: - print(title_str % ("iter", "residual norm", "soln norm")) - - for i in range(cg_iters): - if callback is not None: - callback(x_var) - if verbose: - print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var))) - z_var = f_ax(first_basis_vect) - v_var = residual_dot_residual / first_basis_vect.dot(z_var) - x_var += v_var * first_basis_vect - residual -= v_var * z_var - new_residual_dot_residual = residual.dot(residual) - mu_val = new_residual_dot_residual / residual_dot_residual - first_basis_vect = residual + mu_val * first_basis_vect - - residual_dot_residual = new_residual_dot_residual - if residual_dot_residual < residual_tol: - break - - if callback is not None: - callback(x_var) - if verbose: - print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var))) - return x_var diff --git a/stable_baselines/common/cmd_util.py b/stable_baselines/common/cmd_util.py deleted file mode 100644 index c5ff6633..00000000 --- a/stable_baselines/common/cmd_util.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Helpers for scripts like run_atari.py. -""" - -import os -import warnings - -import gym - -from stable_baselines import logger -from stable_baselines.bench import Monitor -from stable_baselines.common.misc_util import set_global_seeds -from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind -from stable_baselines.common.misc_util import mpi_rank_or_zero -from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv - - -def make_vec_env(env_id, n_envs=1, seed=None, start_index=0, - monitor_dir=None, wrapper_class=None, - env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None): - """ - Create a wrapped, monitored `VecEnv`. - By default it uses a `DummyVecEnv` which is usually faster - than a `SubprocVecEnv`. - - :param env_id: (str or Type[gym.Env]) the environment ID or the environment class - :param n_envs: (int) the number of environments you wish to have in parallel - :param seed: (int) the initial seed for the random number generator - :param start_index: (int) start rank index - :param monitor_dir: (str) Path to a folder where the monitor files will be saved. - If None, no file will be written, however, the env will still be wrapped - in a Monitor wrapper to provide additional information about training. - :param wrapper_class: (gym.Wrapper or callable) Additional wrapper to use on the environment. - This can also be a function with single argument that wraps the environment in many things. - :param env_kwargs: (dict) Optional keyword argument to pass to the env constructor - :param vec_env_cls: (Type[VecEnv]) A custom `VecEnv` class constructor. Default: None. - :param vec_env_kwargs: (dict) Keyword arguments to pass to the `VecEnv` class constructor. - :return: (VecEnv) The wrapped environment - """ - env_kwargs = {} if env_kwargs is None else env_kwargs - vec_env_kwargs = {} if vec_env_kwargs is None else vec_env_kwargs - - def make_env(rank): - def _init(): - if isinstance(env_id, str): - env = gym.make(env_id) - if len(env_kwargs) > 0: - warnings.warn("No environment class was passed (only an env ID) so `env_kwargs` will be ignored") - else: - env = env_id(**env_kwargs) - if seed is not None: - env.seed(seed + rank) - env.action_space.seed(seed + rank) - # Wrap the env in a Monitor wrapper - # to have additional training information - monitor_path = os.path.join(monitor_dir, str(rank)) if monitor_dir is not None else None - # Create the monitor folder if needed - if monitor_path is not None: - os.makedirs(monitor_dir, exist_ok=True) - env = Monitor(env, filename=monitor_path) - # Optionally, wrap the environment with the provided wrapper - if wrapper_class is not None: - env = wrapper_class(env) - return env - return _init - - # No custom VecEnv is passed - if vec_env_cls is None: - # Default: use a DummyVecEnv - vec_env_cls = DummyVecEnv - - return vec_env_cls([make_env(i + start_index) for i in range(n_envs)], **vec_env_kwargs) - - -def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, - start_index=0, allow_early_resets=True, - start_method=None, use_subprocess=False): - """ - Create a wrapped, monitored VecEnv for Atari. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environment you wish to have in subprocesses - :param seed: (int) the initial seed for RNG - :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function - :param start_index: (int) start rank index - :param allow_early_resets: (bool) allows early reset of the environment - :param start_method: (str) method used to start the subprocesses. - See SubprocVecEnv doc for more information - :param use_subprocess: (bool) Whether to use `SubprocVecEnv` or `DummyVecEnv` when - `num_env` > 1, `DummyVecEnv` is usually faster. Default: False - :return: (VecEnv) The atari environment - """ - if wrapper_kwargs is None: - wrapper_kwargs = {} - - def make_env(rank): - def _thunk(): - env = make_atari(env_id) - env.seed(seed + rank) - env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), - allow_early_resets=allow_early_resets) - return wrap_deepmind(env, **wrapper_kwargs) - return _thunk - set_global_seeds(seed) - - # When using one environment, no need to start subprocesses - if num_env == 1 or not use_subprocess: - return DummyVecEnv([make_env(i + start_index) for i in range(num_env)]) - - return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], - start_method=start_method) - - -def make_mujoco_env(env_id, seed, allow_early_resets=True): - """ - Create a wrapped, monitored gym.Env for MuJoCo. - - :param env_id: (str) the environment ID - :param seed: (int) the initial seed for RNG - :param allow_early_resets: (bool) allows early reset of the environment - :return: (Gym Environment) The mujoco environment - """ - set_global_seeds(seed + 10000 * mpi_rank_or_zero()) - env = gym.make(env_id) - env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets) - env.seed(seed) - return env - - -def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): - """ - Create a wrapped, monitored gym.Env for MuJoCo. - - :param env_id: (str) the environment ID - :param seed: (int) the initial seed for RNG - :param rank: (int) the rank of the environment (for logging) - :param allow_early_resets: (bool) allows early reset of the environment - :return: (Gym Environment) The robotic environment - """ - set_global_seeds(seed) - env = gym.make(env_id) - keys = ['observation', 'desired_goal'] - # TODO: remove try-except once most users are running modern Gym - try: # for modern Gym (>=0.15.4) - from gym.wrappers import FilterObservation, FlattenObservation - env = FlattenObservation(FilterObservation(env, keys)) - except ImportError: # for older gym (<=0.15.3) - from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error - env = FlattenDictWrapper(env, keys) - env = Monitor( - env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), - info_keywords=('is_success',), allow_early_resets=allow_early_resets) - env.seed(seed) - return env - - -def arg_parser(): - """ - Create an empty argparse.ArgumentParser. - - :return: (ArgumentParser) - """ - import argparse - return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - -def atari_arg_parser(): - """ - Create an argparse.ArgumentParser for run_atari.py. - - :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)} - """ - parser = arg_parser() - parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(1e7)) - return parser - - -def mujoco_arg_parser(): - """ - Create an argparse.ArgumentParser for run_mujoco.py. - - :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} - """ - parser = arg_parser() - parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(1e6)) - parser.add_argument('--play', default=False, action='store_true') - return parser - - -def robotics_arg_parser(): - """ - Create an argparse.ArgumentParser for run_mujoco.py. - - :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)} - """ - parser = arg_parser() - parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(1e6)) - return parser diff --git a/stable_baselines/common/console_util.py b/stable_baselines/common/console_util.py deleted file mode 100644 index c8b4c94c..00000000 --- a/stable_baselines/common/console_util.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import print_function - -import numpy as np - - -# ================================================================ -# Misc -# ================================================================ - - -def fmt_row(width, row, header=False): - """ - fits a list of items to at least a certain length - - :param width: (int) the minimum width of the string - :param row: ([Any]) a list of object you wish to get the string representation - :param header: (bool) whether or not to return the string as a header - :return: (str) the string representation of all the elements in 'row', of length >= 'width' - """ - out = " | ".join(fmt_item(x, width) for x in row) - if header: - out = out + "\n" + "-" * len(out) - return out - - -def fmt_item(item, min_width): - """ - fits items to a given string length - - :param item: (Any) the item you wish to get the string representation - :param min_width: (int) the minimum width of the string - :return: (str) the string representation of 'x' of length >= 'l' - """ - if isinstance(item, np.ndarray): - assert item.ndim == 0 - item = item.item() - if isinstance(item, (float, np.float32, np.float64)): - value = abs(item) - if (value < 1e-4 or value > 1e+4) and value > 0: - rep = "%7.2e" % item - else: - rep = "%7.5f" % item - else: - rep = str(item) - return " " * (min_width - len(rep)) + rep - - -COLOR_TO_NUM = dict( - gray=30, - red=31, - green=32, - yellow=33, - blue=34, - magenta=35, - cyan=36, - white=37, - crimson=38 -) - - -def colorize(string, color, bold=False, highlight=False): - """ - Colorize, bold and/or highlight a string for terminal print - - :param string: (str) input string - :param color: (str) the color, the lookup table is the dict at console_util.color2num - :param bold: (bool) if the string should be bold or not - :param highlight: (bool) if the string should be highlighted or not - :return: (str) the stylized output string - """ - attr = [] - num = COLOR_TO_NUM[color] - if highlight: - num += 10 - attr.append(str(num)) - if bold: - attr.append('1') - return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) diff --git a/stable_baselines/common/dataset.py b/stable_baselines/common/dataset.py deleted file mode 100644 index b1bb5f79..00000000 --- a/stable_baselines/common/dataset.py +++ /dev/null @@ -1,101 +0,0 @@ -import numpy as np - - -class Dataset(object): - def __init__(self, data_map, shuffle=True): - """ - Data loader that handles batches and shuffling. - WARNING: this will alter the given data_map ordering, as dicts are mutable - - :param data_map: (dict) the input data, where every column is a key - :param shuffle: (bool) Whether to shuffle or not the dataset - Important: this should be disabled for recurrent policies - """ - self.data_map = data_map - self.shuffle = shuffle - self.n_samples = next(iter(data_map.values())).shape[0] - self._next_id = 0 - if self.shuffle: - self.shuffle_dataset() - - def shuffle_dataset(self): - """ - Shuffles the data_map - """ - perm = np.arange(self.n_samples) - np.random.shuffle(perm) - - for key in self.data_map: - self.data_map[key] = self.data_map[key][perm] - - def next_batch(self, batch_size): - """ - returns a batch of data of a given size - - :param batch_size: (int) the size of the batch - :return: (dict) a batch of the input data of size 'batch_size' - """ - if self._next_id >= self.n_samples: - self._next_id = 0 - if self.shuffle: - self.shuffle_dataset() - - cur_id = self._next_id - cur_batch_size = min(batch_size, self.n_samples - self._next_id) - self._next_id += cur_batch_size - - data_map = dict() - for key in self.data_map: - data_map[key] = self.data_map[key][cur_id:cur_id + cur_batch_size] - return data_map - - def iterate_once(self, batch_size): - """ - generator that iterates over the dataset - - :param batch_size: (int) the size of the batch - :return: (dict) a batch of the input data of size 'batch_size' - """ - if self.shuffle: - self.shuffle_dataset() - - while self._next_id <= self.n_samples - batch_size: - yield self.next_batch(batch_size) - self._next_id = 0 - - def subset(self, num_elements, shuffle=True): - """ - Return a subset of the current dataset - - :param num_elements: (int) the number of element you wish to have in the subset - :param shuffle: (bool) Whether to shuffle or not the dataset - :return: (Dataset) a new subset of the current Dataset object - """ - data_map = dict() - for key in self.data_map: - data_map[key] = self.data_map[key][:num_elements] - return Dataset(data_map, shuffle) - - -def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): - """ - Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None. - - :param arrays: (tuple) a tuple of arrays - :param num_batches: (int) the number of batches, must be None is batch_size is defined - :param batch_size: (int) the size of the batch, must be None is num_batches is defined - :param shuffle: (bool) enable auto shuffle - :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size - :return: (tuples) a tuple of a batch of the arrays - """ - assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' - arrays = tuple(map(np.asarray, arrays)) - n_samples = arrays[0].shape[0] - assert all(a.shape[0] == n_samples for a in arrays[1:]) - inds = np.arange(n_samples) - if shuffle: - np.random.shuffle(inds) - sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches - for batch_inds in np.array_split(inds, sections): - if include_final_partial_batch or len(batch_inds) == batch_size: - yield tuple(a[batch_inds] for a in arrays) diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py deleted file mode 100644 index f38dd657..00000000 --- a/stable_baselines/common/distributions.py +++ /dev/null @@ -1,513 +0,0 @@ -import numpy as np -import tensorflow as tf -from tensorflow.python.ops import math_ops -from gym import spaces - -from stable_baselines.common.tf_layers import linear - - -class ProbabilityDistribution(object): - """ - Base class for describing a probability distribution. - """ - def __init__(self): - super(ProbabilityDistribution, self).__init__() - - def flatparam(self): - """ - Return the direct probabilities - - :return: ([float]) the probabilities - """ - raise NotImplementedError - - def mode(self): - """ - Returns the probability - - :return: (Tensorflow Tensor) the deterministic action - """ - raise NotImplementedError - - def neglogp(self, x): - """ - returns the of the negative log likelihood - - :param x: (str) the labels of each index - :return: ([float]) The negative log likelihood of the distribution - """ - # Usually it's easier to define the negative logprob - raise NotImplementedError - - def kl(self, other): - """ - Calculates the Kullback-Leibler divergence from the given probability distribution - - :param other: ([float]) the distribution to compare with - :return: (float) the KL divergence of the two distributions - """ - raise NotImplementedError - - def entropy(self): - """ - Returns Shannon's entropy of the probability - - :return: (float) the entropy - """ - raise NotImplementedError - - def sample(self): - """ - returns a sample from the probability distribution - - :return: (Tensorflow Tensor) the stochastic action - """ - raise NotImplementedError - - def logp(self, x): - """ - returns the of the log likelihood - - :param x: (str) the labels of each index - :return: ([float]) The log likelihood of the distribution - """ - return - self.neglogp(x) - - -class ProbabilityDistributionType(object): - """ - Parametrized family of probability distributions - """ - - def probability_distribution_class(self): - """ - returns the ProbabilityDistribution class of this type - - :return: (Type ProbabilityDistribution) the probability distribution class associated - """ - raise NotImplementedError - - def proba_distribution_from_flat(self, flat): - """ - Returns the probability distribution from flat probabilities - flat: flattened vector of parameters of probability distribution - - :param flat: ([float]) the flat probabilities - :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated - """ - return self.probability_distribution_class()(flat) - - def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): - """ - returns the probability distribution from latent values - - :param pi_latent_vector: ([float]) the latent pi values - :param vf_latent_vector: ([float]) the latent vf values - :param init_scale: (float) the initial scale of the distribution - :param init_bias: (float) the initial bias of the distribution - :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated - """ - raise NotImplementedError - - def param_shape(self): - """ - returns the shape of the input parameters - - :return: ([int]) the shape - """ - raise NotImplementedError - - def sample_shape(self): - """ - returns the shape of the sampling - - :return: ([int]) the shape - """ - raise NotImplementedError - - def sample_dtype(self): - """ - returns the type of the sampling - - :return: (type) the type - """ - raise NotImplementedError - - def param_placeholder(self, prepend_shape, name=None): - """ - returns the TensorFlow placeholder for the input parameters - - :param prepend_shape: ([int]) the prepend shape - :param name: (str) the placeholder name - :return: (TensorFlow Tensor) the placeholder - """ - return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name) - - def sample_placeholder(self, prepend_shape, name=None): - """ - returns the TensorFlow placeholder for the sampling - - :param prepend_shape: ([int]) the prepend shape - :param name: (str) the placeholder name - :return: (TensorFlow Tensor) the placeholder - """ - return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(), name=name) - - -class CategoricalProbabilityDistributionType(ProbabilityDistributionType): - def __init__(self, n_cat): - """ - The probability distribution type for categorical input - - :param n_cat: (int) the number of categories - """ - self.n_cat = n_cat - - def probability_distribution_class(self): - return CategoricalProbabilityDistribution - - def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): - pdparam = linear(pi_latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias) - q_values = linear(vf_latent_vector, 'q', self.n_cat, init_scale=init_scale, init_bias=init_bias) - return self.proba_distribution_from_flat(pdparam), pdparam, q_values - - def param_shape(self): - return [self.n_cat] - - def sample_shape(self): - return [] - - def sample_dtype(self): - return tf.int64 - - -class MultiCategoricalProbabilityDistributionType(ProbabilityDistributionType): - def __init__(self, n_vec): - """ - The probability distribution type for multiple categorical input - - :param n_vec: ([int]) the vectors - """ - # Cast the variable because tf does not allow uint32 - self.n_vec = n_vec.astype(np.int32) - # Check that the cast was valid - assert (self.n_vec > 0).all(), "Casting uint32 to int32 was invalid" - - def probability_distribution_class(self): - return MultiCategoricalProbabilityDistribution - - def proba_distribution_from_flat(self, flat): - return MultiCategoricalProbabilityDistribution(self.n_vec, flat) - - def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): - pdparam = linear(pi_latent_vector, 'pi', sum(self.n_vec), init_scale=init_scale, init_bias=init_bias) - q_values = linear(vf_latent_vector, 'q', sum(self.n_vec), init_scale=init_scale, init_bias=init_bias) - return self.proba_distribution_from_flat(pdparam), pdparam, q_values - - def param_shape(self): - return [sum(self.n_vec)] - - def sample_shape(self): - return [len(self.n_vec)] - - def sample_dtype(self): - return tf.int64 - - -class DiagGaussianProbabilityDistributionType(ProbabilityDistributionType): - def __init__(self, size): - """ - The probability distribution type for multivariate Gaussian input - - :param size: (int) the number of dimensions of the multivariate gaussian - """ - self.size = size - - def probability_distribution_class(self): - return DiagGaussianProbabilityDistribution - - def proba_distribution_from_flat(self, flat): - """ - returns the probability distribution from flat probabilities - - :param flat: ([float]) the flat probabilities - :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated - """ - return self.probability_distribution_class()(flat) - - def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): - mean = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) - logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) - pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) - return self.proba_distribution_from_flat(pdparam), mean, q_values - - def param_shape(self): - return [2 * self.size] - - def sample_shape(self): - return [self.size] - - def sample_dtype(self): - return tf.float32 - - -class BernoulliProbabilityDistributionType(ProbabilityDistributionType): - def __init__(self, size): - """ - The probability distribution type for Bernoulli input - - :param size: (int) the number of dimensions of the Bernoulli distribution - """ - self.size = size - - def probability_distribution_class(self): - return BernoulliProbabilityDistribution - - def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, init_scale=1.0, init_bias=0.0): - pdparam = linear(pi_latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) - q_values = linear(vf_latent_vector, 'q', self.size, init_scale=init_scale, init_bias=init_bias) - return self.proba_distribution_from_flat(pdparam), pdparam, q_values - - def param_shape(self): - return [self.size] - - def sample_shape(self): - return [self.size] - - def sample_dtype(self): - return tf.int32 - - -class CategoricalProbabilityDistribution(ProbabilityDistribution): - def __init__(self, logits): - """ - Probability distributions from categorical input - - :param logits: ([float]) the categorical logits input - """ - self.logits = logits - super(CategoricalProbabilityDistribution, self).__init__() - - def flatparam(self): - return self.logits - - def mode(self): - return tf.argmax(self.logits, axis=-1) - - def neglogp(self, x): - # Note: we can't use sparse_softmax_cross_entropy_with_logits because - # the implementation does not allow second-order derivatives... - one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) - return tf.nn.softmax_cross_entropy_with_logits_v2( - logits=self.logits, - labels=tf.stop_gradient(one_hot_actions)) - - def kl(self, other): - a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) - a_1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True) - exp_a_0 = tf.exp(a_0) - exp_a_1 = tf.exp(a_1) - z_0 = tf.reduce_sum(exp_a_0, axis=-1, keepdims=True) - z_1 = tf.reduce_sum(exp_a_1, axis=-1, keepdims=True) - p_0 = exp_a_0 / z_0 - return tf.reduce_sum(p_0 * (a_0 - tf.log(z_0) - a_1 + tf.log(z_1)), axis=-1) - - def entropy(self): - a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True) - exp_a_0 = tf.exp(a_0) - z_0 = tf.reduce_sum(exp_a_0, axis=-1, keepdims=True) - p_0 = exp_a_0 / z_0 - return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), axis=-1) - - def sample(self): - # Gumbel-max trick to sample - # a categorical distribution (see http://amid.fish/humble-gumbel) - uniform = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype) - return tf.argmax(self.logits - tf.log(-tf.log(uniform)), axis=-1) - - @classmethod - def fromflat(cls, flat): - """ - Create an instance of this from new logits values - - :param flat: ([float]) the categorical logits input - :return: (ProbabilityDistribution) the instance from the given categorical input - """ - return cls(flat) - - -class MultiCategoricalProbabilityDistribution(ProbabilityDistribution): - def __init__(self, nvec, flat): - """ - Probability distributions from multicategorical input - - :param nvec: ([int]) the sizes of the different categorical inputs - :param flat: ([float]) the categorical logits input - """ - self.flat = flat - self.categoricals = list(map(CategoricalProbabilityDistribution, tf.split(flat, nvec, axis=-1))) - super(MultiCategoricalProbabilityDistribution, self).__init__() - - def flatparam(self): - return self.flat - - def mode(self): - return tf.stack([p.mode() for p in self.categoricals], axis=-1) - - def neglogp(self, x): - return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) - - def kl(self, other): - return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) - - def entropy(self): - return tf.add_n([p.entropy() for p in self.categoricals]) - - def sample(self): - return tf.stack([p.sample() for p in self.categoricals], axis=-1) - - @classmethod - def fromflat(cls, flat): - """ - Create an instance of this from new logits values - - :param flat: ([float]) the multi categorical logits input - :return: (ProbabilityDistribution) the instance from the given multi categorical input - """ - raise NotImplementedError - - -class DiagGaussianProbabilityDistribution(ProbabilityDistribution): - def __init__(self, flat): - """ - Probability distributions from multivariate Gaussian input - - :param flat: ([float]) the multivariate Gaussian input data - """ - self.flat = flat - mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat) - self.mean = mean - self.logstd = logstd - self.std = tf.exp(logstd) - super(DiagGaussianProbabilityDistribution, self).__init__() - - def flatparam(self): - return self.flat - - def mode(self): - # Bounds are taken into account outside this class (during training only) - return self.mean - - def neglogp(self, x): - return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ - + 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(x)[-1], tf.float32) \ - + tf.reduce_sum(self.logstd, axis=-1) - - def kl(self, other): - assert isinstance(other, DiagGaussianProbabilityDistribution) - return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / - (2.0 * tf.square(other.std)) - 0.5, axis=-1) - - def entropy(self): - return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) - - def sample(self): - # Bounds are taken into acount outside this class (during training only) - # Otherwise, it changes the distribution and breaks PPO2 for instance - return self.mean + self.std * tf.random_normal(tf.shape(self.mean), - dtype=self.mean.dtype) - - @classmethod - def fromflat(cls, flat): - """ - Create an instance of this from new multivariate Gaussian input - - :param flat: ([float]) the multivariate Gaussian input data - :return: (ProbabilityDistribution) the instance from the given multivariate Gaussian input data - """ - return cls(flat) - - -class BernoulliProbabilityDistribution(ProbabilityDistribution): - def __init__(self, logits): - """ - Probability distributions from Bernoulli input - - :param logits: ([float]) the Bernoulli input data - """ - self.logits = logits - self.probabilities = tf.sigmoid(logits) - super(BernoulliProbabilityDistribution, self).__init__() - - def flatparam(self): - return self.logits - - def mode(self): - return tf.round(self.probabilities) - - def neglogp(self, x): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, - labels=tf.cast(x, tf.float32)), - axis=-1) - - def kl(self, other): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, - labels=self.probabilities), axis=-1) - \ - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, - labels=self.probabilities), axis=-1) - - def entropy(self): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, - labels=self.probabilities), axis=-1) - - def sample(self): - samples_from_uniform = tf.random_uniform(tf.shape(self.probabilities)) - return tf.cast(math_ops.less(samples_from_uniform, self.probabilities), tf.float32) - - @classmethod - def fromflat(cls, flat): - """ - Create an instance of this from new Bernoulli input - - :param flat: ([float]) the Bernoulli input data - :return: (ProbabilityDistribution) the instance from the given Bernoulli input data - """ - return cls(flat) - - -def make_proba_dist_type(ac_space): - """ - return an instance of ProbabilityDistributionType for the correct type of action space - - :param ac_space: (Gym Space) the input action space - :return: (ProbabilityDistributionType) the appropriate instance of a ProbabilityDistributionType - """ - if isinstance(ac_space, spaces.Box): - assert len(ac_space.shape) == 1, "Error: the action space must be a vector" - return DiagGaussianProbabilityDistributionType(ac_space.shape[0]) - elif isinstance(ac_space, spaces.Discrete): - return CategoricalProbabilityDistributionType(ac_space.n) - elif isinstance(ac_space, spaces.MultiDiscrete): - return MultiCategoricalProbabilityDistributionType(ac_space.nvec) - elif isinstance(ac_space, spaces.MultiBinary): - return BernoulliProbabilityDistributionType(ac_space.n) - else: - raise NotImplementedError("Error: probability distribution, not implemented for action space of type {}." - .format(type(ac_space)) + - " Must be of type Gym Spaces: Box, Discrete, MultiDiscrete or MultiBinary.") - - -def shape_el(tensor, index): - """ - get the shape of a TensorFlow Tensor element - - :param tensor: (TensorFlow Tensor) the input tensor - :param index: (int) the element - :return: ([int]) the shape - """ - maybe = tensor.get_shape()[index] - if maybe is not None: - return maybe - else: - return tf.shape(tensor)[index] diff --git a/stable_baselines/common/env_checker.py b/stable_baselines/common/env_checker.py deleted file mode 100644 index bba496b8..00000000 --- a/stable_baselines/common/env_checker.py +++ /dev/null @@ -1,222 +0,0 @@ -import warnings -from typing import Union - -import gym -from gym import spaces -import numpy as np - -from stable_baselines.common.vec_env import DummyVecEnv, VecCheckNan - - -def _enforce_array_obs(observation_space: spaces.Space) -> bool: - """ - Whether to check that the returned observation is a numpy array - it is not mandatory for `Dict` and `Tuple` spaces. - """ - return not isinstance(observation_space, (spaces.Dict, spaces.Tuple)) - - -def _check_image_input(observation_space: spaces.Box) -> None: - """ - Check that the input will be compatible with Stable-Baselines - when the observation is apparently an image. - """ - if observation_space.dtype != np.uint8: - warnings.warn("It seems that your observation is an image but the `dtype` " - "of your observation_space is not `np.uint8`. " - "If your observation is not an image, we recommend you to flatten the observation " - "to have only a 1D vector") - - if np.any(observation_space.low != 0) or np.any(observation_space.high != 255): - warnings.warn("It seems that your observation space is an image but the " - "upper and lower bounds are not in [0, 255]. " - "Because the CNN policy normalize automatically the observation " - "you may encounter issue if the values are not in that range." - ) - - if observation_space.shape[0] < 36 or observation_space.shape[1] < 36: - warnings.warn("The minimal resolution for an image is 36x36 for the default CnnPolicy. " - "You might need to use a custom `cnn_extractor` " - "cf https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html") - - -def _check_unsupported_obs_spaces(env: gym.Env, observation_space: spaces.Space) -> None: - """Emit warnings when the observation space used is not supported by Stable-Baselines.""" - - if isinstance(observation_space, spaces.Dict) and not isinstance(env, gym.GoalEnv): - warnings.warn("The observation space is a Dict but the environment is not a gym.GoalEnv " - "(cf https://github.com/openai/gym/blob/master/gym/core.py), " - "this is currently not supported by Stable Baselines " - "(cf https://github.com/hill-a/stable-baselines/issues/133), " - "you will need to use a custom policy. " - ) - - if isinstance(observation_space, spaces.Tuple): - warnings.warn("The observation space is a Tuple," - "this is currently not supported by Stable Baselines " - "(cf https://github.com/hill-a/stable-baselines/issues/133), " - "you will need to flatten the observation and maybe use a custom policy. " - ) - - -def _check_nan(env: gym.Env) -> None: - """Check for Inf and NaN using the VecWrapper.""" - vec_env = VecCheckNan(DummyVecEnv([lambda: env])) - for _ in range(10): - action = [env.action_space.sample()] - _, _, _, _ = vec_env.step(action) - - -def _check_obs(obs: Union[tuple, dict, np.ndarray, int], - observation_space: spaces.Space, - method_name: str) -> None: - """ - Check that the observation returned by the environment - correspond to the declared one. - """ - if not isinstance(observation_space, spaces.Tuple): - assert not isinstance(obs, tuple), ("The observation returned by the `{}()` " - "method should be a single value, not a tuple".format(method_name)) - - # The check for a GoalEnv is done by the base class - if isinstance(observation_space, spaces.Discrete): - assert isinstance(obs, int), "The observation returned by `{}()` method must be an int".format(method_name) - elif _enforce_array_obs(observation_space): - assert isinstance(obs, np.ndarray), ("The observation returned by `{}()` " - "method must be a numpy array".format(method_name)) - - assert observation_space.contains(obs), ("The observation returned by the `{}()` " - "method does not match the given observation space".format(method_name)) - - -def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action_space: spaces.Space) -> None: - """ - Check the returned values by the env when calling `.reset()` or `.step()` methods. - """ - # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists - obs = env.reset() - - _check_obs(obs, observation_space, 'reset') - - # Sample a random action - action = action_space.sample() - data = env.step(action) - - assert len(data) == 4, "The `step()` method must return four values: obs, reward, done, info" - - # Unpack - obs, reward, done, info = data - - _check_obs(obs, observation_space, 'step') - - # We also allow int because the reward will be cast to float - assert isinstance(reward, (float, int)), "The reward returned by `step()` must be a float" - assert isinstance(done, bool), "The `done` signal must be a boolean" - assert isinstance(info, dict), "The `info` returned by `step()` must be a python dictionary" - - if isinstance(env, gym.GoalEnv): - # For a GoalEnv, the keys are checked at reset - assert reward == env.compute_reward(obs['achieved_goal'], obs['desired_goal'], info) - - -def _check_spaces(env: gym.Env) -> None: - """ - Check that the observation and action spaces are defined - and inherit from gym.spaces.Space. - """ - # Helper to link to the code, because gym has no proper documentation - gym_spaces = " cf https://github.com/openai/gym/blob/master/gym/spaces/" - - assert hasattr(env, 'observation_space'), "You must specify an observation space (cf gym.spaces)" + gym_spaces - assert hasattr(env, 'action_space'), "You must specify an action space (cf gym.spaces)" + gym_spaces - - assert isinstance(env.observation_space, - spaces.Space), "The observation space must inherit from gym.spaces" + gym_spaces - assert isinstance(env.action_space, spaces.Space), "The action space must inherit from gym.spaces" + gym_spaces - - -def _check_render(env: gym.Env, warn: bool = True, headless: bool = False) -> None: - """ - Check the declared render modes and the `render()`/`close()` - method of the environment. - - :param env: (gym.Env) The environment to check - :param warn: (bool) Whether to output additional warnings - :param headless: (bool) Whether to disable render modes - that require a graphical interface. False by default. - """ - render_modes = env.metadata.get('render.modes') - if render_modes is None: - if warn: - warnings.warn("No render modes was declared in the environment " - " (env.metadata['render.modes'] is None or not defined), " - "you may have trouble when calling `.render()`") - - else: - # Don't check render mode that require a - # graphical interface (useful for CI) - if headless and 'human' in render_modes: - render_modes.remove('human') - # Check all declared render modes - for render_mode in render_modes: - env.render(mode=render_mode) - env.close() - - -def check_env(env: gym.Env, warn: bool = True, skip_render_check: bool = True) -> None: - """ - Check that an environment follows Gym API. - This is particularly useful when using a custom environment. - Please take a look at https://github.com/openai/gym/blob/master/gym/core.py - for more information about the API. - - It also optionally check that the environment is compatible with Stable-Baselines. - - :param env: (gym.Env) The Gym environment that will be checked - :param warn: (bool) Whether to output additional warnings - mainly related to the interaction with Stable Baselines - :param skip_render_check: (bool) Whether to skip the checks for the render method. - True by default (useful for the CI) - """ - assert isinstance(env, gym.Env), ("Your environment must inherit from the gym.Env class " - "cf https://github.com/openai/gym/blob/master/gym/core.py") - - # ============= Check the spaces (observation and action) ================ - _check_spaces(env) - - # Define aliases for convenience - observation_space = env.observation_space - action_space = env.action_space - - # Warn the user if needed. - # A warning means that the environment may run but not work properly with Stable Baselines algorithms - if warn: - _check_unsupported_obs_spaces(env, observation_space) - - # If image, check the low and high values, the type and the number of channels - # and the shape (minimal value) - if isinstance(observation_space, spaces.Box) and len(observation_space.shape) == 3: - _check_image_input(observation_space) - - if isinstance(observation_space, spaces.Box) and len(observation_space.shape) not in [1, 3]: - warnings.warn("Your observation has an unconventional shape (neither an image, nor a 1D vector). " - "We recommend you to flatten the observation " - "to have only a 1D vector") - - # Check for the action space, it may lead to hard-to-debug issues - if (isinstance(action_space, spaces.Box) and - (np.any(np.abs(action_space.low) != np.abs(action_space.high)) - or np.any(np.abs(action_space.low) > 1) or np.any(np.abs(action_space.high) > 1))): - warnings.warn("We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) " - "cf https://stable-baselines.readthedocs.io/en/master/guide/rl_tips.html") - - # ============ Check the returned values =============== - _check_returned_values(env, observation_space, action_space) - - # ==== Check the render method and the declared render modes ==== - if not skip_render_check: - _check_render(env, warn=warn) - - # The check only works with numpy arrays - if _enforce_array_obs(observation_space): - _check_nan(env) diff --git a/stable_baselines/common/evaluation.py b/stable_baselines/common/evaluation.py deleted file mode 100644 index 2c911f99..00000000 --- a/stable_baselines/common/evaluation.py +++ /dev/null @@ -1,82 +0,0 @@ -import typing -from typing import Callable, List, Optional, Tuple, Union - -import gym -import numpy as np - -from stable_baselines.common.vec_env import VecEnv - -if typing.TYPE_CHECKING: - from stable_baselines.common.base_class import BaseRLModel - - -def evaluate_policy( - model: "BaseRLModel", - env: Union[gym.Env, VecEnv], - n_eval_episodes: int = 10, - deterministic: bool = True, - render: bool = False, - callback: Optional[Callable] = None, - reward_threshold: Optional[float] = None, - return_episode_rewards: bool = False, -) -> Union[Tuple[float, float], Tuple[List[float], List[int]]]: - """ - Runs policy for ``n_eval_episodes`` episodes and returns average reward. - This is made to work only with one env. - - :param model: (BaseRLModel) The RL agent you want to evaluate. - :param env: (gym.Env or VecEnv) The gym environment. In the case of a ``VecEnv`` - this must contain only one environment. - :param n_eval_episodes: (int) Number of episode to evaluate the agent - :param deterministic: (bool) Whether to use deterministic or stochastic actions - :param render: (bool) Whether to render the environment or not - :param callback: (callable) callback function to do additional checks, - called after each step. - :param reward_threshold: (float) Minimum expected reward per episode, - this will raise an error if the performance is not met - :param return_episode_rewards: (Optional[float]) If True, a list of reward per episode - will be returned instead of the mean. - :return: (float, float) Mean reward per episode, std of reward per episode - returns ([float], [int]) when ``return_episode_rewards`` is True - """ - if isinstance(env, VecEnv): - assert env.num_envs == 1, "You must pass only one environment when using this function" - - is_recurrent = model.policy.recurrent - - episode_rewards, episode_lengths = [], [] - for i in range(n_eval_episodes): - # Avoid double reset, as VecEnv are reset automatically - if not isinstance(env, VecEnv) or i == 0: - obs = env.reset() - # Because recurrent policies need the same observation space during training and evaluation, we need to pad - # observation to match training shape. See https://github.com/hill-a/stable-baselines/issues/1015 - if is_recurrent: - zero_completed_obs = np.zeros((model.n_envs,) + model.observation_space.shape) - zero_completed_obs[0, :] = obs - obs = zero_completed_obs - done, state = False, None - episode_reward = 0.0 - episode_length = 0 - while not done: - action, state = model.predict(obs, state=state, deterministic=deterministic) - new_obs, reward, done, _info = env.step(action) - if is_recurrent: - obs[0, :] = new_obs - else: - obs = new_obs - episode_reward += reward - if callback is not None: - callback(locals(), globals()) - episode_length += 1 - if render: - env.render() - episode_rewards.append(episode_reward) - episode_lengths.append(episode_length) - mean_reward = np.mean(episode_rewards) - std_reward = np.std(episode_rewards) - if reward_threshold is not None: - assert mean_reward > reward_threshold, "Mean reward below threshold: {:.2f} < {:.2f}".format(mean_reward, reward_threshold) - if return_episode_rewards: - return episode_rewards, episode_lengths - return mean_reward, std_reward diff --git a/stable_baselines/common/identity_env.py b/stable_baselines/common/identity_env.py deleted file mode 100644 index 41fd9ef5..00000000 --- a/stable_baselines/common/identity_env.py +++ /dev/null @@ -1,105 +0,0 @@ -import numpy as np -from typing import Optional - -from gym import Env, Space -from gym.spaces import Discrete, MultiDiscrete, MultiBinary, Box - - -class IdentityEnv(Env): - def __init__(self, - dim: Optional[int] = None, - space: Optional[Space] = None, - ep_length: int = 100): - """ - Identity environment for testing purposes - - :param dim: the size of the action and observation dimension you want - to learn. Provide at most one of `dim` and `space`. If both are - None, then initialization proceeds with `dim=1` and `space=None`. - :param space: the action and observation space. Provide at most one of - `dim` and `space`. - :param ep_length: the length of each episode in timesteps - """ - if space is None: - if dim is None: - dim = 1 - space = Discrete(dim) - else: - assert dim is None, "arguments for both 'dim' and 'space' provided: at most one allowed" - - self.action_space = self.observation_space = space - self.ep_length = ep_length - self.current_step = 0 - self.num_resets = -1 # Becomes 0 after __init__ exits. - self.reset() - - def reset(self): - self.current_step = 0 - self.num_resets += 1 - self._choose_next_state() - return self.state - - def step(self, action): - reward = self._get_reward(action) - self._choose_next_state() - self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} - - def _choose_next_state(self): - self.state = self.action_space.sample() - - def _get_reward(self, action): - return 1 if np.all(self.state == action) else 0 - - def render(self, mode='human'): - pass - - -class IdentityEnvBox(IdentityEnv): - def __init__(self, low=-1, high=1, eps=0.05, ep_length=100): - """ - Identity environment for testing purposes - - :param low: (float) the lower bound of the box dim - :param high: (float) the upper bound of the box dim - :param eps: (float) the epsilon bound for correct value - :param ep_length: (int) the length of each episode in timesteps - """ - space = Box(low=low, high=high, shape=(1,), dtype=np.float32) - super().__init__(ep_length=ep_length, space=space) - self.eps = eps - - def step(self, action): - reward = self._get_reward(action) - self._choose_next_state() - self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} - - def _get_reward(self, action): - return 1 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0 - - -class IdentityEnvMultiDiscrete(IdentityEnv): - def __init__(self, dim=1, ep_length=100): - """ - Identity environment for testing purposes - - :param dim: (int) the size of the dimensions you want to learn - :param ep_length: (int) the length of each episode in timesteps - """ - space = MultiDiscrete([dim, dim]) - super().__init__(ep_length=ep_length, space=space) - - -class IdentityEnvMultiBinary(IdentityEnv): - def __init__(self, dim=1, ep_length=100): - """ - Identity environment for testing purposes - - :param dim: (int) the size of the dimensions you want to learn - :param ep_length: (int) the length of each episode in timesteps - """ - space = MultiBinary(dim) - super().__init__(ep_length=ep_length, space=space) diff --git a/stable_baselines/common/input.py b/stable_baselines/common/input.py deleted file mode 100644 index e8cfa3c8..00000000 --- a/stable_baselines/common/input.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -import tensorflow as tf -from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete - - -def observation_input(ob_space, batch_size=None, name='Ob', scale=False): - """ - Build observation input with encoding depending on the observation space type - - When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high. - - :param ob_space: (Gym Space) The observation space - :param batch_size: (int) batch size for input - (default is None, so that resulting input placeholder can take tensors with any batch size) - :param name: (str) tensorflow variable name for input placeholder - :param scale: (bool) whether or not to scale the input - :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor - """ - if isinstance(ob_space, Discrete): - observation_ph = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) - processed_observations = tf.cast(tf.one_hot(observation_ph, ob_space.n), tf.float32) - return observation_ph, processed_observations - - elif isinstance(ob_space, Box): - observation_ph = tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) - processed_observations = tf.cast(observation_ph, tf.float32) - # rescale to [1, 0] if the bounds are defined - if (scale and - not np.any(np.isinf(ob_space.low)) and not np.any(np.isinf(ob_space.high)) and - np.any((ob_space.high - ob_space.low) != 0)): - - # equivalent to processed_observations / 255.0 when bounds are set to [255, 0] - processed_observations = ((processed_observations - ob_space.low) / (ob_space.high - ob_space.low)) - return observation_ph, processed_observations - - elif isinstance(ob_space, MultiBinary): - observation_ph = tf.placeholder(shape=(batch_size, ob_space.n), dtype=tf.int32, name=name) - processed_observations = tf.cast(observation_ph, tf.float32) - return observation_ph, processed_observations - - elif isinstance(ob_space, MultiDiscrete): - observation_ph = tf.placeholder(shape=(batch_size, len(ob_space.nvec)), dtype=tf.int32, name=name) - processed_observations = tf.concat([ - tf.cast(tf.one_hot(input_split, ob_space.nvec[i]), tf.float32) for i, input_split - in enumerate(tf.split(observation_ph, len(ob_space.nvec), axis=-1)) - ], axis=-1) - return observation_ph, processed_observations - - else: - raise NotImplementedError("Error: the model does not support input space of type {}".format( - type(ob_space).__name__)) diff --git a/stable_baselines/common/math_util.py b/stable_baselines/common/math_util.py deleted file mode 100644 index 06ce27cb..00000000 --- a/stable_baselines/common/math_util.py +++ /dev/null @@ -1,140 +0,0 @@ -import numpy as np -import scipy.signal - - -def safe_mean(arr): - """ - Compute the mean of an array if there is at least one element. - For empty array, return nan. It is used for logging only. - - :param arr: (np.ndarray) - :return: (float) - """ - return np.nan if len(arr) == 0 else np.mean(arr) - - -def discount(vector, gamma): - """ - computes discounted sums along 0th dimension of vector x. - y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], - where k = len(x) - t - 1 - - :param vector: (np.ndarray) the input vector - :param gamma: (float) the discount value - :return: (np.ndarray) the output vector - """ - assert vector.ndim >= 1 - return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1] - - -def explained_variance(y_pred, y_true): - """ - Computes fraction of variance that ypred explains about y. - Returns 1 - Var[y-ypred] / Var[y] - - interpretation: - ev=0 => might as well have predicted zero - ev=1 => perfect prediction - ev<0 => worse than just predicting zero - - :param y_pred: (np.ndarray) the prediction - :param y_true: (np.ndarray) the expected value - :return: (float) explained variance of ypred and y - """ - assert y_true.ndim == 1 and y_pred.ndim == 1 - var_y = np.var(y_true) - return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y - - -def explained_variance_2d(y_pred, y_true): - """ - Computes fraction of variance that ypred explains about y, for 2D arrays. - Returns 1 - Var[y-ypred] / Var[y] - - interpretation: - ev=0 => might as well have predicted zero - ev=1 => perfect prediction - ev<0 => worse than just predicting zero - - :param y_pred: (np.ndarray) the prediction - :param y_true: (np.ndarray) the expected value - :return: (float) explained variance of ypred and y - """ - assert y_true.ndim == 2 and y_pred.ndim == 2 - var_y = np.var(y_true, axis=0) - explained_var = 1 - np.var(y_true - y_pred) / var_y - explained_var[var_y < 1e-10] = 0 - return explained_var - - -def flatten_arrays(arrs): - """ - flattens a list of arrays down to 1D - - :param arrs: ([np.ndarray]) arrays - :return: (np.ndarray) 1D flattened array - """ - return np.concatenate([arr.flat for arr in arrs]) - - -def unflatten_vector(vec, shapes): - """ - reshape a flattened array - - :param vec: (np.ndarray) 1D arrays - :param shapes: (tuple) - :return: ([np.ndarray]) reshaped array - """ - i = 0 - arrs = [] - for shape in shapes: - size = np.prod(shape) - arr = vec[i:i + size].reshape(shape) - arrs.append(arr) - i += size - return arrs - - -def discount_with_boundaries(rewards, episode_starts, gamma): - """ - computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode. - y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], - where k = len(x) - t - 1 - - :param rewards: (np.ndarray) the input vector (rewards) - :param episode_starts: (np.ndarray) 2d array of bools, indicating when a new episode has started - :param gamma: (float) the discount factor - :return: (np.ndarray) the output vector (discounted rewards) - """ - discounted_rewards = np.zeros_like(rewards) - n_samples = rewards.shape[0] - discounted_rewards[n_samples - 1] = rewards[n_samples - 1] - for step in range(n_samples - 2, -1, -1): - discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1]) - return discounted_rewards - - -def scale_action(action_space, action): - """ - Rescale the action from [low, high] to [-1, 1] - (no need for symmetric action space) - - :param action_space: (gym.spaces.box.Box) - :param action: (np.ndarray) - :return: (np.ndarray) - """ - low, high = action_space.low, action_space.high - return 2.0 * ((action - low) / (high - low)) - 1.0 - - -def unscale_action(action_space, scaled_action): - """ - Rescale the action from [-1, 1] to [low, high] - (no need for symmetric action space) - - :param action_space: (gym.spaces.box.Box) - :param action: (np.ndarray) - :return: (np.ndarray) - """ - low, high = action_space.low, action_space.high - return low + (0.5 * (scaled_action + 1.0) * (high - low)) diff --git a/stable_baselines/common/misc_util.py b/stable_baselines/common/misc_util.py deleted file mode 100644 index 2d8730c0..00000000 --- a/stable_baselines/common/misc_util.py +++ /dev/null @@ -1,67 +0,0 @@ -import random - -import gym -import numpy as np -import tensorflow as tf - - -def zipsame(*seqs): - """ - Performes a zip function, but asserts that all zipped elements are of the same size - - :param seqs: a list of arrays that are zipped together - :return: the zipped arguments - """ - length = len(seqs[0]) - assert all(len(seq) == length for seq in seqs[1:]) - return zip(*seqs) - - -def set_global_seeds(seed): - """ - set the seed for python random, tensorflow, numpy and gym spaces - - :param seed: (int) the seed - """ - tf.set_random_seed(seed) - np.random.seed(seed) - random.seed(seed) - # prng was removed in latest gym version - if hasattr(gym.spaces, 'prng'): - gym.spaces.prng.seed(seed) - - -def boolean_flag(parser, name, default=False, help_msg=None): - """ - Add a boolean flag to argparse parser. - - :param parser: (argparse.Parser) parser to add the flag to - :param name: (str) -- will enable the flag, while --no- will disable it - :param default: (bool) default value of the flag - :param help_msg: (str) help string for the flag - """ - dest = name.replace('-', '_') - parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help_msg) - parser.add_argument("--no-" + name, action="store_false", dest=dest) - - -def mpi_rank_or_zero(): - """ - Return the MPI rank if mpi is installed. Otherwise, return 0. - :return: (int) - """ - try: - import mpi4py - return mpi4py.MPI.COMM_WORLD.Get_rank() - except ImportError: - return 0 - - -def flatten_lists(listoflists): - """ - Flatten a python list of list - - :param listoflists: (list(list)) - :return: (list) - """ - return [el for list_ in listoflists for el in list_] diff --git a/stable_baselines/common/mpi_adam.py b/stable_baselines/common/mpi_adam.py deleted file mode 100644 index 02c3de8b..00000000 --- a/stable_baselines/common/mpi_adam.py +++ /dev/null @@ -1,121 +0,0 @@ -import tensorflow as tf -import numpy as np -import mpi4py - -import stable_baselines.common.tf_util as tf_utils - - -class MpiAdam(object): - def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None, - sess=None): - """ - A parallel MPI implementation of the Adam optimizer for TensorFlow - https://arxiv.org/abs/1412.6980 - - :param var_list: ([TensorFlow Tensor]) the variables - :param beta1: (float) Adam beta1 parameter - :param beta2: (float) Adam beta1 parameter - :param epsilon: (float) to help with preventing arithmetic issues - :param scale_grad_by_procs: (bool) if the scaling should be done by processes - :param comm: (MPI Communicators) if None, mpi4py.MPI.COMM_WORLD - :param sess: (TensorFlow Session) if None, tf.get_default_session() - """ - self.var_list = var_list - self.beta1 = beta1 - self.beta2 = beta2 - self.epsilon = epsilon - self.scale_grad_by_procs = scale_grad_by_procs - size = sum(tf_utils.numel(v) for v in var_list) - # Exponential moving average of gradient values - # "first moment estimate" m in the paper - self.exp_avg = np.zeros(size, 'float32') - # Exponential moving average of squared gradient values - # "second raw moment estimate" v in the paper - self.exp_avg_sq = np.zeros(size, 'float32') - self.step = 0 - self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess) - self.getflat = tf_utils.GetFlat(var_list, sess=sess) - self.comm = mpi4py.MPI.COMM_WORLD if comm is None else comm - - def update(self, local_grad, learning_rate): - """ - update the values of the graph - - :param local_grad: (numpy float) the gradient - :param learning_rate: (float) the learning_rate for the update - """ - if self.step % 100 == 0: - self.check_synced() - local_grad = local_grad.astype('float32') - global_grad = np.zeros_like(local_grad) - self.comm.Allreduce(local_grad, global_grad, op=mpi4py.MPI.SUM) - if self.scale_grad_by_procs: - global_grad /= self.comm.Get_size() - - self.step += 1 - # Learning rate with bias correction - step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step) - # Decay the first and second moment running average coefficient - self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad - self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad) - step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon) - self.setfromflat(self.getflat() + step) - - def sync(self): - """ - syncronize the MPI threads - """ - theta = self.getflat() - self.comm.Bcast(theta, root=0) - self.setfromflat(theta) - - def check_synced(self): - """ - confirm the MPI threads are synced - """ - if self.comm.Get_rank() == 0: # this is root - theta = self.getflat() - self.comm.Bcast(theta, root=0) - else: - thetalocal = self.getflat() - thetaroot = np.empty_like(thetalocal) - self.comm.Bcast(thetaroot, root=0) - assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) - - -@tf_utils.in_session -def test_mpi_adam(): - """ - tests the MpiAdam object's functionality - """ - np.random.seed(0) - tf.set_random_seed(0) - - a_var = tf.Variable(np.random.randn(3).astype('float32')) - b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) - loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) - - learning_rate = 1e-2 - update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) - do_update = tf_utils.function([], loss, updates=[update_op]) - - tf.get_default_session().run(tf.global_variables_initializer()) - for step in range(10): - print(step, do_update()) - - tf.set_random_seed(0) - tf.get_default_session().run(tf.global_variables_initializer()) - - var_list = [a_var, b_var] - lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) - adam = MpiAdam(var_list) - - for step in range(10): - loss, grad = lossandgrad() - adam.update(grad, learning_rate) - print(step, loss) - - -if __name__ == "__main__": - # Run with mpirun -np 2 python - test_mpi_adam() diff --git a/stable_baselines/common/mpi_moments.py b/stable_baselines/common/mpi_moments.py deleted file mode 100644 index 5f01d119..00000000 --- a/stable_baselines/common/mpi_moments.py +++ /dev/null @@ -1,71 +0,0 @@ -from mpi4py import MPI -import numpy as np - -from stable_baselines.common.misc_util import zipsame - - -def mpi_mean(arr, axis=0, comm=None, keepdims=False): - """ - calculates the mean of an array, using MPI - - :param arr: (np.ndarray) - :param axis: (int or tuple or list) the axis to run the means over - :param comm: (MPI Communicators) if None, MPI.COMM_WORLD - :param keepdims: (bool) keep the other dimensions intact - :return: (np.ndarray or Number) the result of the sum - """ - arr = np.asarray(arr) - assert arr.ndim > 0 - if comm is None: - comm = MPI.COMM_WORLD - xsum = arr.sum(axis=axis, keepdims=keepdims) - size = xsum.size - localsum = np.zeros(size + 1, arr.dtype) - localsum[:size] = xsum.ravel() - localsum[size] = arr.shape[axis] - globalsum = np.zeros_like(localsum) - comm.Allreduce(localsum, globalsum, op=MPI.SUM) - return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size] - - -def mpi_moments(arr, axis=0, comm=None, keepdims=False): - """ - calculates the mean and std of an array, using MPI - - :param arr: (np.ndarray) - :param axis: (int or tuple or list) the axis to run the moments over - :param comm: (MPI Communicators) if None, MPI.COMM_WORLD - :param keepdims: (bool) keep the other dimensions intact - :return: (np.ndarray or Number) the result of the moments - """ - arr = np.asarray(arr) - assert arr.ndim > 0 - mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True) - sqdiffs = np.square(arr - mean) - meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) - assert count1 == count - std = np.sqrt(meansqdiff) - if not keepdims: - newshape = mean.shape[:axis] + mean.shape[axis + 1:] - mean = mean.reshape(newshape) - std = std.reshape(newshape) - return mean, std, count - - -def _helper_runningmeanstd(): - comm = MPI.COMM_WORLD - np.random.seed(0) - for (triple, axis) in [ - ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0), - ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0), - ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]: - - arr = np.concatenate(triple, axis=axis) - ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]] - - ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis) - - for (res_1, res_2) in zipsame(ms1, ms2): - print(res_1, res_2) - assert np.allclose(res_1, res_2) - print("ok!") diff --git a/stable_baselines/common/mpi_running_mean_std.py b/stable_baselines/common/mpi_running_mean_std.py deleted file mode 100644 index 5e52129b..00000000 --- a/stable_baselines/common/mpi_running_mean_std.py +++ /dev/null @@ -1,105 +0,0 @@ -import mpi4py -import tensorflow as tf -import numpy as np - -import stable_baselines.common.tf_util as tf_util - - -class RunningMeanStd(object): - def __init__(self, epsilon=1e-2, shape=()): - """ - calulates the running mean and std of a data stream - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - - :param epsilon: (float) helps with arithmetic issues - :param shape: (tuple) the shape of the data stream's output - """ - self._sum = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(0.0), - name="runningsum", trainable=False) - self._sumsq = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(epsilon), - name="runningsumsq", trainable=False) - self._count = tf.get_variable( - dtype=tf.float64, - shape=(), - initializer=tf.constant_initializer(epsilon), - name="count", trainable=False) - self.shape = shape - - self.mean = tf.cast(self._sum / self._count, tf.float32) - self.std = tf.sqrt(tf.maximum(tf.cast(self._sumsq / self._count, tf.float32) - tf.square(self.mean), - 1e-2)) - - newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') - newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') - newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') - self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [], - updates=[tf.assign_add(self._sum, newsum), - tf.assign_add(self._sumsq, newsumsq), - tf.assign_add(self._count, newcount)]) - - def update(self, data): - """ - update the running mean and std - - :param data: (np.ndarray) the data - """ - data = data.astype('float64') - data_size = int(np.prod(self.shape)) - totalvec = np.zeros(data_size * 2 + 1, 'float64') - addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(), - np.array([len(data)], dtype='float64')]) - mpi4py.MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=mpi4py.MPI.SUM) - self.incfiltparams(totalvec[0: data_size].reshape(self.shape), - totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size]) - - -@tf_util.in_session -def test_dist(): - """ - test the running mean std - """ - np.random.seed(0) - p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) - q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) - - comm = mpi4py.MPI.COMM_WORLD - assert comm.Get_size() == 2 - if comm.Get_rank() == 0: - x_1, x_2, x_3 = p_1, p_2, p_3 - elif comm.Get_rank() == 1: - x_1, x_2, x_3 = q_1, q_2, q_3 - else: - assert False - - rms = RunningMeanStd(epsilon=0.0, shape=(1,)) - tf_util.initialize() - - rms.update(x_1) - rms.update(x_2) - rms.update(x_3) - - bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3]) - - def checkallclose(var_1, var_2): - print(var_1, var_2) - return np.allclose(var_1, var_2) - - assert checkallclose( - bigvec.mean(axis=0), - rms.mean.eval(), - ) - assert checkallclose( - bigvec.std(axis=0), - rms.std.eval(), - ) - - -if __name__ == "__main__": - # Run with mpirun -np 2 python - test_dist() diff --git a/stable_baselines/common/noise.py b/stable_baselines/common/noise.py deleted file mode 100644 index eecfd9fc..00000000 --- a/stable_baselines/common/noise.py +++ /dev/null @@ -1,123 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np - - -class AdaptiveParamNoiseSpec(object): - """ - Implements adaptive parameter noise - - :param initial_stddev: (float) the initial value for the standard deviation of the noise - :param desired_action_stddev: (float) the desired value for the standard deviation of the noise - :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise - """ - - def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): - self.initial_stddev = initial_stddev - self.desired_action_stddev = desired_action_stddev - self.adoption_coefficient = adoption_coefficient - - self.current_stddev = initial_stddev - - def adapt(self, distance): - """ - update the standard deviation for the parameter noise - - :param distance: (float) the noise distance applied to the parameters - """ - if distance > self.desired_action_stddev: - # Decrease stddev. - self.current_stddev /= self.adoption_coefficient - else: - # Increase stddev. - self.current_stddev *= self.adoption_coefficient - - def get_stats(self): - """ - return the standard deviation for the parameter noise - - :return: (dict) the stats of the noise - """ - return {'param_noise_stddev': self.current_stddev} - - def __repr__(self): - fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' - return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) - - -class ActionNoise(ABC): - """ - The action noise base class - """ - - def __init__(self): - super(ActionNoise, self).__init__() - - def reset(self) -> None: - """ - call end of episode reset for the noise - """ - pass - - @abstractmethod - def __call__(self) -> np.ndarray: - raise NotImplementedError() - - -class NormalActionNoise(ActionNoise): - """ - A Gaussian action noise - - :param mean: (float) the mean value of the noise - :param sigma: (float) the scale of the noise (std here) - """ - - def __init__(self, mean, sigma): - super().__init__() - self._mu = mean - self._sigma = sigma - - def __call__(self) -> np.ndarray: - return np.random.normal(self._mu, self._sigma) - - def __repr__(self) -> str: - return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) - - -class OrnsteinUhlenbeckActionNoise(ActionNoise): - """ - A Ornstein Uhlenbeck action noise, this is designed to approximate brownian motion with friction. - - Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab - - :param mean: (float) the mean of the noise - :param sigma: (float) the scale of the noise - :param theta: (float) the rate of mean reversion - :param dt: (float) the timestep for the noise - :param initial_noise: ([float]) the initial value for the noise output, (if None: 0) - """ - - def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None): - super().__init__() - self._theta = theta - self._mu = mean - self._sigma = sigma - self._dt = dt - self.initial_noise = initial_noise - self.noise_prev = None - self.reset() - - def __call__(self) -> np.ndarray: - noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \ - self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape) - self.noise_prev = noise - return noise - - def reset(self) -> None: - """ - reset the Ornstein Uhlenbeck noise, to the initial position - """ - self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu) - - def __repr__(self) -> str: - return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) diff --git a/stable_baselines/common/policies.py b/stable_baselines/common/policies.py deleted file mode 100644 index a2f3cc7f..00000000 --- a/stable_baselines/common/policies.py +++ /dev/null @@ -1,752 +0,0 @@ -import warnings -from itertools import zip_longest -from abc import ABC, abstractmethod - -import numpy as np -import tensorflow as tf -from gym.spaces import Discrete - -from stable_baselines.common.tf_util import batch_to_seq, seq_to_batch -from stable_baselines.common.tf_layers import conv, linear, conv_to_fc, lstm -from stable_baselines.common.distributions import make_proba_dist_type, CategoricalProbabilityDistribution, \ - MultiCategoricalProbabilityDistribution, DiagGaussianProbabilityDistribution, BernoulliProbabilityDistribution -from stable_baselines.common.input import observation_input - - -def nature_cnn(scaled_images, **kwargs): - """ - CNN from Nature paper. - - :param scaled_images: (TensorFlow Tensor) Image input placeholder - :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN - :return: (TensorFlow Tensor) The CNN output layer - """ - activ = tf.nn.relu - layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) - layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) - layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) - layer_3 = conv_to_fc(layer_3) - return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2))) - - -def mlp_extractor(flat_observations, net_arch, act_fun): - """ - Constructs an MLP that receives observations as an input and outputs a latent representation for the policy and - a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many - of them are shared between the policy network and the value network. It is assumed to be a list with the following - structure: - - 1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer. - If the number of ints is zero, there will be no shared layers. - 2. An optional dict, to specify the following non-shared layers for the value network and the policy network. - It is formatted like ``dict(vf=[], pi=[])``. - If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed. - - For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value - network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec - would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128 - would be specified as [128, 128]. - - :param flat_observations: (tf.Tensor) The observations to base policy and value function on. - :param net_arch: ([int or dict]) The specification of the policy and value networks. - See above for details on its formatting. - :param act_fun: (tf function) The activation function to use for the networks. - :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network. - If all layers are shared, then ``latent_policy == latent_value`` - """ - latent = flat_observations - policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network - value_only_layers = [] # Layer sizes of the network that only belongs to the value network - - # Iterate through the shared layers and build the shared parts of the network - for idx, layer in enumerate(net_arch): - if isinstance(layer, int): # Check that this is a shared layer - layer_size = layer - latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) - else: - assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" - if 'pi' in layer: - assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers." - policy_only_layers = layer['pi'] - - if 'vf' in layer: - assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers." - value_only_layers = layer['vf'] - break # From here on the network splits up in policy and value network - - # Build the non-shared part of the network - latent_policy = latent - latent_value = latent - for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)): - if pi_layer_size is not None: - assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." - latent_policy = act_fun(linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) - - if vf_layer_size is not None: - assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." - latent_value = act_fun(linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) - - return latent_policy, latent_value - - -class BasePolicy(ABC): - """ - The base policy object - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batches to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param add_action_ph: (bool) whether or not to create an action placeholder - """ - - recurrent = False - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, - obs_phs=None, add_action_ph=False): - self.n_env = n_env - self.n_steps = n_steps - self.n_batch = n_batch - with tf.variable_scope("input", reuse=False): - if obs_phs is None: - self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale) - else: - self._obs_ph, self._processed_obs = obs_phs - - self._action_ph = None - if add_action_ph: - self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch,) + ac_space.shape, - name="action_ph") - self.sess = sess - self.reuse = reuse - self.ob_space = ob_space - self.ac_space = ac_space - - @property - def is_discrete(self): - """bool: is action space discrete.""" - return isinstance(self.ac_space, Discrete) - - @property - def initial_state(self): - """ - The initial state of the policy. For feedforward policies, None. For a recurrent policy, - a NumPy array of shape (self.n_env, ) + state_shape. - """ - assert not self.recurrent, "When using recurrent policies, you must overwrite `initial_state()` method" - return None - - @property - def obs_ph(self): - """tf.Tensor: placeholder for observations, shape (self.n_batch, ) + self.ob_space.shape.""" - return self._obs_ph - - @property - def processed_obs(self): - """tf.Tensor: processed observations, shape (self.n_batch, ) + self.ob_space.shape. - - The form of processing depends on the type of the observation space, and the parameters - whether scale is passed to the constructor; see observation_input for more information.""" - return self._processed_obs - - @property - def action_ph(self): - """tf.Tensor: placeholder for actions, shape (self.n_batch, ) + self.ac_space.shape.""" - return self._action_ph - - @staticmethod - def _kwargs_check(feature_extraction, kwargs): - """ - Ensure that the user is not passing wrong keywords - when using policy_kwargs. - - :param feature_extraction: (str) - :param kwargs: (dict) - """ - # When using policy_kwargs parameter on model creation, - # all keywords arguments must be consumed by the policy constructor except - # the ones for the cnn_extractor network (cf nature_cnn()), where the keywords arguments - # are not passed explicitly (using **kwargs to forward the arguments) - # that's why there should be not kwargs left when using the mlp_extractor - # (in that case the keywords arguments are passed explicitly) - if feature_extraction == 'mlp' and len(kwargs) > 0: - raise ValueError("Unknown keywords for policy: {}".format(kwargs)) - - @abstractmethod - def step(self, obs, state=None, mask=None): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float], [float], [float], [float]) actions, values, states, neglogp - """ - raise NotImplementedError - - @abstractmethod - def proba_step(self, obs, state=None, mask=None): - """ - Returns the action probability for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) the action probability - """ - raise NotImplementedError - - -class ActorCriticPolicy(BasePolicy): - """ - Policy object that implements actor critic - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False): - super(ActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, - scale=scale) - self._pdtype = make_proba_dist_type(ac_space) - self._policy = None - self._proba_distribution = None - self._value_fn = None - self._action = None - self._deterministic_action = None - - def _setup_init(self): - """Sets up the distributions, actions, and value.""" - with tf.variable_scope("output", reuse=True): - assert self.policy is not None and self.proba_distribution is not None and self.value_fn is not None - self._action = self.proba_distribution.sample() - self._deterministic_action = self.proba_distribution.mode() - self._neglogp = self.proba_distribution.neglogp(self.action) - if isinstance(self.proba_distribution, CategoricalProbabilityDistribution): - self._policy_proba = tf.nn.softmax(self.policy) - elif isinstance(self.proba_distribution, DiagGaussianProbabilityDistribution): - self._policy_proba = [self.proba_distribution.mean, self.proba_distribution.std] - elif isinstance(self.proba_distribution, BernoulliProbabilityDistribution): - self._policy_proba = tf.nn.sigmoid(self.policy) - elif isinstance(self.proba_distribution, MultiCategoricalProbabilityDistribution): - self._policy_proba = [tf.nn.softmax(categorical.flatparam()) - for categorical in self.proba_distribution.categoricals] - else: - self._policy_proba = [] # it will return nothing, as it is not implemented - self._value_flat = self.value_fn[:, 0] - - @property - def pdtype(self): - """ProbabilityDistributionType: type of the distribution for stochastic actions.""" - return self._pdtype - - @property - def policy(self): - """tf.Tensor: policy output, e.g. logits.""" - return self._policy - - @property - def proba_distribution(self): - """ProbabilityDistribution: distribution of stochastic actions.""" - return self._proba_distribution - - @property - def value_fn(self): - """tf.Tensor: value estimate, of shape (self.n_batch, 1)""" - return self._value_fn - - @property - def value_flat(self): - """tf.Tensor: value estimate, of shape (self.n_batch, )""" - return self._value_flat - - @property - def action(self): - """tf.Tensor: stochastic action, of shape (self.n_batch, ) + self.ac_space.shape.""" - return self._action - - @property - def deterministic_action(self): - """tf.Tensor: deterministic action, of shape (self.n_batch, ) + self.ac_space.shape.""" - return self._deterministic_action - - @property - def neglogp(self): - """tf.Tensor: negative log likelihood of the action sampled by self.action.""" - return self._neglogp - - @property - def policy_proba(self): - """tf.Tensor: parameters of the probability distribution. Depends on pdtype.""" - return self._policy_proba - - @abstractmethod - def step(self, obs, state=None, mask=None, deterministic=False): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: ([float], [float], [float], [float]) actions, values, states, neglogp - """ - raise NotImplementedError - - @abstractmethod - def value(self, obs, state=None, mask=None): - """ - Returns the value for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) The associated value of the action - """ - raise NotImplementedError - - -class RecurrentActorCriticPolicy(ActorCriticPolicy): - """ - Actor critic policy object uses a previous state in the computation for the current step. - NOTE: this class is not limited to recurrent neural network policies, - see https://github.com/hill-a/stable-baselines/issues/241 - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param state_shape: (tuple) shape of the per-environment state space. - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - """ - - recurrent = True - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, - state_shape, reuse=False, scale=False): - super(RecurrentActorCriticPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, - n_batch, reuse=reuse, scale=scale) - - with tf.variable_scope("input", reuse=False): - self._dones_ph = tf.placeholder(tf.float32, (n_batch, ), name="dones_ph") # (done t-1) - state_ph_shape = (self.n_env, ) + tuple(state_shape) - self._states_ph = tf.placeholder(tf.float32, state_ph_shape, name="states_ph") - - initial_state_shape = (self.n_env, ) + tuple(state_shape) - self._initial_state = np.zeros(initial_state_shape, dtype=np.float32) - - @property - def initial_state(self): - return self._initial_state - - @property - def dones_ph(self): - """tf.Tensor: placeholder for whether episode has terminated (done), shape (self.n_batch, ). - Internally used to reset the state before the next episode starts.""" - return self._dones_ph - - @property - def states_ph(self): - """tf.Tensor: placeholder for states, shape (self.n_env, ) + state_shape.""" - return self._states_ph - - @abstractmethod - def value(self, obs, state=None, mask=None): - """ - Cf base class doc. - """ - raise NotImplementedError - - -class LstmPolicy(RecurrentActorCriticPolicy): - """ - Policy object that implements actor critic, using LSTMs. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_lstm: (int) The number of LSTM cells (for recurrent policies) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) The size of the Neural network before the LSTM layer (if None, default to [64, 64]) - :param net_arch: (list) Specification of the actor-critic policy network architecture. Notation similar to the - format described in mlp_extractor but with additional support for a 'lstm' entry in the shared network part. - :param act_fun: (tf.func) the activation function to use in the neural network. - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param layer_norm: (bool) Whether or not to use layer normalizing LSTMs - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - recurrent = True - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, - net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", - **kwargs): - # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM - super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, - state_shape=(2 * n_lstm, ), reuse=reuse, - scale=(feature_extraction == "cnn")) - - self._kwargs_check(feature_extraction, kwargs) - - if net_arch is None: # Legacy mode - if layers is None: - layers = [64, 64] - else: - warnings.warn("The layers parameter is deprecated. Use the net_arch parameter instead.") - - with tf.variable_scope("model", reuse=reuse): - if feature_extraction == "cnn": - extracted_features = cnn_extractor(self.processed_obs, **kwargs) - else: - extracted_features = tf.layers.flatten(self.processed_obs) - for i, layer_size in enumerate(layers): - extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, - init_scale=np.sqrt(2))) - input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) - masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) - rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, - layer_norm=layer_norm) - rnn_output = seq_to_batch(rnn_output) - value_fn = linear(rnn_output, 'vf', 1) - - self._proba_distribution, self._policy, self.q_value = \ - self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) - - self._value_fn = value_fn - else: # Use the new net_arch parameter - if layers is not None: - warnings.warn("The new net_arch parameter overrides the deprecated layers parameter.") - if feature_extraction == "cnn": - raise NotImplementedError() - - with tf.variable_scope("model", reuse=reuse): - latent = tf.layers.flatten(self.processed_obs) - policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network - value_only_layers = [] # Layer sizes of the network that only belongs to the value network - - # Iterate through the shared layers and build the shared parts of the network - lstm_layer_constructed = False - for idx, layer in enumerate(net_arch): - if isinstance(layer, int): # Check that this is a shared layer - layer_size = layer - latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) - elif layer == "lstm": - if lstm_layer_constructed: - raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!") - input_sequence = batch_to_seq(latent, self.n_env, n_steps) - masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) - rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, - layer_norm=layer_norm) - latent = seq_to_batch(rnn_output) - lstm_layer_constructed = True - else: - assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" - if 'pi' in layer: - assert isinstance(layer['pi'], - list), "Error: net_arch[-1]['pi'] must contain a list of integers." - policy_only_layers = layer['pi'] - - if 'vf' in layer: - assert isinstance(layer['vf'], - list), "Error: net_arch[-1]['vf'] must contain a list of integers." - value_only_layers = layer['vf'] - break # From here on the network splits up in policy and value network - - # Build the non-shared part of the policy-network - latent_policy = latent - for idx, pi_layer_size in enumerate(policy_only_layers): - if pi_layer_size == "lstm": - raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.") - assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." - latent_policy = act_fun( - linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) - - # Build the non-shared part of the value-network - latent_value = latent - for idx, vf_layer_size in enumerate(value_only_layers): - if vf_layer_size == "lstm": - raise NotImplementedError("LSTMs are only supported in the shared part of the value function " - "network.") - assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." - latent_value = act_fun( - linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) - - if not lstm_layer_constructed: - raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!") - - self._value_fn = linear(latent_value, 'vf', 1) - # TODO: why not init_scale = 0.001 here like in the feedforward - self._proba_distribution, self._policy, self.q_value = \ - self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) - self._setup_init() - - def step(self, obs, state=None, mask=None, deterministic=False): - if deterministic: - return self.sess.run([self.deterministic_action, self.value_flat, self.snew, self.neglogp], - {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask}) - else: - return self.sess.run([self.action, self.value_flat, self.snew, self.neglogp], - {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask}) - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run(self.policy_proba, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask}) - - def value(self, obs, state=None, mask=None): - return self.sess.run(self.value_flat, {self.obs_ph: obs, self.states_ph: state, self.dones_ph: mask}) - - -class FeedForwardPolicy(ActorCriticPolicy): - """ - Policy object that implements actor critic, using a feed forward neural network. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) (deprecated, use net_arch instead) The size of the Neural network for the policy - (if None, default to [64, 64]) - :param net_arch: (list) Specification of the actor-critic policy network architecture (see mlp_extractor - documentation for details). - :param act_fun: (tf.func) the activation function to use in the neural network. - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, - act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): - super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, - scale=(feature_extraction == "cnn")) - - self._kwargs_check(feature_extraction, kwargs) - - if layers is not None: - warnings.warn("Usage of the `layers` parameter is deprecated! Use net_arch instead " - "(it has a different semantics though).", DeprecationWarning) - if net_arch is not None: - warnings.warn("The new `net_arch` parameter overrides the deprecated `layers` parameter!", - DeprecationWarning) - - if net_arch is None: - if layers is None: - layers = [64, 64] - net_arch = [dict(vf=layers, pi=layers)] - - with tf.variable_scope("model", reuse=reuse): - if feature_extraction == "cnn": - pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) - else: - pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun) - - self._value_fn = linear(vf_latent, 'vf', 1) - - self._proba_distribution, self._policy, self.q_value = \ - self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) - - self._setup_init() - - def step(self, obs, state=None, mask=None, deterministic=False): - if deterministic: - action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp], - {self.obs_ph: obs}) - else: - action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp], - {self.obs_ph: obs}) - return action, value, self.initial_state, neglogp - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run(self.policy_proba, {self.obs_ph: obs}) - - def value(self, obs, state=None, mask=None): - return self.sess.run(self.value_flat, {self.obs_ph: obs}) - - -class CnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", **_kwargs) - - -class CnnLstmPolicy(LstmPolicy): - """ - Policy object that implements actor critic, using LSTMs with a CNN feature extraction - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_lstm: (int) The number of LSTM cells (for recurrent policies) - :param reuse: (bool) If the policy is reusable or not - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): - super(CnnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - layer_norm=False, feature_extraction="cnn", **_kwargs) - - -class CnnLnLstmPolicy(LstmPolicy): - """ - Policy object that implements actor critic, using a layer normalized LSTMs with a CNN feature extraction - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_lstm: (int) The number of LSTM cells (for recurrent policies) - :param reuse: (bool) If the policy is reusable or not - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): - super(CnnLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - layer_norm=True, feature_extraction="cnn", **_kwargs) - - -class MlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", **_kwargs) - - -class MlpLstmPolicy(LstmPolicy): - """ - Policy object that implements actor critic, using LSTMs with a MLP feature extraction - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_lstm: (int) The number of LSTM cells (for recurrent policies) - :param reuse: (bool) If the policy is reusable or not - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): - super(MlpLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - layer_norm=False, feature_extraction="mlp", **_kwargs) - - -class MlpLnLstmPolicy(LstmPolicy): - """ - Policy object that implements actor critic, using a layer normalized LSTMs with a MLP feature extraction - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_lstm: (int) The number of LSTM cells (for recurrent policies) - :param reuse: (bool) If the policy is reusable or not - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): - super(MlpLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - layer_norm=True, feature_extraction="mlp", **_kwargs) - - -_policy_registry = { - ActorCriticPolicy: { - "CnnPolicy": CnnPolicy, - "CnnLstmPolicy": CnnLstmPolicy, - "CnnLnLstmPolicy": CnnLnLstmPolicy, - "MlpPolicy": MlpPolicy, - "MlpLstmPolicy": MlpLstmPolicy, - "MlpLnLstmPolicy": MlpLnLstmPolicy, - } -} - - -def get_policy_from_name(base_policy_type, name): - """ - returns the registed policy from the base type and name - - :param base_policy_type: (BasePolicy) the base policy object - :param name: (str) the policy name - :return: (base_policy_type) the policy - """ - if base_policy_type not in _policy_registry: - raise ValueError("Error: the policy type {} is not registered!".format(base_policy_type)) - if name not in _policy_registry[base_policy_type]: - raise ValueError("Error: unknown policy type {}, the only registed policy type are: {}!" - .format(name, list(_policy_registry[base_policy_type].keys()))) - return _policy_registry[base_policy_type][name] - - -def register_policy(name, policy): - """ - returns the registed policy from the base type and name - - :param name: (str) the policy name - :param policy: (subclass of BasePolicy) the policy - """ - sub_class = None - for cls in BasePolicy.__subclasses__(): - if issubclass(policy, cls): - sub_class = cls - break - if sub_class is None: - raise ValueError("Error: the policy {} is not of any known subclasses of BasePolicy!".format(policy)) - - if sub_class not in _policy_registry: - _policy_registry[sub_class] = {} - if name in _policy_registry[sub_class]: - raise ValueError("Error: the name {} is alreay registered for a different policy, will not override." - .format(name)) - _policy_registry[sub_class][name] = policy diff --git a/stable_baselines/common/runners.py b/stable_baselines/common/runners.py deleted file mode 100644 index f5253e4d..00000000 --- a/stable_baselines/common/runners.py +++ /dev/null @@ -1,209 +0,0 @@ -from abc import ABC, abstractmethod -import typing -from typing import Union, Optional, Any - -import gym -import numpy as np - -from stable_baselines.common.callbacks import BaseCallback -from stable_baselines.common.vec_env import VecEnv - -if typing.TYPE_CHECKING: - from stable_baselines.common.base_class import BaseRLModel # pytype: disable=pyi-error - - -class AbstractEnvRunner(ABC): - def __init__(self, *, env: Union[gym.Env, VecEnv], model: 'BaseRLModel', n_steps: int): - """ - Collect experience by running `n_steps` in the environment. - Note: if this is a `VecEnv`, the total number of steps will - be `n_steps * n_envs`. - - :param env: (Union[gym.Env, VecEnv]) The environment to learn from - :param model: (BaseRLModel) The model to learn - :param n_steps: (int) The number of steps to run for each environment - """ - self.env = env - self.model = model - n_envs = env.num_envs - self.batch_ob_shape = (n_envs * n_steps,) + env.observation_space.shape - self.obs = np.zeros((n_envs,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) - self.obs[:] = env.reset() - self.n_steps = n_steps - self.states = model.initial_state # pytype: disable=attribute-error - self.dones = [False for _ in range(n_envs)] - self.callback = None # type: Optional[BaseCallback] - self.continue_training = True - self.n_envs = n_envs - - def run(self, callback: Optional[BaseCallback] = None) -> Any: - """ - Collect experience. - - :param callback: (Optional[BaseCallback]) The callback that will be called - at each environment step. - """ - self.callback = callback - self.continue_training = True - return self._run() - - @abstractmethod - def _run(self) -> Any: - """ - This method must be overwritten by child class. - """ - raise NotImplementedError - - -def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False, callback=None): - """ - Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) - :param policy: (MLPPolicy) the policy - :param env: (Gym Environment) the environment - :param horizon: (int) the number of timesteps to run per batch - :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action - :param gail: (bool) Whether we are using this generator for standard trpo or with gail - :param callback: (BaseCallback) - :return: (dict) generator that returns a dict with the following keys: - - observations: (np.ndarray) observations - - rewards: (numpy float) rewards (if gail is used it is the predicted reward) - - true_rewards: (numpy float) if gail is used it is the original reward - - vpred: (numpy float) action logits - - dones: (numpy bool) dones (is end of episode, used for logging) - - episode_starts: (numpy bool) - True if first timestep of an episode, used for GAE - - actions: (np.ndarray) actions - - nextvpred: (numpy float) next action logits - - ep_rets: (float) cumulated current episode reward - - ep_lens: (int) the length of the current episode - - ep_true_rets: (float) the real environment reward - - continue_training: (bool) Whether to continue training - or stop early (triggered by the callback) - """ - # Check when using GAIL - assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL" - - # Initialize state variables - step = 0 - action = env.action_space.sample() # not used, just so we have the datatype - observation = env.reset() - - cur_ep_ret = 0 # return in current episode - current_it_len = 0 # len of current iteration - current_ep_len = 0 # len of current episode - cur_ep_true_ret = 0 - ep_true_rets = [] - ep_rets = [] # returns of completed episodes in this segment - ep_lens = [] # Episode lengths - - # Initialize history arrays - observations = np.array([observation for _ in range(horizon)]) - true_rewards = np.zeros(horizon, 'float32') - rewards = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - episode_starts = np.zeros(horizon, 'bool') - dones = np.zeros(horizon, 'bool') - actions = np.array([action for _ in range(horizon)]) - states = policy.initial_state - episode_start = True # marks if we're on first timestep of an episode - done = False - - callback.on_rollout_start() - - while True: - action, vpred, states, _ = policy.step(observation.reshape(-1, *observation.shape), states, done) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if step > 0 and step % horizon == 0: - callback.update_locals(locals()) - callback.on_rollout_end() - yield { - "observations": observations, - "rewards": rewards, - "dones": dones, - "episode_starts": episode_starts, - "true_rewards": true_rewards, - "vpred": vpreds, - "actions": actions, - "nextvpred": vpred[0] * (1 - episode_start), - "ep_rets": ep_rets, - "ep_lens": ep_lens, - "ep_true_rets": ep_true_rets, - "total_timestep": current_it_len, - 'continue_training': True - } - _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape)) - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_true_rets = [] - ep_lens = [] - # Reset current iteration length - current_it_len = 0 - callback.on_rollout_start() - i = step % horizon - observations[i] = observation - vpreds[i] = vpred[0] - actions[i] = action[0] - episode_starts[i] = episode_start - - clipped_action = action - # Clip the actions to avoid out of bound error - if isinstance(env.action_space, gym.spaces.Box): - clipped_action = np.clip(action, env.action_space.low, env.action_space.high) - - if gail: - reward = reward_giver.get_reward(observation, clipped_action[0]) - observation, true_reward, done, info = env.step(clipped_action[0]) - else: - observation, reward, done, info = env.step(clipped_action[0]) - true_reward = reward - - if callback is not None: - callback.update_locals(locals()) - if callback.on_step() is False: - # We have to return everything so pytype does not complain - yield { - "observations": observations, - "rewards": rewards, - "dones": dones, - "episode_starts": episode_starts, - "true_rewards": true_rewards, - "vpred": vpreds, - "actions": actions, - "nextvpred": vpred[0] * (1 - episode_start), - "ep_rets": ep_rets, - "ep_lens": ep_lens, - "ep_true_rets": ep_true_rets, - "total_timestep": current_it_len, - 'continue_training': False - } - return - - rewards[i] = reward - true_rewards[i] = true_reward - dones[i] = done - episode_start = done - - cur_ep_ret += reward - cur_ep_true_ret += true_reward - current_it_len += 1 - current_ep_len += 1 - if done: - # Retrieve unnormalized reward if using Monitor wrapper - maybe_ep_info = info.get('episode') - if maybe_ep_info is not None: - if not gail: - cur_ep_ret = maybe_ep_info['r'] - cur_ep_true_ret = maybe_ep_info['r'] - - ep_rets.append(cur_ep_ret) - ep_true_rets.append(cur_ep_true_ret) - ep_lens.append(current_ep_len) - cur_ep_ret = 0 - cur_ep_true_ret = 0 - current_ep_len = 0 - if not isinstance(env, VecEnv): - observation = env.reset() - step += 1 diff --git a/stable_baselines/common/running_mean_std.py b/stable_baselines/common/running_mean_std.py deleted file mode 100644 index d6a03d6e..00000000 --- a/stable_baselines/common/running_mean_std.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np - - -class RunningMeanStd(object): - def __init__(self, epsilon=1e-4, shape=()): - """ - calulates the running mean and std of a data stream - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - - :param epsilon: (float) helps with arithmetic issues - :param shape: (tuple) the shape of the data stream's output - """ - self.mean = np.zeros(shape, 'float64') - self.var = np.ones(shape, 'float64') - self.count = epsilon - - def update(self, arr): - batch_mean = np.mean(arr, axis=0) - batch_var = np.var(arr, axis=0) - batch_count = arr.shape[0] - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - delta = batch_mean - self.mean - tot_count = self.count + batch_count - - new_mean = self.mean + delta * batch_count / tot_count - m_a = self.var * self.count - m_b = batch_var * batch_count - m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) - new_var = m_2 / (self.count + batch_count) - - new_count = batch_count + self.count - - self.mean = new_mean - self.var = new_var - self.count = new_count diff --git a/stable_baselines/common/save_util.py b/stable_baselines/common/save_util.py deleted file mode 100644 index 70a9fa54..00000000 --- a/stable_baselines/common/save_util.py +++ /dev/null @@ -1,173 +0,0 @@ -import base64 -from collections import OrderedDict -import io -import json -import pickle - -import cloudpickle -import numpy as np - - -def is_json_serializable(item): - """ - Test if an object is serializable into JSON - - :param item: (object) The object to be tested for JSON serialization. - :return: (bool) True if object is JSON serializable, false otherwise. - """ - # Try with try-except struct. - json_serializable = True - try: - _ = json.dumps(item) - except TypeError: - json_serializable = False - return json_serializable - - -def data_to_json(data): - """ - Turn data (class parameters) into a JSON string for storing - - :param data: (Dict) Dictionary of class parameters to be - stored. Items that are not JSON serializable will be - pickled with Cloudpickle and stored as bytearray in - the JSON file - :return: (str) JSON string of the data serialized. - """ - # First, check what elements can not be JSONfied, - # and turn them into byte-strings - serializable_data = {} - for data_key, data_item in data.items(): - # See if object is JSON serializable - if is_json_serializable(data_item): - # All good, store as it is - serializable_data[data_key] = data_item - else: - # Not serializable, cloudpickle it into - # bytes and convert to base64 string for storing. - # Also store type of the class for consumption - # from other languages/humans, so we have an - # idea what was being stored. - base64_encoded = base64.b64encode( - cloudpickle.dumps(data_item) - ).decode() - - # Use ":" to make sure we do - # not override these keys - # when we include variables of the object later - cloudpickle_serialization = { - ":type:": str(type(data_item)), - ":serialized:": base64_encoded - } - - # Add first-level JSON-serializable items of the - # object for further details (but not deeper than this to - # avoid deep nesting). - # First we check that object has attributes (not all do, - # e.g. numpy scalars) - if hasattr(data_item, "__dict__") or isinstance(data_item, dict): - # Take elements from __dict__ for custom classes - item_generator = ( - data_item.items if isinstance(data_item, dict) else data_item.__dict__.items - ) - for variable_name, variable_item in item_generator(): - # Check if serializable. If not, just include the - # string-representation of the object. - if is_json_serializable(variable_item): - cloudpickle_serialization[variable_name] = variable_item - else: - cloudpickle_serialization[variable_name] = str(variable_item) - - serializable_data[data_key] = cloudpickle_serialization - json_string = json.dumps(serializable_data, indent=4) - return json_string - - -def json_to_data(json_string, custom_objects=None): - """ - Turn JSON serialization of class-parameters back into dictionary. - - :param json_string: (str) JSON serialization of the class-parameters - that should be loaded. - :param custom_objects: (dict) Dictionary of objects to replace - upon loading. If a variable is present in this dictionary as a - key, it will not be deserialized and the corresponding item - will be used instead. Similar to custom_objects in - `keras.models.load_model`. Useful when you have an object in - file that can not be deserialized. - :return: (dict) Loaded class parameters. - """ - if custom_objects is not None and not isinstance(custom_objects, dict): - raise ValueError("custom_objects argument must be a dict or None") - - json_dict = json.loads(json_string) - # This will be filled with deserialized data - return_data = {} - for data_key, data_item in json_dict.items(): - if custom_objects is not None and data_key in custom_objects.keys(): - # If item is provided in custom_objects, replace - # the one from JSON with the one in custom_objects - return_data[data_key] = custom_objects[data_key] - elif isinstance(data_item, dict) and ":serialized:" in data_item.keys(): - # If item is dictionary with ":serialized:" - # key, this means it is serialized with cloudpickle. - serialization = data_item[":serialized:"] - # Try-except deserialization in case we run into - # errors. If so, we can tell bit more information to - # user. - try: - deserialized_object = cloudpickle.loads( - base64.b64decode(serialization.encode()) - ) - except pickle.UnpicklingError: - raise RuntimeError( - "Could not deserialize object {}. ".format(data_key) + - "Consider using `custom_objects` argument to replace " + - "this object." - ) - return_data[data_key] = deserialized_object - else: - # Read as it is - return_data[data_key] = data_item - return return_data - - -def params_to_bytes(params): - """ - Turn params (OrderedDict of variable name -> ndarray) into - serialized bytes for storing. - - Note: `numpy.savez` does not save the ordering. - - :param params: (OrderedDict) Dictionary mapping variable - names to numpy arrays of the current parameters of the - model. - :return: (bytes) Bytes object of the serialized content. - """ - # Create byte-buffer and save params with - # savez function, and return the bytes. - byte_file = io.BytesIO() - np.savez(byte_file, **params) - serialized_params = byte_file.getvalue() - return serialized_params - - -def bytes_to_params(serialized_params, param_list): - """ - Turn serialized parameters (bytes) back into OrderedDictionary. - - :param serialized_params: (byte) Serialized parameters - with `numpy.savez`. - :param param_list: (list) List of strings, representing - the order of parameters in which they should be returned - :return: (OrderedDict) Dictionary mapping variable name to - numpy array of the parameters. - """ - byte_file = io.BytesIO(serialized_params) - params = np.load(byte_file) - return_dictionary = OrderedDict() - # Assign parameters to return_dictionary - # in the order specified by param_list - for param_name in param_list: - return_dictionary[param_name] = params[param_name] - return return_dictionary diff --git a/stable_baselines/common/schedules.py b/stable_baselines/common/schedules.py deleted file mode 100644 index 698bf03c..00000000 --- a/stable_baselines/common/schedules.py +++ /dev/null @@ -1,254 +0,0 @@ -"""This file is used for specifying various schedules that evolve over -time throughout the execution of the algorithm, such as: - - - learning rate for the optimizer - - exploration epsilon for the epsilon greedy exploration strategy - - beta parameter for beta parameter in prioritized replay - -Each schedule has a function `value(t)` which returns the current value -of the parameter given the timestep t of the optimization procedure. -""" - - -class Schedule(object): - def value(self, step): - """ - Value of the schedule for a given timestep - - :param step: (int) the timestep - :return: (float) the output value for the given timestep - """ - raise NotImplementedError - - -class ConstantSchedule(Schedule): - """ - Value remains constant over time. - - :param value: (float) Constant value of the schedule - """ - - def __init__(self, value): - self._value = value - - def value(self, step): - return self._value - - -def linear_interpolation(left, right, alpha): - """ - Linear interpolation between `left` and `right`. - - :param left: (float) left boundary - :param right: (float) right boundary - :param alpha: (float) coeff in [0, 1] - :return: (float) - """ - - return left + alpha * (right - left) - - -class PiecewiseSchedule(Schedule): - """ - Piecewise schedule. - - :param endpoints: ([(int, int)]) - list of pairs `(time, value)` meaning that schedule should output - `value` when `t==time`. All the values for time must be sorted in - an increasing order. When t is between two times, e.g. `(time_a, value_a)` - and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs - `interpolation(value_a, value_b, alpha)` where alpha is a fraction of - time passed between `time_a` and `time_b` for time `t`. - :param interpolation: (lambda (float, float, float): float) - a function that takes value to the left and to the right of t according - to the `endpoints`. Alpha is the fraction of distance from left endpoint to - right endpoint that t has covered. See linear_interpolation for example. - :param outside_value: (float) - if the value is requested outside of all the intervals specified in - `endpoints` this value is returned. If None then AssertionError is - raised when outside value is requested. - """ - - def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): - idxes = [e[0] for e in endpoints] - assert idxes == sorted(idxes) - self._interpolation = interpolation - self._outside_value = outside_value - self._endpoints = endpoints - - def value(self, step): - for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]): - if left_t <= step < right_t: - alpha = float(step - left_t) / (right_t - left_t) - return self._interpolation(left, right, alpha) - - # t does not belong to any of the pieces, so doom. - assert self._outside_value is not None - return self._outside_value - - -class LinearSchedule(Schedule): - """ - Linear interpolation between initial_p and final_p over - schedule_timesteps. After this many timesteps pass final_p is - returned. - - :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p - :param initial_p: (float) initial output value - :param final_p: (float) final output value - """ - - def __init__(self, schedule_timesteps, final_p, initial_p=1.0): - self.schedule_timesteps = schedule_timesteps - self.final_p = final_p - self.initial_p = initial_p - - def value(self, step): - fraction = min(float(step) / self.schedule_timesteps, 1.0) - return self.initial_p + fraction * (self.final_p - self.initial_p) - - -def get_schedule_fn(value_schedule): - """ - Transform (if needed) learning rate and clip range - to callable. - - :param value_schedule: (callable or float) - :return: (function) - """ - # If the passed schedule is a float - # create a constant function - if isinstance(value_schedule, (float, int)): - # Cast to float to avoid errors - value_schedule = constfn(float(value_schedule)) - else: - assert callable(value_schedule) - return value_schedule - - -def constfn(val): - """ - Create a function that returns a constant - It is useful for learning rate schedule (to avoid code duplication) - - :param val: (float) - :return: (function) - """ - - def func(_): - return val - - return func - - -# ================================================================ -# Legacy scheduler used by A2C, AKCTR and ACER -# ================================================================ - -def constant(_): - """ - Returns a constant value for the Scheduler - - :param _: ignored - :return: (float) 1 - """ - return 1. - - -def linear_schedule(progress): - """ - Returns a linear value for the Scheduler - - :param progress: (float) Current progress status (in [0, 1]) - :return: (float) 1 - progress - """ - return 1 - progress - - -def middle_drop(progress): - """ - Returns a linear value with a drop near the middle to a constant value for the Scheduler - - :param progress: (float) Current progress status (in [0, 1]) - :return: (float) 1 - progress if (1 - progress) >= 0.75 else 0.075 - """ - eps = 0.75 - if 1 - progress < eps: - return eps * 0.1 - return 1 - progress - - -def double_linear_con(progress): - """ - Returns a linear value (x2) with a flattened tail for the Scheduler - - :param progress: (float) Current progress status (in [0, 1]) - :return: (float) 1 - progress*2 if (1 - progress*2) >= 0.125 else 0.125 - """ - progress *= 2 - eps = 0.125 - if 1 - progress < eps: - return eps - return 1 - progress - - -def double_middle_drop(progress): - """ - Returns a linear value with two drops near the middle to a constant value for the Scheduler - - :param progress: (float) Current progress status (in [0, 1]) - :return: (float) if 0.75 <= 1 - p: 1 - p, if 0.25 <= 1 - p < 0.75: 0.75, if 1 - p < 0.25: 0.125 - """ - eps1 = 0.75 - eps2 = 0.25 - if 1 - progress < eps1: - if 1 - progress < eps2: - return eps2 * 0.5 - return eps1 * 0.1 - return 1 - progress - - -SCHEDULES = { - 'linear': linear_schedule, - 'constant': constant, - 'double_linear_con': double_linear_con, - 'middle_drop': middle_drop, - 'double_middle_drop': double_middle_drop -} - - -class Scheduler(object): - def __init__(self, initial_value, n_values, schedule): - """ - Update a value every iteration, with a specific curve. - - This is a legacy version of schedules, originally defined - in a2c/utils.py. Used by A2C, ACER and ACKTR algorithms. - - :param initial_value: (float) initial value - :param n_values: (int) the total number of iterations - :param schedule: (function) the curve you wish to follow for your value - """ - self.step = 0. - self.initial_value = initial_value - self.nvalues = n_values - self.schedule = SCHEDULES[schedule] - - def value(self): - """ - Update the Scheduler, and return the current value - - :return: (float) the current value - """ - current_value = self.initial_value * self.schedule(self.step / self.nvalues) - self.step += 1. - return current_value - - def value_steps(self, steps): - """ - Get a value for a given step - - :param steps: (int) The current number of iterations - :return: (float) the value for the current number of iterations - """ - return self.initial_value * self.schedule(steps / self.nvalues) diff --git a/stable_baselines/common/segment_tree.py b/stable_baselines/common/segment_tree.py deleted file mode 100644 index 6b184fa4..00000000 --- a/stable_baselines/common/segment_tree.py +++ /dev/null @@ -1,171 +0,0 @@ -import numpy as np - - -def unique(sorted_array): - """ - More efficient implementation of np.unique for sorted arrays - :param sorted_array: (np.ndarray) - :return:(np.ndarray) sorted_array without duplicate elements - """ - if len(sorted_array) == 1: - return sorted_array - left = sorted_array[:-1] - right = sorted_array[1:] - uniques = np.append(right != left, True) - return sorted_array[uniques] - - -class SegmentTree(object): - def __init__(self, capacity, operation, neutral_element): - """ - Build a Segment Tree data structure. - - https://en.wikipedia.org/wiki/Segment_tree - - Can be used as regular array that supports Index arrays, but with two - important differences: - - a) setting item's value is slightly slower. - It is O(lg capacity) instead of O(1). - b) user has access to an efficient ( O(log segment size) ) - `reduce` operation which reduces `operation` over - a contiguous subsequence of items in the array. - - :param capacity: (int) Total size of the array - must be a power of two. - :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a - mathematical group together with the set of possible values for array elements (i.e. be associative) - :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum. - """ - assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." - self._capacity = capacity - self._value = [neutral_element for _ in range(2 * capacity)] - self._operation = operation - self.neutral_element = neutral_element - - def _reduce_helper(self, start, end, node, node_start, node_end): - if start == node_start and end == node_end: - return self._value[node] - mid = (node_start + node_end) // 2 - if end <= mid: - return self._reduce_helper(start, end, 2 * node, node_start, mid) - else: - if mid + 1 <= start: - return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) - else: - return self._operation( - self._reduce_helper(start, mid, 2 * node, node_start, mid), - self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) - ) - - def reduce(self, start=0, end=None): - """ - Returns result of applying `self.operation` - to a contiguous subsequence of the array. - - self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) - - :param start: (int) beginning of the subsequence - :param end: (int) end of the subsequences - :return: (Any) result of reducing self.operation over the specified range of array elements. - """ - if end is None: - end = self._capacity - if end < 0: - end += self._capacity - end -= 1 - return self._reduce_helper(start, end, 1, 0, self._capacity - 1) - - def __setitem__(self, idx, val): - # indexes of the leaf - idxs = idx + self._capacity - self._value[idxs] = val - if isinstance(idxs, int): - idxs = np.array([idxs]) - # go up one level in the tree and remove duplicate indexes - idxs = unique(idxs // 2) - while len(idxs) > 1 or idxs[0] > 0: - # as long as there are non-zero indexes, update the corresponding values - self._value[idxs] = self._operation( - self._value[2 * idxs], - self._value[2 * idxs + 1] - ) - # go up one level in the tree and remove duplicate indexes - idxs = unique(idxs // 2) - - def __getitem__(self, idx): - assert np.max(idx) < self._capacity - assert 0 <= np.min(idx) - return self._value[self._capacity + idx] - - -class SumSegmentTree(SegmentTree): - def __init__(self, capacity): - super(SumSegmentTree, self).__init__( - capacity=capacity, - operation=np.add, - neutral_element=0.0 - ) - self._value = np.array(self._value) - - def sum(self, start=0, end=None): - """ - Returns arr[start] + ... + arr[end] - - :param start: (int) start position of the reduction (must be >= 0) - :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) - :return: (Any) reduction of SumSegmentTree - """ - return super(SumSegmentTree, self).reduce(start, end) - - def find_prefixsum_idx(self, prefixsum): - """ - Find the highest index `i` in the array such that - sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum for each entry in prefixsum - - if array values are probabilities, this function - allows to sample indexes according to the discrete - probability efficiently. - - :param prefixsum: (np.ndarray) float upper bounds on the sum of array prefix - :return: (np.ndarray) highest indexes satisfying the prefixsum constraint - """ - if isinstance(prefixsum, float): - prefixsum = np.array([prefixsum]) - assert 0 <= np.min(prefixsum) - assert np.max(prefixsum) <= self.sum() + 1e-5 - assert isinstance(prefixsum[0], float) - - idx = np.ones(len(prefixsum), dtype=int) - cont = np.ones(len(prefixsum), dtype=bool) - - while np.any(cont): # while not all nodes are leafs - idx[cont] = 2 * idx[cont] - prefixsum_new = np.where(self._value[idx] <= prefixsum, prefixsum - self._value[idx], prefixsum) - # prepare update of prefixsum for all right children - idx = np.where(np.logical_or(self._value[idx] > prefixsum, np.logical_not(cont)), idx, idx + 1) - # Select child node for non-leaf nodes - prefixsum = prefixsum_new - # update prefixsum - cont = idx < self._capacity - # collect leafs - return idx - self._capacity - - -class MinSegmentTree(SegmentTree): - def __init__(self, capacity): - super(MinSegmentTree, self).__init__( - capacity=capacity, - operation=np.minimum, - neutral_element=float('inf') - ) - self._value = np.array(self._value) - - def min(self, start=0, end=None): - """ - Returns min(arr[start], ..., arr[end]) - - :param start: (int) start position of the reduction (must be >= 0) - :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) - :return: (Any) reduction of MinSegmentTree - """ - return super(MinSegmentTree, self).reduce(start, end) diff --git a/stable_baselines/common/tf_layers.py b/stable_baselines/common/tf_layers.py deleted file mode 100644 index c35bf852..00000000 --- a/stable_baselines/common/tf_layers.py +++ /dev/null @@ -1,225 +0,0 @@ -import numpy as np -import tensorflow as tf - - -def ortho_init(scale=1.0): - """ - Orthogonal initialization for the policy weights - - :param scale: (float) Scaling factor for the weights. - :return: (function) an initialization function for the weights - """ - - # _ortho_init(shape, dtype, partition_info=None) - def _ortho_init(shape, *_, **_kwargs): - """Intialize weights as Orthogonal matrix. - - Orthogonal matrix initialization [1]_. For n-dimensional shapes where - n > 2, the n-1 trailing axes are flattened. For convolutional layers, this - corresponds to the fan-in, so this makes the initialization usable for - both dense and convolutional layers. - - References - ---------- - .. [1] Saxe, Andrew M., James L. McClelland, and Surya Ganguli. - "Exact solutions to the nonlinear dynamics of learning in deep - linear - """ - # lasagne ortho init for tf - shape = tuple(shape) - if len(shape) == 2: - flat_shape = shape - elif len(shape) == 4: # assumes NHWC - flat_shape = (np.prod(shape[:-1]), shape[-1]) - else: - raise NotImplementedError - gaussian_noise = np.random.normal(0.0, 1.0, flat_shape) - u, _, v = np.linalg.svd(gaussian_noise, full_matrices=False) - weights = u if u.shape == flat_shape else v # pick the one with the correct shape - weights = weights.reshape(shape) - return (scale * weights[:shape[0], :shape[1]]).astype(np.float32) - - return _ortho_init - - -def mlp(input_tensor, layers, activ_fn=tf.nn.relu, layer_norm=False): - """ - Create a multi-layer fully connected neural network. - - :param input_tensor: (tf.placeholder) - :param layers: ([int]) Network architecture - :param activ_fn: (tf.function) Activation function - :param layer_norm: (bool) Whether to apply layer normalization or not - :return: (tf.Tensor) - """ - output = input_tensor - for i, layer_size in enumerate(layers): - output = tf.layers.dense(output, layer_size, name='fc' + str(i)) - if layer_norm: - output = tf.contrib.layers.layer_norm(output, center=True, scale=True) - output = activ_fn(output) - return output - - -def conv(input_tensor, scope, *, n_filters, filter_size, stride, - pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): - """ - Creates a 2d convolutional layer for TensorFlow - - :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution - :param scope: (str) The TensorFlow variable scope - :param n_filters: (int) The number of filters - :param filter_size: (Union[int, [int], tuple]) The filter size for the squared kernel matrix, - or the height and width of kernel filter if the input is a list or tuple - :param stride: (int) The stride of the convolution - :param pad: (str) The padding type ('VALID' or 'SAME') - :param init_scale: (int) The initialization scale - :param data_format: (str) The data format for the convolution weights - :param one_dim_bias: (bool) If the bias should be one dimentional or not - :return: (TensorFlow Tensor) 2d convolutional layer - """ - if isinstance(filter_size, list) or isinstance(filter_size, tuple): - assert len(filter_size) == 2, \ - "Filter size must have 2 elements (height, width), {} were given".format(len(filter_size)) - filter_height = filter_size[0] - filter_width = filter_size[1] - else: - filter_height = filter_size - filter_width = filter_size - if data_format == 'NHWC': - channel_ax = 3 - strides = [1, stride, stride, 1] - bshape = [1, 1, 1, n_filters] - elif data_format == 'NCHW': - channel_ax = 1 - strides = [1, 1, stride, stride] - bshape = [1, n_filters, 1, 1] - else: - raise NotImplementedError - bias_var_shape = [n_filters] if one_dim_bias else [1, n_filters, 1, 1] - n_input = input_tensor.get_shape()[channel_ax].value - wshape = [filter_height, filter_width, n_input, n_filters] - with tf.variable_scope(scope): - weight = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) - bias = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) - if not one_dim_bias and data_format == 'NHWC': - bias = tf.reshape(bias, bshape) - return bias + tf.nn.conv2d(input_tensor, weight, strides=strides, padding=pad, data_format=data_format) - - -def linear(input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0): - """ - Creates a fully connected layer for TensorFlow - - :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer - :param scope: (str) The TensorFlow variable scope - :param n_hidden: (int) The number of hidden neurons - :param init_scale: (int) The initialization scale - :param init_bias: (int) The initialization offset bias - :return: (TensorFlow Tensor) fully connected layer - """ - with tf.variable_scope(scope): - n_input = input_tensor.get_shape()[1].value - weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale)) - bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias)) - return tf.matmul(input_tensor, weight) + bias - - -def lstm(input_tensor, mask_tensor, cell_state_hidden, scope, n_hidden, init_scale=1.0, layer_norm=False): - """ - Creates an Long Short Term Memory (LSTM) cell for TensorFlow - - :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell - :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell - :param cell_state_hidden: (TensorFlow Tensor) The state tensor for the LSTM cell - :param scope: (str) The TensorFlow variable scope - :param n_hidden: (int) The number of hidden neurons - :param init_scale: (int) The initialization scale - :param layer_norm: (bool) Whether to apply Layer Normalization or not - :return: (TensorFlow Tensor) LSTM cell - """ - _, n_input = [v.value for v in input_tensor[0].get_shape()] - with tf.variable_scope(scope): - weight_x = tf.get_variable("wx", [n_input, n_hidden * 4], initializer=ortho_init(init_scale)) - weight_h = tf.get_variable("wh", [n_hidden, n_hidden * 4], initializer=ortho_init(init_scale)) - bias = tf.get_variable("b", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) - - if layer_norm: - # Gain and bias of layer norm - gain_x = tf.get_variable("gx", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) - bias_x = tf.get_variable("bx", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) - - gain_h = tf.get_variable("gh", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) - bias_h = tf.get_variable("bh", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) - - gain_c = tf.get_variable("gc", [n_hidden], initializer=tf.constant_initializer(1.0)) - bias_c = tf.get_variable("bc", [n_hidden], initializer=tf.constant_initializer(0.0)) - - cell_state, hidden = tf.split(axis=1, num_or_size_splits=2, value=cell_state_hidden) - for idx, (_input, mask) in enumerate(zip(input_tensor, mask_tensor)): - cell_state = cell_state * (1 - mask) - hidden = hidden * (1 - mask) - if layer_norm: - gates = _ln(tf.matmul(_input, weight_x), gain_x, bias_x) \ - + _ln(tf.matmul(hidden, weight_h), gain_h, bias_h) + bias - else: - gates = tf.matmul(_input, weight_x) + tf.matmul(hidden, weight_h) + bias - in_gate, forget_gate, out_gate, cell_candidate = tf.split(axis=1, num_or_size_splits=4, value=gates) - in_gate = tf.nn.sigmoid(in_gate) - forget_gate = tf.nn.sigmoid(forget_gate) - out_gate = tf.nn.sigmoid(out_gate) - cell_candidate = tf.tanh(cell_candidate) - cell_state = forget_gate * cell_state + in_gate * cell_candidate - if layer_norm: - hidden = out_gate * tf.tanh(_ln(cell_state, gain_c, bias_c)) - else: - hidden = out_gate * tf.tanh(cell_state) - input_tensor[idx] = hidden - cell_state_hidden = tf.concat(axis=1, values=[cell_state, hidden]) - return input_tensor, cell_state_hidden - - -def _ln(input_tensor, gain, bias, epsilon=1e-5, axes=None): - """ - Apply layer normalisation. - - :param input_tensor: (TensorFlow Tensor) The input tensor for the Layer normalization - :param gain: (TensorFlow Tensor) The scale tensor for the Layer normalization - :param bias: (TensorFlow Tensor) The bias tensor for the Layer normalization - :param epsilon: (float) The epsilon value for floating point calculations - :param axes: (tuple, list or int) The axes to apply the mean and variance calculation - :return: (TensorFlow Tensor) a normalizing layer - """ - if axes is None: - axes = [1] - mean, variance = tf.nn.moments(input_tensor, axes=axes, keep_dims=True) - input_tensor = (input_tensor - mean) / tf.sqrt(variance + epsilon) - input_tensor = input_tensor * gain + bias - return input_tensor - - -def lnlstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale=1.0): - """ - Creates a LSTM with Layer Normalization (lnlstm) cell for TensorFlow - - :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell - :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell - :param cell_state: (TensorFlow Tensor) The state tensor for the LSTM cell - :param scope: (str) The TensorFlow variable scope - :param n_hidden: (int) The number of hidden neurons - :param init_scale: (int) The initialization scale - :return: (TensorFlow Tensor) lnlstm cell - """ - return lstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale, layer_norm=True) - - -def conv_to_fc(input_tensor): - """ - Reshapes a Tensor from a convolutional network to a Tensor for a fully connected network - - :param input_tensor: (TensorFlow Tensor) The convolutional input tensor - :return: (TensorFlow Tensor) The fully connected output tensor - """ - n_hidden = np.prod([v.value for v in input_tensor.get_shape()[1:]]) - input_tensor = tf.reshape(input_tensor, [-1, n_hidden]) - return input_tensor diff --git a/stable_baselines/common/tf_util.py b/stable_baselines/common/tf_util.py deleted file mode 100644 index ba78c042..00000000 --- a/stable_baselines/common/tf_util.py +++ /dev/null @@ -1,510 +0,0 @@ -import os -import collections -import functools -import multiprocessing -from typing import Set - -import numpy as np -import tensorflow as tf - - -def is_image(tensor): - """ - Check if a tensor has the shape of - a valid image for tensorboard logging. - Valid image: RGB, RGBD, GrayScale - - :param tensor: (np.ndarray or tf.placeholder) - :return: (bool) - """ - - return len(tensor.shape) == 3 and tensor.shape[-1] in [1, 3, 4] - - -def batch_to_seq(tensor_batch, n_batch, n_steps, flat=False): - """ - Transform a batch of Tensors, into a sequence of Tensors for recurrent policies - - :param tensor_batch: (TensorFlow Tensor) The input tensor to unroll - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param n_steps: (int) The number of steps to run for each environment - :param flat: (bool) If the input Tensor is flat - :return: (TensorFlow Tensor) sequence of Tensors for recurrent policies - """ - if flat: - tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps]) - else: - tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps, -1]) - return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=n_steps, value=tensor_batch)] - - -def seq_to_batch(tensor_sequence, flat=False): - """ - Transform a sequence of Tensors, into a batch of Tensors for recurrent policies - - :param tensor_sequence: (TensorFlow Tensor) The input tensor to batch - :param flat: (bool) If the input Tensor is flat - :return: (TensorFlow Tensor) batch of Tensors for recurrent policies - """ - shape = tensor_sequence[0].get_shape().as_list() - if not flat: - assert len(shape) > 1 - n_hidden = tensor_sequence[0].get_shape()[-1].value - return tf.reshape(tf.concat(axis=1, values=tensor_sequence), [-1, n_hidden]) - else: - return tf.reshape(tf.stack(values=tensor_sequence, axis=1), [-1]) - - -def check_shape(tensors, shapes): - """ - Verifies the tensors match the given shape, will raise an error if the shapes do not match - - :param tensors: ([TensorFlow Tensor]) The tensors that should be checked - :param shapes: ([list]) The list of shapes for each tensor - """ - i = 0 - for (tensor, shape) in zip(tensors, shapes): - assert tensor.get_shape().as_list() == shape, "id " + str(i) + " shape " + str(tensor.get_shape()) + str(shape) - i += 1 - -# ================================================================ -# Mathematical utils -# ================================================================ - - -def huber_loss(tensor, delta=1.0): - """ - Reference: https://en.wikipedia.org/wiki/Huber_loss - - :param tensor: (TensorFlow Tensor) the input value - :param delta: (float) Huber loss delta value - :return: (TensorFlow Tensor) Huber loss output - """ - return tf.where( - tf.abs(tensor) < delta, - tf.square(tensor) * 0.5, - delta * (tf.abs(tensor) - 0.5 * delta) - ) - - -def sample(logits): - """ - Creates a sampling Tensor for non deterministic policies - when using categorical distribution. - It uses the Gumbel-max trick: http://amid.fish/humble-gumbel - - :param logits: (TensorFlow Tensor) The input probability for each action - :return: (TensorFlow Tensor) The sampled action - """ - noise = tf.random_uniform(tf.shape(logits)) - return tf.argmax(logits - tf.log(-tf.log(noise)), 1) - - -def calc_entropy(logits): - """ - Calculates the entropy of the output values of the network - - :param logits: (TensorFlow Tensor) The input probability for each action - :return: (TensorFlow Tensor) The Entropy of the output values of the network - """ - # Compute softmax - a_0 = logits - tf.reduce_max(logits, 1, keepdims=True) - exp_a_0 = tf.exp(a_0) - z_0 = tf.reduce_sum(exp_a_0, 1, keepdims=True) - p_0 = exp_a_0 / z_0 - return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), 1) - - -def mse(pred, target): - """ - Returns the Mean squared error between prediction and target - - :param pred: (TensorFlow Tensor) The predicted value - :param target: (TensorFlow Tensor) The target value - :return: (TensorFlow Tensor) The Mean squared error between prediction and target - """ - return tf.reduce_mean(tf.square(pred - target)) - - -def avg_norm(tensor): - """ - Return an average of the L2 normalization of the batch - - :param tensor: (TensorFlow Tensor) The input tensor - :return: (TensorFlow Tensor) Average L2 normalization of the batch - """ - return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(tensor), axis=-1))) - - -def gradient_add(grad_1, grad_2, param, verbose=0): - """ - Sum two gradients - - :param grad_1: (TensorFlow Tensor) The first gradient - :param grad_2: (TensorFlow Tensor) The second gradient - :param param: (TensorFlow parameters) The trainable parameters - :param verbose: (int) verbosity level - :return: (TensorFlow Tensor) the sum of the gradients - """ - if verbose > 1: - print([grad_1, grad_2, param.name]) - if grad_1 is None and grad_2 is None: - return None - elif grad_1 is None: - return grad_2 - elif grad_2 is None: - return grad_1 - else: - return grad_1 + grad_2 - - -def q_explained_variance(q_pred, q_true): - """ - Calculates the explained variance of the Q value - - :param q_pred: (TensorFlow Tensor) The predicted Q value - :param q_true: (TensorFlow Tensor) The expected Q value - :return: (TensorFlow Tensor) the explained variance of the Q value - """ - _, var_y = tf.nn.moments(q_true, axes=[0, 1]) - _, var_pred = tf.nn.moments(q_true - q_pred, axes=[0, 1]) - check_shape([var_y, var_pred], [[]] * 2) - return 1.0 - (var_pred / var_y) - - -# ================================================================ -# Global session -# ================================================================ - - -def make_session(num_cpu=None, make_default=False, graph=None): - """ - Returns a session that will use CPU's only - - :param num_cpu: (int) number of CPUs to use for TensorFlow - :param make_default: (bool) if this should return an InteractiveSession or a normal Session - :param graph: (TensorFlow Graph) the graph of the session - :return: (TensorFlow session) - """ - if num_cpu is None: - num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) - tf_config = tf.ConfigProto( - allow_soft_placement=True, - inter_op_parallelism_threads=num_cpu, - intra_op_parallelism_threads=num_cpu) - # Prevent tensorflow from taking all the gpu memory - tf_config.gpu_options.allow_growth = True - if make_default: - return tf.InteractiveSession(config=tf_config, graph=graph) - else: - return tf.Session(config=tf_config, graph=graph) - - -def single_threaded_session(make_default=False, graph=None): - """ - Returns a session which will only use a single CPU - - :param make_default: (bool) if this should return an InteractiveSession or a normal Session - :param graph: (TensorFlow Graph) the graph of the session - :return: (TensorFlow session) - """ - return make_session(num_cpu=1, make_default=make_default, graph=graph) - - -def in_session(func): - """ - Wraps a function so that it is in a TensorFlow Session - - :param func: (function) the function to wrap - :return: (function) - """ - - @functools.wraps(func) - def newfunc(*args, **kwargs): - with tf.Session(): - func(*args, **kwargs) - - return newfunc - - -ALREADY_INITIALIZED = set() # type: Set[tf.Variable] - - -def initialize(sess=None): - """ - Initialize all the uninitialized variables in the global scope. - - :param sess: (TensorFlow Session) - """ - if sess is None: - sess = tf.get_default_session() - new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED - sess.run(tf.variables_initializer(new_variables)) - ALREADY_INITIALIZED.update(new_variables) - - -# ================================================================ -# Theano-like Function -# ================================================================ - - -def function(inputs, outputs, updates=None, givens=None): - """ - Take a bunch of tensorflow placeholders and expressions - computed based on those placeholders and produces f(inputs) -> outputs. Function f takes - values to be fed to the input's placeholders and produces the values of the expressions - in outputs. Just like a Theano function. - - Input values can be passed in the same order as inputs or can be provided as kwargs based - on placeholder name (passed to constructor or accessible via placeholder.op.name). - - Example: - >>> x = tf.placeholder(tf.int32, (), name="x") - >>> y = tf.placeholder(tf.int32, (), name="y") - >>> z = 3 * x + 2 * y - >>> lin = function([x, y], z, givens={y: 0}) - >>> with single_threaded_session(): - >>> initialize() - >>> assert lin(2) == 6 - >>> assert lin(x=3) == 9 - >>> assert lin(2, 2) == 10 - - :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments - :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned - value will also have the same shape. - :param updates: ([tf.Operation] or tf.Operation) - list of update functions or single update function that will be run whenever - the function is called. The return is ignored. - :param givens: (dict) the values known for the output - """ - if isinstance(outputs, list): - return _Function(inputs, outputs, updates, givens=givens) - elif isinstance(outputs, (dict, collections.OrderedDict)): - func = _Function(inputs, outputs.values(), updates, givens=givens) - return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), func(*args, **kwargs))) - else: - func = _Function(inputs, [outputs], updates, givens=givens) - return lambda *args, **kwargs: func(*args, **kwargs)[0] - - -class _Function(object): - def __init__(self, inputs, outputs, updates, givens): - """ - Theano like function - - :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments - :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned - value will also have the same shape. - :param updates: ([tf.Operation] or tf.Operation) - list of update functions or single update function that will be run whenever - the function is called. The return is ignored. - :param givens: (dict) the values known for the output - """ - for inpt in inputs: - if not hasattr(inpt, 'make_feed_dict') and not (isinstance(inpt, tf.Tensor) and len(inpt.op.inputs) == 0): - assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method" - self.inputs = inputs - updates = updates or [] - self.update_group = tf.group(*updates) - self.outputs_update = list(outputs) + [self.update_group] - self.givens = {} if givens is None else givens - - @classmethod - def _feed_input(cls, feed_dict, inpt, value): - if hasattr(inpt, 'make_feed_dict'): - feed_dict.update(inpt.make_feed_dict(value)) - else: - feed_dict[inpt] = value - - def __call__(self, *args, sess=None, **kwargs): - assert len(args) <= len(self.inputs), "Too many arguments provided" - if sess is None: - sess = tf.get_default_session() - feed_dict = {} - # Update the args - for inpt, value in zip(self.inputs, args): - self._feed_input(feed_dict, inpt, value) - # Update feed dict with givens. - for inpt in self.givens: - feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) - results = sess.run(self.outputs_update, feed_dict=feed_dict, **kwargs)[:-1] - return results - - -# ================================================================ -# Flat vectors -# ================================================================ - - -def var_shape(tensor): - """ - get TensorFlow Tensor shape - - :param tensor: (TensorFlow Tensor) the input tensor - :return: ([int]) the shape - """ - out = tensor.get_shape().as_list() - assert all(isinstance(a, int) for a in out), \ - "shape function assumes that shape is fully known" - return out - - -def numel(tensor): - """ - get TensorFlow Tensor's number of elements - - :param tensor: (TensorFlow Tensor) the input tensor - :return: (int) the number of elements - """ - return intprod(var_shape(tensor)) - - -def intprod(tensor): - """ - calculates the product of all the elements in a list - - :param tensor: ([Number]) the list of elements - :return: (int) the product truncated - """ - return int(np.prod(tensor)) - - -def flatgrad(loss, var_list, clip_norm=None): - """ - calculates the gradient and flattens it - - :param loss: (float) the loss value - :param var_list: ([TensorFlow Tensor]) the variables - :param clip_norm: (float) clip the gradients (disabled if None) - :return: ([TensorFlow Tensor]) flattened gradient - """ - grads = tf.gradients(loss, var_list) - if clip_norm is not None: - grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] - return tf.concat(axis=0, values=[ - tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) - for (v, grad) in zip(var_list, grads) - ]) - - -class SetFromFlat(object): - def __init__(self, var_list, dtype=tf.float32, sess=None): - """ - Set the parameters from a flat vector - - :param var_list: ([TensorFlow Tensor]) the variables - :param dtype: (type) the type for the placeholder - :param sess: (TensorFlow Session) - """ - shapes = list(map(var_shape, var_list)) - total_size = np.sum([intprod(shape) for shape in shapes]) - - self.theta = theta = tf.placeholder(dtype, [total_size]) - start = 0 - assigns = [] - for (shape, _var) in zip(shapes, var_list): - size = intprod(shape) - assigns.append(tf.assign(_var, tf.reshape(theta[start:start + size], shape))) - start += size - self.operation = tf.group(*assigns) - self.sess = sess - - def __call__(self, theta): - if self.sess is None: - return tf.get_default_session().run(self.operation, feed_dict={self.theta: theta}) - else: - return self.sess.run(self.operation, feed_dict={self.theta: theta}) - - -class GetFlat(object): - def __init__(self, var_list, sess=None): - """ - Get the parameters as a flat vector - - :param var_list: ([TensorFlow Tensor]) the variables - :param sess: (TensorFlow Session) - """ - self.operation = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) - self.sess = sess - - def __call__(self): - if self.sess is None: - return tf.get_default_session().run(self.operation) - else: - return self.sess.run(self.operation) - - -# ================================================================ -# retrieving variables -# ================================================================ - - -def get_trainable_vars(name): - """ - returns the trainable variables - - :param name: (str) the scope - :return: ([TensorFlow Variable]) - """ - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) - - -def get_globals_vars(name): - """ - returns the trainable variables - - :param name: (str) the scope - :return: ([TensorFlow Variable]) - """ - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) - - -def outer_scope_getter(scope, new_scope=""): - """ - remove a scope layer for the getter - - :param scope: (str) the layer to remove - :param new_scope: (str) optional replacement name - :return: (function (function, str, ``*args``, ``**kwargs``): Tensorflow Tensor) - """ - def _getter(getter, name, *args, **kwargs): - name = name.replace(scope + "/", new_scope, 1) - val = getter(name, *args, **kwargs) - return val - return _getter - - -# ================================================================ -# Logging -# ================================================================ - - -def total_episode_reward_logger(rew_acc, rewards, masks, writer, steps): - """ - calculates the cumulated episode reward, and prints to tensorflow log the output - - :param rew_acc: (np.array float) the total running reward - :param rewards: (np.array float) the rewards - :param masks: (np.array bool) the end of episodes - :param writer: (TensorFlow Session.writer) the writer to log to - :param steps: (int) the current timestep - :return: (np.array float) the updated total running reward - :return: (np.array float) the updated total running reward - """ - with tf.variable_scope("environment_info", reuse=True): - for env_idx in range(rewards.shape[0]): - dones_idx = np.sort(np.argwhere(masks[env_idx])) - - if len(dones_idx) == 0: - rew_acc[env_idx] += sum(rewards[env_idx]) - else: - rew_acc[env_idx] += sum(rewards[env_idx, :dones_idx[0, 0]]) - summary = tf.Summary(value=[tf.Summary.Value(tag="episode_reward", simple_value=rew_acc[env_idx])]) - writer.add_summary(summary, steps + dones_idx[0, 0]) - for k in range(1, len(dones_idx[:, 0])): - rew_acc[env_idx] = sum(rewards[env_idx, dones_idx[k - 1, 0]:dones_idx[k, 0]]) - summary = tf.Summary(value=[tf.Summary.Value(tag="episode_reward", simple_value=rew_acc[env_idx])]) - writer.add_summary(summary, steps + dones_idx[k, 0]) - rew_acc[env_idx] = sum(rewards[env_idx, dones_idx[-1, 0]:]) - - return rew_acc diff --git a/stable_baselines/common/tile_images.py b/stable_baselines/common/tile_images.py deleted file mode 100644 index 30d60fcd..00000000 --- a/stable_baselines/common/tile_images.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy as np - - -def tile_images(img_nhwc): - """ - Tile N images into one big PxQ image - (P,Q) are chosen to be as close as possible, and if N - is square, then P=Q. - - :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc - n = batch index, h = height, w = width, c = channel - :return: (numpy float) img_HWc, ndim=3 - """ - img_nhwc = np.asarray(img_nhwc) - n_images, height, width, n_channels = img_nhwc.shape - # new_height was named H before - new_height = int(np.ceil(np.sqrt(n_images))) - # new_width was named W before - new_width = int(np.ceil(float(n_images) / new_height)) - img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) - # img_HWhwc - out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) - # img_HhWwc - out_image = out_image.transpose(0, 2, 1, 3, 4) - # img_Hh_Ww_c - out_image = out_image.reshape(new_height * height, new_width * width, n_channels) - return out_image diff --git a/stable_baselines/common/vec_env/__init__.py b/stable_baselines/common/vec_env/__init__.py deleted file mode 100644 index e3343f5f..00000000 --- a/stable_baselines/common/vec_env/__init__.py +++ /dev/null @@ -1,52 +0,0 @@ -from typing import Union -from copy import deepcopy - -import gym - -# flake8: noqa F401 -from stable_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, \ - CloudpickleWrapper -from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv -from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack -from stable_baselines.common.vec_env.vec_normalize import VecNormalize -from stable_baselines.common.vec_env.vec_video_recorder import VecVideoRecorder -from stable_baselines.common.vec_env.vec_check_nan import VecCheckNan - - -def unwrap_vec_normalize(env: Union[gym.Env, VecEnv]) -> Union[VecNormalize, None]: - """ - :param env: (Union[gym.Env, VecEnv]) - :return: (VecNormalize) - """ - env_tmp = env - while isinstance(env_tmp, VecEnvWrapper): - if isinstance(env_tmp, VecNormalize): - return env_tmp - env_tmp = env_tmp.venv - return None - - -# Define here to avoid circular import -def sync_envs_normalization(env: Union[gym.Env, VecEnv], eval_env: Union[gym.Env, VecEnv]) -> None: - """ - Sync eval and train environments when using VecNormalize - - :param env: (Union[gym.Env, VecEnv])) - :param eval_env: (Union[gym.Env, VecEnv])) - """ - env_tmp, eval_env_tmp = env, eval_env - # Special case for the _UnvecWrapper - # Avoid circular import - from stable_baselines.common.base_class import _UnvecWrapper - if isinstance(env_tmp, _UnvecWrapper): - return - while isinstance(env_tmp, VecEnvWrapper): - if isinstance(env_tmp, VecNormalize): - # sync reward and observation scaling - eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms) - eval_env_tmp.ret_rms = deepcopy(env_tmp.ret_rms) - env_tmp = env_tmp.venv - # Make pytype happy, in theory env and eval_env have the same type - assert isinstance(eval_env_tmp, VecEnvWrapper), "the second env differs from the first env" - eval_env_tmp = eval_env_tmp.venv diff --git a/stable_baselines/common/vec_env/base_vec_env.py b/stable_baselines/common/vec_env/base_vec_env.py deleted file mode 100644 index 46ee7be0..00000000 --- a/stable_baselines/common/vec_env/base_vec_env.py +++ /dev/null @@ -1,334 +0,0 @@ -from abc import ABC, abstractmethod -import inspect -import pickle -from typing import Sequence, Optional, List, Union - -import cloudpickle -import numpy as np - -from stable_baselines import logger -from stable_baselines.common.tile_images import tile_images - - -class AlreadySteppingError(Exception): - """ - Raised when an asynchronous step is running while - step_async() is called again. - """ - - def __init__(self): - msg = 'already running an async step' - Exception.__init__(self, msg) - - -class NotSteppingError(Exception): - """ - Raised when an asynchronous step is not running but - step_wait() is called. - """ - - def __init__(self): - msg = 'not running an async step' - Exception.__init__(self, msg) - - -class VecEnv(ABC): - """ - An abstract asynchronous, vectorized environment. - - :param num_envs: (int) the number of environments - :param observation_space: (Gym Space) the observation space - :param action_space: (Gym Space) the action space - """ - metadata = { - 'render.modes': ['human', 'rgb_array'] - } - - def __init__(self, num_envs, observation_space, action_space): - self.num_envs = num_envs - self.observation_space = observation_space - self.action_space = action_space - - @abstractmethod - def reset(self): - """ - Reset all the environments and return an array of - observations, or a tuple of observation arrays. - - If step_async is still doing work, that work will - be cancelled and step_wait() should not be called - until step_async() is invoked again. - - :return: ([int] or [float]) observation - """ - pass - - @abstractmethod - def step_async(self, actions): - """ - Tell all the environments to start taking a step - with the given actions. - Call step_wait() to get the results of the step. - - You should not call this if a step_async run is - already pending. - """ - pass - - @abstractmethod - def step_wait(self): - """ - Wait for the step taken with step_async(). - - :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information - """ - pass - - @abstractmethod - def close(self): - """ - Clean up the environment's resources. - """ - pass - - @abstractmethod - def get_attr(self, attr_name, indices=None): - """ - Return attribute from vectorized environment. - - :param attr_name: (str) The name of the attribute whose value to return - :param indices: (list,int) Indices of envs to get attribute from - :return: (list) List of values of 'attr_name' in all environments - """ - pass - - @abstractmethod - def set_attr(self, attr_name, value, indices=None): - """ - Set attribute inside vectorized environments. - - :param attr_name: (str) The name of attribute to assign new value - :param value: (obj) Value to assign to `attr_name` - :param indices: (list,int) Indices of envs to assign value - :return: (NoneType) - """ - pass - - @abstractmethod - def env_method(self, method_name, *method_args, indices=None, **method_kwargs): - """ - Call instance methods of vectorized environments. - - :param method_name: (str) The name of the environment method to invoke. - :param indices: (list,int) Indices of envs whose method to call - :param method_args: (tuple) Any positional arguments to provide in the call - :param method_kwargs: (dict) Any keyword arguments to provide in the call - :return: (list) List of items returned by the environment's method call - """ - pass - - @abstractmethod - def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: - """ - Sets the random seeds for all environments, based on a given seed. - Each individual environment will still get its own seed, by incrementing the given seed. - - :param seed: (Optional[int]) The random seed. May be None for completely random seeding. - :return: (List[Union[None, int]]) Returns a list containing the seeds for each individual env. - Note that all list elements may be None, if the env does not return anything when being seeded. - """ - pass - - def step(self, actions): - """ - Step the environments with the given action - - :param actions: ([int] or [float]) the action - :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information - """ - self.step_async(actions) - return self.step_wait() - - def get_images(self) -> Sequence[np.ndarray]: - """ - Return RGB images from each environment - """ - raise NotImplementedError - - def render(self, mode: str = 'human'): - """ - Gym environment rendering - - :param mode: the rendering type - """ - try: - imgs = self.get_images() - except NotImplementedError: - logger.warn('Render not defined for {}'.format(self)) - return - - # Create a big image by tiling images from subprocesses - bigimg = tile_images(imgs) - if mode == 'human': - import cv2 # pytype:disable=import-error - cv2.imshow('vecenv', bigimg[:, :, ::-1]) - cv2.waitKey(1) - elif mode == 'rgb_array': - return bigimg - else: - raise NotImplementedError - - @property - def unwrapped(self): - if isinstance(self, VecEnvWrapper): - return self.venv.unwrapped - else: - return self - - def getattr_depth_check(self, name, already_found): - """Check if an attribute reference is being hidden in a recursive call to __getattr__ - - :param name: (str) name of attribute to check for - :param already_found: (bool) whether this attribute has already been found in a wrapper - :return: (str or None) name of module whose attribute is being shadowed, if any. - """ - if hasattr(self, name) and already_found: - return "{0}.{1}".format(type(self).__module__, type(self).__name__) - else: - return None - - def _get_indices(self, indices): - """ - Convert a flexibly-typed reference to environment indices to an implied list of indices. - - :param indices: (None,int,Iterable) refers to indices of envs. - :return: (list) the implied list of indices. - """ - if indices is None: - indices = range(self.num_envs) - elif isinstance(indices, int): - indices = [indices] - return indices - - -class VecEnvWrapper(VecEnv): - """ - Vectorized environment base class - - :param venv: (VecEnv) the vectorized environment to wrap - :param observation_space: (Gym Space) the observation space (can be None to load from venv) - :param action_space: (Gym Space) the action space (can be None to load from venv) - """ - - def __init__(self, venv, observation_space=None, action_space=None): - self.venv = venv - VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space, - action_space=action_space or venv.action_space) - self.class_attributes = dict(inspect.getmembers(self.__class__)) - - def step_async(self, actions): - self.venv.step_async(actions) - - @abstractmethod - def reset(self): - pass - - @abstractmethod - def step_wait(self): - pass - - def seed(self, seed=None): - return self.venv.seed(seed) - - def close(self): - return self.venv.close() - - def render(self, mode: str = 'human'): - return self.venv.render(mode=mode) - - def get_images(self): - return self.venv.get_images() - - def get_attr(self, attr_name, indices=None): - return self.venv.get_attr(attr_name, indices) - - def set_attr(self, attr_name, value, indices=None): - return self.venv.set_attr(attr_name, value, indices) - - def env_method(self, method_name, *method_args, indices=None, **method_kwargs): - return self.venv.env_method(method_name, *method_args, indices=indices, **method_kwargs) - - def __getattr__(self, name): - """Find attribute from wrapped venv(s) if this wrapper does not have it. - Useful for accessing attributes from venvs which are wrapped with multiple wrappers - which have unique attributes of interest. - """ - blocked_class = self.getattr_depth_check(name, already_found=False) - if blocked_class is not None: - own_class = "{0}.{1}".format(type(self).__module__, type(self).__name__) - format_str = ("Error: Recursive attribute lookup for {0} from {1} is " - "ambiguous and hides attribute from {2}") - raise AttributeError(format_str.format(name, own_class, blocked_class)) - - return self.getattr_recursive(name) - - def _get_all_attributes(self): - """Get all (inherited) instance and class attributes - - :return: (dict) all_attributes - """ - all_attributes = self.__dict__.copy() - all_attributes.update(self.class_attributes) - return all_attributes - - def getattr_recursive(self, name): - """Recursively check wrappers to find attribute. - - :param name (str) name of attribute to look for - :return: (object) attribute - """ - all_attributes = self._get_all_attributes() - if name in all_attributes: # attribute is present in this wrapper - attr = getattr(self, name) - elif hasattr(self.venv, 'getattr_recursive'): - # Attribute not present, child is wrapper. Call getattr_recursive rather than getattr - # to avoid a duplicate call to getattr_depth_check. - attr = self.venv.getattr_recursive(name) - else: # attribute not present, child is an unwrapped VecEnv - attr = getattr(self.venv, name) - - return attr - - def getattr_depth_check(self, name, already_found): - """See base class. - - :return: (str or None) name of module whose attribute is being shadowed, if any. - """ - all_attributes = self._get_all_attributes() - if name in all_attributes and already_found: - # this venv's attribute is being hidden because of a higher venv. - shadowed_wrapper_class = "{0}.{1}".format(type(self).__module__, type(self).__name__) - elif name in all_attributes and not already_found: - # we have found the first reference to the attribute. Now check for duplicates. - shadowed_wrapper_class = self.venv.getattr_depth_check(name, True) - else: - # this wrapper does not have the attribute. Keep searching. - shadowed_wrapper_class = self.venv.getattr_depth_check(name, already_found) - - return shadowed_wrapper_class - - -class CloudpickleWrapper(object): - def __init__(self, var): - """ - Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) - - :param var: (Any) the variable you wish to wrap for pickling with cloudpickle - """ - self.var = var - - def __getstate__(self): - return cloudpickle.dumps(self.var) - - def __setstate__(self, obs): - self.var = cloudpickle.loads(obs) diff --git a/stable_baselines/common/vec_env/dummy_vec_env.py b/stable_baselines/common/vec_env/dummy_vec_env.py deleted file mode 100644 index 35785f2d..00000000 --- a/stable_baselines/common/vec_env/dummy_vec_env.py +++ /dev/null @@ -1,117 +0,0 @@ -from collections import OrderedDict -from typing import Sequence -from copy import deepcopy - -import numpy as np - -from stable_baselines.common.vec_env.base_vec_env import VecEnv -from stable_baselines.common.vec_env.util import copy_obs_dict, dict_to_obs, obs_space_info - - -class DummyVecEnv(VecEnv): - """ - Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current - Python process. This is useful for computationally simple environment such as ``cartpole-v1``, as the overhead of - multiprocess or multithread outweighs the environment computation time. This can also be used for RL methods that - require a vectorized environment, but that you want a single environments to train with. - - :param env_fns: ([callable]) A list of functions that will create the environments - (each callable returns a `Gym.Env` instance when called). - """ - - def __init__(self, env_fns): - self.envs = [fn() for fn in env_fns] - env = self.envs[0] - VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) - obs_space = env.observation_space - self.keys, shapes, dtypes = obs_space_info(obs_space) - - self.buf_obs = OrderedDict([ - (k, np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k])) - for k in self.keys]) - self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) - self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) - self.buf_infos = [{} for _ in range(self.num_envs)] - self.actions = None - self.metadata = env.metadata - - def step_async(self, actions): - self.actions = actions - - def step_wait(self): - for env_idx in range(self.num_envs): - obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\ - self.envs[env_idx].step(self.actions[env_idx]) - if self.buf_dones[env_idx]: - # save final observation where user can get it, then reset - self.buf_infos[env_idx]['terminal_observation'] = obs - obs = self.envs[env_idx].reset() - self._save_obs(env_idx, obs) - return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), - deepcopy(self.buf_infos)) - - def seed(self, seed=None): - seeds = list() - for idx, env in enumerate(self.envs): - seeds.append(env.seed(seed + idx)) - return seeds - - def reset(self): - for env_idx in range(self.num_envs): - obs = self.envs[env_idx].reset() - self._save_obs(env_idx, obs) - return self._obs_from_buf() - - def close(self): - for env in self.envs: - env.close() - - def get_images(self) -> Sequence[np.ndarray]: - return [env.render(mode='rgb_array') for env in self.envs] - - def render(self, mode: str = 'human'): - """ - Gym environment rendering. If there are multiple environments then - they are tiled together in one image via `BaseVecEnv.render()`. - Otherwise (if `self.num_envs == 1`), we pass the render call directly to the - underlying environment. - - Therefore, some arguments such as `mode` will have values that are valid - only when `num_envs == 1`. - - :param mode: The rendering type. - """ - if self.num_envs == 1: - return self.envs[0].render(mode=mode) - else: - return super().render(mode=mode) - - def _save_obs(self, env_idx, obs): - for key in self.keys: - if key is None: - self.buf_obs[key][env_idx] = obs - else: - self.buf_obs[key][env_idx] = obs[key] - - def _obs_from_buf(self): - return dict_to_obs(self.observation_space, copy_obs_dict(self.buf_obs)) - - def get_attr(self, attr_name, indices=None): - """Return attribute from vectorized environment (see base class).""" - target_envs = self._get_target_envs(indices) - return [getattr(env_i, attr_name) for env_i in target_envs] - - def set_attr(self, attr_name, value, indices=None): - """Set attribute inside vectorized environments (see base class).""" - target_envs = self._get_target_envs(indices) - for env_i in target_envs: - setattr(env_i, attr_name, value) - - def env_method(self, method_name, *method_args, indices=None, **method_kwargs): - """Call instance methods of vectorized environments.""" - target_envs = self._get_target_envs(indices) - return [getattr(env_i, method_name)(*method_args, **method_kwargs) for env_i in target_envs] - - def _get_target_envs(self, indices): - indices = self._get_indices(indices) - return [self.envs[i] for i in indices] diff --git a/stable_baselines/common/vec_env/subproc_vec_env.py b/stable_baselines/common/vec_env/subproc_vec_env.py deleted file mode 100644 index a6b356f8..00000000 --- a/stable_baselines/common/vec_env/subproc_vec_env.py +++ /dev/null @@ -1,210 +0,0 @@ -import os -import multiprocessing -from collections import OrderedDict -from typing import Sequence - -import gym -import numpy as np - -from stable_baselines.common.vec_env.base_vec_env import VecEnv, CloudpickleWrapper - - -def _worker(remote, parent_remote, env_fn_wrapper): - parent_remote.close() - env = env_fn_wrapper.var() - while True: - try: - cmd, data = remote.recv() - if cmd == 'step': - observation, reward, done, info = env.step(data) - if done: - # save final observation where user can get it, then reset - info['terminal_observation'] = observation - observation = env.reset() - remote.send((observation, reward, done, info)) - elif cmd == 'seed': - remote.send(env.seed(data)) - elif cmd == 'reset': - observation = env.reset() - remote.send(observation) - elif cmd == 'render': - remote.send(env.render(data)) - elif cmd == 'close': - env.close() - remote.close() - break - elif cmd == 'get_spaces': - remote.send((env.observation_space, env.action_space)) - elif cmd == 'env_method': - method = getattr(env, data[0]) - remote.send(method(*data[1], **data[2])) - elif cmd == 'get_attr': - remote.send(getattr(env, data)) - elif cmd == 'set_attr': - remote.send(setattr(env, data[0], data[1])) - else: - raise NotImplementedError("`{}` is not implemented in the worker".format(cmd)) - except EOFError: - break - - -class SubprocVecEnv(VecEnv): - """ - Creates a multiprocess vectorized wrapper for multiple environments, distributing each environment to its own - process, allowing significant speed up when the environment is computationally complex. - - For performance reasons, if your environment is not IO bound, the number of environments should not exceed the - number of logical cores on your CPU. - - .. warning:: - - Only 'forkserver' and 'spawn' start methods are thread-safe, - which is important when TensorFlow sessions or other non thread-safe - libraries are used in the parent (see issue #217). However, compared to - 'fork' they incur a small start-up cost and have restrictions on - global variables. With those methods, users must wrap the code in an - ``if __name__ == "__main__":`` block. - For more information, see the multiprocessing documentation. - - :param env_fns: ([callable]) A list of functions that will create the environments - (each callable returns a `Gym.Env` instance when called). - :param start_method: (str) method used to start the subprocesses. - Must be one of the methods returned by multiprocessing.get_all_start_methods(). - Defaults to 'forkserver' on available platforms, and 'spawn' otherwise. - """ - - def __init__(self, env_fns, start_method=None): - self.waiting = False - self.closed = False - n_envs = len(env_fns) - - # In some cases (like on GitHub workflow machine when running tests), - # "forkserver" method results in an "connection error" (probably due to mpi) - # We allow to bypass the default start method if an environment variable - # is specified by the user - if start_method is None: - start_method = os.environ.get("DEFAULT_START_METHOD") - - # No DEFAULT_START_METHOD was specified, start_method may still be None - if start_method is None: - # Fork is not a thread safe method (see issue #217) - # but is more user friendly (does not require to wrap the code in - # a `if __name__ == "__main__":`) - forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods() - start_method = 'forkserver' if forkserver_available else 'spawn' - ctx = multiprocessing.get_context(start_method) - - self.remotes, self.work_remotes = zip(*[ctx.Pipe(duplex=True) for _ in range(n_envs)]) - self.processes = [] - for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns): - args = (work_remote, remote, CloudpickleWrapper(env_fn)) - # daemon=True: if the main process crashes, we should not cause things to hang - process = ctx.Process(target=_worker, args=args, daemon=True) # pytype:disable=attribute-error - process.start() - self.processes.append(process) - work_remote.close() - - self.remotes[0].send(('get_spaces', None)) - observation_space, action_space = self.remotes[0].recv() - VecEnv.__init__(self, len(env_fns), observation_space, action_space) - - def step_async(self, actions): - for remote, action in zip(self.remotes, actions): - remote.send(('step', action)) - self.waiting = True - - def step_wait(self): - results = [remote.recv() for remote in self.remotes] - self.waiting = False - obs, rews, dones, infos = zip(*results) - return _flatten_obs(obs, self.observation_space), np.stack(rews), np.stack(dones), infos - - def seed(self, seed=None): - for idx, remote in enumerate(self.remotes): - remote.send(('seed', seed + idx)) - return [remote.recv() for remote in self.remotes] - - def reset(self): - for remote in self.remotes: - remote.send(('reset', None)) - obs = [remote.recv() for remote in self.remotes] - return _flatten_obs(obs, self.observation_space) - - def close(self): - if self.closed: - return - if self.waiting: - for remote in self.remotes: - remote.recv() - for remote in self.remotes: - remote.send(('close', None)) - for process in self.processes: - process.join() - self.closed = True - - def get_images(self) -> Sequence[np.ndarray]: - for pipe in self.remotes: - # gather images from subprocesses - # `mode` will be taken into account later - pipe.send(('render', 'rgb_array')) - imgs = [pipe.recv() for pipe in self.remotes] - return imgs - - def get_attr(self, attr_name, indices=None): - """Return attribute from vectorized environment (see base class).""" - target_remotes = self._get_target_remotes(indices) - for remote in target_remotes: - remote.send(('get_attr', attr_name)) - return [remote.recv() for remote in target_remotes] - - def set_attr(self, attr_name, value, indices=None): - """Set attribute inside vectorized environments (see base class).""" - target_remotes = self._get_target_remotes(indices) - for remote in target_remotes: - remote.send(('set_attr', (attr_name, value))) - for remote in target_remotes: - remote.recv() - - def env_method(self, method_name, *method_args, indices=None, **method_kwargs): - """Call instance methods of vectorized environments.""" - target_remotes = self._get_target_remotes(indices) - for remote in target_remotes: - remote.send(('env_method', (method_name, method_args, method_kwargs))) - return [remote.recv() for remote in target_remotes] - - def _get_target_remotes(self, indices): - """ - Get the connection object needed to communicate with the wanted - envs that are in subprocesses. - - :param indices: (None,int,Iterable) refers to indices of envs. - :return: ([multiprocessing.Connection]) Connection object to communicate between processes. - """ - indices = self._get_indices(indices) - return [self.remotes[i] for i in indices] - - -def _flatten_obs(obs, space): - """ - Flatten observations, depending on the observation space. - - :param obs: (list or tuple where X is dict, tuple or ndarray) observations. - A list or tuple of observations, one per environment. - Each environment observation may be a NumPy array, or a dict or tuple of NumPy arrays. - :return (OrderedDict, tuple or ndarray) flattened observations. - A flattened NumPy array or an OrderedDict or tuple of flattened numpy arrays. - Each NumPy array has the environment index as its first axis. - """ - assert isinstance(obs, (list, tuple)), "expected list or tuple of observations per environment" - assert len(obs) > 0, "need observations from at least one environment" - - if isinstance(space, gym.spaces.Dict): - assert isinstance(space.spaces, OrderedDict), "Dict space must have ordered subspaces" - assert isinstance(obs[0], dict), "non-dict observation for environment with Dict observation space" - return OrderedDict([(k, np.stack([o[k] for o in obs])) for k in space.spaces.keys()]) - elif isinstance(space, gym.spaces.Tuple): - assert isinstance(obs[0], tuple), "non-tuple observation for environment with Tuple observation space" - obs_len = len(space.spaces) - return tuple((np.stack([o[i] for o in obs]) for i in range(obs_len))) - else: - return np.stack(obs) diff --git a/stable_baselines/common/vec_env/util.py b/stable_baselines/common/vec_env/util.py deleted file mode 100644 index 03ce286d..00000000 --- a/stable_baselines/common/vec_env/util.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Helpers for dealing with vectorized environments. -""" - -from collections import OrderedDict - -import gym -import numpy as np - - -def copy_obs_dict(obs): - """ - Deep-copy a dict of numpy arrays. - - :param obs: (OrderedDict): a dict of numpy arrays. - :return (OrderedDict) a dict of copied numpy arrays. - """ - assert isinstance(obs, OrderedDict), "unexpected type for observations '{}'".format(type(obs)) - return OrderedDict([(k, np.copy(v)) for k, v in obs.items()]) - - -def dict_to_obs(space, obs_dict): - """ - Convert an internal representation raw_obs into the appropriate type - specified by space. - - :param space: (gym.spaces.Space) an observation space. - :param obs_dict: (OrderedDict) a dict of numpy arrays. - :return (ndarray, tuple or dict): returns an observation - of the same type as space. If space is Dict, function is identity; - if space is Tuple, converts dict to Tuple; otherwise, space is - unstructured and returns the value raw_obs[None]. - """ - if isinstance(space, gym.spaces.Dict): - return obs_dict - elif isinstance(space, gym.spaces.Tuple): - assert len(obs_dict) == len(space.spaces), "size of observation does not match size of observation space" - return tuple((obs_dict[i] for i in range(len(space.spaces)))) - else: - assert set(obs_dict.keys()) == {None}, "multiple observation keys for unstructured observation space" - return obs_dict[None] - - -def obs_space_info(obs_space): - """ - Get dict-structured information about a gym.Space. - - Dict spaces are represented directly by their dict of subspaces. - Tuple spaces are converted into a dict with keys indexing into the tuple. - Unstructured spaces are represented by {None: obs_space}. - - :param obs_space: (gym.spaces.Space) an observation space - :return (tuple) A tuple (keys, shapes, dtypes): - keys: a list of dict keys. - shapes: a dict mapping keys to shapes. - dtypes: a dict mapping keys to dtypes. - """ - if isinstance(obs_space, gym.spaces.Dict): - assert isinstance(obs_space.spaces, OrderedDict), "Dict space must have ordered subspaces" - subspaces = obs_space.spaces - elif isinstance(obs_space, gym.spaces.Tuple): - subspaces = {i: space for i, space in enumerate(obs_space.spaces)} - else: - assert not hasattr(obs_space, 'spaces'), "Unsupported structured space '{}'".format(type(obs_space)) - subspaces = {None: obs_space} - keys = [] - shapes = {} - dtypes = {} - for key, box in subspaces.items(): - keys.append(key) - shapes[key] = box.shape - dtypes[key] = box.dtype - return keys, shapes, dtypes diff --git a/stable_baselines/common/vec_env/vec_check_nan.py b/stable_baselines/common/vec_env/vec_check_nan.py deleted file mode 100644 index 6deda56c..00000000 --- a/stable_baselines/common/vec_env/vec_check_nan.py +++ /dev/null @@ -1,86 +0,0 @@ -import warnings - -import numpy as np - -from stable_baselines.common.vec_env.base_vec_env import VecEnvWrapper - - -class VecCheckNan(VecEnvWrapper): - """ - NaN and inf checking wrapper for vectorized environment, will raise a warning by default, - allowing you to know from what the NaN of inf originated from. - - :param venv: (VecEnv) the vectorized environment to wrap - :param raise_exception: (bool) Whether or not to raise a ValueError, instead of a UserWarning - :param warn_once: (bool) Whether or not to only warn once. - :param check_inf: (bool) Whether or not to check for +inf or -inf as well - """ - - def __init__(self, venv, raise_exception=False, warn_once=True, check_inf=True): - VecEnvWrapper.__init__(self, venv) - self.raise_exception = raise_exception - self.warn_once = warn_once - self.check_inf = check_inf - self._actions = None - self._observations = None - self._user_warned = False - - def step_async(self, actions): - self._check_val(async_step=True, actions=actions) - - self._actions = actions - self.venv.step_async(actions) - - def step_wait(self): - observations, rewards, news, infos = self.venv.step_wait() - - self._check_val(async_step=False, observations=observations, rewards=rewards, news=news) - - self._observations = observations - return observations, rewards, news, infos - - def reset(self): - observations = self.venv.reset() - self._actions = None - - self._check_val(async_step=False, observations=observations) - - self._observations = observations - return observations - - def _check_val(self, *, async_step, **kwargs): - # if warn and warn once and have warned once: then stop checking - if not self.raise_exception and self.warn_once and self._user_warned: - return - - found = [] - for name, val in kwargs.items(): - has_nan = np.any(np.isnan(val)) - has_inf = self.check_inf and np.any(np.isinf(val)) - if has_inf: - found.append((name, "inf")) - if has_nan: - found.append((name, "nan")) - - if found: - self._user_warned = True - msg = "" - for i, (name, type_val) in enumerate(found): - msg += "found {} in {}".format(type_val, name) - if i != len(found) - 1: - msg += ", " - - msg += ".\r\nOriginated from the " - - if not async_step: - if self._actions is None: - msg += "environment observation (at reset)" - else: - msg += "environment, Last given value was: \r\n\taction={}".format(self._actions) - else: - msg += "RL model, Last given value was: \r\n\tobservations={}".format(self._observations) - - if self.raise_exception: - raise ValueError(msg) - else: - warnings.warn(msg, UserWarning) diff --git a/stable_baselines/common/vec_env/vec_frame_stack.py b/stable_baselines/common/vec_env/vec_frame_stack.py deleted file mode 100644 index 044102eb..00000000 --- a/stable_baselines/common/vec_env/vec_frame_stack.py +++ /dev/null @@ -1,55 +0,0 @@ -import warnings - -import numpy as np -from gym import spaces - -from stable_baselines.common.vec_env.base_vec_env import VecEnvWrapper - - -class VecFrameStack(VecEnvWrapper): - """ - Frame stacking wrapper for vectorized environment - - :param venv: (VecEnv) the vectorized environment to wrap - :param n_stack: (int) Number of frames to stack - """ - - def __init__(self, venv, n_stack): - self.venv = venv - self.n_stack = n_stack - wrapped_obs_space = venv.observation_space - low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1) - high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1) - self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) - observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) - VecEnvWrapper.__init__(self, venv, observation_space=observation_space) - - def step_wait(self): - observations, rewards, dones, infos = self.venv.step_wait() - last_ax_size = observations.shape[-1] - self.stackedobs = np.roll(self.stackedobs, shift=-last_ax_size, axis=-1) - for i, done in enumerate(dones): - if done: - if 'terminal_observation' in infos[i]: - old_terminal = infos[i]['terminal_observation'] - new_terminal = np.concatenate( - (self.stackedobs[i, ..., :-last_ax_size], old_terminal), axis=-1) - infos[i]['terminal_observation'] = new_terminal - else: - warnings.warn( - "VecFrameStack wrapping a VecEnv without terminal_observation info") - self.stackedobs[i] = 0 - self.stackedobs[..., -observations.shape[-1]:] = observations - return self.stackedobs, rewards, dones, infos - - def reset(self): - """ - Reset all environments - """ - obs = self.venv.reset() - self.stackedobs[...] = 0 - self.stackedobs[..., -obs.shape[-1]:] = obs - return self.stackedobs - - def close(self): - self.venv.close() diff --git a/stable_baselines/common/vec_env/vec_normalize.py b/stable_baselines/common/vec_env/vec_normalize.py deleted file mode 100644 index 6ab308b1..00000000 --- a/stable_baselines/common/vec_env/vec_normalize.py +++ /dev/null @@ -1,198 +0,0 @@ -import pickle -import warnings - -import numpy as np - -from stable_baselines.common.vec_env.base_vec_env import VecEnvWrapper -from stable_baselines.common.running_mean_std import RunningMeanStd - - -class VecNormalize(VecEnvWrapper): - """ - A moving average, normalizing wrapper for vectorized environment. - - It is pickleable which will save moving averages and configuration parameters. - The wrapped environment `venv` is not saved, and must be restored manually with - `set_venv` after being unpickled. - - :param venv: (VecEnv) the vectorized environment to wrap - :param training: (bool) Whether to update or not the moving average - :param norm_obs: (bool) Whether to normalize observation or not (default: True) - :param norm_reward: (bool) Whether to normalize rewards or not (default: True) - :param clip_obs: (float) Max absolute value for observation - :param clip_reward: (float) Max value absolute for discounted reward - :param gamma: (float) discount factor - :param epsilon: (float) To avoid division by zero - """ - - def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, - clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): - VecEnvWrapper.__init__(self, venv) - self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) - self.ret_rms = RunningMeanStd(shape=()) - self.clip_obs = clip_obs - self.clip_reward = clip_reward - # Returns: discounted rewards - self.ret = np.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - self.training = training - self.norm_obs = norm_obs - self.norm_reward = norm_reward - self.old_obs = None - self.old_rews = None - - def __getstate__(self): - """ - Gets state for pickling. - - Excludes self.venv, as in general VecEnv's may not be pickleable.""" - state = self.__dict__.copy() - # these attributes are not pickleable - del state['venv'] - del state['class_attributes'] - # these attributes depend on the above and so we would prefer not to pickle - del state['ret'] - return state - - def __setstate__(self, state): - """ - Restores pickled state. - - User must call set_venv() after unpickling before using. - - :param state: (dict)""" - self.__dict__.update(state) - assert 'venv' not in state - self.venv = None - - def set_venv(self, venv): - """ - Sets the vector environment to wrap to venv. - - Also sets attributes derived from this such as `num_env`. - - :param venv: (VecEnv) - """ - if self.venv is not None: - raise ValueError("Trying to set venv of already initialized VecNormalize wrapper.") - VecEnvWrapper.__init__(self, venv) - if self.obs_rms.mean.shape != self.observation_space.shape: - raise ValueError("venv is incompatible with current statistics.") - self.ret = np.zeros(self.num_envs) - - def step_wait(self): - """ - Apply sequence of actions to sequence of environments - actions -> (observations, rewards, news) - - where 'news' is a boolean vector indicating whether each element is new. - """ - obs, rews, news, infos = self.venv.step_wait() - self.old_obs = obs - self.old_rews = rews - - if self.training: - self.obs_rms.update(obs) - obs = self.normalize_obs(obs) - - if self.training: - self._update_reward(rews) - rews = self.normalize_reward(rews) - - self.ret[news] = 0 - return obs, rews, news, infos - - def _update_reward(self, reward: np.ndarray) -> None: - """Update reward normalization statistics.""" - self.ret = self.ret * self.gamma + reward - self.ret_rms.update(self.ret) - - def normalize_obs(self, obs: np.ndarray) -> np.ndarray: - """ - Normalize observations using this VecNormalize's observations statistics. - Calling this method does not update statistics. - """ - if self.norm_obs: - obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), - -self.clip_obs, - self.clip_obs) - return obs - - def normalize_reward(self, reward: np.ndarray) -> np.ndarray: - """ - Normalize rewards using this VecNormalize's rewards statistics. - Calling this method does not update statistics. - """ - if self.norm_reward: - reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), - -self.clip_reward, self.clip_reward) - return reward - - def get_original_obs(self) -> np.ndarray: - """ - Returns an unnormalized version of the observations from the most recent - step or reset. - """ - return self.old_obs.copy() - - def get_original_reward(self) -> np.ndarray: - """ - Returns an unnormalized version of the rewards from the most recent step. - """ - return self.old_rews.copy() - - def reset(self): - """ - Reset all environments - """ - obs = self.venv.reset() - self.old_obs = obs - self.ret = np.zeros(self.num_envs) - if self.training: - self._update_reward(self.ret) - return self.normalize_obs(obs) - - @staticmethod - def load(load_path, venv): - """ - Loads a saved VecNormalize object. - - :param load_path: the path to load from. - :param venv: the VecEnv to wrap. - :return: (VecNormalize) - """ - with open(load_path, "rb") as file_handler: - vec_normalize = pickle.load(file_handler) - vec_normalize.set_venv(venv) - return vec_normalize - - def save(self, save_path): - with open(save_path, "wb") as file_handler: - pickle.dump(self, file_handler) - - def save_running_average(self, path): - """ - :param path: (str) path to log dir - - .. deprecated:: 2.9.0 - This function will be removed in a future version - """ - warnings.warn("Usage of `save_running_average` is deprecated. Please " - "use `save` or pickle instead.", DeprecationWarning) - for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): - with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: - pickle.dump(rms, file_handler) - - def load_running_average(self, path): - """ - :param path: (str) path to log dir - - .. deprecated:: 2.9.0 - This function will be removed in a future version - """ - warnings.warn("Usage of `load_running_average` is deprecated. Please " - "use `load` or pickle instead.", DeprecationWarning) - for name in ['obs_rms', 'ret_rms']: - with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: - setattr(self, name, pickle.load(file_handler)) diff --git a/stable_baselines/common/vec_env/vec_video_recorder.py b/stable_baselines/common/vec_env/vec_video_recorder.py deleted file mode 100644 index 23478246..00000000 --- a/stable_baselines/common/vec_env/vec_video_recorder.py +++ /dev/null @@ -1,112 +0,0 @@ -import os - -from gym.wrappers.monitoring import video_recorder - -from stable_baselines import logger -from stable_baselines.common.vec_env.base_vec_env import VecEnvWrapper -from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv -from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack -from stable_baselines.common.vec_env.vec_normalize import VecNormalize - - -class VecVideoRecorder(VecEnvWrapper): - """ - Wraps a VecEnv or VecEnvWrapper object to record rendered image as mp4 video. - It requires ffmpeg or avconv to be installed on the machine. - - :param venv: (VecEnv or VecEnvWrapper) - :param video_folder: (str) Where to save videos - :param record_video_trigger: (func) Function that defines when to start recording. - The function takes the current number of step, - and returns whether we should start recording or not. - :param video_length: (int) Length of recorded videos - :param name_prefix: (str) Prefix to the video name - """ - - def __init__(self, venv, video_folder, record_video_trigger, - video_length=200, name_prefix='rl-video'): - - VecEnvWrapper.__init__(self, venv) - - self.env = venv - # Temp variable to retrieve metadata - temp_env = venv - - # Unwrap to retrieve metadata dict - # that will be used by gym recorder - while isinstance(temp_env, VecNormalize) or isinstance(temp_env, VecFrameStack): - temp_env = temp_env.venv - - if isinstance(temp_env, DummyVecEnv) or isinstance(temp_env, SubprocVecEnv): - metadata = temp_env.get_attr('metadata')[0] - else: - metadata = temp_env.metadata - - self.env.metadata = metadata - - self.record_video_trigger = record_video_trigger - self.video_recorder = None - - self.video_folder = os.path.abspath(video_folder) - # Create output folder if needed - os.makedirs(self.video_folder, exist_ok=True) - - self.name_prefix = name_prefix - self.step_id = 0 - self.video_length = video_length - - self.recording = False - self.recorded_frames = 0 - - def reset(self): - obs = self.venv.reset() - self.start_video_recorder() - return obs - - def start_video_recorder(self): - self.close_video_recorder() - - video_name = '{}-step-{}-to-step-{}'.format(self.name_prefix, self.step_id, - self.step_id + self.video_length) - base_path = os.path.join(self.video_folder, video_name) - self.video_recorder = video_recorder.VideoRecorder( - env=self.env, - base_path=base_path, - metadata={'step_id': self.step_id} - ) - - self.video_recorder.capture_frame() - self.recorded_frames = 1 - self.recording = True - - def _video_enabled(self): - return self.record_video_trigger(self.step_id) - - def step_wait(self): - obs, rews, dones, infos = self.venv.step_wait() - - self.step_id += 1 - if self.recording: - self.video_recorder.capture_frame() - self.recorded_frames += 1 - if self.recorded_frames > self.video_length: - logger.info("Saving video to ", self.video_recorder.path) - self.close_video_recorder() - elif self._video_enabled(): - self.start_video_recorder() - - return obs, rews, dones, infos - - def close_video_recorder(self): - if self.recording: - self.video_recorder.close() - self.recording = False - self.recorded_frames = 1 - - def close(self): - VecEnvWrapper.close(self) - self.close_video_recorder() - - def __del__(self): - self.close() diff --git a/stable_baselines/ddpg/__init__.py b/stable_baselines/ddpg/__init__.py deleted file mode 100644 index 1ce12c02..00000000 --- a/stable_baselines/ddpg/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise -from stable_baselines.ddpg.ddpg import DDPG -from stable_baselines.ddpg.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py deleted file mode 100644 index 5e806f3c..00000000 --- a/stable_baselines/ddpg/ddpg.py +++ /dev/null @@ -1,1175 +0,0 @@ -from functools import reduce -import os -import time -from collections import deque -import pickle -import warnings - -import gym -import numpy as np -import tensorflow as tf -import tensorflow.contrib as tc -from mpi4py import MPI - -from stable_baselines import logger -from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter -from stable_baselines.common.vec_env import VecEnv -from stable_baselines.common.mpi_adam import MpiAdam -from stable_baselines.common.buffers import ReplayBuffer -from stable_baselines.common.math_util import unscale_action, scale_action -from stable_baselines.common.mpi_running_mean_std import RunningMeanStd -from stable_baselines.ddpg.policies import DDPGPolicy - - -def normalize(tensor, stats): - """ - normalize a tensor using a running mean and std - - :param tensor: (TensorFlow Tensor) the input tensor - :param stats: (RunningMeanStd) the running mean and std of the input to normalize - :return: (TensorFlow Tensor) the normalized tensor - """ - if stats is None: - return tensor - return (tensor - stats.mean) / stats.std - - -def denormalize(tensor, stats): - """ - denormalize a tensor using a running mean and std - - :param tensor: (TensorFlow Tensor) the normalized tensor - :param stats: (RunningMeanStd) the running mean and std of the input to normalize - :return: (TensorFlow Tensor) the restored tensor - """ - if stats is None: - return tensor - return tensor * stats.std + stats.mean - - -def reduce_std(tensor, axis=None, keepdims=False): - """ - get the standard deviation of a Tensor - - :param tensor: (TensorFlow Tensor) the input tensor - :param axis: (int or [int]) the axis to itterate the std over - :param keepdims: (bool) keep the other dimensions the same - :return: (TensorFlow Tensor) the std of the tensor - """ - return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims)) - - -def reduce_var(tensor, axis=None, keepdims=False): - """ - get the variance of a Tensor - - :param tensor: (TensorFlow Tensor) the input tensor - :param axis: (int or [int]) the axis to itterate the variance over - :param keepdims: (bool) keep the other dimensions the same - :return: (TensorFlow Tensor) the variance of the tensor - """ - tensor_mean = tf.reduce_mean(tensor, axis=axis, keepdims=True) - devs_squared = tf.square(tensor - tensor_mean) - return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims) - - -def get_target_updates(_vars, target_vars, tau, verbose=0): - """ - get target update operations - - :param _vars: ([TensorFlow Tensor]) the initial variables - :param target_vars: ([TensorFlow Tensor]) the target variables - :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update - """ - if verbose >= 2: - logger.info('setting up target updates ...') - soft_updates = [] - init_updates = [] - assert len(_vars) == len(target_vars) - for var, target_var in zip(_vars, target_vars): - if verbose >= 2: - logger.info(' {} <- {}'.format(target_var.name, var.name)) - init_updates.append(tf.assign(target_var, var)) - soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) - assert len(init_updates) == len(_vars) - assert len(soft_updates) == len(_vars) - return tf.group(*init_updates), tf.group(*soft_updates) - - -def get_perturbable_vars(scope): - """ - Get the trainable variables that can be perturbed when using - parameter noise. - - :param scope: (str) tensorflow scope of the variables - :return: ([tf.Variables]) - """ - return [var for var in tf_util.get_trainable_vars(scope) if 'LayerNorm' not in var.name] - - -def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0): - """ - Get the actor update, with noise. - - :param actor: (str) the actor - :param perturbed_actor: (str) the pertubed actor - :param param_noise_stddev: (float) the std of the parameter noise - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :return: (TensorFlow Operation) the update function - """ - assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor)) - assert len(get_perturbable_vars(actor)) == len(get_perturbable_vars(perturbed_actor)) - - updates = [] - for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)): - if var in get_perturbable_vars(actor): - if verbose >= 2: - logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) - # Add Gaussian noise to the parameter - updates.append(tf.assign(perturbed_var, - var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) - else: - if verbose >= 2: - logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) - updates.append(tf.assign(perturbed_var, var)) - assert len(updates) == len(tf_util.get_globals_vars(actor)) - return tf.group(*updates) - - -class DDPG(OffPolicyRLModel): - """ - Deep Deterministic Policy Gradient (DDPG) model - - DDPG: https://arxiv.org/pdf/1509.02971.pdf - - :param policy: (DDPGPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) the discount factor - :param memory_policy: (ReplayBuffer) the replay buffer - (if None, default to baselines.deepq.replay_buffer.ReplayBuffer) - - .. deprecated:: 2.6.0 - This parameter will be removed in a future version - - :param eval_env: (Gym Environment) the evaluation environment (can be None) - :param nb_train_steps: (int) the number of training steps - :param nb_rollout_steps: (int) the number of rollout steps - :param nb_eval_steps: (int) the number of evaluation steps - :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) - :param action_noise: (ActionNoise) the action noise type (can be None) - :param param_noise_adaption_interval: (int) apply param noise every N steps - :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) - :param normalize_returns: (bool) should the critic output be normalized - :param enable_popart: (bool) enable pop-art normalization of the critic output - (https://arxiv.org/pdf/1602.07714.pdf), normalize_returns must be set to True. - :param normalize_observations: (bool) should the observation be normalized - :param batch_size: (int) the size of the batch for learning the policy - :param observation_range: (tuple) the bounding values for the observation - :param return_range: (tuple) the bounding values for the critic output - :param critic_l2_reg: (float) l2 regularizer coefficient - :param actor_lr: (float) the actor learning rate - :param critic_lr: (float) the critic learning rate - :param clip_norm: (float) clip the gradients (disabled if None) - :param reward_scale: (float) the value the reward should be scaled by - :param render: (bool) enable rendering of the environment - :param render_eval: (bool) enable rendering of the evaluation environment - :param memory_limit: (int) the max number of transitions to store, size of the replay buffer - - .. deprecated:: 2.6.0 - Use `buffer_size` instead. - - :param buffer_size: (int) the max number of transitions to store, size of the replay buffer - :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) - This is not needed for DDPG normally but can help exploring when using HER + DDPG. - This hack was present in the original OpenAI Baselines repo (DDPG + HER) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50, - nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, action_noise=None, - normalize_observations=False, tau=0.001, batch_size=128, param_noise_adaption_interval=50, - normalize_returns=False, enable_popart=False, observation_range=(-5., 5.), critic_l2_reg=0., - return_range=(-np.inf, np.inf), actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., - render=False, render_eval=False, memory_limit=None, buffer_size=50000, random_exploration=0.0, - verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): - - super(DDPG, self).__init__(policy=policy, env=env, replay_buffer=None, - verbose=verbose, policy_base=DDPGPolicy, - requires_vec_env=False, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - # Parameters. - self.gamma = gamma - self.tau = tau - - # TODO: remove this param in v3.x.x - if memory_policy is not None: - warnings.warn("memory_policy will be removed in a future version (v3.x.x) " - "it is now ignored and replaced with ReplayBuffer", DeprecationWarning) - - if memory_limit is not None: - warnings.warn("memory_limit will be removed in a future version (v3.x.x) " - "use buffer_size instead", DeprecationWarning) - buffer_size = memory_limit - - self.normalize_observations = normalize_observations - self.normalize_returns = normalize_returns - self.action_noise = action_noise - self.param_noise = param_noise - self.return_range = return_range - self.observation_range = observation_range - self.actor_lr = actor_lr - self.critic_lr = critic_lr - self.clip_norm = clip_norm - self.enable_popart = enable_popart - self.reward_scale = reward_scale - self.batch_size = batch_size - self.critic_l2_reg = critic_l2_reg - self.eval_env = eval_env - self.render = render - self.render_eval = render_eval - self.nb_eval_steps = nb_eval_steps - self.param_noise_adaption_interval = param_noise_adaption_interval - self.nb_train_steps = nb_train_steps - self.nb_rollout_steps = nb_rollout_steps - self.memory_limit = memory_limit - self.buffer_size = buffer_size - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - self.random_exploration = random_exploration - - # init - self.graph = None - self.stats_sample = None - self.replay_buffer = None - self.policy_tf = None - self.target_init_updates = None - self.target_soft_updates = None - self.critic_loss = None - self.critic_grads = None - self.critic_optimizer = None - self.sess = None - self.stats_ops = None - self.stats_names = None - self.perturbed_actor_tf = None - self.perturb_policy_ops = None - self.perturb_adaptive_policy_ops = None - self.adaptive_policy_distance = None - self.actor_loss = None - self.actor_grads = None - self.actor_optimizer = None - self.old_std = None - self.old_mean = None - self.renormalize_q_outputs_op = None - self.obs_rms = None - self.ret_rms = None - self.target_policy = None - self.actor_tf = None - self.normalized_critic_tf = None - self.critic_tf = None - self.normalized_critic_with_actor_tf = None - self.critic_with_actor_tf = None - self.target_q = None - self.obs_train = None - self.action_train_ph = None - self.obs_target = None - self.action_target = None - self.obs_noise = None - self.action_noise_ph = None - self.obs_adapt_noise = None - self.action_adapt_noise = None - self.terminals_ph = None - self.rewards = None - self.actions = None - self.critic_target = None - self.param_noise_stddev = None - self.param_noise_actor = None - self.adaptive_param_noise_actor = None - self.params = None - self.summary = None - self.tb_seen_steps = None - - self.target_params = None - self.obs_rms_params = None - self.ret_rms_params = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.policy_tf - # Rescale - deterministic_action = unscale_action(self.action_space, self.actor_tf) - return policy.obs_ph, self.actions, deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - - assert isinstance(self.action_space, gym.spaces.Box), \ - "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) - assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ - "an instance of DDPGPolicy." - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - self.replay_buffer = ReplayBuffer(self.buffer_size) - - with tf.variable_scope("input", reuse=False): - # Observation normalization. - if self.normalize_observations: - with tf.variable_scope('obs_rms'): - self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) - else: - self.obs_rms = None - - # Return normalization. - if self.normalize_returns: - with tf.variable_scope('ret_rms'): - self.ret_rms = RunningMeanStd() - else: - self.ret_rms = None - - self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, - **self.policy_kwargs) - - # Create target networks. - self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, - **self.policy_kwargs) - self.obs_target = self.target_policy.obs_ph - self.action_target = self.target_policy.action_ph - - normalized_obs = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), - self.observation_range[0], self.observation_range[1]) - normalized_next_obs = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), - self.observation_range[0], self.observation_range[1]) - - if self.param_noise is not None: - # Configure perturbed actor. - self.param_noise_actor = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, - None, **self.policy_kwargs) - self.obs_noise = self.param_noise_actor.obs_ph - self.action_noise_ph = self.param_noise_actor.action_ph - - # Configure separate copy for stddev adoption. - self.adaptive_param_noise_actor = self.policy(self.sess, self.observation_space, - self.action_space, 1, 1, None, - **self.policy_kwargs) - self.obs_adapt_noise = self.adaptive_param_noise_actor.obs_ph - self.action_adapt_noise = self.adaptive_param_noise_actor.action_ph - - # Inputs. - self.obs_train = self.policy_tf.obs_ph - self.action_train_ph = self.policy_tf.action_ph - self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') - self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') - self.actions = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') - self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') - self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') - - # Create networks and core TF parts that are shared across setup parts. - with tf.variable_scope("model", reuse=False): - self.actor_tf = self.policy_tf.make_actor(normalized_obs) - self.normalized_critic_tf = self.policy_tf.make_critic(normalized_obs, self.actions) - self.normalized_critic_with_actor_tf = self.policy_tf.make_critic(normalized_obs, - self.actor_tf, - reuse=True) - # Noise setup - if self.param_noise is not None: - self._setup_param_noise(normalized_obs) - - with tf.variable_scope("target", reuse=False): - critic_target = self.target_policy.make_critic(normalized_next_obs, - self.target_policy.make_actor(normalized_next_obs)) - - with tf.variable_scope("loss", reuse=False): - self.critic_tf = denormalize( - tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), - self.ret_rms) - - self.critic_with_actor_tf = denormalize( - tf.clip_by_value(self.normalized_critic_with_actor_tf, - self.return_range[0], self.return_range[1]), - self.ret_rms) - - q_next_obs = denormalize(critic_target, self.ret_rms) - self.target_q = self.rewards + (1. - self.terminals_ph) * self.gamma * q_next_obs - - tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) - if self.full_tensorboard_log: - tf.summary.histogram('critic_target', self.critic_target) - - # Set up parts. - if self.normalize_returns and self.enable_popart: - self._setup_popart() - self._setup_stats() - self._setup_target_network_updates() - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('rewards', tf.reduce_mean(self.rewards)) - tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev)) - - if self.full_tensorboard_log: - tf.summary.histogram('rewards', self.rewards) - tf.summary.histogram('param_noise_stddev', self.param_noise_stddev) - if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]: - tf.summary.image('observation', self.obs_train) - else: - tf.summary.histogram('observation', self.obs_train) - - with tf.variable_scope("Adam_mpi", reuse=False): - self._setup_actor_optimizer() - self._setup_critic_optimizer() - tf.summary.scalar('actor_loss', self.actor_loss) - tf.summary.scalar('critic_loss', self.critic_loss) - - self.params = tf_util.get_trainable_vars("model") \ - + tf_util.get_trainable_vars('noise/') + tf_util.get_trainable_vars('noise_adapt/') - - self.target_params = tf_util.get_trainable_vars("target") - self.obs_rms_params = [var for var in tf.global_variables() - if "obs_rms" in var.name] - self.ret_rms_params = [var for var in tf.global_variables() - if "ret_rms" in var.name] - - with self.sess.as_default(): - self._initialize(self.sess) - - self.summary = tf.summary.merge_all() - - def _setup_target_network_updates(self): - """ - set the target update operations - """ - init_updates, soft_updates = get_target_updates(tf_util.get_trainable_vars('model/'), - tf_util.get_trainable_vars('target/'), self.tau, - self.verbose) - self.target_init_updates = init_updates - self.target_soft_updates = soft_updates - - def _setup_param_noise(self, normalized_obs): - """ - Setup the parameter noise operations - - :param normalized_obs: (TensorFlow Tensor) the normalized observation - """ - assert self.param_noise is not None - - with tf.variable_scope("noise", reuse=False): - self.perturbed_actor_tf = self.param_noise_actor.make_actor(normalized_obs) - - with tf.variable_scope("noise_adapt", reuse=False): - adaptive_actor_tf = self.adaptive_param_noise_actor.make_actor(normalized_obs) - - with tf.variable_scope("noise_update_func", reuse=False): - if self.verbose >= 2: - logger.info('setting up param noise') - self.perturb_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise/pi/', self.param_noise_stddev, - verbose=self.verbose) - - self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise_adapt/pi/', - self.param_noise_stddev, - verbose=self.verbose) - self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) - - def _setup_actor_optimizer(self): - """ - setup the optimizer for the actor - """ - if self.verbose >= 2: - logger.info('setting up actor optimizer') - self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) - actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')] - actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) - if self.verbose >= 2: - logger.info(' actor shapes: {}'.format(actor_shapes)) - logger.info(' actor params: {}'.format(actor_nb_params)) - self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), - clip_norm=self.clip_norm) - self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, - epsilon=1e-08) - - def _setup_critic_optimizer(self): - """ - setup the optimizer for the critic - """ - if self.verbose >= 2: - logger.info('setting up critic optimizer') - normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), - self.return_range[0], self.return_range[1]) - self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) - if self.critic_l2_reg > 0.: - critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/') - if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name] - if self.verbose >= 2: - for var in critic_reg_vars: - logger.info(' regularizing: {}'.format(var.name)) - logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) - critic_reg = tc.layers.apply_regularization( - tc.layers.l2_regularizer(self.critic_l2_reg), - weights_list=critic_reg_vars - ) - self.critic_loss += critic_reg - critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')] - critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) - if self.verbose >= 2: - logger.info(' critic shapes: {}'.format(critic_shapes)) - logger.info(' critic params: {}'.format(critic_nb_params)) - self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), - clip_norm=self.clip_norm) - self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, - epsilon=1e-08) - - def _setup_popart(self): - """ - setup pop-art normalization of the critic output - - See https://arxiv.org/pdf/1602.07714.pdf for details. - Preserving Outputs Precisely, while Adaptively Rescaling Targets”. - """ - self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') - new_std = self.ret_rms.std - self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') - new_mean = self.ret_rms.mean - - self.renormalize_q_outputs_op = [] - for out_vars in [[var for var in tf_util.get_trainable_vars('model/qf/') if 'qf_output' in var.name], - [var for var in tf_util.get_trainable_vars('target/qf/') if 'qf_output' in var.name]]: - assert len(out_vars) == 2 - # wieght and bias of the last layer - weight, bias = out_vars - assert 'kernel' in weight.name - assert 'bias' in bias.name - assert weight.get_shape()[-1] == 1 - assert bias.get_shape()[-1] == 1 - self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)] - self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)] - - def _setup_stats(self): - """ - Setup the stat logger for DDPG. - """ - ops = [ - tf.reduce_mean(self.critic_tf), - reduce_std(self.critic_tf), - tf.reduce_mean(self.critic_with_actor_tf), - reduce_std(self.critic_with_actor_tf), - tf.reduce_mean(self.actor_tf), - reduce_std(self.actor_tf) - ] - names = [ - 'reference_Q_mean', - 'reference_Q_std', - 'reference_actor_Q_mean', - 'reference_actor_Q_std', - 'reference_action_mean', - 'reference_action_std' - ] - - if self.normalize_returns: - ops += [self.ret_rms.mean, self.ret_rms.std] - names += ['ret_rms_mean', 'ret_rms_std'] - - if self.normalize_observations: - ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] - names += ['obs_rms_mean', 'obs_rms_std'] - - if self.param_noise: - ops += [tf.reduce_mean(self.perturbed_actor_tf), reduce_std(self.perturbed_actor_tf)] - names += ['reference_perturbed_action_mean', 'reference_perturbed_action_std'] - - self.stats_ops = ops - self.stats_names = names - - def _policy(self, obs, apply_noise=True, compute_q=True): - """ - Get the actions and critic output, from a given observation - - :param obs: ([float] or [int]) the observation - :param apply_noise: (bool) enable the noise - :param compute_q: (bool) compute the critic output - :return: ([float], float) the action and critic value - """ - obs = np.array(obs).reshape((-1,) + self.observation_space.shape) - feed_dict = {self.obs_train: obs} - if self.param_noise is not None and apply_noise: - actor_tf = self.perturbed_actor_tf - feed_dict[self.obs_noise] = obs - else: - actor_tf = self.actor_tf - - if compute_q: - action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) - else: - action = self.sess.run(actor_tf, feed_dict=feed_dict) - q_value = None - - action = action.flatten() - if self.action_noise is not None and apply_noise: - noise = self.action_noise() - action += noise - action = np.clip(action, -1, 1) - return action, q_value - - def _store_transition(self, obs, action, reward, next_obs, done, info): - """ - Store a transition in the replay buffer - - :param obs: ([float] or [int]) the last observation - :param action: ([float]) the action - :param reward: (float] the reward - :param next_obs: ([float] or [int]) the current observation - :param done: (bool) Whether the episode is over - :param info: (dict) extra values used to compute reward when using HER - """ - reward *= self.reward_scale - self.replay_buffer_add(obs, action, reward, next_obs, done, info) - if self.normalize_observations: - self.obs_rms.update(np.array([obs])) - - def _train_step(self, step, writer, log=False): - """ - run a step of training from batch - - :param step: (int) the current step iteration - :param writer: (TensorFlow Summary.writer) the writer for tensorboard - :param log: (bool) whether or not to log to metadata - :return: (float, float) critic loss, actor loss - """ - # Get a batch - obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size, - env=self._vec_normalize_env) - # Reshape to match previous behavior and placeholder shape - rewards = rewards.reshape(-1, 1) - terminals = terminals.reshape(-1, 1) - - if self.normalize_returns and self.enable_popart: - old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q], - feed_dict={ - self.obs_target: next_obs, - self.rewards: rewards, - self.terminals_ph: terminals - }) - self.ret_rms.update(target_q.flatten()) - self.sess.run(self.renormalize_q_outputs_op, feed_dict={ - self.old_std: np.array([old_std]), - self.old_mean: np.array([old_mean]), - }) - - else: - target_q = self.sess.run(self.target_q, feed_dict={ - self.obs_target: next_obs, - self.rewards: rewards, - self.terminals_ph: terminals - }) - - # Get all gradients and perform a synced update. - ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] - td_map = { - self.obs_train: obs, - self.actions: actions, - self.action_train_ph: actions, - self.rewards: rewards, - self.critic_target: target_q, - self.param_noise_stddev: 0 if self.param_noise is None else self.param_noise.current_stddev - } - if writer is not None: - # run loss backprop with summary if the step_id was not already logged (can happen with the right - # parameters as the step value is only an estimate) - if self.full_tensorboard_log and log and step not in self.tb_seen_steps: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, actor_grads, actor_loss, critic_grads, critic_loss = \ - self.sess.run([self.summary] + ops, td_map, options=run_options, run_metadata=run_metadata) - - writer.add_run_metadata(run_metadata, 'step%d' % step) - self.tb_seen_steps.append(step) - else: - summary, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run([self.summary] + ops, - td_map) - writer.add_summary(summary, step) - else: - actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, td_map) - - self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) - self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) - - return critic_loss, actor_loss - - def _initialize(self, sess): - """ - initialize the model parameters and optimizers - - :param sess: (TensorFlow Session) the current TensorFlow session - """ - self.sess = sess - self.sess.run(tf.global_variables_initializer()) - self.actor_optimizer.sync() - self.critic_optimizer.sync() - self.sess.run(self.target_init_updates) - - def _update_target_net(self): - """ - run target soft update operation - """ - self.sess.run(self.target_soft_updates) - - def _get_stats(self): - """ - Get the mean and standard deviation of the model's inputs and outputs - - :return: (dict) the means and stds - """ - if self.stats_sample is None: - # Get a sample and keep that fixed for all further computations. - # This allows us to estimate the change in value for the same set of inputs. - obs, actions, rewards, next_obs, terminals = self.replay_buffer.sample(batch_size=self.batch_size, - env=self._vec_normalize_env) - self.stats_sample = { - 'obs': obs, - 'actions': actions, - 'rewards': rewards, - 'next_obs': next_obs, - 'terminals': terminals - } - - feed_dict = { - self.actions: self.stats_sample['actions'] - } - - for placeholder in [self.action_train_ph, self.action_target, self.action_adapt_noise, self.action_noise_ph]: - if placeholder is not None: - feed_dict[placeholder] = self.stats_sample['actions'] - - for placeholder in [self.obs_train, self.obs_target, self.obs_adapt_noise, self.obs_noise]: - if placeholder is not None: - feed_dict[placeholder] = self.stats_sample['obs'] - - values = self.sess.run(self.stats_ops, feed_dict=feed_dict) - - names = self.stats_names[:] - assert len(names) == len(values) - stats = dict(zip(names, values)) - - if self.param_noise is not None: - stats = {**stats, **self.param_noise.get_stats()} - - return stats - - def _adapt_param_noise(self): - """ - calculate the adaptation for the parameter noise - - :return: (float) the mean distance for the parameter noise - """ - if self.param_noise is None: - return 0. - - # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. - obs, *_ = self.replay_buffer.sample(batch_size=self.batch_size, env=self._vec_normalize_env) - self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ - self.param_noise_stddev: self.param_noise.current_stddev, - }) - distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ - self.obs_adapt_noise: obs, self.obs_train: obs, - self.param_noise_stddev: self.param_noise.current_stddev, - }) - - mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() - self.param_noise.adapt(mean_distance) - return mean_distance - - def _reset(self): - """ - Reset internal state after an episode is complete. - """ - if self.action_noise is not None: - self.action_noise.reset() - if self.param_noise is not None: - self.sess.run(self.perturb_policy_ops, feed_dict={ - self.param_noise_stddev: self.param_noise.current_stddev, - }) - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DDPG", - reset_num_timesteps=True, replay_wrapper=None): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - if replay_wrapper is not None: - self.replay_buffer = replay_wrapper(self.replay_buffer) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - # a list for tensorboard logging, to prevent logging with the same step number, if it already occured - self.tb_seen_steps = [] - - rank = MPI.COMM_WORLD.Get_rank() - - if self.verbose >= 2: - logger.log('Using agent with the following configuration:') - logger.log(str(self.__dict__.items())) - - eval_episode_rewards_history = deque(maxlen=100) - episode_rewards_history = deque(maxlen=100) - episode_successes = [] - - with self.sess.as_default(), self.graph.as_default(): - # Prepare everything. - self._reset() - obs = self.env.reset() - # Retrieve unnormalized observation for saving into the buffer - if self._vec_normalize_env is not None: - obs_ = self._vec_normalize_env.get_original_obs().squeeze() - eval_obs = None - if self.eval_env is not None: - eval_obs = self.eval_env.reset() - episode_reward = 0. - episode_step = 0 - episodes = 0 - step = 0 - total_steps = 0 - - start_time = time.time() - - epoch_episode_rewards = [] - epoch_episode_steps = [] - epoch_actor_losses = [] - epoch_critic_losses = [] - epoch_adaptive_distances = [] - eval_episode_rewards = [] - eval_qs = [] - epoch_actions = [] - epoch_qs = [] - epoch_episodes = 0 - epoch = 0 - - callback.on_training_start(locals(), globals()) - - while True: - for _ in range(log_interval): - callback.on_rollout_start() - # Perform rollouts. - for _ in range(self.nb_rollout_steps): - - if total_steps >= total_timesteps: - callback.on_training_end() - return self - - # Predict next action. - action, q_value = self._policy(obs, apply_noise=True, compute_q=True) - assert action.shape == self.env.action_space.shape - - # Execute next action. - if rank == 0 and self.render: - self.env.render() - - # Randomly sample actions from a uniform distribution - # with a probability self.random_exploration (used in HER + DDPG) - if np.random.rand() < self.random_exploration: - # actions sampled from action space are from range specific to the environment - # but algorithm operates on tanh-squashed actions therefore simple scaling is used - unscaled_action = self.action_space.sample() - action = scale_action(self.action_space, unscaled_action) - else: - # inferred actions need to be transformed to environment action_space before stepping - unscaled_action = unscale_action(self.action_space, action) - - new_obs, reward, done, info = self.env.step(unscaled_action) - - self.num_timesteps += 1 - callback.update_locals(locals()) - if callback.on_step() is False: - callback.on_training_end() - return self - - step += 1 - total_steps += 1 - if rank == 0 and self.render: - self.env.render() - - # Book-keeping. - epoch_actions.append(action) - epoch_qs.append(q_value) - - # Store only the unnormalized version - if self._vec_normalize_env is not None: - new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() - reward_ = self._vec_normalize_env.get_original_reward().squeeze() - else: - # Avoid changing the original ones - obs_, new_obs_, reward_ = obs, new_obs, reward - - self._store_transition(obs_, action, reward_, new_obs_, done, info) - obs = new_obs - # Save the unnormalized observation - if self._vec_normalize_env is not None: - obs_ = new_obs_ - - episode_reward += reward_ - episode_step += 1 - - if writer is not None: - ep_rew = np.array([reward_]).reshape((1, -1)) - ep_done = np.array([done]).reshape((1, -1)) - tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, - writer, self.num_timesteps) - - if done: - # Episode done. - epoch_episode_rewards.append(episode_reward) - episode_rewards_history.append(episode_reward) - epoch_episode_steps.append(episode_step) - episode_reward = 0. - episode_step = 0 - epoch_episodes += 1 - episodes += 1 - - maybe_is_success = info.get('is_success') - if maybe_is_success is not None: - episode_successes.append(float(maybe_is_success)) - - self._reset() - if not isinstance(self.env, VecEnv): - obs = self.env.reset() - - callback.on_rollout_end() - # Train. - epoch_actor_losses = [] - epoch_critic_losses = [] - epoch_adaptive_distances = [] - for t_train in range(self.nb_train_steps): - # Not enough samples in the replay buffer - if not self.replay_buffer.can_sample(self.batch_size): - break - - # Adapt param noise, if necessary. - if len(self.replay_buffer) >= self.batch_size and \ - t_train % self.param_noise_adaption_interval == 0: - distance = self._adapt_param_noise() - epoch_adaptive_distances.append(distance) - - # weird equation to deal with the fact the nb_train_steps will be different - # to nb_rollout_steps - step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + - self.num_timesteps - self.nb_rollout_steps) - - critic_loss, actor_loss = self._train_step(step, writer, log=t_train == 0) - epoch_critic_losses.append(critic_loss) - epoch_actor_losses.append(actor_loss) - self._update_target_net() - - # Evaluate. - eval_episode_rewards = [] - eval_qs = [] - if self.eval_env is not None: - eval_episode_reward = 0. - for _ in range(self.nb_eval_steps): - if total_steps >= total_timesteps: - return self - - eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True) - unscaled_action = unscale_action(self.action_space, eval_action) - eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action) - if self.render_eval: - self.eval_env.render() - eval_episode_reward += eval_r - - eval_qs.append(eval_q) - if eval_done: - if not isinstance(self.env, VecEnv): - eval_obs = self.eval_env.reset() - eval_episode_rewards.append(eval_episode_reward) - eval_episode_rewards_history.append(eval_episode_reward) - eval_episode_reward = 0. - - mpi_size = MPI.COMM_WORLD.Get_size() - - # Not enough samples in the replay buffer - if not self.replay_buffer.can_sample(self.batch_size): - continue - - # Log stats. - # XXX shouldn't call np.mean on variable length lists - duration = time.time() - start_time - stats = self._get_stats() - combined_stats = stats.copy() - combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) - combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) - combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) - combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) - combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) - combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) - combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) - if len(epoch_adaptive_distances) != 0: - combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) - combined_stats['total/duration'] = duration - combined_stats['total/steps_per_second'] = float(step) / float(duration) - combined_stats['total/episodes'] = episodes - combined_stats['rollout/episodes'] = epoch_episodes - combined_stats['rollout/actions_std'] = np.std(epoch_actions) - # Evaluation statistics. - if self.eval_env is not None: - combined_stats['eval/return'] = np.mean(eval_episode_rewards) - combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) - combined_stats['eval/Q'] = np.mean(eval_qs) - combined_stats['eval/episodes'] = len(eval_episode_rewards) - - def as_scalar(scalar): - """ - check and return the input if it is a scalar, otherwise raise ValueError - - :param scalar: (Any) the object to check - :return: (Number) the scalar if x is a scalar - """ - if isinstance(scalar, np.ndarray): - assert scalar.size == 1 - return scalar[0] - elif np.isscalar(scalar): - return scalar - else: - raise ValueError('expected scalar, got %s' % scalar) - - combined_stats_sums = MPI.COMM_WORLD.allreduce( - np.array([as_scalar(x) for x in combined_stats.values()])) - combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} - - # Total statistics. - combined_stats['total/epochs'] = epoch + 1 - combined_stats['total/steps'] = step - - for key in sorted(combined_stats.keys()): - logger.record_tabular(key, combined_stats[key]) - if len(episode_successes) > 0: - logger.logkv("success rate", np.mean(episode_successes[-100:])) - logger.dump_tabular() - logger.info('') - logdir = logger.get_dir() - if rank == 0 and logdir: - if hasattr(self.env, 'get_state'): - with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: - pickle.dump(self.env.get_state(), file_handler) - if self.eval_env and hasattr(self.eval_env, 'get_state'): - with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: - pickle.dump(self.eval_env.get_state(), file_handler) - - def predict(self, observation, state=None, mask=None, deterministic=True): - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False) - actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = unscale_action(self.action_space, actions) # scale the output for the prediction - - if not vectorized_env: - actions = actions[0] - - return actions, None - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - _ = np.array(observation) - - if actions is not None: - raise ValueError("Error: DDPG does not have action probabilities.") - - # here there are no action probabilities, as DDPG does not use a probability distribution - warnings.warn("Warning: action probability is meaningless for DDPG. Returning None") - return None - - def get_parameter_list(self): - return (self.params + - self.target_params + - self.obs_rms_params + - self.ret_rms_params) - - def save(self, save_path, cloudpickle=False): - data = { - "observation_space": self.observation_space, - "action_space": self.action_space, - "nb_eval_steps": self.nb_eval_steps, - "param_noise_adaption_interval": self.param_noise_adaption_interval, - "nb_train_steps": self.nb_train_steps, - "nb_rollout_steps": self.nb_rollout_steps, - "verbose": self.verbose, - "param_noise": self.param_noise, - "action_noise": self.action_noise, - "gamma": self.gamma, - "tau": self.tau, - "normalize_returns": self.normalize_returns, - "enable_popart": self.enable_popart, - "normalize_observations": self.normalize_observations, - "batch_size": self.batch_size, - "observation_range": self.observation_range, - "return_range": self.return_range, - "critic_l2_reg": self.critic_l2_reg, - "actor_lr": self.actor_lr, - "critic_lr": self.critic_lr, - "clip_norm": self.clip_norm, - "reward_scale": self.reward_scale, - "memory_limit": self.memory_limit, - "buffer_size": self.buffer_size, - "random_exploration": self.random_exploration, - "policy": self.policy, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, - data=data, - params=params_to_save, - cloudpickle=cloudpickle) - - @classmethod - def load(cls, load_path, env=None, custom_objects=None, **kwargs): - data, params = cls._load_from_file(load_path, custom_objects=custom_objects) - - if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: - raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " - "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], - kwargs['policy_kwargs'])) - - model = cls(None, env, _init_setup_model=False) - model.__dict__.update(data) - model.__dict__.update(kwargs) - model.set_env(env) - model.setup_model() - # Patch for version < v2.6.0, duplicated keys where saved - if len(params) > len(model.get_parameter_list()): - n_params = len(model.params) - n_target_params = len(model.target_params) - n_normalisation_params = len(model.obs_rms_params) + len(model.ret_rms_params) - # Check that the issue is the one from - # https://github.com/hill-a/stable-baselines/issues/363 - assert len(params) == 2 * (n_params + n_target_params) + n_normalisation_params,\ - "The number of parameter saved differs from the number of parameters"\ - " that should be loaded: {}!={}".format(len(params), len(model.get_parameter_list())) - # Remove duplicates - params_ = params[:n_params + n_target_params] - if n_normalisation_params > 0: - params_ += params[-n_normalisation_params:] - params = params_ - model.load_parameters(params) - - return model diff --git a/stable_baselines/ddpg/main.py b/stable_baselines/ddpg/main.py deleted file mode 100644 index 3e123278..00000000 --- a/stable_baselines/ddpg/main.py +++ /dev/null @@ -1,139 +0,0 @@ -import argparse -import time -import os - -import gym -import tensorflow as tf -import numpy as np -from mpi4py import MPI - -from stable_baselines import logger, bench -from stable_baselines.common.misc_util import set_global_seeds, boolean_flag -from stable_baselines.ddpg.policies import MlpPolicy, LnMlpPolicy -from stable_baselines.ddpg import DDPG -from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise - - -def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): - """ - run the training of DDPG - - :param env_id: (str) the environment ID - :param seed: (int) the initial random seed - :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by - seperating them with commas - :param layer_norm: (bool) use layer normalization - :param evaluation: (bool) enable evaluation of DDPG training - :param kwargs: (dict) extra keywords for the training.train function - """ - - # Configure things. - rank = MPI.COMM_WORLD.Get_rank() - if rank != 0: - logger.set_level(logger.DISABLED) - - # Create envs. - env = gym.make(env_id) - env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) - - if evaluation and rank == 0: - eval_env = gym.make(env_id) - eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) - env = bench.Monitor(env, None) - else: - eval_env = None - - # Parse noise_type - action_noise = None - param_noise = None - nb_actions = env.action_space.shape[-1] - for current_noise_type in noise_type.split(','): - current_noise_type = current_noise_type.strip() - if current_noise_type == 'none': - pass - elif 'adaptive-param' in current_noise_type: - _, stddev = current_noise_type.split('_') - param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) - elif 'normal' in current_noise_type: - _, stddev = current_noise_type.split('_') - action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) - elif 'ou' in current_noise_type: - _, stddev = current_noise_type.split('_') - action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), - sigma=float(stddev) * np.ones(nb_actions)) - else: - raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) - - # Seed everything to make things reproducible. - seed = seed + 1000000 * rank - logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) - tf.reset_default_graph() - set_global_seeds(seed) - env.seed(seed) - if eval_env is not None: - eval_env.seed(seed) - - # Disable logging for rank != 0 to avoid noise. - start_time = 0 - if rank == 0: - start_time = time.time() - - if layer_norm: - policy = LnMlpPolicy - else: - policy = MlpPolicy - - num_timesteps = kwargs['num_timesteps'] - del kwargs['num_timesteps'] - - model = DDPG(policy=policy, env=env, eval_env=eval_env, param_noise=param_noise, - action_noise=action_noise, buffer_size=int(1e6), verbose=2, **kwargs) - model.learn(total_timesteps=num_timesteps) - env.close() - if eval_env is not None: - eval_env.close() - if rank == 0: - logger.info('total runtime: {}s'.format(time.time() - start_time)) - - -def parse_args(): - """ - parse the arguments for DDPG training - - :return: (dict) the arguments - """ - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') - boolean_flag(parser, 'render-eval', default=False) - boolean_flag(parser, 'layer-norm', default=True) - boolean_flag(parser, 'render', default=False) - boolean_flag(parser, 'normalize-returns', default=False) - boolean_flag(parser, 'normalize-observations', default=True) - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--critic-l2-reg', type=float, default=1e-2) - parser.add_argument('--batch-size', type=int, default=64) # per MPI worker - parser.add_argument('--actor-lr', type=float, default=1e-4) - parser.add_argument('--critic-lr', type=float, default=1e-3) - boolean_flag(parser, 'enable-popart', default=False) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--reward-scale', type=float, default=1.) - parser.add_argument('--clip-norm', type=float, default=None) - parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker - parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker - parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker - # choices are adaptive-param_xx, ou_xx, normal_xx, none - parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') - parser.add_argument('--num-timesteps', type=int, default=int(1e6)) - boolean_flag(parser, 'evaluation', default=False) - args = parser.parse_args() - dict_args = vars(args) - return dict_args - - -if __name__ == '__main__': - args = parse_args() - if MPI.COMM_WORLD.Get_rank() == 0: - logger.configure() - # Run actual script. - run(**args) diff --git a/stable_baselines/ddpg/noise.py b/stable_baselines/ddpg/noise.py deleted file mode 100644 index 181cad37..00000000 --- a/stable_baselines/ddpg/noise.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.common.noise import NormalActionNoise, AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise # pylint: disable=unused-import diff --git a/stable_baselines/ddpg/policies.py b/stable_baselines/ddpg/policies.py deleted file mode 100644 index 19ac6463..00000000 --- a/stable_baselines/ddpg/policies.py +++ /dev/null @@ -1,262 +0,0 @@ -import tensorflow as tf -from gym.spaces import Box - -from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy - - -class DDPGPolicy(BasePolicy): - """ - Policy object that implements a DDPG-like actor critic - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False): - super(DDPGPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale, - add_action_ph=True) - assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - self.qvalue_fn = None - self.policy = None - - def make_actor(self, obs=None, reuse=False, scope="pi"): - """ - creates an actor object - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name of the actor - :return: (TensorFlow Tensor) the output tensor - """ - raise NotImplementedError - - def make_critic(self, obs=None, action=None, reuse=False, scope="qf"): - """ - creates a critic object - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param action: (TensorFlow Tensor) The action placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name of the critic - :return: (TensorFlow Tensor) the output tensor - """ - raise NotImplementedError - - def step(self, obs, state=None, mask=None): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) actions - """ - raise NotImplementedError - - def proba_step(self, obs, state=None, mask=None): - """ - Returns the action probability for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) the action probability - """ - raise NotImplementedError - - def value(self, obs, action, state=None, mask=None): - """ - Returns the value for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param action: ([float] or [int]) The taken action - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) The associated value of the action - """ - raise NotImplementedError - - -class FeedForwardPolicy(DDPGPolicy): - """ - Policy object that implements a DDPG-like actor critic, using a feed forward neural network. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64]) - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param layer_norm: (bool) enable layer normalisation - :param act_fun: (tf.func) the activation function to use in the neural network. - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, - cnn_extractor=nature_cnn, feature_extraction="cnn", - layer_norm=False, act_fun=tf.nn.relu, **kwargs): - super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, - scale=(feature_extraction == "cnn")) - - self._kwargs_check(feature_extraction, kwargs) - self.layer_norm = layer_norm - self.feature_extraction = feature_extraction - self.cnn_kwargs = kwargs - self.cnn_extractor = cnn_extractor - self.reuse = reuse - self._qvalue = None - if layers is None: - layers = [64, 64] - self.layers = layers - - assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy." - - self.activ = act_fun - - def make_actor(self, obs=None, reuse=False, scope="pi"): - if obs is None: - obs = self.processed_obs - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - pi_h = tf.layers.flatten(obs) - for i, layer_size in enumerate(self.layers): - pi_h = tf.layers.dense(pi_h, layer_size, name='fc' + str(i)) - if self.layer_norm: - pi_h = tf.contrib.layers.layer_norm(pi_h, center=True, scale=True) - pi_h = self.activ(pi_h) - self.policy = tf.nn.tanh(tf.layers.dense(pi_h, self.ac_space.shape[0], name=scope, - kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, - maxval=3e-3))) - return self.policy - - def make_critic(self, obs=None, action=None, reuse=False, scope="qf"): - if obs is None: - obs = self.processed_obs - if action is None: - action = self.action_ph - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - qf_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - qf_h = tf.layers.flatten(obs) - for i, layer_size in enumerate(self.layers): - qf_h = tf.layers.dense(qf_h, layer_size, name='fc' + str(i)) - if self.layer_norm: - qf_h = tf.contrib.layers.layer_norm(qf_h, center=True, scale=True) - qf_h = self.activ(qf_h) - if i == 0: - qf_h = tf.concat([qf_h, action], axis=-1) - - # the name attribute is used in pop-art normalization - qvalue_fn = tf.layers.dense(qf_h, 1, name='qf_output', - kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, - maxval=3e-3)) - - self.qvalue_fn = qvalue_fn - self._qvalue = qvalue_fn[:, 0] - return self.qvalue_fn - - def step(self, obs, state=None, mask=None): - return self.sess.run(self.policy, {self.obs_ph: obs}) - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run(self.policy, {self.obs_ph: obs}) - - def value(self, obs, action, state=None, mask=None): - return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action}) - - -class CnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", **_kwargs) - - -class LnCnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", layer_norm=True, **_kwargs) - - -class MlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", **_kwargs) - - -class LnMlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", layer_norm=True, **_kwargs) - - -register_policy("CnnPolicy", CnnPolicy) -register_policy("LnCnnPolicy", LnCnnPolicy) -register_policy("MlpPolicy", MlpPolicy) -register_policy("LnMlpPolicy", LnMlpPolicy) diff --git a/stable_baselines/deepq/__init__.py b/stable_baselines/deepq/__init__.py deleted file mode 100644 index eda0c1e0..00000000 --- a/stable_baselines/deepq/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy -from stable_baselines.deepq.build_graph import build_act, build_train # noqa -from stable_baselines.deepq.dqn import DQN -from stable_baselines.common.buffers import ReplayBuffer, PrioritizedReplayBuffer # noqa - - -def wrap_atari_dqn(env): - """ - wrap the environment in atari wrappers for DQN - - :param env: (Gym Environment) the environment - :return: (Gym Environment) the wrapped environment - """ - from stable_baselines.common.atari_wrappers import wrap_deepmind - return wrap_deepmind(env, frame_stack=True, scale=False) diff --git a/stable_baselines/deepq/build_graph.py b/stable_baselines/deepq/build_graph.py deleted file mode 100644 index 51453ec6..00000000 --- a/stable_baselines/deepq/build_graph.py +++ /dev/null @@ -1,467 +0,0 @@ -"""Deep Q learning graph - -The functions in this file can are used to create the following functions: - -======= act ======== - - Function to chose an action given an observation - - :param observation: (Any) Observation that can be feed into the output of make_obs_ph - :param stochastic: (bool) if set to False all the actions are always deterministic (default False) - :param update_eps_ph: (float) update epsilon a new value, if negative not update happens (default: no update) - :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. - - -======= act (in case of parameter noise) ======== - - Function to chose an action given an observation - - :param observation: (Any) Observation that can be feed into the output of make_obs_ph - :param stochastic: (bool) if set to False all the actions are always deterministic (default False) - :param update_eps_ph: (float) update epsilon a new value, if negative not update happens - (default: no update) - :param reset_ph: (bool) reset the perturbed policy by sampling a new perturbation - :param update_param_noise_threshold_ph: (float) the desired threshold for the difference between - non-perturbed and perturbed policy - :param update_param_noise_scale_ph: (bool) whether or not to update the scale of the noise for the next time it is - re-perturbed - :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. - - -======= train ======= - - Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error: - - td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) - loss = huber_loss[td_error] - - :param obs_t: (Any) a batch of observations - :param action: (numpy int) actions that were selected upon seeing obs_t. dtype must be int32 and shape must be - (batch_size,) - :param reward: (numpy float) immediate reward attained after executing those actions dtype must be float32 and - shape must be (batch_size,) - :param obs_tp1: (Any) observations that followed obs_t - :param done: (numpy bool) 1 if obs_t was the last observation in the episode and 0 otherwise obs_tp1 gets ignored, - but must be of the valid shape. dtype must be float32 and shape must be (batch_size,) - :param weight: (numpy float) imporance weights for every element of the batch (gradient is multiplied by the - importance weight) dtype must be float32 and shape must be (batch_size,) - :return: (numpy float) td_error: a list of differences between Q(s,a) and the target in Bellman's equation. - dtype is float32 and shape is (batch_size,) - -======= update_target ======== - - copy the parameters from optimized Q function to the target Q function. - In Q learning we actually optimize the following error: - - Q(s,a) - (r + gamma * max_a' Q'(s', a')) - - Where Q' is lagging behind Q to stablize the learning. For example for Atari - - Q' is set to Q once every 10000 updates training steps. - -""" -import tensorflow as tf -from gym.spaces import MultiDiscrete - -from stable_baselines.common import tf_util - - -def scope_vars(scope, trainable_only=False): - """ - Get variables inside a scope - The scope can be specified as a string - - :param scope: (str or VariableScope) scope in which the variables reside. - :param trainable_only: (bool) whether or not to return only the variables that were marked as trainable. - :return: ([TensorFlow Tensor]) vars: list of variables in `scope`. - """ - return tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, - scope=scope if isinstance(scope, str) else scope.name - ) - - -def scope_name(): - """ - Returns the name of current scope as a string, e.g. deepq/q_func - - :return: (str) the name of current scope - """ - return tf.get_variable_scope().name - - -def absolute_scope_name(relative_scope_name): - """ - Appends parent scope name to `relative_scope_name` - - :return: (str) the absolute name of the scope - """ - return scope_name() + "/" + relative_scope_name - - -def default_param_noise_filter(var): - """ - check whether or not a variable is perturbable or not - - :param var: (TensorFlow Tensor) the variable - :return: (bool) can be perturb - """ - if var not in tf.trainable_variables(): - # We never perturb non-trainable vars. - return False - if "fully_connected" in var.name: - # We perturb fully-connected layers. - return True - - # The remaining layers are likely conv or layer norm layers, which we do not wish to - # perturb (in the former case because they only extract features, in the latter case because - # we use them for normalization purposes). If you change your network, you will likely want - # to re-consider which layers to perturb and which to keep untouched. - return False - - -def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess): - """ - Creates the act function: - - :param q_func: (DQNPolicy) the policy - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder - :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder - :param sess: (TensorFlow session) The current TensorFlow session - :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) - act function to select and action given observation (See the top of the file for details), - A tuple containing the observation placeholder and the processed observation placeholder respectively. - """ - eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) - - policy = q_func(sess, ob_space, ac_space, 1, 1, None) - obs_phs = (policy.obs_ph, policy.processed_obs) - deterministic_actions = tf.argmax(policy.q_values, axis=1) - - batch_size = tf.shape(policy.obs_ph)[0] - n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n - random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) - chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) - - output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) - update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) - _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True}, - updates=[update_eps_expr]) - - def act(obs, stochastic=True, update_eps=-1): - return _act(obs, stochastic, update_eps) - - return act, obs_phs - - -def build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, - param_noise_filter_func=None): - """ - Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): - - :param q_func: (DQNPolicy) the policy - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder - :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder - :param sess: (TensorFlow session) The current TensorFlow session - :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a - variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter - is used by default. - :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) - act function to select and action given observation (See the top of the file for details), - A tuple containing the observation placeholder and the processed observation placeholder respectively. - """ - if param_noise_filter_func is None: - param_noise_filter_func = default_param_noise_filter - - update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") - update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") - reset_ph = tf.placeholder(tf.bool, (), name="reset") - - eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) - param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), - trainable=False) - param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), - trainable=False) - - # Unmodified Q. - policy = q_func(sess, ob_space, ac_space, 1, 1, None) - obs_phs = (policy.obs_ph, policy.processed_obs) - - # Perturbable Q used for the actual rollout. - with tf.variable_scope("perturbed_model", reuse=False): - perturbable_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs) - - def perturb_vars(original_scope, perturbed_scope): - """ - We have to wrap this code into a function due to the way tf.cond() works. - - See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed - discussion. - - :param original_scope: (str or VariableScope) the original scope. - :param perturbed_scope: (str or VariableScope) the perturbed scope. - :return: (TensorFlow Operation) - """ - all_vars = scope_vars(absolute_scope_name(original_scope)) - all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) - assert len(all_vars) == len(all_perturbed_vars) - perturb_ops = [] - for var, perturbed_var in zip(all_vars, all_perturbed_vars): - if param_noise_filter_func(perturbed_var): - # Perturb this variable. - operation = tf.assign(perturbed_var, - var + tf.random_normal(shape=tf.shape(var), mean=0., - stddev=param_noise_scale)) - else: - # Do not perturb, just assign. - operation = tf.assign(perturbed_var, var) - perturb_ops.append(operation) - assert len(perturb_ops) == len(all_vars) - return tf.group(*perturb_ops) - - # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy - # of the network and measures the effect of that perturbation in action space. If the perturbation - # is too big, reduce scale of perturbation, otherwise increase. - with tf.variable_scope("adaptive_model", reuse=False): - adaptive_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs) - perturb_for_adaption = perturb_vars(original_scope="model", perturbed_scope="adaptive_model/model") - kl_loss = tf.reduce_sum( - tf.nn.softmax(policy.q_values) * - (tf.log(tf.nn.softmax(policy.q_values)) - tf.log(tf.nn.softmax(adaptive_policy.q_values))), - axis=-1) - mean_kl = tf.reduce_mean(kl_loss) - - def update_scale(): - """ - update the scale expression - - :return: (TensorFlow Tensor) the updated scale expression - """ - with tf.control_dependencies([perturb_for_adaption]): - update_scale_expr = tf.cond(mean_kl < param_noise_threshold, - lambda: param_noise_scale.assign(param_noise_scale * 1.01), - lambda: param_noise_scale.assign(param_noise_scale / 1.01), - ) - return update_scale_expr - - # Functionality to update the threshold for parameter space noise. - update_param_noise_thres_expr = param_noise_threshold.assign( - tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, - lambda: param_noise_threshold)) - - # Put everything together. - perturbed_deterministic_actions = tf.argmax(perturbable_policy.q_values, axis=1) - deterministic_actions = tf.argmax(policy.q_values, axis=1) - batch_size = tf.shape(policy.obs_ph)[0] - n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n - random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) - chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - perturbed_stochastic_actions = tf.where(chose_random, random_actions, perturbed_deterministic_actions) - stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) - - perturbed_output_actions = tf.cond(stochastic_ph, lambda: perturbed_stochastic_actions, - lambda: deterministic_actions) - output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) - update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) - updates = [ - update_eps_expr, - tf.cond(reset_ph, lambda: perturb_vars(original_scope="model", perturbed_scope="perturbed_model/model"), - lambda: tf.group(*[])), - tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), - update_param_noise_thres_expr, - ] - - _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True}, - updates=[update_eps_expr]) - - _perturbed_act = tf_util.function( - inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, - update_param_noise_scale_ph], - outputs=perturbed_output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, - update_param_noise_scale_ph: False}, - updates=updates) - - def act(obs, reset=None, update_param_noise_threshold=None, update_param_noise_scale=None, stochastic=True, - update_eps=-1): - """ - get the action from the current observation - - :param obs: (Any) Observation that can be feed into the output of make_obs_ph - :param reset: (bool) reset the perturbed policy by sampling a new perturbation - :param update_param_noise_threshold: (float) the desired threshold for the difference between - non-perturbed and perturbed policy - :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time - it is re-perturbed - :param stochastic: (bool) if set to False all the actions are always deterministic (default False) - :param update_eps: (float) update epsilon a new value, if negative not update happens - (default: no update) - :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be - performed for every element of the batch. - """ - if reset is None or update_param_noise_threshold is None or update_param_noise_scale is None: - return _act(obs, stochastic, update_eps) - else: - return _perturbed_act(obs, stochastic, update_eps, reset, update_param_noise_threshold, - update_param_noise_scale) - - return act, obs_phs - - -def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None, - gamma=1.0, double_q=True, scope="deepq", reuse=None, - param_noise=False, param_noise_filter_func=None, full_tensorboard_log=False): - """ - Creates the train function: - - :param q_func: (DQNPolicy) the policy - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param reuse: (bool) whether or not to reuse the graph variables - :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. - :param sess: (TensorFlow session) The current TensorFlow session - :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. - :param gamma: (float) discount rate. - :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a - good idea to keep it enabled. - :param scope: (str or VariableScope) optional scope for variable_scope. - :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. - :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) - :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a - variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter - is used by default. - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - - :return: (tuple) - - act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given - observation. See the top of the file for details. - train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) - optimize the error in Bellman's equation. See the top of the file for details. - update_target: (function) copy the parameters from optimized Q function to the target Q function. - See the top of the file for details. - step_model: (DQNPolicy) Policy for evaluation - """ - n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n - with tf.variable_scope("input", reuse=reuse): - stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") - update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") - - with tf.variable_scope(scope, reuse=reuse): - if param_noise: - act_f, obs_phs = build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, - param_noise_filter_func=param_noise_filter_func) - else: - act_f, obs_phs = build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess) - - # q network evaluation - with tf.variable_scope("step_model", reuse=True, custom_getter=tf_util.outer_scope_getter("step_model")): - step_model = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True, obs_phs=obs_phs) - q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/model") - # target q network evaluation - - with tf.variable_scope("target_q_func", reuse=False): - target_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=False) - target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, - scope=tf.get_variable_scope().name + "/target_q_func") - - # compute estimate of best possible value starting from state at t + 1 - double_q_values = None - double_obs_ph = target_policy.obs_ph - if double_q: - with tf.variable_scope("double_q", reuse=True, custom_getter=tf_util.outer_scope_getter("double_q")): - double_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True) - double_q_values = double_policy.q_values - double_obs_ph = double_policy.obs_ph - - with tf.variable_scope("loss", reuse=reuse): - # set up placeholders - act_t_ph = tf.placeholder(tf.int32, [None], name="action") - rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") - done_mask_ph = tf.placeholder(tf.float32, [None], name="done") - importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") - - # q scores for actions which we know were selected in the given state. - q_t_selected = tf.reduce_sum(step_model.q_values * tf.one_hot(act_t_ph, n_actions), axis=1) - - # compute estimate of best possible value starting from state at t + 1 - if double_q: - q_tp1_best_using_online_net = tf.argmax(double_q_values, axis=1) - q_tp1_best = tf.reduce_sum(target_policy.q_values * tf.one_hot(q_tp1_best_using_online_net, n_actions), axis=1) - else: - q_tp1_best = tf.reduce_max(target_policy.q_values, axis=1) - q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked - - # compute the error (potentially clipped) - td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - errors = tf_util.huber_loss(td_error) - weighted_error = tf.reduce_mean(importance_weights_ph * errors) - - tf.summary.scalar("td_error", tf.reduce_mean(td_error)) - tf.summary.scalar("loss", weighted_error) - - if full_tensorboard_log: - tf.summary.histogram("td_error", td_error) - - # update_target_fn will be called periodically to copy Q network to target Q network - update_target_expr = [] - for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), - sorted(target_q_func_vars, key=lambda v: v.name)): - update_target_expr.append(var_target.assign(var)) - update_target_expr = tf.group(*update_target_expr) - - # compute optimization op (potentially with gradient clipping) - gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) - if grad_norm_clipping is not None: - for i, (grad, var) in enumerate(gradients): - if grad is not None: - gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph)) - tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph)) - - if full_tensorboard_log: - tf.summary.histogram('rewards', rew_t_ph) - tf.summary.histogram('importance_weights', importance_weights_ph) - if tf_util.is_image(obs_phs[0]): - tf.summary.image('observation', obs_phs[0]) - elif len(obs_phs[0].shape) == 1: - tf.summary.histogram('observation', obs_phs[0]) - - optimize_expr = optimizer.apply_gradients(gradients) - - summary = tf.summary.merge_all() - - # Create callable functions - train = tf_util.function( - inputs=[ - obs_phs[0], - act_t_ph, - rew_t_ph, - target_policy.obs_ph, - double_obs_ph, - done_mask_ph, - importance_weights_ph - ], - outputs=[summary, td_error], - updates=[optimize_expr] - ) - update_target = tf_util.function([], [], updates=[update_target_expr]) - - return act_f, train, update_target, step_model diff --git a/stable_baselines/deepq/dqn.py b/stable_baselines/deepq/dqn.py deleted file mode 100644 index 8deaf5a0..00000000 --- a/stable_baselines/deepq/dqn.py +++ /dev/null @@ -1,401 +0,0 @@ -from functools import partial - -import tensorflow as tf -import numpy as np -import gym - -from stable_baselines import logger -from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter -from stable_baselines.common.vec_env import VecEnv -from stable_baselines.common.schedules import LinearSchedule -from stable_baselines.common.buffers import ReplayBuffer, PrioritizedReplayBuffer -from stable_baselines.deepq.build_graph import build_train -from stable_baselines.deepq.policies import DQNPolicy - - -class DQN(OffPolicyRLModel): - """ - The DQN model class. - DQN paper: https://arxiv.org/abs/1312.5602 - Dueling DQN: https://arxiv.org/abs/1511.06581 - Double-Q Learning: https://arxiv.org/abs/1509.06461 - Prioritized Experience Replay: https://arxiv.org/abs/1511.05952 - - :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) discount factor - :param learning_rate: (float) learning rate for adam optimizer - :param buffer_size: (int) size of the replay buffer - :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is - annealed - :param exploration_final_eps: (float) final value of random action probability - :param exploration_initial_eps: (float) initial value of random action probability - :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing - :param batch_size: (int) size of a batched sampled from replay buffer for training - :param double_q: (bool) Whether to enable Double-Q learning or not. - :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. - :param prioritized_replay: (bool) if True prioritized replay buffer will be used. - :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. - It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. - :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer - :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial - value to 1.0. If set to None equals to max_timesteps. - :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. - :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, - exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True, - learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, - prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, - prioritized_replay_eps=1e-6, param_noise=False, - n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None): - - # TODO: replay_buffer refactoring - super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, - requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.param_noise = param_noise - self.learning_starts = learning_starts - self.train_freq = train_freq - self.prioritized_replay = prioritized_replay - self.prioritized_replay_eps = prioritized_replay_eps - self.batch_size = batch_size - self.target_network_update_freq = target_network_update_freq - self.prioritized_replay_alpha = prioritized_replay_alpha - self.prioritized_replay_beta0 = prioritized_replay_beta0 - self.prioritized_replay_beta_iters = prioritized_replay_beta_iters - self.exploration_final_eps = exploration_final_eps - self.exploration_initial_eps = exploration_initial_eps - self.exploration_fraction = exploration_fraction - self.buffer_size = buffer_size - self.learning_rate = learning_rate - self.gamma = gamma - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - self.double_q = double_q - - self.graph = None - self.sess = None - self._train_step = None - self.step_model = None - self.update_target = None - self.act = None - self.proba_step = None - self.replay_buffer = None - self.beta_schedule = None - self.exploration = None - self.params = None - self.summary = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.step_model - return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values - - def setup_model(self): - - with SetVerbosity(self.verbose): - assert not isinstance(self.action_space, gym.spaces.Box), \ - "Error: DQN cannot output a gym.spaces.Box action space." - - # If the policy is wrap in functool.partial (e.g. to disable dueling) - # unwrap it to check the class type - if isinstance(self.policy, partial): - test_policy = self.policy.func - else: - test_policy = self.policy - assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ - "an instance of DQNPolicy." - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) - - self.act, self._train_step, self.update_target, self.step_model = build_train( - q_func=partial(self.policy, **self.policy_kwargs), - ob_space=self.observation_space, - ac_space=self.action_space, - optimizer=optimizer, - gamma=self.gamma, - grad_norm_clipping=10, - param_noise=self.param_noise, - sess=self.sess, - full_tensorboard_log=self.full_tensorboard_log, - double_q=self.double_q - ) - self.proba_step = self.step_model.proba_step - self.params = tf_util.get_trainable_vars("deepq") - - # Initialize the parameters and copy them to the target network. - tf_util.initialize(self.sess) - self.update_target(sess=self.sess) - - self.summary = tf.summary.merge_all() - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", - reset_num_timesteps=True, replay_wrapper=None): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - # Create the replay buffer - if self.prioritized_replay: - self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) - if self.prioritized_replay_beta_iters is None: - prioritized_replay_beta_iters = total_timesteps - else: - prioritized_replay_beta_iters = self.prioritized_replay_beta_iters - self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, - initial_p=self.prioritized_replay_beta0, - final_p=1.0) - else: - self.replay_buffer = ReplayBuffer(self.buffer_size) - self.beta_schedule = None - - if replay_wrapper is not None: - assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" - self.replay_buffer = replay_wrapper(self.replay_buffer) - - # Create the schedule for exploration starting from 1. - self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), - initial_p=self.exploration_initial_eps, - final_p=self.exploration_final_eps) - - episode_rewards = [0.0] - episode_successes = [] - - callback.on_training_start(locals(), globals()) - callback.on_rollout_start() - - reset = True - obs = self.env.reset() - # Retrieve unnormalized observation for saving into the buffer - if self._vec_normalize_env is not None: - obs_ = self._vec_normalize_env.get_original_obs().squeeze() - - for _ in range(total_timesteps): - # Take action and update exploration to the newest value - kwargs = {} - if not self.param_noise: - update_eps = self.exploration.value(self.num_timesteps) - update_param_noise_threshold = 0. - else: - update_eps = 0. - # Compute the threshold such that the KL divergence between perturbed and non-perturbed - # policy is comparable to eps-greedy exploration with eps = exploration.value(t). - # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 - # for detailed explanation. - update_param_noise_threshold = \ - -np.log(1. - self.exploration.value(self.num_timesteps) + - self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) - kwargs['reset'] = reset - kwargs['update_param_noise_threshold'] = update_param_noise_threshold - kwargs['update_param_noise_scale'] = True - with self.sess.as_default(): - action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] - env_action = action - reset = False - new_obs, rew, done, info = self.env.step(env_action) - - self.num_timesteps += 1 - - # Stop training if return value is False - callback.update_locals(locals()) - if callback.on_step() is False: - break - - # Store only the unnormalized version - if self._vec_normalize_env is not None: - new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() - reward_ = self._vec_normalize_env.get_original_reward().squeeze() - else: - # Avoid changing the original ones - obs_, new_obs_, reward_ = obs, new_obs, rew - # Store transition in the replay buffer. - self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) - obs = new_obs - # Save the unnormalized observation - if self._vec_normalize_env is not None: - obs_ = new_obs_ - - if writer is not None: - ep_rew = np.array([reward_]).reshape((1, -1)) - ep_done = np.array([done]).reshape((1, -1)) - tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, - self.num_timesteps) - - episode_rewards[-1] += reward_ - if done: - maybe_is_success = info.get('is_success') - if maybe_is_success is not None: - episode_successes.append(float(maybe_is_success)) - if not isinstance(self.env, VecEnv): - obs = self.env.reset() - episode_rewards.append(0.0) - reset = True - - # Do not train if the warmup phase is not over - # or if there are not enough samples in the replay buffer - can_sample = self.replay_buffer.can_sample(self.batch_size) - if can_sample and self.num_timesteps > self.learning_starts \ - and self.num_timesteps % self.train_freq == 0: - - callback.on_rollout_end() - # Minimize the error in Bellman's equation on a batch sampled from replay buffer. - # pytype:disable=bad-unpacking - if self.prioritized_replay: - assert self.beta_schedule is not None, \ - "BUG: should be LinearSchedule when self.prioritized_replay True" - experience = self.replay_buffer.sample(self.batch_size, - beta=self.beta_schedule.value(self.num_timesteps), - env=self._vec_normalize_env) - (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience - else: - obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, - env=self._vec_normalize_env) - weights, batch_idxes = np.ones_like(rewards), None - # pytype:enable=bad-unpacking - - if writer is not None: - # run loss backprop with summary, but once every 100 steps save the metadata - # (memory, compute time, ...) - if (1 + self.num_timesteps) % 100 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, - dones, weights, sess=self.sess, options=run_options, - run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) - else: - summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, - dones, weights, sess=self.sess) - writer.add_summary(summary, self.num_timesteps) - else: - _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, - sess=self.sess) - - if self.prioritized_replay: - new_priorities = np.abs(td_errors) + self.prioritized_replay_eps - assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) - self.replay_buffer.update_priorities(batch_idxes, new_priorities) - - callback.on_rollout_start() - - if can_sample and self.num_timesteps > self.learning_starts and \ - self.num_timesteps % self.target_network_update_freq == 0: - # Update target network periodically. - self.update_target(sess=self.sess) - - if len(episode_rewards[-101:-1]) == 0: - mean_100ep_reward = -np.inf - else: - mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) - - num_episodes = len(episode_rewards) - if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: - logger.record_tabular("steps", self.num_timesteps) - logger.record_tabular("episodes", num_episodes) - if len(episode_successes) > 0: - logger.logkv("success rate", np.mean(episode_successes[-100:])) - logger.record_tabular("mean 100 episode reward", mean_100ep_reward) - logger.record_tabular("% time spent exploring", - int(100 * self.exploration.value(self.num_timesteps))) - logger.dump_tabular() - - callback.on_training_end() - return self - - def predict(self, observation, state=None, mask=None, deterministic=True): - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - with self.sess.as_default(): - actions, _, _ = self.step_model.step(observation, deterministic=deterministic) - - if not vectorized_env: - actions = actions[0] - - return actions, None - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions_proba = self.proba_step(observation, state, mask) - - if actions is not None: # comparing the action distribution, to given actions - actions = np.array([actions]) - assert isinstance(self.action_space, gym.spaces.Discrete) - actions = actions.reshape((-1,)) - assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations." - actions_proba = actions_proba[np.arange(actions.shape[0]), actions] - # normalize action proba shape - actions_proba = actions_proba.reshape((-1, 1)) - if logp: - actions_proba = np.log(actions_proba) - - if not vectorized_env: - if state is not None: - raise ValueError("Error: The environment must be vectorized when using recurrent policies.") - actions_proba = actions_proba[0] - - return actions_proba - - def get_parameter_list(self): - return self.params - - def save(self, save_path, cloudpickle=False): - # params - data = { - "double_q": self.double_q, - "param_noise": self.param_noise, - "learning_starts": self.learning_starts, - "train_freq": self.train_freq, - "prioritized_replay": self.prioritized_replay, - "prioritized_replay_eps": self.prioritized_replay_eps, - "batch_size": self.batch_size, - "target_network_update_freq": self.target_network_update_freq, - "prioritized_replay_alpha": self.prioritized_replay_alpha, - "prioritized_replay_beta0": self.prioritized_replay_beta0, - "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters, - "exploration_final_eps": self.exploration_final_eps, - "exploration_fraction": self.exploration_fraction, - "learning_rate": self.learning_rate, - "gamma": self.gamma, - "verbose": self.verbose, - "observation_space": self.observation_space, - "action_space": self.action_space, - "policy": self.policy, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/deepq/experiments/enjoy_cartpole.py b/stable_baselines/deepq/experiments/enjoy_cartpole.py deleted file mode 100644 index 90991234..00000000 --- a/stable_baselines/deepq/experiments/enjoy_cartpole.py +++ /dev/null @@ -1,36 +0,0 @@ -import argparse - -import gym - -from stable_baselines.deepq import DQN - - -def main(args): - """ - Run a trained model for the cartpole problem - - :param args: (ArgumentParser) the input arguments - """ - env = gym.make("CartPole-v0") - model = DQN.load("cartpole_model.zip", env) - - while True: - obs, done = env.reset(), False - episode_rew = 0 - while not done: - if not args.no_render: - env.render() - action, _ = model.predict(obs) - obs, rew, done, _ = env.step(action) - episode_rew += rew - print("Episode reward", episode_rew) - # No render is only used for automatic testing - if args.no_render: - break - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole") - parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") - args = parser.parse_args() - main(args) diff --git a/stable_baselines/deepq/experiments/enjoy_mountaincar.py b/stable_baselines/deepq/experiments/enjoy_mountaincar.py deleted file mode 100644 index dc0513f8..00000000 --- a/stable_baselines/deepq/experiments/enjoy_mountaincar.py +++ /dev/null @@ -1,41 +0,0 @@ -import argparse - -import gym -import numpy as np - -from stable_baselines.deepq import DQN - - -def main(args): - """ - Run a trained model for the mountain car problem - - :param args: (ArgumentParser) the input arguments - """ - env = gym.make("MountainCar-v0") - model = DQN.load("mountaincar_model.zip", env) - - while True: - obs, done = env.reset(), False - episode_rew = 0 - while not done: - if not args.no_render: - env.render() - # Epsilon-greedy - if np.random.random() < 0.02: - action = env.action_space.sample() - else: - action, _ = model.predict(obs, deterministic=True) - obs, rew, done, _ = env.step(action) - episode_rew += rew - print("Episode reward", episode_rew) - # No render is only used for automatic testing - if args.no_render: - break - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar") - parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") - args = parser.parse_args() - main(args) diff --git a/stable_baselines/deepq/experiments/run_atari.py b/stable_baselines/deepq/experiments/run_atari.py deleted file mode 100644 index 7e3fb165..00000000 --- a/stable_baselines/deepq/experiments/run_atari.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse -from functools import partial - -from stable_baselines import bench, logger -from stable_baselines.common import set_global_seeds -from stable_baselines.common.atari_wrappers import make_atari -from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy - - -def main(): - """ - Run the atari test - """ - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') - parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--prioritized', type=int, default=1) - parser.add_argument('--dueling', type=int, default=1) - parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) - parser.add_argument('--num-timesteps', type=int, default=int(10e6)) - - args = parser.parse_args() - logger.configure() - set_global_seeds(args.seed) - env = make_atari(args.env) - env = bench.Monitor(env, logger.get_dir()) - env = wrap_atari_dqn(env) - policy = partial(CnnPolicy, dueling=args.dueling == 1) - - model = DQN( - env=env, - policy=policy, - learning_rate=1e-4, - buffer_size=10000, - exploration_fraction=0.1, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=10000, - target_network_update_freq=1000, - gamma=0.99, - prioritized_replay=bool(args.prioritized), - prioritized_replay_alpha=args.prioritized_replay_alpha, - ) - model.learn(total_timesteps=args.num_timesteps) - - env.close() - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/deepq/experiments/train_cartpole.py b/stable_baselines/deepq/experiments/train_cartpole.py deleted file mode 100644 index acae9b92..00000000 --- a/stable_baselines/deepq/experiments/train_cartpole.py +++ /dev/null @@ -1,51 +0,0 @@ -import argparse - -import gym -import numpy as np - -from stable_baselines.deepq import DQN, MlpPolicy - - -def callback(lcl, _glb): - """ - The callback function for logging and saving - - :param lcl: (dict) the local variables - :param _glb: (dict) the global variables - :return: (bool) is solved - """ - # stop training if reward exceeds 199 - if len(lcl['episode_rewards'][-101:-1]) == 0: - mean_100ep_reward = -np.inf - else: - mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1) - is_solved = lcl['self'].num_timesteps > 100 and mean_100ep_reward >= 199 - return not is_solved - - -def main(args): - """ - Train and save the DQN model, for the cartpole problem - - :param args: (ArgumentParser) the input arguments - """ - env = gym.make("CartPole-v0") - model = DQN( - env=env, - policy=MlpPolicy, - learning_rate=1e-3, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - ) - model.learn(total_timesteps=args.max_timesteps, callback=callback) - - print("Saving model to cartpole_model.zip") - model.save("cartpole_model.zip") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Train DQN on cartpole") - parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") - args = parser.parse_args() - main(args) diff --git a/stable_baselines/deepq/experiments/train_mountaincar.py b/stable_baselines/deepq/experiments/train_mountaincar.py deleted file mode 100644 index f20e934a..00000000 --- a/stable_baselines/deepq/experiments/train_mountaincar.py +++ /dev/null @@ -1,37 +0,0 @@ -import argparse - -import gym - -from stable_baselines.deepq import DQN - - -def main(args): - """ - Train and save the DQN model, for the mountain car problem - - :param args: (ArgumentParser) the input arguments - """ - env = gym.make("MountainCar-v0") - - # using layer norm policy here is important for parameter space noise! - model = DQN( - policy="LnMlpPolicy", - env=env, - learning_rate=1e-3, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.1, - param_noise=True, - policy_kwargs=dict(layers=[64]) - ) - model.learn(total_timesteps=args.max_timesteps) - - print("Saving model to mountaincar_model.zip") - model.save("mountaincar_model") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Train DQN on MountainCar") - parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") - args = parser.parse_args() - main(args) diff --git a/stable_baselines/deepq/policies.py b/stable_baselines/deepq/policies.py deleted file mode 100644 index 3a2dfec1..00000000 --- a/stable_baselines/deepq/policies.py +++ /dev/null @@ -1,254 +0,0 @@ -import tensorflow as tf -import tensorflow.contrib.layers as tf_layers -import numpy as np -from gym.spaces import Discrete - -from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy - - -class DQNPolicy(BasePolicy): - """ - Policy object that implements a DQN policy - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, - obs_phs=None, dueling=True): - # DQN policies need an override for the obs placeholder, due to the architecture of the code - super(DQNPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale, - obs_phs=obs_phs) - assert isinstance(ac_space, Discrete), "Error: the action space for DQN must be of type gym.spaces.Discrete" - self.n_actions = ac_space.n - self.value_fn = None - self.q_values = None - self.dueling = dueling - - def _setup_init(self): - """ - Set up action probability - """ - with tf.variable_scope("output", reuse=True): - assert self.q_values is not None - self.policy_proba = tf.nn.softmax(self.q_values) - - def step(self, obs, state=None, mask=None, deterministic=True): - """ - Returns the q_values for a single step - - :param obs: (np.ndarray float or int) The current observation of the environment - :param state: (np.ndarray float) The last states (used in recurrent policies) - :param mask: (np.ndarray float) The last masks (used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: (np.ndarray int, np.ndarray float, np.ndarray float) actions, q_values, states - """ - raise NotImplementedError - - def proba_step(self, obs, state=None, mask=None): - """ - Returns the action probability for a single step - - :param obs: (np.ndarray float or int) The current observation of the environment - :param state: (np.ndarray float) The last states (used in recurrent policies) - :param mask: (np.ndarray float) The last masks (used in recurrent policies) - :return: (np.ndarray float) the action probability - """ - raise NotImplementedError - - -class FeedForwardPolicy(DQNPolicy): - """ - Policy object that implements a DQN policy, using a feed forward neural network. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64]) - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param layer_norm: (bool) enable layer normalisation - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - :param act_fun: (tf.func) the activation function to use in the neural network. - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, - cnn_extractor=nature_cnn, feature_extraction="cnn", - obs_phs=None, layer_norm=False, dueling=True, act_fun=tf.nn.relu, **kwargs): - super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, - n_batch, dueling=dueling, reuse=reuse, - scale=(feature_extraction == "cnn"), obs_phs=obs_phs) - - self._kwargs_check(feature_extraction, kwargs) - - if layers is None: - layers = [64, 64] - - with tf.variable_scope("model", reuse=reuse): - with tf.variable_scope("action_value"): - if feature_extraction == "cnn": - extracted_features = cnn_extractor(self.processed_obs, **kwargs) - action_out = extracted_features - else: - extracted_features = tf.layers.flatten(self.processed_obs) - action_out = extracted_features - for layer_size in layers: - action_out = tf_layers.fully_connected(action_out, num_outputs=layer_size, activation_fn=None) - if layer_norm: - action_out = tf_layers.layer_norm(action_out, center=True, scale=True) - action_out = act_fun(action_out) - - action_scores = tf_layers.fully_connected(action_out, num_outputs=self.n_actions, activation_fn=None) - - if self.dueling: - with tf.variable_scope("state_value"): - state_out = extracted_features - for layer_size in layers: - state_out = tf_layers.fully_connected(state_out, num_outputs=layer_size, activation_fn=None) - if layer_norm: - state_out = tf_layers.layer_norm(state_out, center=True, scale=True) - state_out = act_fun(state_out) - state_score = tf_layers.fully_connected(state_out, num_outputs=1, activation_fn=None) - action_scores_mean = tf.reduce_mean(action_scores, axis=1) - action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, axis=1) - q_out = state_score + action_scores_centered - else: - q_out = action_scores - - self.q_values = q_out - self._setup_init() - - def step(self, obs, state=None, mask=None, deterministic=True): - q_values, actions_proba = self.sess.run([self.q_values, self.policy_proba], {self.obs_ph: obs}) - if deterministic: - actions = np.argmax(q_values, axis=1) - else: - # Unefficient sampling - # TODO: replace the loop - # maybe with Gumbel-max trick ? (http://amid.fish/humble-gumbel) - actions = np.zeros((len(obs),), dtype=np.int64) - for action_idx in range(len(obs)): - actions[action_idx] = np.random.choice(self.n_actions, p=actions_proba[action_idx]) - - return actions, q_values, None - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run(self.policy_proba, {self.obs_ph: obs}) - - -class CnnPolicy(FeedForwardPolicy): - """ - Policy object that implements DQN policy, using a CNN (the nature CNN) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=False, obs_phs=None, dueling=True, **_kwargs): - super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", obs_phs=obs_phs, dueling=dueling, - layer_norm=False, **_kwargs) - - -class LnCnnPolicy(FeedForwardPolicy): - """ - Policy object that implements DQN policy, using a CNN (the nature CNN), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=False, obs_phs=None, dueling=True, **_kwargs): - super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", obs_phs=obs_phs, dueling=dueling, - layer_norm=True, **_kwargs) - - -class MlpPolicy(FeedForwardPolicy): - """ - Policy object that implements DQN policy, using a MLP (2 layers of 64) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=False, obs_phs=None, dueling=True, **_kwargs): - super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", obs_phs=obs_phs, dueling=dueling, - layer_norm=False, **_kwargs) - - -class LnMlpPolicy(FeedForwardPolicy): - """ - Policy object that implements DQN policy, using a MLP (2 layers of 64), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectively - :param dueling: (bool) if true double the output MLP to compute a baseline for action scores - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=False, obs_phs=None, dueling=True, **_kwargs): - super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", obs_phs=obs_phs, - layer_norm=True, dueling=True, **_kwargs) - - -register_policy("CnnPolicy", CnnPolicy) -register_policy("LnCnnPolicy", LnCnnPolicy) -register_policy("MlpPolicy", MlpPolicy) -register_policy("LnMlpPolicy", LnMlpPolicy) diff --git a/stable_baselines/gail/__init__.py b/stable_baselines/gail/__init__.py deleted file mode 100644 index 0a4c7ac4..00000000 --- a/stable_baselines/gail/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from stable_baselines.gail.model import GAIL -from stable_baselines.gail.dataset.dataset import ExpertDataset, DataLoader -from stable_baselines.gail.dataset.record_expert import generate_expert_traj diff --git a/stable_baselines/gail/adversary.py b/stable_baselines/gail/adversary.py deleted file mode 100644 index 7c6cb63c..00000000 --- a/stable_baselines/gail/adversary.py +++ /dev/null @@ -1,164 +0,0 @@ -""" -Reference: https://github.com/openai/imitation -I follow the architecture from the official repository -""" -import gym -import tensorflow as tf -import numpy as np - -from stable_baselines.common.mpi_running_mean_std import RunningMeanStd -from stable_baselines.common import tf_util as tf_util - - -def logsigmoid(input_tensor): - """ - Equivalent to tf.log(tf.sigmoid(a)) - - :param input_tensor: (tf.Tensor) - :return: (tf.Tensor) - """ - return -tf.nn.softplus(-input_tensor) - - -def logit_bernoulli_entropy(logits): - """ - Reference: - https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51 - - :param logits: (tf.Tensor) the logits - :return: (tf.Tensor) the Bernoulli entropy - """ - ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits) - return ent - - -class TransitionClassifier(object): - def __init__(self, observation_space, action_space, hidden_size, - entcoeff=0.001, scope="adversary", normalize=True): - """ - Reward regression from observations and transitions - - :param observation_space: (gym.spaces) - :param action_space: (gym.spaces) - :param hidden_size: ([int]) the hidden dimension for the MLP - :param entcoeff: (float) the entropy loss weight - :param scope: (str) tensorflow variable scope - :param normalize: (bool) Whether to normalize the reward or not - """ - # TODO: support images properly (using a CNN) - self.scope = scope - self.observation_shape = observation_space.shape - self.actions_shape = action_space.shape - - if isinstance(action_space, gym.spaces.Box): - # Continuous action space - self.discrete_actions = False - self.n_actions = action_space.shape[0] - elif isinstance(action_space, gym.spaces.Discrete): - self.n_actions = action_space.n - self.discrete_actions = True - else: - raise ValueError('Action space not supported: {}'.format(action_space)) - - self.hidden_size = hidden_size - self.normalize = normalize - self.obs_rms = None - - # Placeholders - self.generator_obs_ph = tf.placeholder(observation_space.dtype, (None,) + self.observation_shape, - name="observations_ph") - self.generator_acs_ph = tf.placeholder(action_space.dtype, (None,) + self.actions_shape, - name="actions_ph") - self.expert_obs_ph = tf.placeholder(observation_space.dtype, (None,) + self.observation_shape, - name="expert_observations_ph") - self.expert_acs_ph = tf.placeholder(action_space.dtype, (None,) + self.actions_shape, - name="expert_actions_ph") - # Build graph - generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) - expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) - # Build accuracy - generator_acc = tf.reduce_mean(tf.cast(tf.nn.sigmoid(generator_logits) < 0.5, tf.float32)) - expert_acc = tf.reduce_mean(tf.cast(tf.nn.sigmoid(expert_logits) > 0.5, tf.float32)) - # Build regression loss - # let x = logits, z = targets. - # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) - generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, - labels=tf.zeros_like(generator_logits)) - generator_loss = tf.reduce_mean(generator_loss) - expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) - expert_loss = tf.reduce_mean(expert_loss) - # Build entropy loss - logits = tf.concat([generator_logits, expert_logits], 0) - entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) - entropy_loss = -entcoeff * entropy - # Loss + Accuracy terms - self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] - self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] - self.total_loss = generator_loss + expert_loss + entropy_loss - # Build Reward for policy - self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) - var_list = self.get_trainable_variables() - self.lossandgrad = tf_util.function( - [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], - self.losses + [tf_util.flatgrad(self.total_loss, var_list)]) - - def build_graph(self, obs_ph, acs_ph, reuse=False): - """ - build the graph - - :param obs_ph: (tf.Tensor) the observation placeholder - :param acs_ph: (tf.Tensor) the action placeholder - :param reuse: (bool) - :return: (tf.Tensor) the graph output - """ - with tf.variable_scope(self.scope): - if reuse: - tf.get_variable_scope().reuse_variables() - - if self.normalize: - with tf.variable_scope("obfilter"): - self.obs_rms = RunningMeanStd(shape=self.observation_shape) - obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std - else: - obs = obs_ph - - if self.discrete_actions: - one_hot_actions = tf.one_hot(acs_ph, self.n_actions) - actions_ph = tf.cast(one_hot_actions, tf.float32) - else: - actions_ph = acs_ph - - _input = tf.concat([obs, actions_ph], axis=1) # concatenate the two input -> form a transition - p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh) - p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) - logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity) - return logits - - def get_trainable_variables(self): - """ - Get all the trainable variables from the graph - - :return: ([tf.Tensor]) the variables - """ - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - - def get_reward(self, obs, actions): - """ - Predict the reward using the observation and action - - :param obs: (tf.Tensor or np.ndarray) the observation - :param actions: (tf.Tensor or np.ndarray) the action - :return: (np.ndarray) the reward - """ - sess = tf.get_default_session() - if len(obs.shape) == 1: - obs = np.expand_dims(obs, 0) - if len(actions.shape) == 1: - actions = np.expand_dims(actions, 0) - elif len(actions.shape) == 0: - # one discrete action - actions = np.expand_dims(actions, 0) - - feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: actions} - reward = sess.run(self.reward_op, feed_dict) - return reward diff --git a/stable_baselines/gail/dataset/__init__.py b/stable_baselines/gail/dataset/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/stable_baselines/gail/dataset/dataset.py b/stable_baselines/gail/dataset/dataset.py deleted file mode 100644 index 676b8875..00000000 --- a/stable_baselines/gail/dataset/dataset.py +++ /dev/null @@ -1,371 +0,0 @@ -import queue -import time -from multiprocessing import Queue, Process - -import cv2 # pytype:disable=import-error -import numpy as np -from joblib import Parallel, delayed - -from stable_baselines import logger - - -class ExpertDataset(object): - """ - Dataset for using behavior cloning or GAIL. - - The structure of the expert dataset is a dict, saved as an ".npz" archive. - The dictionary contains the keys 'actions', 'episode_returns', 'rewards', 'obs' and 'episode_starts'. - The corresponding values have data concatenated across episode: the first axis is the timestep, - the remaining axes index into the data. In case of images, 'obs' contains the relative path to - the images, to enable space saving from image compression. - - :param expert_path: (str) The path to trajectory data (.npz file). Mutually exclusive with traj_data. - :param traj_data: (dict) Trajectory data, in format described above. Mutually exclusive with expert_path. - :param train_fraction: (float) the train validation split (0 to 1) - for pre-training using behavior cloning (BC) - :param batch_size: (int) the minibatch size for behavior cloning - :param traj_limitation: (int) the number of trajectory to use (if -1, load all) - :param randomize: (bool) if the dataset should be shuffled - :param verbose: (int) Verbosity - :param sequential_preprocessing: (bool) Do not use subprocess to preprocess - the data (slower but use less memory for the CI) - """ - # Excluded attribute when pickling the object - EXCLUDED_KEYS = {'dataloader', 'train_loader', 'val_loader'} - - def __init__(self, expert_path=None, traj_data=None, train_fraction=0.7, batch_size=64, - traj_limitation=-1, randomize=True, verbose=1, sequential_preprocessing=False): - if traj_data is not None and expert_path is not None: - raise ValueError("Cannot specify both 'traj_data' and 'expert_path'") - if traj_data is None and expert_path is None: - raise ValueError("Must specify one of 'traj_data' or 'expert_path'") - if traj_data is None: - traj_data = np.load(expert_path, allow_pickle=True) - - if verbose > 0: - for key, val in traj_data.items(): - print(key, val.shape) - - # Array of bool where episode_starts[i] = True for each new episode - episode_starts = traj_data['episode_starts'] - - traj_limit_idx = len(traj_data['obs']) - - if traj_limitation > 0: - n_episodes = 0 - # Retrieve the index corresponding - # to the traj_limitation trajectory - for idx, episode_start in enumerate(episode_starts): - n_episodes += int(episode_start) - if n_episodes == (traj_limitation + 1): - traj_limit_idx = idx - 1 - - observations = traj_data['obs'][:traj_limit_idx] - actions = traj_data['actions'][:traj_limit_idx] - - # obs, actions: shape (N * L, ) + S - # where N = # episodes, L = episode length - # and S is the environment observation/action space. - # S = (1, ) for discrete space - # Flatten to (N * L, prod(S)) - if len(observations.shape) > 2: - observations = np.reshape(observations, [-1, np.prod(observations.shape[1:])]) - if len(actions.shape) > 2: - actions = np.reshape(actions, [-1, np.prod(actions.shape[1:])]) - - indices = np.random.permutation(len(observations)).astype(np.int64) - - # Train/Validation split when using behavior cloning - train_indices = indices[:int(train_fraction * len(indices))] - val_indices = indices[int(train_fraction * len(indices)):] - - assert len(train_indices) > 0, "No sample for the training set" - assert len(val_indices) > 0, "No sample for the validation set" - - self.observations = observations - self.actions = actions - - self.returns = traj_data['episode_returns'][:traj_limit_idx] - self.avg_ret = sum(self.returns) / len(self.returns) - self.std_ret = np.std(np.array(self.returns)) - self.verbose = verbose - - assert len(self.observations) == len(self.actions), "The number of actions and observations differ " \ - "please check your expert dataset" - self.num_traj = min(traj_limitation, np.sum(episode_starts)) - self.num_transition = len(self.observations) - self.randomize = randomize - self.sequential_preprocessing = sequential_preprocessing - - self.dataloader = None - self.train_loader = DataLoader(train_indices, self.observations, self.actions, batch_size, - shuffle=self.randomize, start_process=False, - sequential=sequential_preprocessing) - self.val_loader = DataLoader(val_indices, self.observations, self.actions, batch_size, - shuffle=self.randomize, start_process=False, - sequential=sequential_preprocessing) - - if self.verbose >= 1: - self.log_info() - - def init_dataloader(self, batch_size): - """ - Initialize the dataloader used by GAIL. - - :param batch_size: (int) - """ - indices = np.random.permutation(len(self.observations)).astype(np.int64) - self.dataloader = DataLoader(indices, self.observations, self.actions, batch_size, - shuffle=self.randomize, start_process=False, - sequential=self.sequential_preprocessing) - - def __del__(self): - # Exit processes if needed - for key in self.EXCLUDED_KEYS: - if self.__dict__.get(key) is not None: - del self.__dict__[key] - - def __getstate__(self): - """ - Gets state for pickling. - - Excludes processes that are not pickleable - """ - # Remove processes in order to pickle the dataset. - return {key: val for key, val in self.__dict__.items() if key not in self.EXCLUDED_KEYS} - - def __setstate__(self, state): - """ - Restores pickled state. - - init_dataloader() must be called - after unpickling before using it with GAIL. - - :param state: (dict) - """ - self.__dict__.update(state) - for excluded_key in self.EXCLUDED_KEYS: - assert excluded_key not in state - self.dataloader = None - self.train_loader = None - self.val_loader = None - - def log_info(self): - """ - Log the information of the dataset. - """ - logger.log("Total trajectories: {}".format(self.num_traj)) - logger.log("Total transitions: {}".format(self.num_transition)) - logger.log("Average returns: {}".format(self.avg_ret)) - logger.log("Std for returns: {}".format(self.std_ret)) - - def get_next_batch(self, split=None): - """ - Get the batch from the dataset. - - :param split: (str) the type of data split (can be None, 'train', 'val') - :return: (np.ndarray, np.ndarray) inputs and labels - """ - dataloader = { - None: self.dataloader, - 'train': self.train_loader, - 'val': self.val_loader - }[split] - - if dataloader.process is None: - dataloader.start_process() - try: - return next(dataloader) - except StopIteration: - dataloader = iter(dataloader) - return next(dataloader) - - def plot(self): - """ - Show histogram plotting of the episode returns - """ - # Isolate dependency since it is only used for plotting and also since - # different matplotlib backends have further dependencies themselves. - import matplotlib.pyplot as plt - plt.hist(self.returns) - plt.show() - - -class DataLoader(object): - """ - A custom dataloader to preprocessing observations (including images) - and feed them to the network. - - Original code for the dataloader from https://github.com/araffin/robotics-rl-srl - (MIT licence) - Authors: Antonin Raffin, René Traoré, Ashley Hill - - :param indices: ([int]) list of observations indices - :param observations: (np.ndarray) observations or images path - :param actions: (np.ndarray) actions - :param batch_size: (int) Number of samples per minibatch - :param n_workers: (int) number of preprocessing worker (for loading the images) - :param infinite_loop: (bool) whether to have an iterator that can be reset - :param max_queue_len: (int) Max number of minibatches that can be preprocessed at the same time - :param shuffle: (bool) Shuffle the minibatch after each epoch - :param start_process: (bool) Start the preprocessing process (default: True) - :param backend: (str) joblib backend (one of 'multiprocessing', 'sequential', 'threading' - or 'loky' in newest versions) - :param sequential: (bool) Do not use subprocess to preprocess the data - (slower but use less memory for the CI) - :param partial_minibatch: (bool) Allow partial minibatches (minibatches with a number of element - lesser than the batch_size) - """ - - def __init__(self, indices, observations, actions, batch_size, n_workers=1, - infinite_loop=True, max_queue_len=1, shuffle=False, - start_process=True, backend='threading', sequential=False, partial_minibatch=True): - super(DataLoader, self).__init__() - self.n_workers = n_workers - self.infinite_loop = infinite_loop - self.indices = indices - self.original_indices = indices.copy() - self.n_minibatches = len(indices) // batch_size - # Add a partial minibatch, for instance - # when there is not enough samples - if partial_minibatch and len(indices) % batch_size > 0: - self.n_minibatches += 1 - self.batch_size = batch_size - self.observations = observations - self.actions = actions - self.shuffle = shuffle - self.queue = Queue(max_queue_len) - self.process = None - self.load_images = isinstance(observations[0], str) - self.backend = backend - self.sequential = sequential - self.start_idx = 0 - if start_process: - self.start_process() - - def start_process(self): - """Start preprocessing process""" - # Skip if in sequential mode - if self.sequential: - return - self.process = Process(target=self._run) - # Make it a deamon, so it will be deleted at the same time - # of the main process - self.process.daemon = True - self.process.start() - - @property - def _minibatch_indices(self): - """ - Current minibatch indices given the current pointer - (start_idx) and the minibatch size - :return: (np.ndarray) 1D array of indices - """ - return self.indices[self.start_idx:self.start_idx + self.batch_size] - - def sequential_next(self): - """ - Sequential version of the pre-processing. - """ - if self.start_idx > len(self.indices): - raise StopIteration - - if self.start_idx == 0: - if self.shuffle: - # Shuffle indices - np.random.shuffle(self.indices) - - obs = self.observations[self._minibatch_indices] - if self.load_images: - obs = np.concatenate([self._make_batch_element(image_path) for image_path in obs], - axis=0) - - actions = self.actions[self._minibatch_indices] - self.start_idx += self.batch_size - return obs, actions - - def _run(self): - start = True - with Parallel(n_jobs=self.n_workers, batch_size="auto", backend=self.backend) as parallel: - while start or self.infinite_loop: - start = False - - if self.shuffle: - np.random.shuffle(self.indices) - - for minibatch_idx in range(self.n_minibatches): - - self.start_idx = minibatch_idx * self.batch_size - - obs = self.observations[self._minibatch_indices] - if self.load_images: - if self.n_workers <= 1: - obs = [self._make_batch_element(image_path) - for image_path in obs] - - else: - obs = parallel(delayed(self._make_batch_element)(image_path) - for image_path in obs) - - obs = np.concatenate(obs, axis=0) - - actions = self.actions[self._minibatch_indices] - - self.queue.put((obs, actions)) - - # Free memory - del obs - - self.queue.put(None) - - @classmethod - def _make_batch_element(cls, image_path): - """ - Process one element. - - :param image_path: (str) path to an image - :return: (np.ndarray) - """ - # cv2.IMREAD_UNCHANGED is needed to load - # grey and RGBa images - image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) - # Grey image - if len(image.shape) == 2: - image = image[:, :, np.newaxis] - - if image is None: - raise ValueError("Tried to load {}, but it was not found".format(image_path)) - # Convert from BGR to RGB - if image.shape[-1] == 3: - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - image = image.reshape((1,) + image.shape) - return image - - def __len__(self): - return self.n_minibatches - - def __iter__(self): - self.start_idx = 0 - self.indices = self.original_indices.copy() - return self - - def __next__(self): - if self.sequential: - return self.sequential_next() - - if self.process is None: - raise ValueError("You must call .start_process() before using the dataloader") - while True: - try: - val = self.queue.get_nowait() - break - except queue.Empty: - time.sleep(0.001) - continue - if val is None: - raise StopIteration - return val - - def __del__(self): - if self.process is not None: - self.process.terminate() diff --git a/stable_baselines/gail/dataset/expert_cartpole.npz b/stable_baselines/gail/dataset/expert_cartpole.npz deleted file mode 100644 index 263366e3..00000000 Binary files a/stable_baselines/gail/dataset/expert_cartpole.npz and /dev/null differ diff --git a/stable_baselines/gail/dataset/expert_pendulum.npz b/stable_baselines/gail/dataset/expert_pendulum.npz deleted file mode 100644 index 743daf10..00000000 Binary files a/stable_baselines/gail/dataset/expert_pendulum.npz and /dev/null differ diff --git a/stable_baselines/gail/dataset/record_expert.py b/stable_baselines/gail/dataset/record_expert.py deleted file mode 100644 index 8e57d4a8..00000000 --- a/stable_baselines/gail/dataset/record_expert.py +++ /dev/null @@ -1,183 +0,0 @@ -import os -import warnings -from typing import Dict - -import cv2 # pytype:disable=import-error -import numpy as np -from gym import spaces - -from stable_baselines.common.base_class import BaseRLModel -from stable_baselines.common.vec_env import VecEnv, VecFrameStack -from stable_baselines.common.base_class import _UnvecWrapper - - -def generate_expert_traj(model, save_path=None, env=None, n_timesteps=0, - n_episodes=100, image_folder='recorded_images'): - """ - Train expert controller (if needed) and record expert trajectories. - - .. note:: - - only Box and Discrete spaces are supported for now. - - :param model: (RL model or callable) The expert model, if it needs to be trained, - then you need to pass ``n_timesteps > 0``. - :param save_path: (str) Path without the extension where the expert dataset will be saved - (ex: 'expert_cartpole' -> creates 'expert_cartpole.npz'). - If not specified, it will not save, and just return the generated expert trajectories. - This parameter must be specified for image-based environments. - :param env: (gym.Env) The environment, if not defined then it tries to use the model - environment. - :param n_timesteps: (int) Number of training timesteps - :param n_episodes: (int) Number of trajectories (episodes) to record - :param image_folder: (str) When using images, folder that will be used to record images. - :return: (dict) the generated expert trajectories. - """ - - # Retrieve the environment using the RL model - if env is None and isinstance(model, BaseRLModel): - env = model.get_env() - - assert env is not None, "You must set the env in the model or pass it to the function." - - is_vec_env = False - if isinstance(env, VecEnv) and not isinstance(env, _UnvecWrapper): - is_vec_env = True - if env.num_envs > 1: - warnings.warn("You are using multiple envs, only the data from the first one will be recorded.") - - # Sanity check - assert (isinstance(env.observation_space, spaces.Box) or - isinstance(env.observation_space, spaces.Discrete)), "Observation space type not supported" - - assert (isinstance(env.action_space, spaces.Box) or - isinstance(env.action_space, spaces.Discrete)), "Action space type not supported" - - # Check if we need to record images - obs_space = env.observation_space - record_images = len(obs_space.shape) == 3 and obs_space.shape[-1] in [1, 3, 4] \ - and obs_space.dtype == np.uint8 - if record_images and save_path is None: - warnings.warn("Observations are images but no save path was specified, so will save in numpy archive; " - "this can lead to higher memory usage.") - record_images = False - - if not record_images and len(obs_space.shape) == 3 and obs_space.dtype == np.uint8: - warnings.warn("The observations looks like images (shape = {}) " - "but the number of channel > 4, so it will be saved in the numpy archive " - "which can lead to high memory usage".format(obs_space.shape)) - - image_ext = 'jpg' - if record_images: - # We save images as jpg or png, that have only 3/4 color channels - if isinstance(env, VecFrameStack) and env.n_stack == 4: - # assert env.n_stack < 5, "The current data recorder does no support"\ - # "VecFrameStack with n_stack > 4" - image_ext = 'png' - - folder_path = os.path.dirname(save_path) - image_folder = os.path.join(folder_path, image_folder) - os.makedirs(image_folder, exist_ok=True) - print("=" * 10) - print("Images will be recorded to {}/".format(image_folder)) - print("Image shape: {}".format(obs_space.shape)) - print("=" * 10) - - if n_timesteps > 0 and isinstance(model, BaseRLModel): - model.learn(n_timesteps) - - actions = [] - observations = [] - rewards = [] - episode_returns = np.zeros((n_episodes,)) - episode_starts = [] - - ep_idx = 0 - obs = env.reset() - episode_starts.append(True) - reward_sum = 0.0 - idx = 0 - # state and mask for recurrent policies - state, mask = None, None - - if is_vec_env: - mask = [True for _ in range(env.num_envs)] - - while ep_idx < n_episodes: - obs_ = obs[0] if is_vec_env else obs - if record_images: - image_path = os.path.join(image_folder, "{}.{}".format(idx, image_ext)) - # Convert from RGB to BGR - # which is the format OpenCV expect - if obs_.shape[-1] == 3: - obs_ = cv2.cvtColor(obs_, cv2.COLOR_RGB2BGR) - cv2.imwrite(image_path, obs_) - observations.append(image_path) - else: - observations.append(obs_) - - if isinstance(model, BaseRLModel): - action, state = model.predict(obs, state=state, mask=mask) - else: - action = model(obs) - - obs, reward, done, _ = env.step(action) - - # Use only first env - if is_vec_env: - mask = [done[0] for _ in range(env.num_envs)] - action = np.array([action[0]]) - reward = np.array([reward[0]]) - done = np.array([done[0]]) - - actions.append(action) - rewards.append(reward) - episode_starts.append(done) - reward_sum += reward - idx += 1 - if done: - if not is_vec_env: - obs = env.reset() - # Reset the state in case of a recurrent policy - state = None - - episode_returns[ep_idx] = reward_sum - reward_sum = 0.0 - ep_idx += 1 - - if isinstance(env.observation_space, spaces.Box) and not record_images: - observations = np.concatenate(observations).reshape((-1,) + env.observation_space.shape) - elif isinstance(env.observation_space, spaces.Discrete): - observations = np.array(observations).reshape((-1, 1)) - elif record_images: - observations = np.array(observations) - - if isinstance(env.action_space, spaces.Box): - actions = np.concatenate(actions).reshape((-1,) + env.action_space.shape) - elif isinstance(env.action_space, spaces.Discrete): - actions = np.array(actions).reshape((-1, 1)) - - rewards = np.array(rewards) - episode_starts = np.array(episode_starts[:-1]) - - assert len(observations) == len(actions) - - # pytype: disable=attribute-error - numpy_dict = { - 'actions': actions, - 'obs': observations, - 'rewards': rewards, - 'episode_returns': episode_returns, - 'episode_starts': episode_starts - } - - for key, val in numpy_dict.items(): - print(key, val.shape) - # pytype: enable=attribute-error - - if save_path is not None: - np.savez(save_path, **numpy_dict) - - env.close() - - return numpy_dict diff --git a/stable_baselines/gail/model.py b/stable_baselines/gail/model.py deleted file mode 100644 index f5d62022..00000000 --- a/stable_baselines/gail/model.py +++ /dev/null @@ -1,54 +0,0 @@ -from stable_baselines.trpo_mpi import TRPO - - -class GAIL(TRPO): - """ - Generative Adversarial Imitation Learning (GAIL) - - .. warning:: - - Images are not yet handled properly by the current implementation - - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param expert_dataset: (ExpertDataset) the dataset manager - :param gamma: (float) the discount value - :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) - :param max_kl: (float) the Kullback-Leibler loss threshold - :param cg_iters: (int) the number of iterations for the conjugate gradient calculation - :param lam: (float) GAE factor - :param entcoeff: (float) the weight for the entropy loss - :param cg_damping: (float) the compute gradient dampening factor - :param vf_stepsize: (float) the value function stepsize - :param vf_iters: (int) the value function's number iterations for learning - :param hidden_size: ([int]) the hidden dimension for the MLP - :param g_step: (int) number of steps to train policy in each epoch - :param d_step: (int) number of steps to train discriminator in each epoch - :param d_stepsize: (float) the reward giver stepsize - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - """ - - def __init__(self, policy, env, expert_dataset=None, - hidden_size_adversary=100, adversary_entcoeff=1e-3, - g_step=3, d_step=1, d_stepsize=3e-4, verbose=0, - _init_setup_model=True, **kwargs): - super().__init__(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) - self.using_gail = True - self.expert_dataset = expert_dataset - self.g_step = g_step - self.d_step = d_step - self.d_stepsize = d_stepsize - self.hidden_size_adversary = hidden_size_adversary - self.adversary_entcoeff = adversary_entcoeff - - if _init_setup_model: - self.setup_model() - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="GAIL", - reset_num_timesteps=True): - assert self.expert_dataset is not None, "You must pass an expert dataset to GAIL for training" - return super().learn(total_timesteps, callback, log_interval, tb_log_name, reset_num_timesteps) diff --git a/stable_baselines/her/__init__.py b/stable_baselines/her/__init__.py deleted file mode 100644 index 6b47d6e7..00000000 --- a/stable_baselines/her/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from stable_baselines.her.her import HER -from stable_baselines.her.replay_buffer import GoalSelectionStrategy, HindsightExperienceReplayWrapper -from stable_baselines.her.utils import HERGoalEnvWrapper diff --git a/stable_baselines/her/her.py b/stable_baselines/her/her.py deleted file mode 100644 index 6a9e89f4..00000000 --- a/stable_baselines/her/her.py +++ /dev/null @@ -1,163 +0,0 @@ -import functools - -from stable_baselines.common import BaseRLModel -from stable_baselines.common import OffPolicyRLModel -from stable_baselines.common.base_class import _UnvecWrapper -from stable_baselines.common.vec_env import VecEnvWrapper -from .replay_buffer import HindsightExperienceReplayWrapper, KEY_TO_GOAL_STRATEGY -from .utils import HERGoalEnvWrapper - - -class HER(BaseRLModel): - """ - Hindsight Experience Replay (HER) https://arxiv.org/abs/1707.01495 - - :param policy: (BasePolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param model_class: (OffPolicyRLModel) The off policy RL model to apply Hindsight Experience Replay - currently supported: DQN, DDPG, SAC - :param n_sampled_goal: (int) - :param goal_selection_strategy: (GoalSelectionStrategy or str) - """ - - def __init__(self, policy, env, model_class, n_sampled_goal=4, - goal_selection_strategy='future', *args, **kwargs): - - assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper" - - super().__init__(policy=policy, env=env, verbose=kwargs.get('verbose', 0), - policy_base=None, requires_vec_env=False) - - self.model_class = model_class - self.replay_wrapper = None - # Save dict observation space (used for checks at loading time) - if env is not None: - self.observation_space = env.observation_space - self.action_space = env.action_space - - # Convert string to GoalSelectionStrategy object - if isinstance(goal_selection_strategy, str): - assert goal_selection_strategy in KEY_TO_GOAL_STRATEGY.keys(), "Unknown goal selection strategy" - goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy] - - self.n_sampled_goal = n_sampled_goal - self.goal_selection_strategy = goal_selection_strategy - - if self.env is not None: - self._create_replay_wrapper(self.env) - - assert issubclass(model_class, OffPolicyRLModel), \ - "Error: HER only works with Off policy model (such as DDPG, SAC, TD3 and DQN)." - - self.model = self.model_class(policy, self.env, *args, **kwargs) - # Patch to support saving/loading - self.model._save_to_file = self._save_to_file - - def _create_replay_wrapper(self, env): - """ - Wrap the environment in a HERGoalEnvWrapper - if needed and create the replay buffer wrapper. - """ - if not isinstance(env, HERGoalEnvWrapper): - env = HERGoalEnvWrapper(env) - - self.env = env - # NOTE: we cannot do that check directly with VecEnv - # maybe we can try calling `compute_reward()` ? - # assert isinstance(self.env, gym.GoalEnv), "HER only supports gym.GoalEnv" - - self.replay_wrapper = functools.partial(HindsightExperienceReplayWrapper, - n_sampled_goal=self.n_sampled_goal, - goal_selection_strategy=self.goal_selection_strategy, - wrapped_env=self.env) - - def set_env(self, env): - assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper" - super().set_env(env) - self._create_replay_wrapper(self.env) - self.model.set_env(self.env) - - def get_env(self): - return self.env - - def get_parameter_list(self): - return self.model.get_parameter_list() - - def __getattr__(self, attr): - """ - Wrap the RL model. - - :param attr: (str) - :return: (Any) - """ - if attr in self.__dict__: - return getattr(self, attr) - return getattr(self.model, attr) - - def __set_attr__(self, attr, value): - if attr in self.__dict__: - setattr(self, attr, value) - else: - setattr(self.model, attr, value) - - def _get_pretrain_placeholders(self): - return self.model._get_pretrain_placeholders() - - def setup_model(self): - pass - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="HER", - reset_num_timesteps=True): - return self.model.learn(total_timesteps, callback=callback, log_interval=log_interval, - tb_log_name=tb_log_name, reset_num_timesteps=reset_num_timesteps, - replay_wrapper=self.replay_wrapper) - - def _check_obs(self, observation): - if isinstance(observation, dict): - if self.env is not None: - if len(observation['observation'].shape) > 1: - observation = _UnvecWrapper.unvec_obs(observation) - return [self.env.convert_dict_to_obs(observation)] - return self.env.convert_dict_to_obs(observation) - else: - raise ValueError("You must either pass an env to HER or wrap your env using HERGoalEnvWrapper") - return observation - - def predict(self, observation, state=None, mask=None, deterministic=True): - return self.model.predict(self._check_obs(observation), state, mask, deterministic) - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - return self.model.action_probability(self._check_obs(observation), state, mask, actions, logp) - - def _save_to_file(self, save_path, data=None, params=None, cloudpickle=False): - # HACK to save the replay wrapper - # or better to save only the replay strategy and its params? - # it will not work with VecEnv - data['n_sampled_goal'] = self.n_sampled_goal - data['goal_selection_strategy'] = self.goal_selection_strategy - data['model_class'] = self.model_class - data['her_obs_space'] = self.observation_space - data['her_action_space'] = self.action_space - super()._save_to_file(save_path, data, params, cloudpickle=cloudpickle) - - def save(self, save_path, cloudpickle=False): - self.model.save(save_path, cloudpickle=cloudpickle) - - @classmethod - def load(cls, load_path, env=None, custom_objects=None, **kwargs): - data, _ = cls._load_from_file(load_path, custom_objects=custom_objects) - - if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: - raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " - "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], - kwargs['policy_kwargs'])) - - model = cls(policy=data["policy"], env=env, model_class=data['model_class'], - n_sampled_goal=data['n_sampled_goal'], - goal_selection_strategy=data['goal_selection_strategy'], - _init_setup_model=False) - model.__dict__['observation_space'] = data['her_obs_space'] - model.__dict__['action_space'] = data['her_action_space'] - model.model = data['model_class'].load(load_path, model.get_env(), **kwargs) - model.model._save_to_file = model._save_to_file - return model diff --git a/stable_baselines/her/replay_buffer.py b/stable_baselines/her/replay_buffer.py deleted file mode 100644 index 736a4993..00000000 --- a/stable_baselines/her/replay_buffer.py +++ /dev/null @@ -1,184 +0,0 @@ -import copy -from enum import Enum - -import numpy as np - - -class GoalSelectionStrategy(Enum): - """ - The strategies for selecting new goals when - creating artificial transitions. - """ - # Select a goal that was achieved - # after the current step, in the same episode - FUTURE = 0 - # Select the goal that was achieved - # at the end of the episode - FINAL = 1 - # Select a goal that was achieved in the episode - EPISODE = 2 - # Select a goal that was achieved - # at some point in the training procedure - # (and that is present in the replay buffer) - RANDOM = 3 - - -# For convenience -# that way, we can use string to select a strategy -KEY_TO_GOAL_STRATEGY = { - 'future': GoalSelectionStrategy.FUTURE, - 'final': GoalSelectionStrategy.FINAL, - 'episode': GoalSelectionStrategy.EPISODE, - 'random': GoalSelectionStrategy.RANDOM -} - - -class HindsightExperienceReplayWrapper(object): - """ - Wrapper around a replay buffer in order to use HER. - This implementation is inspired by to the one found in https://github.com/NervanaSystems/coach/. - - :param replay_buffer: (ReplayBuffer) - :param n_sampled_goal: (int) The number of artificial transitions to generate for each actual transition - :param goal_selection_strategy: (GoalSelectionStrategy) The method that will be used to generate - the goals for the artificial transitions. - :param wrapped_env: (HERGoalEnvWrapper) the GoalEnv wrapped using HERGoalEnvWrapper, - that enables to convert observation to dict, and vice versa - """ - - def __init__(self, replay_buffer, n_sampled_goal, goal_selection_strategy, wrapped_env): - super(HindsightExperienceReplayWrapper, self).__init__() - - assert isinstance(goal_selection_strategy, GoalSelectionStrategy), "Invalid goal selection strategy," \ - "please use one of {}".format( - list(GoalSelectionStrategy)) - - self.n_sampled_goal = n_sampled_goal - self.goal_selection_strategy = goal_selection_strategy - self.env = wrapped_env - # Buffer for storing transitions of the current episode - self.episode_transitions = [] - self.replay_buffer = replay_buffer - - def add(self, obs_t, action, reward, obs_tp1, done, info): - """ - add a new transition to the buffer - - :param obs_t: (np.ndarray) the last observation - :param action: ([float]) the action - :param reward: (float) the reward of the transition - :param obs_tp1: (np.ndarray) the new observation - :param done: (bool) is the episode done - :param info: (dict) extra values used to compute reward - """ - assert self.replay_buffer is not None - # Update current episode buffer - self.episode_transitions.append((obs_t, action, reward, obs_tp1, done, info)) - if done: - # Add transitions (and imagined ones) to buffer only when an episode is over - self._store_episode() - # Reset episode buffer - self.episode_transitions = [] - - def sample(self, *args, **kwargs): - return self.replay_buffer.sample(*args, **kwargs) - - def can_sample(self, n_samples): - """ - Check if n_samples samples can be sampled - from the buffer. - - :param n_samples: (int) - :return: (bool) - """ - return self.replay_buffer.can_sample(n_samples) - - def __len__(self): - return len(self.replay_buffer) - - def _sample_achieved_goal(self, episode_transitions, transition_idx): - """ - Sample an achieved goal according to the sampling strategy. - - :param episode_transitions: ([tuple]) a list of all the transitions in the current episode - :param transition_idx: (int) the transition to start sampling from - :return: (np.ndarray) an achieved goal - """ - if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: - # Sample a goal that was observed in the same episode after the current step - selected_idx = np.random.choice(np.arange(transition_idx + 1, len(episode_transitions))) - selected_transition = episode_transitions[selected_idx] - elif self.goal_selection_strategy == GoalSelectionStrategy.FINAL: - # Choose the goal achieved at the end of the episode - selected_transition = episode_transitions[-1] - elif self.goal_selection_strategy == GoalSelectionStrategy.EPISODE: - # Random goal achieved during the episode - selected_idx = np.random.choice(np.arange(len(episode_transitions))) - selected_transition = episode_transitions[selected_idx] - elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM: - # Random goal achieved, from the entire replay buffer - selected_idx = np.random.choice(np.arange(len(self.replay_buffer))) - selected_transition = self.replay_buffer.storage[selected_idx] - else: - raise ValueError("Invalid goal selection strategy," - "please use one of {}".format(list(GoalSelectionStrategy))) - return self.env.convert_obs_to_dict(selected_transition[0])['achieved_goal'] - - def _sample_achieved_goals(self, episode_transitions, transition_idx): - """ - Sample a batch of achieved goals according to the sampling strategy. - - :param episode_transitions: ([tuple]) list of the transitions in the current episode - :param transition_idx: (int) the transition to start sampling from - :return: (np.ndarray) an achieved goal - """ - return [ - self._sample_achieved_goal(episode_transitions, transition_idx) - for _ in range(self.n_sampled_goal) - ] - - def _store_episode(self): - """ - Sample artificial goals and store transition of the current - episode in the replay buffer. - This method is called only after each end of episode. - """ - # For each transition in the last episode, - # create a set of artificial transitions - for transition_idx, transition in enumerate(self.episode_transitions): - - obs_t, action, reward, obs_tp1, done, info = transition - - # Add to the replay buffer - self.replay_buffer.add(obs_t, action, reward, obs_tp1, done) - - # We cannot sample a goal from the future in the last step of an episode - if (transition_idx == len(self.episode_transitions) - 1 and - self.goal_selection_strategy == GoalSelectionStrategy.FUTURE): - break - - # Sampled n goals per transition, where n is `n_sampled_goal` - # this is called k in the paper - sampled_goals = self._sample_achieved_goals(self.episode_transitions, transition_idx) - # For each sampled goals, store a new transition - for goal in sampled_goals: - # Copy transition to avoid modifying the original one - obs, action, reward, next_obs, done, info = copy.deepcopy(transition) - - # Convert concatenated obs to dict, so we can update the goals - obs_dict, next_obs_dict = map(self.env.convert_obs_to_dict, (obs, next_obs)) - - # Update the desired goal in the transition - obs_dict['desired_goal'] = goal - next_obs_dict['desired_goal'] = goal - - # Update the reward according to the new desired goal - reward = self.env.compute_reward(next_obs_dict['achieved_goal'], goal, info) - # Can we use achieved_goal == desired_goal? - done = False - - # Transform back to ndarrays - obs, next_obs = map(self.env.convert_dict_to_obs, (obs_dict, next_obs_dict)) - - # Add artificial transition to the replay buffer - self.replay_buffer.add(obs, action, reward, next_obs, done) diff --git a/stable_baselines/her/utils.py b/stable_baselines/her/utils.py deleted file mode 100644 index ff3d4d23..00000000 --- a/stable_baselines/her/utils.py +++ /dev/null @@ -1,103 +0,0 @@ -from collections import OrderedDict - -import numpy as np -from gym import spaces - -# Important: gym mixes up ordered and unordered keys -# and the Dict space may return a different order of keys that the actual one -KEY_ORDER = ['observation', 'achieved_goal', 'desired_goal'] - - -class HERGoalEnvWrapper(object): - """ - A wrapper that allow to use dict observation space (coming from GoalEnv) with - the RL algorithms. - It assumes that all the spaces of the dict space are of the same type. - - :param env: (gym.GoalEnv) - """ - - def __init__(self, env): - super(HERGoalEnvWrapper, self).__init__() - self.env = env - self.metadata = self.env.metadata - self.action_space = env.action_space - self.spaces = list(env.observation_space.spaces.values()) - # Check that all spaces are of the same type - # (current limitation of the wrapper) - space_types = [type(env.observation_space.spaces[key]) for key in KEY_ORDER] - assert len(set(space_types)) == 1, "The spaces for goal and observation"\ - " must be of the same type" - - if isinstance(self.spaces[0], spaces.Discrete): - self.obs_dim = 1 - self.goal_dim = 1 - else: - goal_space_shape = env.observation_space.spaces['achieved_goal'].shape - self.obs_dim = env.observation_space.spaces['observation'].shape[0] - self.goal_dim = goal_space_shape[0] - - if len(goal_space_shape) == 2: - assert goal_space_shape[1] == 1, "Only 1D observation spaces are supported yet" - else: - assert len(goal_space_shape) == 1, "Only 1D observation spaces are supported yet" - - if isinstance(self.spaces[0], spaces.MultiBinary): - total_dim = self.obs_dim + 2 * self.goal_dim - self.observation_space = spaces.MultiBinary(total_dim) - - elif isinstance(self.spaces[0], spaces.Box): - lows = np.concatenate([space.low for space in self.spaces]) - highs = np.concatenate([space.high for space in self.spaces]) - self.observation_space = spaces.Box(lows, highs, dtype=np.float32) - - elif isinstance(self.spaces[0], spaces.Discrete): - dimensions = [env.observation_space.spaces[key].n for key in KEY_ORDER] - self.observation_space = spaces.MultiDiscrete(dimensions) - - else: - raise NotImplementedError("{} space is not supported".format(type(self.spaces[0]))) - - def convert_dict_to_obs(self, obs_dict): - """ - :param obs_dict: (dict) - :return: (np.ndarray) - """ - # Note: achieved goal is not removed from the observation - # this is helpful to have a revertible transformation - if isinstance(self.observation_space, spaces.MultiDiscrete): - # Special case for multidiscrete - return np.concatenate([[int(obs_dict[key])] for key in KEY_ORDER]) - return np.concatenate([obs_dict[key] for key in KEY_ORDER]) - - def convert_obs_to_dict(self, observations): - """ - Inverse operation of convert_dict_to_obs - - :param observations: (np.ndarray) - :return: (OrderedDict) - """ - return OrderedDict([ - ('observation', observations[:self.obs_dim]), - ('achieved_goal', observations[self.obs_dim:self.obs_dim + self.goal_dim]), - ('desired_goal', observations[self.obs_dim + self.goal_dim:]), - ]) - - def step(self, action): - obs, reward, done, info = self.env.step(action) - return self.convert_dict_to_obs(obs), reward, done, info - - def seed(self, seed=None): - return self.env.seed(seed) - - def reset(self): - return self.convert_dict_to_obs(self.env.reset()) - - def compute_reward(self, achieved_goal, desired_goal, info): - return self.env.compute_reward(achieved_goal, desired_goal, info) - - def render(self, mode='human'): - return self.env.render(mode) - - def close(self): - return self.env.close() diff --git a/stable_baselines/logger.py b/stable_baselines/logger.py deleted file mode 100644 index e094acce..00000000 --- a/stable_baselines/logger.py +++ /dev/null @@ -1,745 +0,0 @@ -import os -import sys -import shutil -import json -import time -import datetime -import tempfile -import warnings -from collections import defaultdict -from typing import Optional - -import tensorflow as tf -from tensorflow.python import pywrap_tensorflow -from tensorflow.core.util import event_pb2 -from tensorflow.python.util import compat - -from stable_baselines.common.misc_util import mpi_rank_or_zero - -DEBUG = 10 -INFO = 20 -WARN = 30 -ERROR = 40 - -DISABLED = 50 - - -class KVWriter(object): - """ - Key Value writer - """ - def writekvs(self, kvs): - """ - write a dictionary to file - - :param kvs: (dict) - """ - raise NotImplementedError - - -class SeqWriter(object): - """ - sequence writer - """ - def writeseq(self, seq): - """ - write an array to file - - :param seq: (list) - """ - raise NotImplementedError - - -class HumanOutputFormat(KVWriter, SeqWriter): - def __init__(self, filename_or_file): - """ - log to a file, in a human readable format - - :param filename_or_file: (str or File) the file to write the log to - """ - if isinstance(filename_or_file, str): - self.file = open(filename_or_file, 'wt') - self.own_file = True - else: - assert hasattr(filename_or_file, 'write'), 'Expected file or str, got {}'.format(filename_or_file) - self.file = filename_or_file - self.own_file = False - - def writekvs(self, kvs): - # Create strings for printing - key2str = {} - for (key, val) in sorted(kvs.items()): - if isinstance(val, float): - valstr = '%-8.3g' % (val,) - else: - valstr = str(val) - key2str[self._truncate(key)] = self._truncate(valstr) - - # Find max widths - if len(key2str) == 0: - warnings.warn('Tried to write empty key-value dict') - return - else: - keywidth = max(map(len, key2str.keys())) - valwidth = max(map(len, key2str.values())) - - # Write out the data - dashes = '-' * (keywidth + valwidth + 7) - lines = [dashes] - for (key, val) in sorted(key2str.items()): - lines.append('| %s%s | %s%s |' % ( - key, - ' ' * (keywidth - len(key)), - val, - ' ' * (valwidth - len(val)), - )) - lines.append(dashes) - self.file.write('\n'.join(lines) + '\n') - - # Flush the output to the file - self.file.flush() - - @classmethod - def _truncate(cls, string): - return string[:20] + '...' if len(string) > 23 else string - - def writeseq(self, seq): - seq = list(seq) - for (i, elem) in enumerate(seq): - self.file.write(elem) - if i < len(seq) - 1: # add space unless this is the last one - self.file.write(' ') - self.file.write('\n') - self.file.flush() - - def close(self): - """ - closes the file - """ - if self.own_file: - self.file.close() - - -class JSONOutputFormat(KVWriter): - def __init__(self, filename): - """ - log to a file, in the JSON format - - :param filename: (str) the file to write the log to - """ - self.file = open(filename, 'wt') - - def writekvs(self, kvs): - for key, value in sorted(kvs.items()): - if hasattr(value, 'dtype'): - if value.shape == () or len(value) == 1: # pytype: disable=attribute-error - # if value is a dimensionless numpy array or of length 1, serialize as a float - kvs[key] = float(value) - else: - # otherwise, a value is a numpy array, serialize as a list or nested lists - kvs[key] = value.tolist() # pytype: disable=attribute-error - self.file.write(json.dumps(kvs) + '\n') - self.file.flush() - - def close(self): - """ - closes the file - """ - self.file.close() - - -class CSVOutputFormat(KVWriter): - def __init__(self, filename): - """ - log to a file, in a CSV format - - :param filename: (str) the file to write the log to - """ - self.file = open(filename, 'w+t') - self.keys = [] - self.sep = ',' - - def writekvs(self, kvs): - # Add our current row to the history - extra_keys = kvs.keys() - self.keys - if extra_keys: - self.keys.extend(extra_keys) - self.file.seek(0) - lines = self.file.readlines() - self.file.seek(0) - for (i, key) in enumerate(self.keys): - if i > 0: - self.file.write(',') - self.file.write(key) - self.file.write('\n') - for line in lines[1:]: - self.file.write(line[:-1]) - self.file.write(self.sep * len(extra_keys)) - self.file.write('\n') - for i, key in enumerate(self.keys): - if i > 0: - self.file.write(',') - value = kvs.get(key) - if value is not None: - self.file.write(str(value)) - self.file.write('\n') - self.file.flush() - - def close(self): - """ - closes the file - """ - self.file.close() - - -def summary_val(key, value): - """ - :param key: (str) - :param value: (float) - """ - kwargs = {'tag': key, 'simple_value': float(value)} - return tf.Summary.Value(**kwargs) - - -def valid_float_value(value): - """ - Returns True if the value can be successfully cast into a float - - :param value: (Any) the value to check - :return: (bool) - """ - try: - float(value) - return True - except TypeError: - return False - - -class TensorBoardOutputFormat(KVWriter): - def __init__(self, folder): - """ - Dumps key/value pairs into TensorBoard's numeric format. - - :param folder: (str) the folder to write the log to - """ - os.makedirs(folder, exist_ok=True) - self.dir = folder - self.step = 1 - prefix = 'events' - path = os.path.join(os.path.abspath(folder), prefix) - self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) # type: pywrap_tensorflow.EventsWriter - - def writekvs(self, kvs): - summary = tf.Summary(value=[summary_val(k, v) for k, v in kvs.items() if valid_float_value(v)]) - event = event_pb2.Event(wall_time=time.time(), summary=summary) - event.step = self.step # is there any reason why you'd want to specify the step? - if self.writer is None: - raise ValueError("Attempt to write after close().") - self.writer.WriteEvent(event) - self.writer.Flush() - self.step += 1 - - def close(self): - """ - closes the file - """ - if self.writer: - self.writer.Close() - self.writer = None - - -def make_output_format(_format, ev_dir, log_suffix=''): - """ - return a logger for the requested format - - :param _format: (str) the requested format to log to ('stdout', 'log', 'json', 'csv' or 'tensorboard') - :param ev_dir: (str) the logging directory - :param log_suffix: (str) the suffix for the log file - :return: (KVWrite) the logger - """ - os.makedirs(ev_dir, exist_ok=True) - if _format == 'stdout': - return HumanOutputFormat(sys.stdout) - elif _format == 'log': - return HumanOutputFormat(os.path.join(ev_dir, 'log%s.txt' % log_suffix)) - elif _format == 'json': - return JSONOutputFormat(os.path.join(ev_dir, 'progress%s.json' % log_suffix)) - elif _format == 'csv': - return CSVOutputFormat(os.path.join(ev_dir, 'progress%s.csv' % log_suffix)) - elif _format == 'tensorboard': - return TensorBoardOutputFormat(os.path.join(ev_dir, 'tb%s' % log_suffix)) - else: - raise ValueError('Unknown format specified: %s' % (_format,)) - - -# ================================================================ -# API -# ================================================================ - -def logkv(key, val): - """ - Log a value of some diagnostic - Call this once for each diagnostic quantity, each iteration - If called many times, last value will be used. - - :param key: (Any) save to log this key - :param val: (Any) save to log this value - """ - Logger.CURRENT.logkv(key, val) - - -def logkv_mean(key, val): - """ - The same as logkv(), but if called many times, values averaged. - - :param key: (Any) save to log this key - :param val: (Number) save to log this value - """ - Logger.CURRENT.logkv_mean(key, val) - - -def logkvs(key_values): - """ - Log a dictionary of key-value pairs - - :param key_values: (dict) the list of keys and values to save to log - """ - for key, value in key_values.items(): - logkv(key, value) - - -def dumpkvs(): - """ - Write all of the diagnostics from the current iteration - """ - Logger.CURRENT.dumpkvs() - - -def getkvs(): - """ - get the key values logs - - :return: (dict) the logged values - """ - return Logger.CURRENT.name2val - - -def log(*args, level=INFO): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - - level: int. (see logger.py docs) If the global logger level is higher than - the level argument here, don't print to stdout. - - :param args: (list) log the arguments - :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) - """ - Logger.CURRENT.log(*args, level=level) - - -def debug(*args): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - Using the DEBUG level. - - :param args: (list) log the arguments - """ - log(*args, level=DEBUG) - - -def info(*args): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - Using the INFO level. - - :param args: (list) log the arguments - """ - log(*args, level=INFO) - - -def warn(*args): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - Using the WARN level. - - :param args: (list) log the arguments - """ - log(*args, level=WARN) - - -def error(*args): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - Using the ERROR level. - - :param args: (list) log the arguments - """ - log(*args, level=ERROR) - - -def set_level(level): - """ - Set logging threshold on current logger. - - :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) - """ - Logger.CURRENT.set_level(level) - - -def get_level(): - """ - Get logging threshold on current logger. - :return: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) - """ - return Logger.CURRENT.level - - -def get_dir(): - """ - Get directory that log files are being written to. - will be None if there is no output directory (i.e., if you didn't call start) - - :return: (str) the logging directory - """ - return Logger.CURRENT.get_dir() - - -record_tabular = logkv -dump_tabular = dumpkvs - - -class ProfileKV: - def __init__(self, name): - """ - Usage: - with logger.ProfileKV("interesting_scope"): - code - - :param name: (str) the profiling name - """ - self.name = "wait_" + name - - def __enter__(self): - self.start_time = time.time() - - def __exit__(self, _type, value, traceback): - Logger.CURRENT.name2val[self.name] += time.time() - self.start_time - - -def profile(name): - """ - Usage: - @profile("my_func") - def my_func(): code - - :param name: (str) the profiling name - :return: (function) the wrapped function - """ - def decorator_with_name(func): - def func_wrapper(*args, **kwargs): - with ProfileKV(name): - return func(*args, **kwargs) - - return func_wrapper - - return decorator_with_name - - -# ================================================================ -# Backend -# ================================================================ - -class Logger(object): - # A logger with no output files. (See right below class definition) - # So that you can still log to the terminal without setting up any output files - DEFAULT = None # type: Optional["Logger"] - # Current logger being used by the free functions above - CURRENT = None # type: Optional["Logger"] - - def __init__(self, folder, output_formats): - """ - the logger class - - :param folder: (str) the logging location - :param output_formats: ([str]) the list of output format - """ - self.name2val = defaultdict(float) # values this iteration - self.name2cnt = defaultdict(int) - self.level = INFO - self.dir = folder - self.output_formats = output_formats - - # Logging API, forwarded - # ---------------------------------------- - def logkv(self, key, val): - """ - Log a value of some diagnostic - Call this once for each diagnostic quantity, each iteration - If called many times, last value will be used. - - :param key: (Any) save to log this key - :param val: (Any) save to log this value - """ - self.name2val[key] = val - - def logkv_mean(self, key, val): - """ - The same as logkv(), but if called many times, values averaged. - - :param key: (Any) save to log this key - :param val: (Number) save to log this value - """ - if val is None: - self.name2val[key] = None - return - oldval, cnt = self.name2val[key], self.name2cnt[key] - self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1) - self.name2cnt[key] = cnt + 1 - - def dumpkvs(self): - """ - Write all of the diagnostics from the current iteration - """ - if self.level == DISABLED: - return - for fmt in self.output_formats: - if isinstance(fmt, KVWriter): - fmt.writekvs(self.name2val) - self.name2val.clear() - self.name2cnt.clear() - - def log(self, *args, level=INFO): - """ - Write the sequence of args, with no separators, - to the console and output files (if you've configured an output file). - - level: int. (see logger.py docs) If the global logger level is higher than - the level argument here, don't print to stdout. - - :param args: (list) log the arguments - :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) - """ - if self.level <= level: - self._do_log(args) - - # Configuration - # ---------------------------------------- - def set_level(self, level): - """ - Set logging threshold on current logger. - - :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) - """ - self.level = level - - def get_dir(self): - """ - Get directory that log files are being written to. - will be None if there is no output directory (i.e., if you didn't call start) - - :return: (str) the logging directory - """ - return self.dir - - def close(self): - """ - closes the file - """ - for fmt in self.output_formats: - fmt.close() - - # Misc - # ---------------------------------------- - def _do_log(self, args): - """ - log to the requested format outputs - - :param args: (list) the arguments to log - """ - for fmt in self.output_formats: - if isinstance(fmt, SeqWriter): - fmt.writeseq(map(str, args)) - - -Logger.DEFAULT = Logger.CURRENT = Logger(folder=None, output_formats=[HumanOutputFormat(sys.stdout)]) - - -def configure(folder=None, format_strs=None): - """ - configure the current logger - - :param folder: (str) the save location (if None, $OPENAI_LOGDIR, if still None, tempdir/openai-[date & time]) - :param format_strs: (list) the output logging format - (if None, $OPENAI_LOG_FORMAT, if still None, ['stdout', 'log', 'csv']) - """ - if folder is None: - folder = os.getenv('OPENAI_LOGDIR') - if folder is None: - folder = os.path.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) - assert isinstance(folder, str) - os.makedirs(folder, exist_ok=True) - rank = mpi_rank_or_zero() - - log_suffix = '' - if format_strs is None: - if rank == 0: - format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',') - else: - log_suffix = "-rank%03i" % rank - format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') - format_strs = filter(None, format_strs) - output_formats = [make_output_format(f, folder, log_suffix) for f in format_strs] - - Logger.CURRENT = Logger(folder=folder, output_formats=output_formats) - log('Logging to %s' % folder) - - -def reset(): - """ - reset the current logger - """ - if Logger.CURRENT is not Logger.DEFAULT: - Logger.CURRENT.close() - Logger.CURRENT = Logger.DEFAULT - log('Reset logger') - - -class ScopedConfigure(object): - def __init__(self, folder=None, format_strs=None): - """ - Class for using context manager while logging - - usage: - with ScopedConfigure(folder=None, format_strs=None): - {code} - - :param folder: (str) the logging folder - :param format_strs: ([str]) the list of output logging format - """ - self.dir = folder - self.format_strs = format_strs - self.prevlogger = None - - def __enter__(self): - self.prevlogger = Logger.CURRENT - configure(folder=self.dir, format_strs=self.format_strs) - - def __exit__(self, *args): - Logger.CURRENT.close() - Logger.CURRENT = self.prevlogger - - -# ================================================================ - -def _demo(): - """ - tests for the logger module - """ - info("hi") - debug("shouldn't appear") - set_level(DEBUG) - debug("should appear") - folder = "/tmp/testlogging" - if os.path.exists(folder): - shutil.rmtree(folder) - configure(folder=folder) - logkv("a", 3) - logkv("b", 2.5) - dumpkvs() - logkv("b", -2.5) - logkv("a", 5.5) - dumpkvs() - info("^^^ should see a = 5.5") - logkv_mean("b", -22.5) - logkv_mean("b", -44.4) - logkv("a", 5.5) - dumpkvs() - with ScopedConfigure(None, None): - info("^^^ should see b = 33.3") - - with ScopedConfigure("/tmp/test-logger/", ["json"]): - logkv("b", -2.5) - dumpkvs() - - reset() - logkv("a", "longasslongasslongasslongasslongasslongassvalue") - dumpkvs() - warn("hey") - error("oh") - logkvs({"test": 1}) - - -# ================================================================ -# Readers -# ================================================================ - -def read_json(fname): - """ - read a json file using pandas - - :param fname: (str) the file path to read - :return: (pandas DataFrame) the data in the json - """ - import pandas - data = [] - with open(fname, 'rt') as file_handler: - for line in file_handler: - data.append(json.loads(line)) - return pandas.DataFrame(data) - - -def read_csv(fname): - """ - read a csv file using pandas - - :param fname: (str) the file path to read - :return: (pandas DataFrame) the data in the csv - """ - import pandas - return pandas.read_csv(fname, index_col=None, comment='#') - - -def read_tb(path): - """ - read a tensorboard output - - :param path: (str) a tensorboard file OR a directory, where we will find all TB files of the form events. - :return: (pandas DataFrame) the tensorboad data - """ - import pandas - import numpy as np - from glob import glob - # from collections import defaultdict - import tensorflow as tf - if os.path.isdir(path): - fnames = glob(os.path.join(path, "events.*")) - elif os.path.basename(path).startswith("events."): - fnames = [path] - else: - raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s" % path) - tag2pairs = defaultdict(list) - maxstep = 0 - for fname in fnames: - for summary in tf.train.summary_iterator(fname): - if summary.step > 0: - for value in summary.summary.value: - pair = (summary.step, value.simple_value) - tag2pairs[value.tag].append(pair) - maxstep = max(summary.step, maxstep) - data = np.empty((maxstep, len(tag2pairs))) - data[:] = np.nan - tags = sorted(tag2pairs.keys()) - for (colidx, tag) in enumerate(tags): - pairs = tag2pairs[tag] - for (step, value) in pairs: - data[step - 1, colidx] = value - return pandas.DataFrame(data, columns=tags) - - -if __name__ == "__main__": - _demo() diff --git a/stable_baselines/ppo1/__init__.py b/stable_baselines/ppo1/__init__.py deleted file mode 100644 index e04efd2a..00000000 --- a/stable_baselines/ppo1/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.ppo1.pposgd_simple import PPO1 diff --git a/stable_baselines/ppo1/experiments/train_cartpole.py b/stable_baselines/ppo1/experiments/train_cartpole.py deleted file mode 100644 index e3d2fa30..00000000 --- a/stable_baselines/ppo1/experiments/train_cartpole.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Simple test to check that PPO1 is running with no errors (see issue #50) -""" -from stable_baselines import PPO1 - - -if __name__ == '__main__': - model = PPO1('MlpPolicy', 'CartPole-v1', schedule='linear', verbose=0) - model.learn(total_timesteps=1000) diff --git a/stable_baselines/ppo1/pposgd_simple.py b/stable_baselines/ppo1/pposgd_simple.py deleted file mode 100644 index b570df26..00000000 --- a/stable_baselines/ppo1/pposgd_simple.py +++ /dev/null @@ -1,370 +0,0 @@ -import time -from collections import deque - -import gym -import numpy as np -import tensorflow as tf -from mpi4py import MPI - -from stable_baselines.common import Dataset, explained_variance, fmt_row, zipsame, ActorCriticRLModel, SetVerbosity, \ - TensorboardWriter -from stable_baselines import logger -import stable_baselines.common.tf_util as tf_util -from stable_baselines.common.tf_util import total_episode_reward_logger -from stable_baselines.common.policies import ActorCriticPolicy -from stable_baselines.common.mpi_adam import MpiAdam -from stable_baselines.common.mpi_moments import mpi_moments -from stable_baselines.common.misc_util import flatten_lists -from stable_baselines.common.runners import traj_segment_generator -from stable_baselines.trpo_mpi.utils import add_vtarg_and_adv - - -class PPO1(ActorCriticRLModel): - """ - Proximal Policy Optimization algorithm (MPI version). - Paper: https://arxiv.org/abs/1707.06347 - - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param timesteps_per_actorbatch: (int) timesteps per actor per update - :param clip_param: (float) clipping parameter epsilon - :param entcoeff: (float) the entropy loss weight - :param optim_epochs: (float) the optimizer's number of epochs - :param optim_stepsize: (float) the optimizer's stepsize - :param optim_batchsize: (int) the optimizer's the batch size - :param gamma: (float) discount factor - :param lam: (float) advantage estimation - :param adam_epsilon: (float) the epsilon value for the adam optimizer - :param schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', - 'double_linear_con', 'middle_drop' or 'double_middle_drop') - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, - optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, - schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, - policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): - - super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.gamma = gamma - self.timesteps_per_actorbatch = timesteps_per_actorbatch - self.clip_param = clip_param - self.entcoeff = entcoeff - self.optim_epochs = optim_epochs - self.optim_stepsize = optim_stepsize - self.optim_batchsize = optim_batchsize - self.lam = lam - self.adam_epsilon = adam_epsilon - self.schedule = schedule - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - - self.graph = None - self.sess = None - self.policy_pi = None - self.loss_names = None - self.lossandgrad = None - self.adam = None - self.assign_old_eq_new = None - self.compute_losses = None - self.params = None - self.step = None - self.proba_step = None - self.initial_state = None - self.summary = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.policy_pi - action_ph = policy.pdtype.sample_placeholder([None]) - if isinstance(self.action_space, gym.spaces.Discrete): - return policy.obs_ph, action_ph, policy.policy - return policy.obs_ph, action_ph, policy.deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - # Construct network for new policy - self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - None, reuse=False, **self.policy_kwargs) - - # Network for old policy - with tf.variable_scope("oldpi", reuse=False): - old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - None, reuse=False, **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False): - # Target advantage function (if applicable) - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) - - # Empirical return - ret = tf.placeholder(dtype=tf.float32, shape=[None]) - - # learning rate multiplier, updated with schedule - lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) - - # Annealed cliping parameter epislon - clip_param = self.clip_param * lrmult - - obs_ph = self.policy_pi.obs_ph - action_ph = self.policy_pi.pdtype.sample_placeholder([None]) - - kloldnew = old_pi.proba_distribution.kl(self.policy_pi.proba_distribution) - ent = self.policy_pi.proba_distribution.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - pol_entpen = (-self.entcoeff) * meanent - - # pnew / pold - ratio = tf.exp(self.policy_pi.proba_distribution.logp(action_ph) - - old_pi.proba_distribution.logp(action_ph)) - - # surrogate from conservative policy iteration - surr1 = ratio * atarg - surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg - - # PPO's pessimistic surrogate (L^CLIP) - pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) - vf_loss = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) - total_loss = pol_surr + pol_entpen + vf_loss - losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] - self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] - - tf.summary.scalar('entropy_loss', pol_entpen) - tf.summary.scalar('policy_gradient_loss', pol_surr) - tf.summary.scalar('value_function_loss', vf_loss) - tf.summary.scalar('approximate_kullback-leibler', meankl) - tf.summary.scalar('clip_factor', clip_param) - tf.summary.scalar('loss', total_loss) - - self.params = tf_util.get_trainable_vars("model") - - self.assign_old_eq_new = tf_util.function( - [], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in - zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) - - with tf.variable_scope("Adam_mpi", reuse=False): - self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.optim_stepsize)) - tf.summary.scalar('advantage', tf.reduce_mean(atarg)) - tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_param)) - - if self.full_tensorboard_log: - tf.summary.histogram('discounted_rewards', ret) - tf.summary.histogram('learning_rate', self.optim_stepsize) - tf.summary.histogram('advantage', atarg) - tf.summary.histogram('clip_range', self.clip_param) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', obs_ph) - else: - tf.summary.histogram('observation', obs_ph) - - self.step = self.policy_pi.step - self.proba_step = self.policy_pi.proba_step - self.initial_state = self.policy_pi.initial_state - - tf_util.initialize(sess=self.sess) - - self.summary = tf.summary.merge_all() - - self.lossandgrad = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], - [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) - self.compute_losses = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], - losses) - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", - reset_num_timesteps=True): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ - "an instance of common.policies.ActorCriticPolicy." - - with self.sess.as_default(): - self.adam.sync() - callback.on_training_start(locals(), globals()) - - # Prepare for rollouts - seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, - callback=callback) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - t_start = time.time() - - # rolling buffer for episode lengths - len_buffer = deque(maxlen=100) - # rolling buffer for episode rewards - reward_buffer = deque(maxlen=100) - - while True: - if timesteps_so_far >= total_timesteps: - break - - if self.schedule == 'constant': - cur_lrmult = 1.0 - elif self.schedule == 'linear': - cur_lrmult = max(1.0 - float(timesteps_so_far) / total_timesteps, 0) - else: - raise NotImplementedError - - logger.log("********** Iteration %i ************" % iters_so_far) - - seg = seg_gen.__next__() - - # Stop training early (triggered by the callback) - if not seg.get('continue_training', True): # pytype: disable=attribute-error - break - - add_vtarg_and_adv(seg, self.gamma, self.lam) - - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - observations, actions = seg["observations"], seg["actions"] - atarg, tdlamret = seg["adv"], seg["tdlamret"] - - # true_rew is the reward without discount - if writer is not None: - total_episode_reward_logger(self.episode_reward, - seg["true_rewards"].reshape((self.n_envs, -1)), - seg["dones"].reshape((self.n_envs, -1)), - writer, self.num_timesteps) - - # predicted value function before udpate - vpredbefore = seg["vpred"] - - # standardized advantage function estimate - atarg = (atarg - atarg.mean()) / atarg.std() - dataset = Dataset(dict(ob=observations, ac=actions, atarg=atarg, vtarg=tdlamret), - shuffle=not self.policy.recurrent) - optim_batchsize = self.optim_batchsize or observations.shape[0] - - # set old parameter values to new parameter values - self.assign_old_eq_new(sess=self.sess) - logger.log("Optimizing...") - logger.log(fmt_row(13, self.loss_names)) - - # Here we do a bunch of optimization epochs over the data - for k in range(self.optim_epochs): - # list of tuples, each of which gives the loss for a minibatch - losses = [] - for i, batch in enumerate(dataset.iterate_once(optim_batchsize)): - steps = (self.num_timesteps + - k * optim_batchsize + - int(i * (optim_batchsize / len(dataset.data_map)))) - if writer is not None: - # run loss backprop with summary, but once every 10 runs save the metadata - # (memory, compute time, ...) - if self.full_tensorboard_log and (1 + k) % 10 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], - batch["atarg"], batch["vtarg"], - cur_lrmult, sess=self.sess, - options=run_options, - run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % steps) - else: - summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], - batch["atarg"], batch["vtarg"], - cur_lrmult, sess=self.sess) - writer.add_summary(summary, steps) - else: - _, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], - batch["atarg"], batch["vtarg"], cur_lrmult, - sess=self.sess) - - self.adam.update(grad, self.optim_stepsize * cur_lrmult) - losses.append(newlosses) - logger.log(fmt_row(13, np.mean(losses, axis=0))) - - logger.log("Evaluating losses...") - losses = [] - for batch in dataset.iterate_once(optim_batchsize): - newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], - batch["vtarg"], cur_lrmult, sess=self.sess) - losses.append(newlosses) - mean_losses, _, _ = mpi_moments(losses, axis=0) - logger.log(fmt_row(13, mean_losses)) - for (loss_val, name) in zipsame(mean_losses, self.loss_names): - logger.record_tabular("loss_" + name, loss_val) - logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - - # local values - lrlocal = (seg["ep_lens"], seg["ep_rets"]) - - # list of tuples - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) - lens, rews = map(flatten_lists, zip(*listoflrpairs)) - len_buffer.extend(lens) - reward_buffer.extend(rews) - if len(len_buffer) > 0: - logger.record_tabular("EpLenMean", np.mean(len_buffer)) - logger.record_tabular("EpRewMean", np.mean(reward_buffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"]) - timesteps_so_far += current_it_timesteps - self.num_timesteps += current_it_timesteps - iters_so_far += 1 - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", self.num_timesteps) - logger.record_tabular("TimeElapsed", time.time() - t_start) - if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: - logger.dump_tabular() - callback.on_training_end() - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "timesteps_per_actorbatch": self.timesteps_per_actorbatch, - "clip_param": self.clip_param, - "entcoeff": self.entcoeff, - "optim_epochs": self.optim_epochs, - "optim_stepsize": self.optim_stepsize, - "optim_batchsize": self.optim_batchsize, - "lam": self.lam, - "adam_epsilon": self.adam_epsilon, - "schedule": self.schedule, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/ppo1/run_atari.py b/stable_baselines/ppo1/run_atari.py deleted file mode 100644 index 692f9678..00000000 --- a/stable_baselines/ppo1/run_atari.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -import os - -from mpi4py import MPI - -from stable_baselines.common import set_global_seeds -from stable_baselines import bench, logger, PPO1 -from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind -from stable_baselines.common.cmd_util import atari_arg_parser -from stable_baselines.common.policies import CnnPolicy - - -def train(env_id, num_timesteps, seed): - """ - Train PPO1 model for Atari environments, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - """ - rank = MPI.COMM_WORLD.Get_rank() - - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env = make_atari(env_id) - - env = bench.Monitor(env, logger.get_dir() and - os.path.join(logger.get_dir(), str(rank))) - env.seed(workerseed) - - env = wrap_deepmind(env) - env.seed(workerseed) - - model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, - optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) - model.learn(total_timesteps=num_timesteps) - env.close() - del env - - -def main(): - """ - Runs the test - """ - args = atari_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/ppo1/run_mujoco.py b/stable_baselines/ppo1/run_mujoco.py deleted file mode 100644 index 77bfd065..00000000 --- a/stable_baselines/ppo1/run_mujoco.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -from stable_baselines.ppo1 import PPO1 -from stable_baselines.common.policies import MlpPolicy -from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from stable_baselines import logger - - -def train(env_id, num_timesteps, seed): - """ - Train PPO1 model for the Mujoco environment, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - """ - env = make_mujoco_env(env_id, seed) - model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, - optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') - model.learn(total_timesteps=num_timesteps) - env.close() - - -def main(): - """ - Runs the test - """ - args = mujoco_arg_parser().parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/ppo1/run_robotics.py b/stable_baselines/ppo1/run_robotics.py deleted file mode 100644 index aeab9aaa..00000000 --- a/stable_baselines/ppo1/run_robotics.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 - -from mpi4py import MPI -import mujoco_py # pytype:disable=import-error - -from stable_baselines.common import set_global_seeds -from stable_baselines.common.policies import MlpPolicy -from stable_baselines.common.cmd_util import make_robotics_env, robotics_arg_parser -from stable_baselines.ppo1 import PPO1 - - -def train(env_id, num_timesteps, seed): - """ - Train PPO1 model for Robotics environment, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - """ - - rank = MPI.COMM_WORLD.Get_rank() - with mujoco_py.ignore_mujoco_warnings(): - workerseed = seed + 10000 * rank - set_global_seeds(workerseed) - env = make_robotics_env(env_id, workerseed, rank=rank) - - model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, - optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') - model.learn(total_timesteps=num_timesteps) - env.close() - - -def main(): - """ - Runs the test - """ - args = robotics_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/ppo2/__init__.py b/stable_baselines/ppo2/__init__.py deleted file mode 100644 index 6eb9f827..00000000 --- a/stable_baselines/ppo2/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.ppo2.ppo2 import PPO2 diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py deleted file mode 100644 index 2f533db1..00000000 --- a/stable_baselines/ppo2/ppo2.py +++ /dev/null @@ -1,541 +0,0 @@ -import time - -import gym -import numpy as np -import tensorflow as tf - -from stable_baselines import logger -from stable_baselines.common import explained_variance, ActorCriticRLModel, tf_util, SetVerbosity, TensorboardWriter -from stable_baselines.common.runners import AbstractEnvRunner -from stable_baselines.common.policies import ActorCriticPolicy, RecurrentActorCriticPolicy -from stable_baselines.common.schedules import get_schedule_fn -from stable_baselines.common.tf_util import total_episode_reward_logger -from stable_baselines.common.math_util import safe_mean - - -class PPO2(ActorCriticRLModel): - """ - Proximal Policy Optimization algorithm (GPU version). - Paper: https://arxiv.org/abs/1707.06347 - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) Discount factor - :param n_steps: (int) The number of steps to run for each environment per update - (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) - :param ent_coef: (float) Entropy coefficient for the loss calculation - :param learning_rate: (float or callable) The learning rate, it can be a function - :param vf_coef: (float) Value function coefficient for the loss calculation - :param max_grad_norm: (float) The maximum value for the gradient clipping - :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator - :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, - the number of environments run in parallel should be a multiple of nminibatches. - :param noptepochs: (int) Number of epoch when optimizing the surrogate - :param cliprange: (float or callable) Clipping parameter, it can be a function - :param cliprange_vf: (float or callable) Clipping parameter for the value function, it can be a function. - This is a parameter specific to the OpenAI implementation. If None is passed (default), - then `cliprange` (that is used for the policy) will be used. - IMPORTANT: this clipping depends on the reward scaling. - To deactivate value function clipping (and recover the original PPO implementation), - you have to pass a negative value (e.g. -1). - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, - max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, cliprange_vf=None, - verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): - - self.learning_rate = learning_rate - self.cliprange = cliprange - self.cliprange_vf = cliprange_vf - self.n_steps = n_steps - self.ent_coef = ent_coef - self.vf_coef = vf_coef - self.max_grad_norm = max_grad_norm - self.gamma = gamma - self.lam = lam - self.nminibatches = nminibatches - self.noptepochs = noptepochs - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - - self.action_ph = None - self.advs_ph = None - self.rewards_ph = None - self.old_neglog_pac_ph = None - self.old_vpred_ph = None - self.learning_rate_ph = None - self.clip_range_ph = None - self.entropy = None - self.vf_loss = None - self.pg_loss = None - self.approxkl = None - self.clipfrac = None - self._train = None - self.loss_names = None - self.train_model = None - self.act_model = None - self.value = None - self.n_batch = None - self.summary = None - - super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - if _init_setup_model: - self.setup_model() - - def _make_runner(self): - return Runner(env=self.env, model=self, n_steps=self.n_steps, - gamma=self.gamma, lam=self.lam) - - def _get_pretrain_placeholders(self): - policy = self.act_model - if isinstance(self.action_space, gym.spaces.Discrete): - return policy.obs_ph, self.action_ph, policy.policy - return policy.obs_ph, self.action_ph, policy.deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ - "an instance of common.policies.ActorCriticPolicy." - - self.n_batch = self.n_envs * self.n_steps - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - n_batch_step = None - n_batch_train = None - if issubclass(self.policy, RecurrentActorCriticPolicy): - assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ - "the number of environments run in parallel should be a multiple of nminibatches." - n_batch_step = self.n_envs - n_batch_train = self.n_batch // self.nminibatches - - act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - n_batch_step, reuse=False, **self.policy_kwargs) - with tf.variable_scope("train_model", reuse=True, - custom_getter=tf_util.outer_scope_getter("train_model")): - train_model = self.policy(self.sess, self.observation_space, self.action_space, - self.n_envs // self.nminibatches, self.n_steps, n_batch_train, - reuse=True, **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False): - self.action_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") - self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") - self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") - self.old_neglog_pac_ph = tf.placeholder(tf.float32, [None], name="old_neglog_pac_ph") - self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") - self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") - self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") - - neglogpac = train_model.proba_distribution.neglogp(self.action_ph) - self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) - - vpred = train_model.value_flat - - # Value function clipping: not present in the original PPO - if self.cliprange_vf is None: - # Default behavior (legacy from OpenAI baselines): - # use the same clipping as for the policy - self.clip_range_vf_ph = self.clip_range_ph - self.cliprange_vf = self.cliprange - elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: - # Original PPO implementation: no value function clipping - self.clip_range_vf_ph = None - else: - # Last possible behavior: clipping range - # specific to the value function - self.clip_range_vf_ph = tf.placeholder(tf.float32, [], name="clip_range_vf_ph") - - if self.clip_range_vf_ph is None: - # No clipping - vpred_clipped = train_model.value_flat - else: - # Clip the different between old and new value - # NOTE: this depends on the reward scaling - vpred_clipped = self.old_vpred_ph + \ - tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - - self.clip_range_vf_ph, self.clip_range_vf_ph) - - vf_losses1 = tf.square(vpred - self.rewards_ph) - vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) - self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) - - ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) - pg_losses = -self.advs_ph * ratio - pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + - self.clip_range_ph) - self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) - self.approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.old_neglog_pac_ph)) - self.clipfrac = tf.reduce_mean(tf.cast(tf.greater(tf.abs(ratio - 1.0), - self.clip_range_ph), tf.float32)) - loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef - - tf.summary.scalar('entropy_loss', self.entropy) - tf.summary.scalar('policy_gradient_loss', self.pg_loss) - tf.summary.scalar('value_function_loss', self.vf_loss) - tf.summary.scalar('approximate_kullback-leibler', self.approxkl) - tf.summary.scalar('clip_factor', self.clipfrac) - tf.summary.scalar('loss', loss) - - with tf.variable_scope('model'): - self.params = tf.trainable_variables() - if self.full_tensorboard_log: - for var in self.params: - tf.summary.histogram(var.name, var) - grads = tf.gradients(loss, self.params) - if self.max_grad_norm is not None: - grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) - grads = list(zip(grads, self.params)) - trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5) - self._train = trainer.apply_gradients(grads) - - self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) - tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) - tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) - if self.clip_range_vf_ph is not None: - tf.summary.scalar('clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) - - tf.summary.scalar('old_neglog_action_probability', tf.reduce_mean(self.old_neglog_pac_ph)) - tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) - - if self.full_tensorboard_log: - tf.summary.histogram('discounted_rewards', self.rewards_ph) - tf.summary.histogram('learning_rate', self.learning_rate_ph) - tf.summary.histogram('advantage', self.advs_ph) - tf.summary.histogram('clip_range', self.clip_range_ph) - tf.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) - tf.summary.histogram('old_value_pred', self.old_vpred_ph) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', train_model.obs_ph) - else: - tf.summary.histogram('observation', train_model.obs_ph) - - self.train_model = train_model - self.act_model = act_model - self.step = act_model.step - self.proba_step = act_model.proba_step - self.value = act_model.value - self.initial_state = act_model.initial_state - tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 - - self.summary = tf.summary.merge_all() - - def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, update, - writer, states=None, cliprange_vf=None): - """ - Training of PPO2 Algorithm - - :param learning_rate: (float) learning rate - :param cliprange: (float) Clipping factor - :param obs: (np.ndarray) The current observation of the environment - :param returns: (np.ndarray) the rewards - :param masks: (np.ndarray) The last masks for done episodes (used in recurent policies) - :param actions: (np.ndarray) the actions - :param values: (np.ndarray) the values - :param neglogpacs: (np.ndarray) Negative Log-likelihood probability of Actions - :param update: (int) the current step iteration - :param writer: (TensorFlow Summary.writer) the writer for tensorboard - :param states: (np.ndarray) For recurrent policies, the internal state of the recurrent model - :return: policy gradient loss, value function loss, policy entropy, - approximation of kl divergence, updated clipping range, training update operation - :param cliprange_vf: (float) Clipping factor for the value function - """ - advs = returns - values - advs = (advs - advs.mean()) / (advs.std() + 1e-8) - td_map = {self.train_model.obs_ph: obs, self.action_ph: actions, - self.advs_ph: advs, self.rewards_ph: returns, - self.learning_rate_ph: learning_rate, self.clip_range_ph: cliprange, - self.old_neglog_pac_ph: neglogpacs, self.old_vpred_ph: values} - if states is not None: - td_map[self.train_model.states_ph] = states - td_map[self.train_model.dones_ph] = masks - - if cliprange_vf is not None and cliprange_vf >= 0: - td_map[self.clip_range_vf_ph] = cliprange_vf - - if states is None: - update_fac = max(self.n_batch // self.nminibatches // self.noptepochs, 1) - else: - update_fac = max(self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) - - if writer is not None: - # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) - if self.full_tensorboard_log and (1 + update) % 10 == 0: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], - td_map, options=run_options, run_metadata=run_metadata) - writer.add_run_metadata(run_metadata, 'step%d' % (update * update_fac)) - else: - summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( - [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], - td_map) - writer.add_summary(summary, (update * update_fac)) - else: - policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( - [self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], td_map) - - return policy_loss, value_loss, policy_entropy, approxkl, clipfrac - - def set_env(self,env): - super().set_env(env) - self.n_batch = self.n_envs * self.n_steps - - def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", - reset_num_timesteps=True): - # Transform to callable if needed - self.learning_rate = get_schedule_fn(self.learning_rate) - self.cliprange = get_schedule_fn(self.cliprange) - cliprange_vf = get_schedule_fn(self.cliprange_vf) - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - t_first_start = time.time() - n_updates = total_timesteps // self.n_batch - - callback.on_training_start(locals(), globals()) - - for update in range(1, n_updates + 1): - assert self.n_batch % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) " - "is not a factor of the total number of samples " - "collected per rollout (`n_batch`), " - "some samples won't be used." - ) - batch_size = self.n_batch // self.nminibatches - t_start = time.time() - frac = 1.0 - (update - 1.0) / n_updates - lr_now = self.learning_rate(frac) - cliprange_now = self.cliprange(frac) - cliprange_vf_now = cliprange_vf(frac) - - callback.on_rollout_start() - # true_reward is the reward without discount - rollout = self.runner.run(callback) - # Unpack - obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout - - callback.on_rollout_end() - - # Early stopping due to the callback - if not self.runner.continue_training: - break - - self.ep_info_buf.extend(ep_infos) - mb_loss_vals = [] - if states is None: # nonrecurrent version - update_fac = max(self.n_batch // self.nminibatches // self.noptepochs, 1) - inds = np.arange(self.n_batch) - for epoch_num in range(self.noptepochs): - np.random.shuffle(inds) - for start in range(0, self.n_batch, batch_size): - timestep = self.num_timesteps // update_fac + ((epoch_num * - self.n_batch + start) // batch_size) - end = start + batch_size - mbinds = inds[start:end] - slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer, - update=timestep, cliprange_vf=cliprange_vf_now)) - else: # recurrent version - update_fac = max(self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) - assert self.n_envs % self.nminibatches == 0 - env_indices = np.arange(self.n_envs) - flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) - envs_per_batch = batch_size // self.n_steps - for epoch_num in range(self.noptepochs): - np.random.shuffle(env_indices) - for start in range(0, self.n_envs, envs_per_batch): - timestep = self.num_timesteps // update_fac + ((epoch_num * - self.n_envs + start) // envs_per_batch) - end = start + envs_per_batch - mb_env_inds = env_indices[start:end] - mb_flat_inds = flat_indices[mb_env_inds].ravel() - slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mb_states = states[mb_env_inds] - mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep, - writer=writer, states=mb_states, - cliprange_vf=cliprange_vf_now)) - - loss_vals = np.mean(mb_loss_vals, axis=0) - t_now = time.time() - fps = int(self.n_batch / (t_now - t_start)) - - if writer is not None: - total_episode_reward_logger(self.episode_reward, - true_reward.reshape((self.n_envs, self.n_steps)), - masks.reshape((self.n_envs, self.n_steps)), - writer, self.num_timesteps) - - if self.verbose >= 1 and (update % log_interval == 0 or update == 1): - explained_var = explained_variance(values, returns) - logger.logkv("serial_timesteps", update * self.n_steps) - logger.logkv("n_updates", update) - logger.logkv("total_timesteps", self.num_timesteps) - logger.logkv("fps", fps) - logger.logkv("explained_variance", float(explained_var)) - if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: - logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) - logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) - logger.logkv('time_elapsed', t_start - t_first_start) - for (loss_val, loss_name) in zip(loss_vals, self.loss_names): - logger.logkv(loss_name, loss_val) - logger.dumpkvs() - - callback.on_training_end() - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "n_steps": self.n_steps, - "vf_coef": self.vf_coef, - "ent_coef": self.ent_coef, - "max_grad_norm": self.max_grad_norm, - "learning_rate": self.learning_rate, - "lam": self.lam, - "nminibatches": self.nminibatches, - "noptepochs": self.noptepochs, - "cliprange": self.cliprange, - "cliprange_vf": self.cliprange_vf, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) - - -class Runner(AbstractEnvRunner): - def __init__(self, *, env, model, n_steps, gamma, lam): - """ - A runner to learn the policy of an environment for a model - - :param env: (Gym environment) The environment to learn from - :param model: (Model) The model to learn - :param n_steps: (int) The number of steps to run for each environment - :param gamma: (float) Discount factor - :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator - """ - super().__init__(env=env, model=model, n_steps=n_steps) - self.lam = lam - self.gamma = gamma - - def _run(self): - """ - Run a learning step of the model - - :return: - - observations: (np.ndarray) the observations - - rewards: (np.ndarray) the rewards - - masks: (numpy bool) whether an episode is over or not - - actions: (np.ndarray) the actions - - values: (np.ndarray) the value function output - - negative log probabilities: (np.ndarray) - - states: (np.ndarray) the internal states of the recurrent policies - - infos: (dict) the extra information of the model - """ - # mb stands for minibatch - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] - mb_states = self.states - ep_infos = [] - for _ in range(self.n_steps): - actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) # pytype: disable=attribute-error - mb_obs.append(self.obs.copy()) - mb_actions.append(actions) - mb_values.append(values) - mb_neglogpacs.append(neglogpacs) - mb_dones.append(self.dones) - clipped_actions = actions - # Clip the actions to avoid out of bound error - if isinstance(self.env.action_space, gym.spaces.Box): - clipped_actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) - self.obs[:], rewards, self.dones, infos = self.env.step(clipped_actions) - - self.model.num_timesteps += self.n_envs - - if self.callback is not None: - # Abort training early - self.callback.update_locals(locals()) - if self.callback.on_step() is False: - self.continue_training = False - # Return dummy values - return [None] * 9 - - for info in infos: - maybe_ep_info = info.get('episode') - if maybe_ep_info is not None: - ep_infos.append(maybe_ep_info) - mb_rewards.append(rewards) - # batch of steps to batch of rollouts - mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) - mb_rewards = np.asarray(mb_rewards, dtype=np.float32) - mb_actions = np.asarray(mb_actions) - mb_values = np.asarray(mb_values, dtype=np.float32) - mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) - mb_dones = np.asarray(mb_dones, dtype=np.bool) - last_values = self.model.value(self.obs, self.states, self.dones) # pytype: disable=attribute-error - # discount/bootstrap off value fn - mb_advs = np.zeros_like(mb_rewards) - true_reward = np.copy(mb_rewards) - last_gae_lam = 0 - for step in reversed(range(self.n_steps)): - if step == self.n_steps - 1: - nextnonterminal = 1.0 - self.dones - nextvalues = last_values - else: - nextnonterminal = 1.0 - mb_dones[step + 1] - nextvalues = mb_values[step + 1] - delta = mb_rewards[step] + self.gamma * nextvalues * nextnonterminal - mb_values[step] - mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam - mb_returns = mb_advs + mb_values - - mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, true_reward = \ - map(swap_and_flatten, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, true_reward)) - - return mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs, mb_states, ep_infos, true_reward - - -# obs, returns, masks, actions, values, neglogpacs, states = runner.run() -def swap_and_flatten(arr): - """ - swap and then flatten axes 0 and 1 - - :param arr: (np.ndarray) - :return: (np.ndarray) - """ - shape = arr.shape - return arr.swapaxes(0, 1).reshape(shape[0] * shape[1], *shape[2:]) diff --git a/stable_baselines/ppo2/run_atari.py b/stable_baselines/ppo2/run_atari.py deleted file mode 100644 index bc026aca..00000000 --- a/stable_baselines/ppo2/run_atari.py +++ /dev/null @@ -1,48 +0,0 @@ -from stable_baselines import PPO2, logger -from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser -from stable_baselines.common.vec_env import VecFrameStack -from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy, MlpPolicy - - -def train(env_id, num_timesteps, seed, policy, - n_envs=8, nminibatches=4, n_steps=128): - """ - Train PPO2 model for atari environment, for testing purposes - - :param env_id: (str) the environment id string - :param num_timesteps: (int) the number of timesteps to run - :param seed: (int) Used to seed the random generator. - :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) - :param n_envs: (int) Number of parallel environments - :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, - the number of environments run in parallel should be a multiple of nminibatches. - :param n_steps: (int) The number of steps to run for each environment per update - (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) - """ - - env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) - policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy] - model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, - lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, - learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) - model.learn(total_timesteps=num_timesteps) - - env.close() - # Free memory - del model - - -def main(): - """ - Runs the test - """ - parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') - args = parser.parse_args() - logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/ppo2/run_mujoco.py b/stable_baselines/ppo2/run_mujoco.py deleted file mode 100644 index a99efbca..00000000 --- a/stable_baselines/ppo2/run_mujoco.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -import gym - -from stable_baselines.common.cmd_util import mujoco_arg_parser -from stable_baselines import bench, logger -from stable_baselines.common import set_global_seeds -from stable_baselines.common.vec_env.vec_normalize import VecNormalize -from stable_baselines.ppo2 import PPO2 -from stable_baselines.common.policies import MlpPolicy -from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv - - -def train(env_id, num_timesteps, seed): - """ - Train PPO2 model for Mujoco environment, for testing purposes - - :param env_id: (str) the environment id string - :param num_timesteps: (int) the number of timesteps to run - :param seed: (int) Used to seed the random generator. - """ - def make_env(): - env_out = gym.make(env_id) - env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) - return env_out - - env = DummyVecEnv([make_env]) - env = VecNormalize(env) - - set_global_seeds(seed) - policy = MlpPolicy - model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, - ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) - model.learn(total_timesteps=num_timesteps) - - return model, env - - -def main(): - """ - Runs the test - """ - args = mujoco_arg_parser().parse_args() - logger.configure() - model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - if args.play: - logger.log("Running trained model") - obs = np.zeros((env.num_envs,) + env.observation_space.shape) - obs[:] = env.reset() - while True: - actions = model.step(obs)[0] - obs[:] = env.step(actions)[0] - env.render() - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/py.typed b/stable_baselines/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/stable_baselines/results_plotter.py b/stable_baselines/results_plotter.py deleted file mode 100644 index 5631f7a6..00000000 --- a/stable_baselines/results_plotter.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import matplotlib -import matplotlib.pyplot as plt - -from stable_baselines.bench.monitor import load_results - -# matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode -plt.rcParams['svg.fonttype'] = 'none' - -X_TIMESTEPS = 'timesteps' -X_EPISODES = 'episodes' -X_WALLTIME = 'walltime_hrs' -POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] -EPISODES_WINDOW = 100 -COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', - 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', - 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] - - -def rolling_window(array, window): - """ - apply a rolling window to a np.ndarray - - :param array: (np.ndarray) the input Array - :param window: (int) length of the rolling window - :return: (np.ndarray) rolling window on the input array - """ - shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) - strides = array.strides + (array.strides[-1],) - return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) - - -def window_func(var_1, var_2, window, func): - """ - apply a function to the rolling window of 2 arrays - - :param var_1: (np.ndarray) variable 1 - :param var_2: (np.ndarray) variable 2 - :param window: (int) length of the rolling window - :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean) - :return: (np.ndarray, np.ndarray) the rolling output with applied function - """ - var_2_window = rolling_window(var_2, window) - function_on_var2 = func(var_2_window, axis=-1) - return var_1[window - 1:], function_on_var2 - - -def ts2xy(timesteps, xaxis): - """ - Decompose a timesteps variable to x ans ys - - :param timesteps: (Pandas DataFrame) the input data - :param xaxis: (str) the axis for the x and y output - (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') - :return: (np.ndarray, np.ndarray) the x and y output - """ - if xaxis == X_TIMESTEPS: - x_var = np.cumsum(timesteps.l.values) - y_var = timesteps.r.values - elif xaxis == X_EPISODES: - x_var = np.arange(len(timesteps)) - y_var = timesteps.r.values - elif xaxis == X_WALLTIME: - x_var = timesteps.t.values / 3600. - y_var = timesteps.r.values - else: - raise NotImplementedError - return x_var, y_var - - -def plot_curves(xy_list, xaxis, title): - """ - plot the curves - - :param xy_list: ([(np.ndarray, np.ndarray)]) the x and y coordinates to plot - :param xaxis: (str) the axis for the x and y output - (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') - :param title: (str) the title of the plot - """ - - plt.figure(figsize=(8, 2)) - maxx = max(xy[0][-1] for xy in xy_list) - minx = 0 - for (i, (x, y)) in enumerate(xy_list): - color = COLORS[i] - plt.scatter(x, y, s=2) - # Do not plot the smoothed curve at all if the timeseries is shorter than window size. - if x.shape[0] >= EPISODES_WINDOW: - # Compute and plot rolling mean with window of size EPISODE_WINDOW - x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) - plt.plot(x, y_mean, color=color) - plt.xlim(minx, maxx) - plt.title(title) - plt.xlabel(xaxis) - plt.ylabel("Episode Rewards") - plt.tight_layout() - - -def plot_results(dirs, num_timesteps, xaxis, task_name): - """ - plot the results - - :param dirs: ([str]) the save location of the results to plot - :param num_timesteps: (int or None) only plot the points below this value - :param xaxis: (str) the axis for the x and y output - (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') - :param task_name: (str) the title of the task to plot - """ - - tslist = [] - for folder in dirs: - timesteps = load_results(folder) - if num_timesteps is not None: - timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps] - tslist.append(timesteps) - xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist] - plot_curves(xy_list, xaxis, task_name) - - -def main(): - """ - Example usage in jupyter-notebook - - .. code-block:: python - - from stable_baselines import results_plotter - %matplotlib inline - results_plotter.plot_results(["./log"], 10e6, results_plotter.X_TIMESTEPS, "Breakout") - - Here ./log is a directory containing the monitor.csv files - """ - import argparse - import os - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log']) - parser.add_argument('--num_timesteps', type=int, default=int(10e6)) - parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS) - parser.add_argument('--task_name', help='Title of plot', default='Breakout') - args = parser.parse_args() - args.dirs = [os.path.abspath(folder) for folder in args.dirs] - plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) - plt.show() - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/sac/__init__.py b/stable_baselines/sac/__init__.py deleted file mode 100644 index e727402f..00000000 --- a/stable_baselines/sac/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from stable_baselines.sac.sac import SAC -from stable_baselines.sac.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy diff --git a/stable_baselines/sac/policies.py b/stable_baselines/sac/policies.py deleted file mode 100644 index 7cc61e7d..00000000 --- a/stable_baselines/sac/policies.py +++ /dev/null @@ -1,351 +0,0 @@ -import tensorflow as tf -import numpy as np -from gym.spaces import Box - -from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy -from stable_baselines.common.tf_layers import mlp - -EPS = 1e-6 # Avoid NaN (prevents division by zero or log of zero) -# CAP the standard deviation of the actor -LOG_STD_MAX = 2 -LOG_STD_MIN = -20 - - -def gaussian_likelihood(input_, mu_, log_std): - """ - Helper to computer log likelihood of a gaussian. - Here we assume this is a Diagonal Gaussian. - - :param input_: (tf.Tensor) - :param mu_: (tf.Tensor) - :param log_std: (tf.Tensor) - :return: (tf.Tensor) - """ - pre_sum = -0.5 * (((input_ - mu_) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) - return tf.reduce_sum(pre_sum, axis=1) - - -def gaussian_entropy(log_std): - """ - Compute the entropy for a diagonal Gaussian distribution. - - :param log_std: (tf.Tensor) Log of the standard deviation - :return: (tf.Tensor) - """ - return tf.reduce_sum(log_std + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) - - -def clip_but_pass_gradient(input_, lower=-1., upper=1.): - clip_up = tf.cast(input_ > upper, tf.float32) - clip_low = tf.cast(input_ < lower, tf.float32) - return input_ + tf.stop_gradient((upper - input_) * clip_up + (lower - input_) * clip_low) - - -def apply_squashing_func(mu_, pi_, logp_pi): - """ - Squash the output of the Gaussian distribution - and account for that in the log probability - The squashed mean is also returned for using - deterministic actions. - - :param mu_: (tf.Tensor) Mean of the gaussian - :param pi_: (tf.Tensor) Output of the policy before squashing - :param logp_pi: (tf.Tensor) Log probability before squashing - :return: ([tf.Tensor]) - """ - # Squash the output - deterministic_policy = tf.tanh(mu_) - policy = tf.tanh(pi_) - # OpenAI Variation: - # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. - # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - policy ** 2, lower=0, upper=1) + EPS), axis=1) - # Squash correction (from original implementation) - logp_pi -= tf.reduce_sum(tf.log(1 - policy ** 2 + EPS), axis=1) - return deterministic_policy, policy, logp_pi - - -class SACPolicy(BasePolicy): - """ - Policy object that implements a SAC-like actor critic - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False): - super(SACPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) - assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - - self.qf1 = None - self.qf2 = None - self.value_fn = None - self.policy = None - self.deterministic_policy = None - self.act_mu = None - self.std = None - - def make_actor(self, obs=None, reuse=False, scope="pi"): - """ - Creates an actor object - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name of the actor - :return: (TensorFlow Tensor) the output tensor - """ - raise NotImplementedError - - def make_critics(self, obs=None, action=None, reuse=False, - scope="values_fn", create_vf=True, create_qf=True): - """ - Creates the two Q-Values approximator along with the Value function - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param action: (TensorFlow Tensor) The action placeholder - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name - :param create_vf: (bool) Whether to create Value fn or not - :param create_qf: (bool) Whether to create Q-Values fn or not - :return: ([tf.Tensor]) Mean, action and log probability - """ - raise NotImplementedError - - def step(self, obs, state=None, mask=None, deterministic=False): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :param deterministic: (bool) Whether or not to return deterministic actions. - :return: ([float]) actions - """ - raise NotImplementedError - - def proba_step(self, obs, state=None, mask=None): - """ - Returns the action probability params (mean, std) for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float], [float]) - """ - raise NotImplementedError - - -class FeedForwardPolicy(SACPolicy): - """ - Policy object that implements a DDPG-like actor critic, using a feed forward neural network. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64]) - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param layer_norm: (bool) enable layer normalisation - :param reg_weight: (float) Regularization loss weight for the policy parameters - :param act_fun: (tf.func) the activation function to use in the neural network. - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, layers=None, - cnn_extractor=nature_cnn, feature_extraction="cnn", reg_weight=0.0, - layer_norm=False, act_fun=tf.nn.relu, **kwargs): - super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=reuse, scale=(feature_extraction == "cnn")) - - self._kwargs_check(feature_extraction, kwargs) - self.layer_norm = layer_norm - self.feature_extraction = feature_extraction - self.cnn_kwargs = kwargs - self.cnn_extractor = cnn_extractor - self.reuse = reuse - if layers is None: - layers = [64, 64] - self.layers = layers - self.reg_loss = None - self.reg_weight = reg_weight - self.entropy = None - - assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy." - - self.activ_fn = act_fun - - def make_actor(self, obs=None, reuse=False, scope="pi"): - if obs is None: - obs = self.processed_obs - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - pi_h = tf.layers.flatten(obs) - - pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - - self.act_mu = mu_ = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) - # Important difference with SAC and other algo such as PPO: - # the std depends on the state, so we cannot use stable_baselines.common.distribution - log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) - - # Regularize policy output (not used for now) - # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2) - # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2) - # self.reg_loss = reg_loss - - # OpenAI Variation to cap the standard deviation - # activation = tf.tanh # for log_std - # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) - # Original Implementation - log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) - - self.std = std = tf.exp(log_std) - # Reparameterization trick - pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std - logp_pi = gaussian_likelihood(pi_, mu_, log_std) - self.entropy = gaussian_entropy(log_std) - # MISSING: reg params for log and mu - # Apply squashing and account for it in the probability - deterministic_policy, policy, logp_pi = apply_squashing_func(mu_, pi_, logp_pi) - self.policy = policy - self.deterministic_policy = deterministic_policy - - return deterministic_policy, policy, logp_pi - - def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn", - create_vf=True, create_qf=True): - if obs is None: - obs = self.processed_obs - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - critics_h = tf.layers.flatten(obs) - - if create_vf: - # Value function - with tf.variable_scope('vf', reuse=reuse): - vf_h = mlp(critics_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - value_fn = tf.layers.dense(vf_h, 1, name="vf") - self.value_fn = value_fn - - if create_qf: - # Concatenate preprocessed state and action - qf_h = tf.concat([critics_h, action], axis=-1) - - # Double Q values to reduce overestimation - with tf.variable_scope('qf1', reuse=reuse): - qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - qf1 = tf.layers.dense(qf1_h, 1, name="qf1") - - with tf.variable_scope('qf2', reuse=reuse): - qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - qf2 = tf.layers.dense(qf2_h, 1, name="qf2") - - self.qf1 = qf1 - self.qf2 = qf2 - - return self.qf1, self.qf2, self.value_fn - - def step(self, obs, state=None, mask=None, deterministic=False): - if deterministic: - return self.sess.run(self.deterministic_policy, {self.obs_ph: obs}) - return self.sess.run(self.policy, {self.obs_ph: obs}) - - def proba_step(self, obs, state=None, mask=None): - return self.sess.run([self.act_mu, self.std], {self.obs_ph: obs}) - - -class CnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", **_kwargs) - - -class LnCnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", layer_norm=True, **_kwargs) - - -class MlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", **_kwargs) - - -class LnMlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", layer_norm=True, **_kwargs) - - -register_policy("CnnPolicy", CnnPolicy) -register_policy("LnCnnPolicy", LnCnnPolicy) -register_policy("MlpPolicy", MlpPolicy) -register_policy("LnMlpPolicy", LnMlpPolicy) diff --git a/stable_baselines/sac/sac.py b/stable_baselines/sac/sac.py deleted file mode 100644 index f466af76..00000000 --- a/stable_baselines/sac/sac.py +++ /dev/null @@ -1,574 +0,0 @@ -import time -import warnings - -import numpy as np -import tensorflow as tf - -from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter -from stable_baselines.common.vec_env import VecEnv -from stable_baselines.common.math_util import safe_mean, unscale_action, scale_action -from stable_baselines.common.schedules import get_schedule_fn -from stable_baselines.common.buffers import ReplayBuffer -from stable_baselines.sac.policies import SACPolicy -from stable_baselines import logger - - -class SAC(OffPolicyRLModel): - """ - Soft Actor-Critic (SAC) - Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor, - This implementation borrows code from original implementation (https://github.com/haarnoja/sac) - from OpenAI Spinning Up (https://github.com/openai/spinningup) and from the Softlearning repo - (https://github.com/rail-berkeley/softlearning/) - Paper: https://arxiv.org/abs/1801.01290 - Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html - - :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) the discount factor - :param learning_rate: (float or callable) learning rate for adam optimizer, - the same learning rate will be used for all networks (Q-Values, Actor and Value function) - it can be a function of the current progress (from 1 to 0) - :param buffer_size: (int) size of the replay buffer - :param batch_size: (int) Minibatch size for each gradient update - :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) - :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to - inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. - Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) - :param train_freq: (int) Update the model every `train_freq` steps. - :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. - :param gradient_steps: (int) How many gradient update after each step - :param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto') - :param action_noise: (ActionNoise) the action noise type (None by default), this can help - for hard exploration problem. Cf DDPG for the different action noise type. - :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) - This is not needed for SAC normally but can help exploring when using HER + SAC. - This hack was present in the original OpenAI Baselines repo (DDPG + HER) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - Note: this has no effect on SAC logging for now - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - - def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, - learning_starts=100, train_freq=1, batch_size=64, - tau=0.005, ent_coef='auto', target_update_interval=1, - gradient_steps=1, target_entropy='auto', action_noise=None, - random_exploration=0.0, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, - seed=None, n_cpu_tf_sess=None): - - super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, - policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.buffer_size = buffer_size - self.learning_rate = learning_rate - self.learning_starts = learning_starts - self.train_freq = train_freq - self.batch_size = batch_size - self.tau = tau - # In the original paper, same learning rate is used for all networks - # self.policy_lr = learning_rate - # self.qf_lr = learning_rate - # self.vf_lr = learning_rate - # Entropy coefficient / Entropy temperature - # Inverse of the reward scale - self.ent_coef = ent_coef - self.target_update_interval = target_update_interval - self.gradient_steps = gradient_steps - self.gamma = gamma - self.action_noise = action_noise - self.random_exploration = random_exploration - - self.value_fn = None - self.graph = None - self.replay_buffer = None - self.sess = None - self.tensorboard_log = tensorboard_log - self.verbose = verbose - self.params = None - self.summary = None - self.policy_tf = None - self.target_entropy = target_entropy - self.full_tensorboard_log = full_tensorboard_log - - self.obs_target = None - self.target_policy = None - self.actions_ph = None - self.rewards_ph = None - self.terminals_ph = None - self.observations_ph = None - self.action_target = None - self.next_observations_ph = None - self.value_target = None - self.step_ops = None - self.target_update_op = None - self.infos_names = None - self.entropy = None - self.target_params = None - self.learning_rate_ph = None - self.processed_obs_ph = None - self.processed_next_obs_ph = None - self.log_ent_coef = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.policy_tf - # Rescale - deterministic_action = unscale_action(self.action_space, self.deterministic_action) - return policy.obs_ph, self.actions_ph, deterministic_action - - def setup_model(self): - with SetVerbosity(self.verbose): - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - self.replay_buffer = ReplayBuffer(self.buffer_size) - - with tf.variable_scope("input", reuse=False): - # Create policy and target TF objects - self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, - **self.policy_kwargs) - self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, - **self.policy_kwargs) - - # Initialize Placeholders - self.observations_ph = self.policy_tf.obs_ph - # Normalized observation for pixels - self.processed_obs_ph = self.policy_tf.processed_obs - self.next_observations_ph = self.target_policy.obs_ph - self.processed_next_obs_ph = self.target_policy.processed_obs - self.action_target = self.target_policy.action_ph - self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') - self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') - self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, - name='actions') - self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") - - with tf.variable_scope("model", reuse=False): - # Create the policy - # first return value corresponds to deterministic actions - # policy_out corresponds to stochastic actions, used for training - # logp_pi is the log probability of actions taken by the policy - self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) - # Monitor the entropy of the policy, - # this is not used for training - self.entropy = tf.reduce_mean(self.policy_tf.entropy) - # Use two Q-functions to improve performance by reducing overestimation bias. - qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, - create_qf=True, create_vf=True) - qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, - policy_out, create_qf=True, create_vf=False, - reuse=True) - - # Target entropy is used when learning the entropy coefficient - if self.target_entropy == 'auto': - # automatically set target entropy if needed - self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) - else: - # Force conversion - # this will also throw an error for unexpected string - self.target_entropy = float(self.target_entropy) - - # The entropy coefficient or entropy can be learned automatically - # see Automating Entropy Adjustment for Maximum Entropy RL section - # of https://arxiv.org/abs/1812.05905 - if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): - # Default initial value of ent_coef when learned - init_value = 1.0 - if '_' in self.ent_coef: - init_value = float(self.ent_coef.split('_')[1]) - assert init_value > 0., "The initial value of ent_coef must be greater than 0" - - self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, - initializer=np.log(init_value).astype(np.float32)) - self.ent_coef = tf.exp(self.log_ent_coef) - else: - # Force conversion to float - # this will throw an error if a malformed string (different from 'auto') - # is passed - self.ent_coef = float(self.ent_coef) - - with tf.variable_scope("target", reuse=False): - # Create the value network - _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph, - create_qf=False, create_vf=True) - self.value_target = value_target - - with tf.variable_scope("loss", reuse=False): - # Take the min of the two Q-Values (Double-Q Learning) - min_qf_pi = tf.minimum(qf1_pi, qf2_pi) - - # Target for Q value regression - q_backup = tf.stop_gradient( - self.rewards_ph + - (1 - self.terminals_ph) * self.gamma * self.value_target - ) - - # Compute Q-Function loss - # TODO: test with huber loss (it would avoid too high values) - qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2) - qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2) - - # Compute the entropy temperature loss - # it is used when the entropy coefficient is learned - ent_coef_loss, entropy_optimizer = None, None - if not isinstance(self.ent_coef, float): - ent_coef_loss = -tf.reduce_mean( - self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) - entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) - - # Compute the policy loss - # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) - policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) - - # NOTE: in the original implementation, they have an additional - # regularization loss for the Gaussian parameters - # this is not used for now - # policy_loss = (policy_kl_loss + policy_regularization_loss) - policy_loss = policy_kl_loss - - # Target for value fn regression - # We update the vf towards the min of two Q-functions in order to - # reduce overestimation bias from function approximation error. - v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) - value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) - - values_losses = qf1_loss + qf2_loss + value_loss - - # Policy train op - # (has to be separate from value train op, because min_qf_pi appears in policy_loss) - policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) - policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) - - # Value train op - value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) - values_params = tf_util.get_trainable_vars('model/values_fn') - - source_params = tf_util.get_trainable_vars("model/values_fn") - target_params = tf_util.get_trainable_vars("target/values_fn") - - # Polyak averaging for target variables - self.target_update_op = [ - tf.assign(target, (1 - self.tau) * target + self.tau * source) - for target, source in zip(target_params, source_params) - ] - # Initializing target to match source variables - target_init_op = [ - tf.assign(target, source) - for target, source in zip(target_params, source_params) - ] - - # Control flow is used because sess.run otherwise evaluates in nondeterministic order - # and we first need to compute the policy action before computing q values losses - with tf.control_dependencies([policy_train_op]): - train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) - - self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] - # All ops to call during one training step - self.step_ops = [policy_loss, qf1_loss, qf2_loss, - value_loss, qf1, qf2, value_fn, logp_pi, - self.entropy, policy_train_op, train_values_op] - - # Add entropy coefficient optimization operation if needed - if ent_coef_loss is not None: - with tf.control_dependencies([train_values_op]): - ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) - self.infos_names += ['ent_coef_loss', 'ent_coef'] - self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] - - # Monitor losses and entropy in tensorboard - tf.summary.scalar('policy_loss', policy_loss) - tf.summary.scalar('qf1_loss', qf1_loss) - tf.summary.scalar('qf2_loss', qf2_loss) - tf.summary.scalar('value_loss', value_loss) - tf.summary.scalar('entropy', self.entropy) - if ent_coef_loss is not None: - tf.summary.scalar('ent_coef_loss', ent_coef_loss) - tf.summary.scalar('ent_coef', self.ent_coef) - - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) - - # Retrieve parameters that must be saved - self.params = tf_util.get_trainable_vars("model") - self.target_params = tf_util.get_trainable_vars("target/values_fn") - - # Initialize Variables and target network - with self.sess.as_default(): - self.sess.run(tf.global_variables_initializer()) - self.sess.run(target_init_op) - - self.summary = tf.summary.merge_all() - - def _train_step(self, step, writer, learning_rate): - # Sample a batch from the replay buffer - batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) - batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch - - feed_dict = { - self.observations_ph: batch_obs, - self.actions_ph: batch_actions, - self.next_observations_ph: batch_next_obs, - self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), - self.terminals_ph: batch_dones.reshape(self.batch_size, -1), - self.learning_rate_ph: learning_rate - } - - # out = [policy_loss, qf1_loss, qf2_loss, - # value_loss, qf1, qf2, value_fn, logp_pi, - # self.entropy, policy_train_op, train_values_op] - - # Do one gradient step - # and optionally compute log for tensorboard - if writer is not None: - out = self.sess.run([self.summary] + self.step_ops, feed_dict) - summary = out.pop(0) - writer.add_summary(summary, step) - else: - out = self.sess.run(self.step_ops, feed_dict) - - # Unpack to monitor losses and entropy - policy_loss, qf1_loss, qf2_loss, value_loss, *values = out - # qf1, qf2, value_fn, logp_pi, entropy, *_ = values - entropy = values[4] - - if self.log_ent_coef is not None: - ent_coef_loss, ent_coef = values[-2:] - return policy_loss, qf1_loss, qf2_loss, value_loss, entropy, ent_coef_loss, ent_coef - - return policy_loss, qf1_loss, qf2_loss, value_loss, entropy - - def learn(self, total_timesteps, callback=None, - log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - if replay_wrapper is not None: - self.replay_buffer = replay_wrapper(self.replay_buffer) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - - self._setup_learn() - - # Transform to callable if needed - self.learning_rate = get_schedule_fn(self.learning_rate) - # Initial learning rate - current_lr = self.learning_rate(1) - - start_time = time.time() - episode_rewards = [0.0] - episode_successes = [] - if self.action_noise is not None: - self.action_noise.reset() - obs = self.env.reset() - # Retrieve unnormalized observation for saving into the buffer - if self._vec_normalize_env is not None: - obs_ = self._vec_normalize_env.get_original_obs().squeeze() - - n_updates = 0 - infos_values = [] - - callback.on_training_start(locals(), globals()) - callback.on_rollout_start() - - for step in range(total_timesteps): - # Before training starts, randomly sample actions - # from a uniform distribution for better exploration. - # Afterwards, use the learned policy - # if random_exploration is set to 0 (normal setting) - if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: - # actions sampled from action space are from range specific to the environment - # but algorithm operates on tanh-squashed actions therefore simple scaling is used - unscaled_action = self.env.action_space.sample() - action = scale_action(self.action_space, unscaled_action) - else: - action = self.policy_tf.step(obs[None], deterministic=False).flatten() - # Add noise to the action (improve exploration, - # not needed in general) - if self.action_noise is not None: - action = np.clip(action + self.action_noise(), -1, 1) - # inferred actions need to be transformed to environment action_space before stepping - unscaled_action = unscale_action(self.action_space, action) - - assert action.shape == self.env.action_space.shape - - new_obs, reward, done, info = self.env.step(unscaled_action) - - self.num_timesteps += 1 - - # Only stop training if return value is False, not when it is None. This is for backwards - # compatibility with callbacks that have no return statement. - callback.update_locals(locals()) - if callback.on_step() is False: - break - - # Store only the unnormalized version - if self._vec_normalize_env is not None: - new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() - reward_ = self._vec_normalize_env.get_original_reward().squeeze() - else: - # Avoid changing the original ones - obs_, new_obs_, reward_ = obs, new_obs, reward - - # Store transition in the replay buffer. - self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) - obs = new_obs - # Save the unnormalized observation - if self._vec_normalize_env is not None: - obs_ = new_obs_ - - # Retrieve reward and episode length if using Monitor wrapper - maybe_ep_info = info.get('episode') - if maybe_ep_info is not None: - self.ep_info_buf.extend([maybe_ep_info]) - - if writer is not None: - # Write reward per episode to tensorboard - ep_reward = np.array([reward_]).reshape((1, -1)) - ep_done = np.array([done]).reshape((1, -1)) - tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, - ep_done, writer, self.num_timesteps) - - if self.num_timesteps % self.train_freq == 0: - callback.on_rollout_end() - - mb_infos_vals = [] - # Update policy, critics and target networks - for grad_step in range(self.gradient_steps): - # Break if the warmup phase is not over - # or if there are not enough samples in the replay buffer - if not self.replay_buffer.can_sample(self.batch_size) \ - or self.num_timesteps < self.learning_starts: - break - n_updates += 1 - # Compute current learning_rate - frac = 1.0 - step / total_timesteps - current_lr = self.learning_rate(frac) - # Update policy and critics (q functions) - mb_infos_vals.append(self._train_step(step, writer, current_lr)) - # Update target network - if (step + grad_step) % self.target_update_interval == 0: - # Update target network - self.sess.run(self.target_update_op) - # Log losses and entropy, useful for monitor training - if len(mb_infos_vals) > 0: - infos_values = np.mean(mb_infos_vals, axis=0) - - callback.on_rollout_start() - - episode_rewards[-1] += reward_ - if done: - if self.action_noise is not None: - self.action_noise.reset() - if not isinstance(self.env, VecEnv): - obs = self.env.reset() - episode_rewards.append(0.0) - - maybe_is_success = info.get('is_success') - if maybe_is_success is not None: - episode_successes.append(float(maybe_is_success)) - - if len(episode_rewards[-101:-1]) == 0: - mean_reward = -np.inf - else: - mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) - - # substract 1 as we appended a new term just now - num_episodes = len(episode_rewards) - 1 - # Display training infos - if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: - fps = int(step / (time.time() - start_time)) - logger.logkv("episodes", num_episodes) - logger.logkv("mean 100 episode reward", mean_reward) - if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: - logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) - logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) - logger.logkv("n_updates", n_updates) - logger.logkv("current_lr", current_lr) - logger.logkv("fps", fps) - logger.logkv('time_elapsed', int(time.time() - start_time)) - if len(episode_successes) > 0: - logger.logkv("success rate", np.mean(episode_successes[-100:])) - if len(infos_values) > 0: - for (name, val) in zip(self.infos_names, infos_values): - logger.logkv(name, val) - logger.logkv("total timesteps", self.num_timesteps) - logger.dumpkvs() - # Reset infos: - infos_values = [] - callback.on_training_end() - return self - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - if actions is not None: - raise ValueError("Error: SAC does not have action probabilities.") - - warnings.warn("Even though SAC has a Gaussian policy, it cannot return a distribution as it " - "is squashed by a tanh before being scaled and outputed.") - - return None - - def predict(self, observation, state=None, mask=None, deterministic=True): - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions = self.policy_tf.step(observation, deterministic=deterministic) - actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = unscale_action(self.action_space, actions) # scale the output for the prediction - - if not vectorized_env: - actions = actions[0] - - return actions, None - - def get_parameter_list(self): - return (self.params + - self.target_params) - - def save(self, save_path, cloudpickle=False): - data = { - "learning_rate": self.learning_rate, - "buffer_size": self.buffer_size, - "learning_starts": self.learning_starts, - "train_freq": self.train_freq, - "batch_size": self.batch_size, - "tau": self.tau, - "ent_coef": self.ent_coef if isinstance(self.ent_coef, float) else 'auto', - "target_entropy": self.target_entropy, - # Should we also store the replay buffer? - # this may lead to high memory usage - # with all transition inside - # "replay_buffer": self.replay_buffer - "gamma": self.gamma, - "verbose": self.verbose, - "observation_space": self.observation_space, - "action_space": self.action_space, - "policy": self.policy, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "action_noise": self.action_noise, - "random_exploration": self.random_exploration, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/td3/__init__.py b/stable_baselines/td3/__init__.py deleted file mode 100644 index e8b55838..00000000 --- a/stable_baselines/td3/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise -from stable_baselines.td3.td3 import TD3 -from stable_baselines.td3.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy diff --git a/stable_baselines/td3/policies.py b/stable_baselines/td3/policies.py deleted file mode 100644 index bc688900..00000000 --- a/stable_baselines/td3/policies.py +++ /dev/null @@ -1,242 +0,0 @@ -import tensorflow as tf -from gym.spaces import Box - -from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy -from stable_baselines.common.tf_layers import mlp - - -class TD3Policy(BasePolicy): - """ - Policy object that implements a TD3-like actor critic - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param scale: (bool) whether or not to scale the input - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False): - super(TD3Policy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) - assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - - self.qf1 = None - self.qf2 = None - self.policy = None - - def make_actor(self, obs=None, reuse=False, scope="pi"): - """ - Creates an actor object - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name of the actor - :return: (TensorFlow Tensor) the output tensor - """ - raise NotImplementedError - - def make_critics(self, obs=None, action=None, reuse=False, - scope="qvalues_fn"): - """ - Creates the two Q-Values approximator - - :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param action: (TensorFlow Tensor) The action placeholder - :param reuse: (bool) whether or not to reuse parameters - :param scope: (str) the scope name - :return: ([tf.Tensor]) Mean, action and log probability - """ - raise NotImplementedError - - def step(self, obs, state=None, mask=None): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) actions - """ - raise NotImplementedError - - def proba_step(self, obs, state=None, mask=None): - """ - Returns the policy for a single step - - :param obs: ([float] or [int]) The current observation of the environment - :param state: ([float]) The last states (used in recurrent policies) - :param mask: ([float]) The last masks (used in recurrent policies) - :return: ([float]) actions - """ - return self.step(obs, state, mask) - - -class FeedForwardPolicy(TD3Policy): - """ - Policy object that implements a DDPG-like actor critic, using a feed forward neural network. - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64]) - :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction - :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") - :param layer_norm: (bool) enable layer normalisation - :param act_fun: (tf.func) the activation function to use in the neural network. - :param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, layers=None, - cnn_extractor=nature_cnn, feature_extraction="cnn", - layer_norm=False, act_fun=tf.nn.relu, **kwargs): - super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, - reuse=reuse, scale=(feature_extraction == "cnn")) - - self._kwargs_check(feature_extraction, kwargs) - self.layer_norm = layer_norm - self.feature_extraction = feature_extraction - self.cnn_kwargs = kwargs - self.cnn_extractor = cnn_extractor - self.reuse = reuse - if layers is None: - layers = [64, 64] - self.layers = layers - - assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy." - - self.activ_fn = act_fun - - def make_actor(self, obs=None, reuse=False, scope="pi"): - if obs is None: - obs = self.processed_obs - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - pi_h = tf.layers.flatten(obs) - - pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - - self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh) - - return policy - - def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"): - if obs is None: - obs = self.processed_obs - - with tf.variable_scope(scope, reuse=reuse): - if self.feature_extraction == "cnn": - critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) - else: - critics_h = tf.layers.flatten(obs) - - # Concatenate preprocessed state and action - qf_h = tf.concat([critics_h, action], axis=-1) - - # Double Q values to reduce overestimation - with tf.variable_scope('qf1', reuse=reuse): - qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - qf1 = tf.layers.dense(qf1_h, 1, name="qf1") - - with tf.variable_scope('qf2', reuse=reuse): - qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) - qf2 = tf.layers.dense(qf2_h, 1, name="qf2") - - self.qf1 = qf1 - self.qf2 = qf2 - - return self.qf1, self.qf2 - - def step(self, obs, state=None, mask=None): - return self.sess.run(self.policy, {self.obs_ph: obs}) - - -class CnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", **_kwargs) - - -class LnCnnPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a CNN (the nature CNN), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(LnCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="cnn", layer_norm=True, **_kwargs) - - -class MlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64) - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", **_kwargs) - - -class LnMlpPolicy(FeedForwardPolicy): - """ - Policy object that implements actor critic, using a MLP (2 layers of 64), with layer normalisation - - :param sess: (TensorFlow session) The current TensorFlow session - :param ob_space: (Gym Space) The observation space of the environment - :param ac_space: (Gym Space) The action space of the environment - :param n_env: (int) The number of environments to run - :param n_steps: (int) The number of steps to run for each environment - :param n_batch: (int) The number of batch to run (n_envs * n_steps) - :param reuse: (bool) If the policy is reusable or not - :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction - """ - - def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, **_kwargs): - super(LnMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse, - feature_extraction="mlp", layer_norm=True, **_kwargs) - - -register_policy("CnnPolicy", CnnPolicy) -register_policy("LnCnnPolicy", LnCnnPolicy) -register_policy("MlpPolicy", MlpPolicy) -register_policy("LnMlpPolicy", LnMlpPolicy) diff --git a/stable_baselines/td3/td3.py b/stable_baselines/td3/td3.py deleted file mode 100644 index 5f65d8bb..00000000 --- a/stable_baselines/td3/td3.py +++ /dev/null @@ -1,501 +0,0 @@ -import time -import warnings - -import numpy as np -import tensorflow as tf - -from stable_baselines import logger -from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter -from stable_baselines.common.vec_env import VecEnv -from stable_baselines.common.math_util import safe_mean, unscale_action, scale_action -from stable_baselines.common.schedules import get_schedule_fn -from stable_baselines.common.buffers import ReplayBuffer -from stable_baselines.td3.policies import TD3Policy - - -class TD3(OffPolicyRLModel): - """ - Twin Delayed DDPG (TD3) - Addressing Function Approximation Error in Actor-Critic Methods. - - Original implementation: https://github.com/sfujim/TD3 - Paper: https://arxiv.org/pdf/1802.09477.pdf - Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html - - :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) the discount factor - :param learning_rate: (float or callable) learning rate for adam optimizer, - the same learning rate will be used for all networks (Q-Values and Actor networks) - it can be a function of the current progress (from 1 to 0) - :param buffer_size: (int) size of the replay buffer - :param batch_size: (int) Minibatch size for each gradient update - :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1) - :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps - per training steps. The Q values will be updated policy_delay more often (update every training step). - :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. - :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy - (smoothing noise) - :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. - :param train_freq: (int) Update the model every `train_freq` steps. - :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param gradient_steps: (int) How many gradient update after each step - :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy) - This is not needed for TD3 normally but can help exploring when using HER + TD3. - This hack was present in the original OpenAI Baselines repo (DDPG + HER) - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - Note: this has no effect on TD3 logging for now - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, - learning_starts=100, train_freq=100, gradient_steps=100, batch_size=128, - tau=0.005, policy_delay=2, action_noise=None, - target_policy_noise=0.2, target_noise_clip=0.5, - random_exploration=0.0, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): - - super(TD3, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, - policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.buffer_size = buffer_size - self.learning_rate = learning_rate - self.learning_starts = learning_starts - self.train_freq = train_freq - self.batch_size = batch_size - self.tau = tau - self.gradient_steps = gradient_steps - self.gamma = gamma - self.action_noise = action_noise - self.random_exploration = random_exploration - self.policy_delay = policy_delay - self.target_noise_clip = target_noise_clip - self.target_policy_noise = target_policy_noise - - self.graph = None - self.replay_buffer = None - self.sess = None - self.tensorboard_log = tensorboard_log - self.verbose = verbose - self.params = None - self.summary = None - self.policy_tf = None - self.full_tensorboard_log = full_tensorboard_log - - self.obs_target = None - self.target_policy_tf = None - self.actions_ph = None - self.rewards_ph = None - self.terminals_ph = None - self.observations_ph = None - self.action_target = None - self.next_observations_ph = None - self.step_ops = None - self.target_ops = None - self.infos_names = None - self.target_params = None - self.learning_rate_ph = None - self.processed_obs_ph = None - self.processed_next_obs_ph = None - self.policy_out = None - self.policy_train_op = None - self.policy_loss = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.policy_tf - # Rescale - policy_out = unscale_action(self.action_space, self.policy_out) - return policy.obs_ph, self.actions_ph, policy_out - - def setup_model(self): - with SetVerbosity(self.verbose): - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - self.replay_buffer = ReplayBuffer(self.buffer_size) - - with tf.variable_scope("input", reuse=False): - # Create policy and target TF objects - self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, - **self.policy_kwargs) - self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, - **self.policy_kwargs) - - # Initialize Placeholders - self.observations_ph = self.policy_tf.obs_ph - # Normalized observation for pixels - self.processed_obs_ph = self.policy_tf.processed_obs - self.next_observations_ph = self.target_policy_tf.obs_ph - self.processed_next_obs_ph = self.target_policy_tf.processed_obs - self.action_target = self.target_policy_tf.action_ph - self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') - self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') - self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, - name='actions') - self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") - - with tf.variable_scope("model", reuse=False): - # Create the policy - self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph) - # Use two Q-functions to improve performance by reducing overestimation bias - qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph) - # Q value when following the current policy - qf1_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, - policy_out, reuse=True) - - with tf.variable_scope("target", reuse=False): - # Create target networks - target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph) - # Target policy smoothing, by adding clipped noise to target actions - target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) - target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) - # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) - noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) - # Q values when following the target policy - qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, - noisy_target_action) - - with tf.variable_scope("loss", reuse=False): - # Take the min of the two target Q-Values (clipped Double-Q Learning) - min_qf_target = tf.minimum(qf1_target, qf2_target) - - # Targets for Q value regression - q_backup = tf.stop_gradient( - self.rewards_ph + - (1 - self.terminals_ph) * self.gamma * min_qf_target - ) - - # Compute Q-Function loss - qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2) - qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2) - - qvalues_losses = qf1_loss + qf2_loss - - # Policy loss: maximise q value - self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) - - # Policy train op - # will be called only every n training steps, - # where n is the policy delay - policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) - policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) - self.policy_train_op = policy_train_op - - # Q Values optimizer - qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) - qvalues_params = tf_util.get_trainable_vars('model/values_fn/') - - # Q Values and policy target params - source_params = tf_util.get_trainable_vars("model/") - target_params = tf_util.get_trainable_vars("target/") - - # Polyak averaging for target variables - self.target_ops = [ - tf.assign(target, (1 - self.tau) * target + self.tau * source) - for target, source in zip(target_params, source_params) - ] - - # Initializing target to match source variables - target_init_op = [ - tf.assign(target, source) - for target, source in zip(target_params, source_params) - ] - - train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params) - - self.infos_names = ['qf1_loss', 'qf2_loss'] - # All ops to call during one training step - self.step_ops = [qf1_loss, qf2_loss, - qf1, qf2, train_values_op] - - # Monitor losses and entropy in tensorboard - tf.summary.scalar('policy_loss', policy_loss) - tf.summary.scalar('qf1_loss', qf1_loss) - tf.summary.scalar('qf2_loss', qf2_loss) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) - - # Retrieve parameters that must be saved - self.params = tf_util.get_trainable_vars("model") - self.target_params = tf_util.get_trainable_vars("target/") - - # Initialize Variables and target network - with self.sess.as_default(): - self.sess.run(tf.global_variables_initializer()) - self.sess.run(target_init_op) - - self.summary = tf.summary.merge_all() - - def _train_step(self, step, writer, learning_rate, update_policy): - # Sample a batch from the replay buffer - batch = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) - batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch - - feed_dict = { - self.observations_ph: batch_obs, - self.actions_ph: batch_actions, - self.next_observations_ph: batch_next_obs, - self.rewards_ph: batch_rewards.reshape(self.batch_size, -1), - self.terminals_ph: batch_dones.reshape(self.batch_size, -1), - self.learning_rate_ph: learning_rate - } - - step_ops = self.step_ops - if update_policy: - # Update policy and target networks - step_ops = step_ops + [self.policy_train_op, self.target_ops, self.policy_loss] - - # Do one gradient step - # and optionally compute log for tensorboard - if writer is not None: - out = self.sess.run([self.summary] + step_ops, feed_dict) - summary = out.pop(0) - writer.add_summary(summary, step) - else: - out = self.sess.run(step_ops, feed_dict) - - # Unpack to monitor losses - qf1_loss, qf2_loss, *_values = out - - return qf1_loss, qf2_loss - - def learn(self, total_timesteps, callback=None, - log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - if replay_wrapper is not None: - self.replay_buffer = replay_wrapper(self.replay_buffer) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - - self._setup_learn() - - # Transform to callable if needed - self.learning_rate = get_schedule_fn(self.learning_rate) - # Initial learning rate - current_lr = self.learning_rate(1) - - start_time = time.time() - episode_rewards = [0.0] - episode_successes = [] - if self.action_noise is not None: - self.action_noise.reset() - obs = self.env.reset() - # Retrieve unnormalized observation for saving into the buffer - if self._vec_normalize_env is not None: - obs_ = self._vec_normalize_env.get_original_obs().squeeze() - n_updates = 0 - infos_values = [] - - callback.on_training_start(locals(), globals()) - callback.on_rollout_start() - - for step in range(total_timesteps): - # Before training starts, randomly sample actions - # from a uniform distribution for better exploration. - # Afterwards, use the learned policy - # if random_exploration is set to 0 (normal setting) - if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: - # actions sampled from action space are from range specific to the environment - # but algorithm operates on tanh-squashed actions therefore simple scaling is used - unscaled_action = self.env.action_space.sample() - action = scale_action(self.action_space, unscaled_action) - else: - action = self.policy_tf.step(obs[None]).flatten() - # Add noise to the action, as the policy - # is deterministic, this is required for exploration - if self.action_noise is not None: - action = np.clip(action + self.action_noise(), -1, 1) - # Rescale from [-1, 1] to the correct bounds - unscaled_action = unscale_action(self.action_space, action) - - assert action.shape == self.env.action_space.shape - - new_obs, reward, done, info = self.env.step(unscaled_action) - - self.num_timesteps += 1 - - # Only stop training if return value is False, not when it is None. This is for backwards - # compatibility with callbacks that have no return statement. - callback.update_locals(locals()) - if callback.on_step() is False: - break - - # Store only the unnormalized version - if self._vec_normalize_env is not None: - new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() - reward_ = self._vec_normalize_env.get_original_reward().squeeze() - else: - # Avoid changing the original ones - obs_, new_obs_, reward_ = obs, new_obs, reward - - # Store transition in the replay buffer. - self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) - obs = new_obs - # Save the unnormalized observation - if self._vec_normalize_env is not None: - obs_ = new_obs_ - - # Retrieve reward and episode length if using Monitor wrapper - maybe_ep_info = info.get('episode') - if maybe_ep_info is not None: - self.ep_info_buf.extend([maybe_ep_info]) - - if writer is not None: - # Write reward per episode to tensorboard - ep_reward = np.array([reward_]).reshape((1, -1)) - ep_done = np.array([done]).reshape((1, -1)) - tf_util.total_episode_reward_logger(self.episode_reward, ep_reward, - ep_done, writer, self.num_timesteps) - - if self.num_timesteps % self.train_freq == 0: - callback.on_rollout_end() - - mb_infos_vals = [] - # Update policy, critics and target networks - for grad_step in range(self.gradient_steps): - # Break if the warmup phase is not over - # or if there are not enough samples in the replay buffer - if not self.replay_buffer.can_sample(self.batch_size) \ - or self.num_timesteps < self.learning_starts: - break - n_updates += 1 - # Compute current learning_rate - frac = 1.0 - step / total_timesteps - current_lr = self.learning_rate(frac) - # Update policy and critics (q functions) - # Note: the policy is updated less frequently than the Q functions - # this is controlled by the `policy_delay` parameter - mb_infos_vals.append( - self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) - - # Log losses and entropy, useful for monitor training - if len(mb_infos_vals) > 0: - infos_values = np.mean(mb_infos_vals, axis=0) - - callback.on_rollout_start() - - episode_rewards[-1] += reward_ - if done: - if self.action_noise is not None: - self.action_noise.reset() - if not isinstance(self.env, VecEnv): - obs = self.env.reset() - episode_rewards.append(0.0) - - maybe_is_success = info.get('is_success') - if maybe_is_success is not None: - episode_successes.append(float(maybe_is_success)) - - if len(episode_rewards[-101:-1]) == 0: - mean_reward = -np.inf - else: - mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) - - # substract 1 as we appended a new term just now - num_episodes = len(episode_rewards) - 1 - # Display training infos - if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: - fps = int(step / (time.time() - start_time)) - logger.logkv("episodes", num_episodes) - logger.logkv("mean 100 episode reward", mean_reward) - if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: - logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) - logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) - logger.logkv("n_updates", n_updates) - logger.logkv("current_lr", current_lr) - logger.logkv("fps", fps) - logger.logkv('time_elapsed', int(time.time() - start_time)) - if len(episode_successes) > 0: - logger.logkv("success rate", np.mean(episode_successes[-100:])) - if len(infos_values) > 0: - for (name, val) in zip(self.infos_names, infos_values): - logger.logkv(name, val) - logger.logkv("total timesteps", self.num_timesteps) - logger.dumpkvs() - # Reset infos: - infos_values = [] - - callback.on_training_end() - return self - - def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): - _ = np.array(observation) - - if actions is not None: - raise ValueError("Error: TD3 does not have action probabilities.") - - # here there are no action probabilities, as DDPG does not use a probability distribution - warnings.warn("Warning: action probability is meaningless for TD3. Returning None") - return None - - def predict(self, observation, state=None, mask=None, deterministic=True): - observation = np.array(observation) - vectorized_env = self._is_vectorized_observation(observation, self.observation_space) - - observation = observation.reshape((-1,) + self.observation_space.shape) - actions = self.policy_tf.step(observation) - - if self.action_noise is not None and not deterministic: - actions = np.clip(actions + self.action_noise(), -1, 1) - - actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = unscale_action(self.action_space, actions) # scale the output for the prediction - - if not vectorized_env: - actions = actions[0] - - return actions, None - - def get_parameter_list(self): - return (self.params + - self.target_params) - - def save(self, save_path, cloudpickle=False): - data = { - "learning_rate": self.learning_rate, - "buffer_size": self.buffer_size, - "learning_starts": self.learning_starts, - "train_freq": self.train_freq, - "batch_size": self.batch_size, - "tau": self.tau, - # Should we also store the replay buffer? - # this may lead to high memory usage - # with all transition inside - # "replay_buffer": self.replay_buffer - "policy_delay": self.policy_delay, - "target_noise_clip": self.target_noise_clip, - "target_policy_noise": self.target_policy_noise, - "gamma": self.gamma, - "verbose": self.verbose, - "observation_space": self.observation_space, - "action_space": self.action_space, - "policy": self.policy, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "action_noise": self.action_noise, - "random_exploration": self.random_exploration, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/trpo_mpi/__init__.py b/stable_baselines/trpo_mpi/__init__.py deleted file mode 100644 index 601d7f88..00000000 --- a/stable_baselines/trpo_mpi/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from stable_baselines.trpo_mpi.trpo_mpi import TRPO diff --git a/stable_baselines/trpo_mpi/run_atari.py b/stable_baselines/trpo_mpi/run_atari.py deleted file mode 100644 index dab662b2..00000000 --- a/stable_baselines/trpo_mpi/run_atari.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -import os - -from mpi4py import MPI - -from stable_baselines.common import set_global_seeds -from stable_baselines import bench, logger, TRPO -from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind -from stable_baselines.common.cmd_util import atari_arg_parser -from stable_baselines.common.policies import CnnPolicy - - -def train(env_id, num_timesteps, seed): - """ - Train TRPO model for the atari environment, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - """ - rank = MPI.COMM_WORLD.Get_rank() - - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - set_global_seeds(workerseed) - env = make_atari(env_id) - - env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) - env.seed(workerseed) - - env = wrap_deepmind(env) - env.seed(workerseed) - - model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, - gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) - model.learn(total_timesteps=int(num_timesteps * 1.1)) - env.close() - # Free memory - del env - - -def main(): - """ - Runs the test - """ - args = atari_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == "__main__": - main() diff --git a/stable_baselines/trpo_mpi/run_mujoco.py b/stable_baselines/trpo_mpi/run_mujoco.py deleted file mode 100644 index de303dba..00000000 --- a/stable_baselines/trpo_mpi/run_mujoco.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# noinspection PyUnresolvedReferences -from mpi4py import MPI - -from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from stable_baselines.common.policies import MlpPolicy -from stable_baselines import logger -from stable_baselines.trpo_mpi import TRPO -import stable_baselines.common.tf_util as tf_util - - -def train(env_id, num_timesteps, seed): - """ - Train TRPO model for the mujoco environment, for testing purposes - - :param env_id: (str) Environment ID - :param num_timesteps: (int) The total number of samples - :param seed: (int) The initial seed for training - """ - with tf_util.single_threaded_session(): - rank = MPI.COMM_WORLD.Get_rank() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - logger.set_level(logger.DISABLED) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - - env = make_mujoco_env(env_id, workerseed) - model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, - gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) - model.learn(total_timesteps=num_timesteps) - env.close() - - -def main(): - """ - Runs the test - """ - args = mujoco_arg_parser().parse_args() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) - - -if __name__ == '__main__': - main() diff --git a/stable_baselines/trpo_mpi/trpo_mpi.py b/stable_baselines/trpo_mpi/trpo_mpi.py deleted file mode 100644 index 7d95356d..00000000 --- a/stable_baselines/trpo_mpi/trpo_mpi.py +++ /dev/null @@ -1,538 +0,0 @@ -import time -from contextlib import contextmanager -from collections import deque - -import gym -from mpi4py import MPI -import tensorflow as tf -import numpy as np - -import stable_baselines.common.tf_util as tf_util -from stable_baselines.common.tf_util import total_episode_reward_logger -from stable_baselines.common import explained_variance, zipsame, dataset, fmt_row, colorize, ActorCriticRLModel, \ - SetVerbosity, TensorboardWriter -from stable_baselines import logger -from stable_baselines.common.mpi_adam import MpiAdam -from stable_baselines.common.cg import conjugate_gradient -from stable_baselines.common.policies import ActorCriticPolicy -from stable_baselines.common.misc_util import flatten_lists -from stable_baselines.common.runners import traj_segment_generator -from stable_baselines.trpo_mpi.utils import add_vtarg_and_adv - - -class TRPO(ActorCriticRLModel): - """ - Trust Region Policy Optimization (https://arxiv.org/abs/1502.05477) - - :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param gamma: (float) the discount value - :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) - :param max_kl: (float) the Kullback-Leibler loss threshold - :param cg_iters: (int) the number of iterations for the conjugate gradient calculation - :param lam: (float) GAE factor - :param entcoeff: (float) the weight for the entropy loss - :param cg_damping: (float) the compute gradient dampening factor - :param vf_stepsize: (float) the value function stepsize - :param vf_iters: (int) the value function's number iterations for learning - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug - :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) - :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance - :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param full_tensorboard_log: (bool) enable additional logging when using tensorboard - WARNING: this logging can take a lot of space quickly - :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). - If None (default), use random seed. Note that if you want completely deterministic - results, you must set `n_cpu_tf_sess` to 1. - :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations - If None, the number of cpu of the current machine will be used. - """ - def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, - entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, - seed=None, n_cpu_tf_sess=1): - super(TRPO, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, - seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) - - self.using_gail = False - self.timesteps_per_batch = timesteps_per_batch - self.cg_iters = cg_iters - self.cg_damping = cg_damping - self.gamma = gamma - self.lam = lam - self.max_kl = max_kl - self.vf_iters = vf_iters - self.vf_stepsize = vf_stepsize - self.entcoeff = entcoeff - self.tensorboard_log = tensorboard_log - self.full_tensorboard_log = full_tensorboard_log - - # GAIL Params - self.hidden_size_adversary = 100 - self.adversary_entcoeff = 1e-3 - self.expert_dataset = None - self.g_step = 1 - self.d_step = 1 - self.d_stepsize = 3e-4 - - self.graph = None - self.sess = None - self.policy_pi = None - self.loss_names = None - self.assign_old_eq_new = None - self.compute_losses = None - self.compute_lossandgrad = None - self.compute_fvp = None - self.compute_vflossandgrad = None - self.d_adam = None - self.vfadam = None - self.get_flat = None - self.set_from_flat = None - self.timed = None - self.allmean = None - self.nworkers = None - self.rank = None - self.reward_giver = None - self.step = None - self.proba_step = None - self.initial_state = None - self.params = None - self.summary = None - - if _init_setup_model: - self.setup_model() - - def _get_pretrain_placeholders(self): - policy = self.policy_pi - action_ph = policy.pdtype.sample_placeholder([None]) - if isinstance(self.action_space, gym.spaces.Discrete): - return policy.obs_ph, action_ph, policy.policy - return policy.obs_ph, action_ph, policy.deterministic_action - - def setup_model(self): - # prevent import loops - from stable_baselines.gail.adversary import TransitionClassifier - - with SetVerbosity(self.verbose): - - assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ - "an instance of common.policies.ActorCriticPolicy." - - self.nworkers = MPI.COMM_WORLD.Get_size() - self.rank = MPI.COMM_WORLD.Get_rank() - np.set_printoptions(precision=3) - - self.graph = tf.Graph() - with self.graph.as_default(): - self.set_random_seed(self.seed) - self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) - - if self.using_gail: - self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, - self.hidden_size_adversary, - entcoeff=self.adversary_entcoeff) - - # Construct network for new policy - self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - None, reuse=False, **self.policy_kwargs) - - # Network for old policy - with tf.variable_scope("oldpi", reuse=False): - old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, - None, reuse=False, **self.policy_kwargs) - - with tf.variable_scope("loss", reuse=False): - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - - observation = self.policy_pi.obs_ph - action = self.policy_pi.pdtype.sample_placeholder([None]) - - kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) - ent = self.policy_pi.proba_distribution.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - entbonus = self.entcoeff * meanent - - vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) - - # advantage * pnew / pold - ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - - old_policy.proba_distribution.logp(action)) - surrgain = tf.reduce_mean(ratio * atarg) - - optimgain = surrgain + entbonus - losses = [optimgain, meankl, entbonus, surrgain, meanent] - self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] - - dist = meankl - - all_var_list = tf_util.get_trainable_vars("model") - var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] - vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] - - self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) - self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) - - klgrads = tf.gradients(dist, var_list) - flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") - shapes = [var.get_shape().as_list() for var in var_list] - start = 0 - tangents = [] - for shape in shapes: - var_size = tf_util.intprod(shape) - tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) - start += var_size - gvp = tf.add_n([tf.reduce_sum(grad * tangent) - for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 - # Fisher vector products - fvp = tf_util.flatgrad(gvp, var_list) - - tf.summary.scalar('entropy_loss', meanent) - tf.summary.scalar('policy_gradient_loss', optimgain) - tf.summary.scalar('value_function_loss', surrgain) - tf.summary.scalar('approximate_kullback-leibler', meankl) - tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) - - self.assign_old_eq_new = \ - tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in - zipsame(tf_util.get_globals_vars("oldpi"), - tf_util.get_globals_vars("model"))]) - self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses) - self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg], - fvp) - self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], - tf_util.flatgrad(vferr, vf_var_list)) - - @contextmanager - def timed(msg): - if self.rank == 0 and self.verbose >= 1: - print(colorize(msg, color='magenta')) - start_time = time.time() - yield - print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), - color='magenta')) - else: - yield - - def allmean(arr): - assert isinstance(arr, np.ndarray) - out = np.empty_like(arr) - MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) - out /= self.nworkers - return out - - tf_util.initialize(sess=self.sess) - - th_init = self.get_flat() - MPI.COMM_WORLD.Bcast(th_init, root=0) - self.set_from_flat(th_init) - - with tf.variable_scope("Adam_mpi", reuse=False): - self.vfadam = MpiAdam(vf_var_list, sess=self.sess) - if self.using_gail: - self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) - self.d_adam.sync() - self.vfadam.sync() - - with tf.variable_scope("input_info", reuse=False): - tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) - tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) - tf.summary.scalar('advantage', tf.reduce_mean(atarg)) - tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) - - if self.full_tensorboard_log: - tf.summary.histogram('discounted_rewards', ret) - tf.summary.histogram('learning_rate', self.vf_stepsize) - tf.summary.histogram('advantage', atarg) - tf.summary.histogram('kl_clip_range', self.max_kl) - if tf_util.is_image(self.observation_space): - tf.summary.image('observation', observation) - else: - tf.summary.histogram('observation', observation) - - self.timed = timed - self.allmean = allmean - - self.step = self.policy_pi.step - self.proba_step = self.policy_pi.proba_step - self.initial_state = self.policy_pi.initial_state - - self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") - if self.using_gail: - self.params.extend(self.reward_giver.get_trainable_variables()) - - self.summary = tf.summary.merge_all() - - self.compute_lossandgrad = \ - tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], - [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses) - - def _initialize_dataloader(self): - """Initialize dataloader.""" - batchsize = self.timesteps_per_batch // self.d_step - self.expert_dataset.init_dataloader(batchsize) - - def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="TRPO", - reset_num_timesteps=True): - - new_tb_log = self._init_num_timesteps(reset_num_timesteps) - callback = self._init_callback(callback) - - with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ - as writer: - self._setup_learn() - - with self.sess.as_default(): - callback.on_training_start(locals(), globals()) - - seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_batch, - reward_giver=self.reward_giver, - gail=self.using_gail, callback=callback) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - t_start = time.time() - len_buffer = deque(maxlen=40) # rolling buffer for episode lengths - reward_buffer = deque(maxlen=40) # rolling buffer for episode rewards - - true_reward_buffer = None - if self.using_gail: - true_reward_buffer = deque(maxlen=40) - - self._initialize_dataloader() - - # Stats not used for now - # TODO: replace with normal tb logging - #  g_loss_stats = Stats(loss_names) - # d_loss_stats = Stats(reward_giver.loss_name) - # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) - - while True: - if timesteps_so_far >= total_timesteps: - break - - logger.log("********** Iteration %i ************" % iters_so_far) - - def fisher_vector_product(vec): - return self.allmean(self.compute_fvp(vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec - - # ------------------ Update G ------------------ - logger.log("Optimizing Policy...") - # g_step = 1 when not using GAIL - mean_losses = None - vpredbefore = None - tdlamret = None - observation = None - action = None - seg = None - for k in range(self.g_step): - with self.timed("sampling"): - seg = seg_gen.__next__() - - # Stop training early (triggered by the callback) - if not seg.get('continue_training', True): # pytype: disable=attribute-error - break - - add_vtarg_and_adv(seg, self.gamma, self.lam) - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - observation, action = seg["observations"], seg["actions"] - atarg, tdlamret = seg["adv"], seg["tdlamret"] - - vpredbefore = seg["vpred"] # predicted value function before update - atarg = (atarg - atarg.mean()) / (atarg.std() + 1e-8) # standardized advantage function estimate - - # true_rew is the reward without discount - if writer is not None: - total_episode_reward_logger(self.episode_reward, - seg["true_rewards"].reshape( - (self.n_envs, -1)), - seg["dones"].reshape((self.n_envs, -1)), - writer, self.num_timesteps) - - args = seg["observations"], seg["observations"], seg["actions"], atarg - # Subsampling: see p40-42 of John Schulman thesis - # http://joschu.net/docs/thesis.pdf - fvpargs = [arr[::5] for arr in args] - - self.assign_old_eq_new(sess=self.sess) - - with self.timed("computegrad"): - steps = self.num_timesteps + (k + 1) * (seg["total_timestep"] / self.g_step) - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() if self.full_tensorboard_log else None - # run loss backprop with summary, and save the metadata (memory, compute time, ...) - if writer is not None: - summary, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess, - options=run_options, - run_metadata=run_metadata) - if self.full_tensorboard_log: - writer.add_run_metadata(run_metadata, 'step%d' % steps) - writer.add_summary(summary, steps) - else: - _, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess, - options=run_options, - run_metadata=run_metadata) - - lossbefore = self.allmean(np.array(lossbefore)) - grad = self.allmean(grad) - if np.allclose(grad, 0): - logger.log("Got zero gradient. not updating") - else: - with self.timed("conjugate_gradient"): - stepdir = conjugate_gradient(fisher_vector_product, grad, cg_iters=self.cg_iters, - verbose=self.rank == 0 and self.verbose >= 1) - assert np.isfinite(stepdir).all() - shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) - # abs(shs) to avoid taking square root of negative values - lagrange_multiplier = np.sqrt(abs(shs) / self.max_kl) - # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) - fullstep = stepdir / lagrange_multiplier - expectedimprove = grad.dot(fullstep) - surrbefore = lossbefore[0] - stepsize = 1.0 - thbefore = self.get_flat() - for _ in range(10): - thnew = thbefore + fullstep * stepsize - self.set_from_flat(thnew) - mean_losses = surr, kl_loss, *_ = self.allmean( - np.array(self.compute_losses(*args, sess=self.sess))) - improve = surr - surrbefore - logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) - if not np.isfinite(mean_losses).all(): - logger.log("Got non-finite value of losses -- bad!") - elif kl_loss > self.max_kl * 1.5: - logger.log("violated KL constraint. shrinking step.") - elif improve < 0: - logger.log("surrogate didn't improve. shrinking step.") - else: - logger.log("Stepsize OK!") - break - stepsize *= .5 - else: - logger.log("couldn't compute a good step") - self.set_from_flat(thbefore) - if self.nworkers > 1 and iters_so_far % 20 == 0: - # list of tuples - paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), self.vfadam.getflat().sum())) - assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) - - for (loss_name, loss_val) in zip(self.loss_names, mean_losses): - logger.record_tabular(loss_name, loss_val) - - with self.timed("vf"): - for _ in range(self.vf_iters): - # NOTE: for recurrent policies, use shuffle=False? - for (mbob, mbret) in dataset.iterbatches((seg["observations"], seg["tdlamret"]), - include_final_partial_batch=False, - batch_size=128, - shuffle=True): - grad = self.allmean(self.compute_vflossandgrad(mbob, mbob, mbret, sess=self.sess)) - self.vfadam.update(grad, self.vf_stepsize) - - # Stop training early (triggered by the callback) - if not seg.get('continue_training', True): # pytype: disable=attribute-error - break - - logger.record_tabular("explained_variance_tdlam_before", - explained_variance(vpredbefore, tdlamret)) - - if self.using_gail: - # ------------------ Update D ------------------ - logger.log("Optimizing Discriminator...") - logger.log(fmt_row(13, self.reward_giver.loss_name)) - assert len(observation) == self.timesteps_per_batch - batch_size = self.timesteps_per_batch // self.d_step - - # NOTE: uses only the last g step for observation - d_losses = [] # list of tuples, each of which gives the loss for a minibatch - # NOTE: for recurrent policies, use shuffle=False? - for ob_batch, ac_batch in dataset.iterbatches((observation, action), - include_final_partial_batch=False, - batch_size=batch_size, - shuffle=True): - ob_expert, ac_expert = self.expert_dataset.get_next_batch() - # update running mean/std for reward_giver - if self.reward_giver.normalize: - self.reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) - - # Reshape actions if needed when using discrete actions - if isinstance(self.action_space, gym.spaces.Discrete): - if len(ac_batch.shape) == 2: - ac_batch = ac_batch[:, 0] - if len(ac_expert.shape) == 2: - ac_expert = ac_expert[:, 0] - *newlosses, grad = self.reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) - self.d_adam.update(self.allmean(grad), self.d_stepsize) - d_losses.append(newlosses) - logger.log(fmt_row(13, np.mean(d_losses, axis=0))) - - # lr: lengths and rewards - lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values - list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local) # list of tuples - lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) - true_reward_buffer.extend(true_rets) - else: - # lr: lengths and rewards - lr_local = (seg["ep_lens"], seg["ep_rets"]) # local values - list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local) # list of tuples - lens, rews = map(flatten_lists, zip(*list_lr_pairs)) - len_buffer.extend(lens) - reward_buffer.extend(rews) - - if len(len_buffer) > 0: - logger.record_tabular("EpLenMean", np.mean(len_buffer)) - logger.record_tabular("EpRewMean", np.mean(reward_buffer)) - if self.using_gail: - logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"]) - timesteps_so_far += current_it_timesteps - self.num_timesteps += current_it_timesteps - iters_so_far += 1 - - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", self.num_timesteps) - logger.record_tabular("TimeElapsed", time.time() - t_start) - - if self.verbose >= 1 and self.rank == 0: - logger.dump_tabular() - - callback.on_training_end() - return self - - def save(self, save_path, cloudpickle=False): - data = { - "gamma": self.gamma, - "timesteps_per_batch": self.timesteps_per_batch, - "max_kl": self.max_kl, - "cg_iters": self.cg_iters, - "lam": self.lam, - "entcoeff": self.entcoeff, - "cg_damping": self.cg_damping, - "vf_stepsize": self.vf_stepsize, - "vf_iters": self.vf_iters, - "hidden_size_adversary": self.hidden_size_adversary, - "adversary_entcoeff": self.adversary_entcoeff, - "expert_dataset": self.expert_dataset, - "g_step": self.g_step, - "d_step": self.d_step, - "d_stepsize": self.d_stepsize, - "using_gail": self.using_gail, - "verbose": self.verbose, - "policy": self.policy, - "observation_space": self.observation_space, - "action_space": self.action_space, - "n_envs": self.n_envs, - "n_cpu_tf_sess": self.n_cpu_tf_sess, - "seed": self.seed, - "_vectorize_action": self._vectorize_action, - "policy_kwargs": self.policy_kwargs - } - - params_to_save = self.get_parameters() - - self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle) diff --git a/stable_baselines/trpo_mpi/utils.py b/stable_baselines/trpo_mpi/utils.py deleted file mode 100644 index 8661edcb..00000000 --- a/stable_baselines/trpo_mpi/utils.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np - - -def add_vtarg_and_adv(seg, gamma, lam): - """ - Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) - - :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information) - :param gamma: (float) Discount factor - :param lam: (float) GAE factor - """ - # last element is only used for last vtarg, but we already zeroed it if last new = 1 - episode_starts = np.append(seg["episode_starts"], False) - vpred = np.append(seg["vpred"], seg["nextvpred"]) - rew_len = len(seg["rewards"]) - seg["adv"] = np.empty(rew_len, 'float32') - rewards = seg["rewards"] - lastgaelam = 0 - for step in reversed(range(rew_len)): - nonterminal = 1 - float(episode_starts[step + 1]) - delta = rewards[step] + gamma * vpred[step + 1] * nonterminal - vpred[step] - seg["adv"][step] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] diff --git a/stable_baselines/version.txt b/stable_baselines/version.txt deleted file mode 100644 index af104faf..00000000 --- a/stable_baselines/version.txt +++ /dev/null @@ -1 +0,0 @@ -2.10.3a0 diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_0deterministic.py b/tests/test_0deterministic.py deleted file mode 100644 index 3e87cf54..00000000 --- a/tests/test_0deterministic.py +++ /dev/null @@ -1,60 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO, TD3 -from stable_baselines.common.noise import NormalActionNoise - -N_STEPS_TRAINING = 300 -SEED = 0 - - -# Weird stuff: TD3 would fail if another algorithm is tested before -# with n_cpu_tf_sess > 1 -@pytest.mark.xfail(reason="TD3 deterministic randomly fail when run with others...", strict=False) -def test_deterministic_td3(): - results = [[], []] - rewards = [[], []] - kwargs = {'n_cpu_tf_sess': 1} - env_id = 'Pendulum-v0' - kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) - - for i in range(2): - model = TD3('MlpPolicy', env_id, seed=SEED, **kwargs) - model.learn(N_STEPS_TRAINING) - env = model.get_env() - obs = env.reset() - for _ in range(20): - action, _ = model.predict(obs, deterministic=True) - obs, reward, _, _ = env.step(action) - results[i].append(action) - rewards[i].append(reward) - # without the extended tolerance, test fails for unknown reasons on Github... - assert np.allclose(results[0], results[1], rtol=1e-2), results - assert np.allclose(rewards[0], rewards[1], rtol=1e-2), rewards - - -@pytest.mark.parametrize("algo", [A2C, ACKTR, ACER, DDPG, DQN, PPO1, PPO2, SAC, TRPO]) -def test_deterministic_training_common(algo): - results = [[], []] - rewards = [[], []] - kwargs = {'n_cpu_tf_sess': 1} - if algo in [DDPG, TD3, SAC]: - env_id = 'Pendulum-v0' - kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) - else: - env_id = 'CartPole-v1' - if algo == DQN: - kwargs.update({'learning_starts': 100}) - - for i in range(2): - model = algo('MlpPolicy', env_id, seed=SEED, **kwargs) - model.learn(N_STEPS_TRAINING) - env = model.get_env() - obs = env.reset() - for _ in range(20): - action, _ = model.predict(obs, deterministic=False) - obs, reward, _, _ = env.step(action) - results[i].append(action) - rewards[i].append(reward) - assert sum(results[0]) == sum(results[1]), results - assert sum(rewards[0]) == sum(rewards[1]), rewards diff --git a/tests/test_a2c.py b/tests/test_a2c.py deleted file mode 100644 index c2f1f09c..00000000 --- a/tests/test_a2c.py +++ /dev/null @@ -1,26 +0,0 @@ -import os - -import pytest -import gym - -from stable_baselines import A2C -from stable_baselines.common import make_vec_env -from stable_baselines.common.vec_env import DummyVecEnv - - -def test_a2c_update_n_batch_on_load(tmp_path): - env = make_vec_env("CartPole-v1", n_envs=2) - model = A2C("MlpPolicy", env, n_steps=10) - - model.learn(total_timesteps=100) - model.save(os.path.join(str(tmp_path), "a2c_cartpole.zip")) - - del model - - model = A2C.load(os.path.join(str(tmp_path), "a2c_cartpole.zip")) - test_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) - - model.set_env(test_env) - assert model.n_batch == 10 - model.learn(100) - os.remove(os.path.join(str(tmp_path), "a2c_cartpole.zip")) diff --git a/tests/test_a2c_conv.py b/tests/test_a2c_conv.py deleted file mode 100644 index 59de1e04..00000000 --- a/tests/test_a2c_conv.py +++ /dev/null @@ -1,40 +0,0 @@ -import gym -import numpy as np -import tensorflow as tf - -from stable_baselines.common.tf_layers import conv -from stable_baselines.common.input import observation_input - - -ENV_ID = 'BreakoutNoFrameskip-v4' -SEED = 3 - - -def test_conv_kernel(): - """Test convolution kernel with various input formats.""" - filter_size_1 = 4 # The size of squared filter for the first layer - filter_size_2 = (3, 5) # The size of non-squared filter for the second layer - target_shape_1 = [2, 52, 40, 32] # The desired shape of the first layer - target_shape_2 = [2, 13, 9, 32] # The desired shape of the second layer - kwargs = {} - n_envs = 1 - n_steps = 2 - n_batch = n_envs * n_steps - scale = False - env = gym.make(ENV_ID) - ob_space = env.observation_space - - with tf.Graph().as_default(): - _, scaled_images = observation_input(ob_space, n_batch, scale=scale) - activ = tf.nn.relu - layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=filter_size_1, - stride=4, init_scale=np.sqrt(2), **kwargs)) - layer_2 = activ(conv(layer_1, 'c2', n_filters=32, filter_size=filter_size_2, - stride=4, init_scale=np.sqrt(2), **kwargs)) - assert layer_1.shape == target_shape_1, \ - "The shape of layer based on the squared kernel matrix is not correct. " \ - "The current shape is {} and the desired shape is {}".format(layer_1.shape, target_shape_1) - assert layer_2.shape == target_shape_2, \ - "The shape of layer based on the non-squared kernel matrix is not correct. " \ - "The current shape is {} and the desired shape is {}".format(layer_2.shape, target_shape_2) - env.close() diff --git a/tests/test_action_scaling.py b/tests/test_action_scaling.py deleted file mode 100644 index 5d6ca20f..00000000 --- a/tests/test_action_scaling.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import DDPG, TD3, SAC -from stable_baselines.common.identity_env import IdentityEnvBox - -ROLLOUT_STEPS = 100 - -MODEL_LIST = [ - (DDPG, dict(nb_train_steps=0, nb_rollout_steps=ROLLOUT_STEPS)), - (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)), - (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)), - (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)), - (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)) -] - - -@pytest.mark.parametrize("model_class, model_kwargs", MODEL_LIST) -def test_buffer_actions_scaling(model_class, model_kwargs): - """ - Test if actions are scaled to tanh co-domain before being put in a buffer - for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC - - :param model_class: (BaseRLModel) A RL Model - :param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm - """ - - # check random and inferred actions as they possibly have different flows - for random_coeff in [0.0, 1.0]: - - env = IdentityEnvBox(-2000, 1000) - - model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs) - model.learn(total_timesteps=ROLLOUT_STEPS) - - assert hasattr(model, 'replay_buffer') - - buffer = model.replay_buffer - - assert buffer.can_sample(ROLLOUT_STEPS) - - _, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS) - - assert not np.any(actions > np.ones_like(actions)) - assert not np.any(actions < -np.ones_like(actions)) diff --git a/tests/test_action_space.py b/tests/test_action_space.py deleted file mode 100644 index eefe0dab..00000000 --- a/tests/test_action_space.py +++ /dev/null @@ -1,60 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import A2C, PPO1, PPO2, TRPO -from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.common.evaluation import evaluate_policy - -MODEL_LIST = [ - A2C, - PPO1, - PPO2, - TRPO -] - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_identity_multidiscrete(model_class): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - with a multidiscrete action space - - :param model_class: (BaseRLModel) A RL Model - """ - env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) - - model = model_class("MlpPolicy", env) - model.learn(total_timesteps=1000) - evaluate_policy(model, env, n_eval_episodes=5) - obs = env.reset() - - assert np.array(model.action_probability(obs)).shape == (2, 1, 10), \ - "Error: action_probability not returning correct shape" - assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \ - "Error: not scalar probability" - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_identity_multibinary(model_class): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - with a multibinary action space - - :param model_class: (BaseRLModel) A RL Model - """ - env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) - - model = model_class("MlpPolicy", env) - model.learn(total_timesteps=1000) - evaluate_policy(model, env, n_eval_episodes=5) - obs = env.reset() - - assert model.action_probability(obs).shape == (1, 10), \ - "Error: action_probability not returning correct shape" - assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \ - "Error: not scalar probability" diff --git a/tests/test_atari.py b/tests/test_atari.py deleted file mode 100644 index fa1b11a1..00000000 --- a/tests/test_atari.py +++ /dev/null @@ -1,99 +0,0 @@ -import pytest - -from stable_baselines import bench, logger -from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy -from stable_baselines.common import set_global_seeds -from stable_baselines.common.atari_wrappers import make_atari -import stable_baselines.a2c.run_atari as a2c_atari -import stable_baselines.acer.run_atari as acer_atari -import stable_baselines.acktr.run_atari as acktr_atari -import stable_baselines.ppo1.run_atari as ppo1_atari -import stable_baselines.ppo2.run_atari as ppo2_atari -import stable_baselines.trpo_mpi.run_atari as trpo_atari - - -ENV_ID = 'BreakoutNoFrameskip-v4' -SEED = 3 -NUM_TIMESTEPS = 300 -NUM_CPU = 2 - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm']) -def test_a2c(policy): - """ - test A2C on atari - - :param policy: (str) the policy to test for A2C - """ - a2c_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, - policy=policy, lr_schedule='constant', num_env=NUM_CPU) - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm']) -def test_acer(policy): - """ - test ACER on atari - - :param policy: (str) the policy to test for ACER - """ - acer_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, - policy=policy, lr_schedule='constant', num_cpu=NUM_CPU) - - -@pytest.mark.slow -def test_acktr(): - """ - test ACKTR on atari - """ - acktr_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, num_cpu=NUM_CPU) - - -@pytest.mark.slow -def test_deepq(): - """ - test DeepQ on atari - """ - logger.configure() - set_global_seeds(SEED) - env = make_atari(ENV_ID) - env = bench.Monitor(env, logger.get_dir()) - env = wrap_atari_dqn(env) - - model = DQN(env=env, policy=CnnPolicy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, - exploration_final_eps=0.01, train_freq=4, learning_starts=100, target_network_update_freq=100, - gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6) - model.learn(total_timesteps=NUM_TIMESTEPS) - - env.close() - del model, env - - -@pytest.mark.slow -def test_ppo1(): - """ - test PPO1 on atari - """ - ppo1_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm', 'mlp']) -def test_ppo2(policy): - """ - test PPO2 on atari - - :param policy: (str) the policy to test for PPO2 - """ - ppo2_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, - seed=SEED, policy=policy, n_envs=NUM_CPU, - nminibatches=NUM_CPU, n_steps=16) - - -@pytest.mark.slow -def test_trpo(): - """ - test TRPO on atari - """ - trpo_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) diff --git a/tests/test_auto_vec_detection.py b/tests/test_auto_vec_detection.py deleted file mode 100644 index 1796657f..00000000 --- a/tests/test_auto_vec_detection.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import A2C, ACER, ACKTR, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3 -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox, IdentityEnvMultiBinary, \ - IdentityEnvMultiDiscrete -from stable_baselines.common.evaluation import evaluate_policy - - -def check_shape(make_env, model_class, shape_1, shape_2): - model = model_class(policy="MlpPolicy", env=DummyVecEnv([make_env])) - - env0 = make_env() - env1 = DummyVecEnv([make_env]) - - for env, expected_shape in [(env0, shape_1), (env1, shape_2)]: - def callback(locals_, _globals): - assert np.array(locals_['action']).shape == expected_shape - evaluate_policy(model, env, n_eval_episodes=5, callback=callback) - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO]) -def test_identity(model_class): - """ - test the Disrete environment vectorisation detection - - :param model_class: (BaseRLModel) the RL model - """ - check_shape(lambda: IdentityEnv(dim=10), model_class, (), (1,)) - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, DDPG, PPO1, PPO2, SAC, TRPO, TD3]) -def test_identity_box(model_class): - """ - test the Box environment vectorisation detection - - :param model_class: (BaseRLModel) the RL model - """ - check_shape(lambda: IdentityEnvBox(eps=0.5), model_class, (1,), (1, 1)) - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) -def test_identity_multi_binary(model_class): - """ - test the MultiBinary environment vectorisation detection - - :param model_class: (BaseRLModel) the RL model - """ - check_shape(lambda: IdentityEnvMultiBinary(dim=10), model_class, (10,), (1, 10)) - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) -def test_identity_multi_discrete(model_class): - """ - test the MultiDiscrete environment vectorisation detection - - :param model_class: (BaseRLModel) the RL model - """ - check_shape(lambda: IdentityEnvMultiDiscrete(dim=10), model_class, (2,), (1, 2)) diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py deleted file mode 100644 index debaacaa..00000000 --- a/tests/test_callbacks.py +++ /dev/null @@ -1,128 +0,0 @@ -import os -import shutil - -import pytest - -from stable_baselines import A2C, ACKTR, ACER, DQN, DDPG, PPO1, PPO2, SAC, TD3, TRPO -from stable_baselines.common import make_vec_env -from stable_baselines.common.callbacks import (CallbackList, CheckpointCallback, EvalCallback, - EveryNTimesteps, StopTrainingOnRewardThreshold, BaseCallback) - - -LOG_FOLDER = './logs/callbacks/' - - -class CustomCallback(BaseCallback): - """ - Callback to check that every method was called once at least - """ - def __init__(self): - super(CustomCallback, self).__init__() - self.calls = { - 'training_start': False, - 'rollout_start': False, - 'step': False, - 'rollout_end': False, - 'training_end': False, - } - - def _on_training_start(self): - self.calls['training_start'] = True - - def _on_rollout_start(self): - self.calls['rollout_start'] = True - - def _on_step(self): - self.calls['step'] = True - return True - - def _on_rollout_end(self): - self.calls['rollout_end'] = True - - def _on_training_end(self): - self.calls['training_end'] = True - - def validate(self, allowed_failures): - for allowed_failure in allowed_failures: - self.calls[allowed_failure] = True - assert all(self.calls.values()) - - -@pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TD3, TRPO]) -def test_callbacks(model_class): - - env_id = 'Pendulum-v0' - if model_class in [ACER, DQN]: - env_id = 'CartPole-v1' - - allowed_failures = [] - # Number of training timesteps is too short - # otherwise, the training would take too long, or would require - # custom parameter per algorithm - if model_class in [PPO1, DQN, TRPO]: - allowed_failures = ['rollout_end'] - - # Create RL model - model = model_class('MlpPolicy', env_id) - - checkpoint_callback = CheckpointCallback(save_freq=500, save_path=LOG_FOLDER) - - # For testing: use the same training env - eval_env = model.get_env() - # Stop training if the performance is good enough - callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) - - eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, - best_model_save_path=LOG_FOLDER, - log_path=LOG_FOLDER, eval_freq=100) - - # Equivalent to the `checkpoint_callback` - # but here in an event-driven manner - checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=LOG_FOLDER, - name_prefix='event') - event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) - - callback = CallbackList([checkpoint_callback, eval_callback, event_callback]) - - model.learn(500, callback=callback) - model.learn(200, callback=None) - custom_callback = CustomCallback() - model.learn(200, callback=custom_callback) - # Check that every called were executed - custom_callback.validate(allowed_failures=allowed_failures) - # Transform callback into a callback list automatically - custom_callback = CustomCallback() - model.learn(500, callback=[checkpoint_callback, eval_callback, custom_callback]) - # Check that every called were executed - custom_callback.validate(allowed_failures=allowed_failures) - - # Automatic wrapping, old way of doing callbacks - model.learn(200, callback=lambda _locals, _globals: True) - - # Cleanup - if os.path.exists(LOG_FOLDER): - shutil.rmtree(LOG_FOLDER) - - -def test_recurrent_eval_callback(): - env_id = 'Pendulum-v0' - - # Create envs - env = make_vec_env(env_id, n_envs=4) - eval_env = make_vec_env(env_id, n_envs=1) - - # Create RL model - model = PPO2('MlpLstmPolicy', env) - - # Stop training if the performance is good enough - callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) - - eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, - best_model_save_path=LOG_FOLDER, - log_path=LOG_FOLDER, eval_freq=100) - - model.learn(300, callback=eval_callback) - - # Cleanup - if os.path.exists(LOG_FOLDER): - shutil.rmtree(LOG_FOLDER) diff --git a/tests/test_common.py b/tests/test_common.py deleted file mode 100644 index 7a8ce446..00000000 --- a/tests/test_common.py +++ /dev/null @@ -1,32 +0,0 @@ -from contextlib import contextmanager -import sys - - -def _assert_eq(left, right): - assert left == right, '{} != {}'.format(left, right) - - -def _assert_neq(left, right): - assert left != right, '{} == {}'.format(left, right) - - -@contextmanager -def _maybe_disable_mpi(mpi_disabled): - """A context that can temporarily remove the mpi4py import. - - Useful for testing whether non-MPI algorithms work as intended when - mpi4py isn't installed. - - Args: - disable_mpi (bool): If True, then this context temporarily removes - the mpi4py import from `sys.modules` - """ - if mpi_disabled and "mpi4py" in sys.modules: - temp = sys.modules["mpi4py"] - try: - sys.modules["mpi4py"] = None - yield - finally: - sys.modules["mpi4py"] = temp - else: - yield diff --git a/tests/test_continuous.py b/tests/test_continuous.py deleted file mode 100644 index 307e6adb..00000000 --- a/tests/test_continuous.py +++ /dev/null @@ -1,172 +0,0 @@ -import subprocess -import os - -import gym -import pytest -import numpy as np - -from stable_baselines import A2C, ACKTR, SAC, DDPG, PPO1, PPO2, TRPO, TD3 -# TODO: add support for continuous actions -# from stable_baselines.acer import ACER -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.common.identity_env import IdentityEnvBox -from stable_baselines.ddpg import AdaptiveParamNoiseSpec, NormalActionNoise -from stable_baselines.common.evaluation import evaluate_policy -from tests.test_common import _assert_eq - - -N_EVAL_EPISODES = 20 -NUM_TIMESTEPS = 300 - -MODEL_LIST = [ - A2C, - # ACER, - ACKTR, - DDPG, - PPO1, - PPO2, - SAC, - TD3, - TRPO -] - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_model_manipulation(request, model_class): - """ - Test if the algorithm can be loaded and saved without any issues, the environment switching - works and that the action prediction works - - :param model_class: (BaseRLModel) A model - """ - model_fname = None - try: - env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) - - # create and train - model = model_class(policy="MlpPolicy", env=env, seed=0) - model.learn(total_timesteps=NUM_TIMESTEPS) - - env.reset() - observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) - selected_actions, _ = model.predict(observations, deterministic=True) - - # saving - model_fname = './test_model_{}.zip'.format(request.node.name) - model.save(model_fname) - - del model, env - - # loading - model = model_class.load(model_fname) - - # check if model still selects the same actions - new_selected_actions, _ = model.predict(observations, deterministic=True) - assert np.allclose(selected_actions, new_selected_actions, 1e-4) - - # changing environment (note: this can be done at loading) - env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) - model.set_env(env) - - obs = env.reset() - with pytest.warns(None) as record: - act_prob = model.action_probability(obs) - - if model_class in [DDPG, SAC, TD3]: - # check that only one warning was raised - assert len(record) == 1, "No warning was raised for {}".format(model_class) - assert act_prob is None, "Error: action_probability should be None for {}".format(model_class) - else: - assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \ - "Error: action_probability not returning correct shape" - - # test action probability for given (obs, action) pair - # must return zero and raise a warning or raise an exception if not defined - env = model.get_env() - obs = env.reset() - observations = np.array([obs for _ in range(10)]) - observations = np.squeeze(observations) - observations = observations.reshape((-1, 1)) - actions = np.array([env.action_space.sample() for _ in range(10)]) - - if model_class in [DDPG, SAC, TD3]: - with pytest.raises(ValueError): - model.action_probability(observations, actions=actions) - else: - actions_probas = model.action_probability(observations, actions=actions) - assert actions_probas.shape == (len(actions), 1), actions_probas.shape - assert np.all(actions_probas >= 0), actions_probas - actions_logprobas = model.action_probability(observations, actions=actions, logp=True) - assert np.allclose(actions_probas, np.exp(actions_logprobas)), (actions_probas, actions_logprobas) - - # learn post loading - model.learn(total_timesteps=100) - - # predict new values - evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES) - - # Free memory - del model, env - - finally: - if model_fname is not None and os.path.exists(model_fname): - os.remove(model_fname) - - -def test_ddpg(): - args = ['--env-id', 'Pendulum-v0', '--num-timesteps', 300, '--noise-type', 'ou_0.01'] - args = list(map(str, args)) - return_code = subprocess.call(['python', '-m', 'stable_baselines.ddpg.main'] + args) - _assert_eq(return_code, 0) - - -def test_ddpg_eval_env(): - """ - Additional test to check that everything is working when passing - an eval env. - """ - eval_env = gym.make("Pendulum-v0") - model = DDPG("MlpPolicy", "Pendulum-v0", nb_rollout_steps=5, - nb_train_steps=2, nb_eval_steps=10, - eval_env=eval_env, verbose=0) - model.learn(NUM_TIMESTEPS) - - -def test_ddpg_normalization(): - """ - Test that observations and returns normalizations are properly saved and loaded. - """ - param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) - model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, - normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, - batch_size=64, param_noise=param_noise) - model.learn(NUM_TIMESTEPS) - obs_rms_params = model.sess.run(model.obs_rms_params) - ret_rms_params = model.sess.run(model.ret_rms_params) - model.save('./test_ddpg.zip') - - loaded_model = DDPG.load('./test_ddpg.zip') - obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) - ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) - - for param, param_loaded in zip(obs_rms_params + ret_rms_params, - obs_rms_params_2 + ret_rms_params_2): - assert np.allclose(param, param_loaded) - - del model, loaded_model - - if os.path.exists("./test_ddpg.zip"): - os.remove("./test_ddpg.zip") - - -def test_ddpg_popart(): - """ - Test DDPG with pop-art normalization - """ - n_actions = 1 - action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) - model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, - normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, - batch_size=64, action_noise=action_noise, enable_popart=True) - model.learn(NUM_TIMESTEPS) diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py deleted file mode 100644 index 725d88ff..00000000 --- a/tests/test_custom_policy.py +++ /dev/null @@ -1,147 +0,0 @@ -import os - -import gym -import pytest -import tensorflow as tf - -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO, SAC, DDPG -from stable_baselines.common.policies import FeedForwardPolicy -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.deepq.policies import FeedForwardPolicy as DQNPolicy -from stable_baselines.ddpg.policies import FeedForwardPolicy as DDPGPolicy -from stable_baselines.sac.policies import FeedForwardPolicy as SACPolicy - -N_TRIALS = 100 - - -class CustomCommonPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'net_arch' not in kwargs: - kwargs['net_arch'] = [8, dict(vf=[8, 8], pi=[8, 8])] - super(CustomCommonPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomDQNPolicy(DQNPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomDQNPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomDDPGPolicy(DDPGPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomDDPGPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomSACPolicy(SACPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomSACPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -# MODEL_CLASS, POLICY_CLASS, POLICY_KWARGS -MODEL_DICT = { - 'a2c': (A2C, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[12, dict(vf=[16], pi=[8])])), - 'acer': (ACER, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), - 'acktr': (ACKTR, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), - 'dqn': (DQN, CustomDQNPolicy, dict(layers=[4, 4], dueling=False)), - 'ddpg': (DDPG, CustomDDPGPolicy, dict(layers=[16, 16], layer_norm=False)), - 'ppo1': (PPO1, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[8, 4])), - 'ppo2': (PPO2, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[4, 4])), - 'sac': (SAC, CustomSACPolicy, dict(layers=[16, 16])), - 'trpo': (TRPO, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), -} - - -@pytest.mark.parametrize("model_name", MODEL_DICT.keys()) -def test_custom_policy(request, model_name): - """ - Test if the algorithm (with a custom policy) can be loaded and saved without any issues. - :param model_name: (str) A RL model - """ - - try: - model_class, policy, _ = MODEL_DICT[model_name] - env = 'MountainCarContinuous-v0' if model_name in ['ddpg', 'sac'] else 'CartPole-v1' - - # create and train - model = model_class(policy, env) - model.learn(total_timesteps=100) - - env = model.get_env() - # predict and measure the acc reward - obs = env.reset() - for _ in range(N_TRIALS): - action, _ = model.predict(obs) - # Test action probability method - if model_name not in ['ddpg', 'sac']: - model.action_probability(obs) - obs, _, _, _ = env.step(action) - # saving - model_fname = './test_model_{}.zip'.format(request.node.name) - model.save(model_fname) - del model, env - # loading - _ = model_class.load(model_fname, policy=policy) - - finally: - if os.path.exists(model_fname): - os.remove(model_fname) - - -@pytest.mark.parametrize("model_name", MODEL_DICT.keys()) -def test_custom_policy_kwargs(request, model_name): - """ - Test if the algorithm (with a custom policy) can be loaded and saved without any issues. - :param model_name: (str) A RL model - """ - - model_fname = './test_model_{}.zip'.format(request.node.name) - - try: - model_class, policy, policy_kwargs = MODEL_DICT[model_name] - env = 'MountainCarContinuous-v0' if model_name in ['ddpg', 'sac'] else 'CartPole-v1' - - # Should raise an error when a wrong keyword is passed - with pytest.raises(ValueError): - model_class(policy, env, policy_kwargs=dict(this_throws_error='maybe')) - - # create and train - model = model_class(policy, env, policy_kwargs=policy_kwargs) - model.learn(total_timesteps=100) - - model.save(model_fname) - del model - - # loading - - env = DummyVecEnv([lambda: gym.make(env)]) - - # Load with specifying policy_kwargs - model = model_class.load(model_fname, policy=policy, env=env, policy_kwargs=policy_kwargs) - model.learn(total_timesteps=100) - del model - - # Load without specifying policy_kwargs - model = model_class.load(model_fname, policy=policy, env=env) - model.learn(total_timesteps=100) - del model - - # Load with different wrong policy_kwargs - with pytest.raises(ValueError): - _ = model_class.load(model_fname, policy=policy, env=env, policy_kwargs=dict(wrong="kwargs")) - - finally: - if os.path.exists(model_fname): - os.remove(model_fname) diff --git a/tests/test_deepq.py b/tests/test_deepq.py deleted file mode 100644 index 421dc5ad..00000000 --- a/tests/test_deepq.py +++ /dev/null @@ -1,26 +0,0 @@ -from stable_baselines.deepq.experiments.train_cartpole import main as train_cartpole -from stable_baselines.deepq.experiments.enjoy_cartpole import main as enjoy_cartpole -from stable_baselines.deepq.experiments.train_mountaincar import main as train_mountaincar -from stable_baselines.deepq.experiments.enjoy_mountaincar import main as enjoy_mountaincar - - -class DummyObject(object): - """ - Dummy object to create fake Parsed Arguments object - """ - pass - - -args = DummyObject() -args.no_render = True -args.max_timesteps = 200 - - -def test_cartpole(): - train_cartpole(args) - enjoy_cartpole(args) - - -def test_mountaincar(): - train_mountaincar(args) - enjoy_mountaincar(args) diff --git a/tests/test_distri.py b/tests/test_distri.py deleted file mode 100644 index d3be3626..00000000 --- a/tests/test_distri.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as np -import tensorflow as tf - -import stable_baselines.common.tf_util as tf_util -from stable_baselines.common.distributions import DiagGaussianProbabilityDistributionType,\ - CategoricalProbabilityDistributionType, \ - MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType - - -@tf_util.in_session -def test_probtypes(): - """ - test probability distribution types - """ - np.random.seed(0) - - pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) - diag_gauss = DiagGaussianProbabilityDistributionType(pdparam_diag_gauss.size // 2) - validate_probtype(diag_gauss, pdparam_diag_gauss) - - pdparam_categorical = np.array([-.2, .3, .5]) - categorical = CategoricalProbabilityDistributionType(pdparam_categorical.size) - validate_probtype(categorical, pdparam_categorical) - - nvec = np.array([1, 2, 3]) - pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) - multicategorical = MultiCategoricalProbabilityDistributionType(nvec) - validate_probtype(multicategorical, pdparam_multicategorical) - - pdparam_bernoulli = np.array([-.2, .3, .5]) - bernoulli = BernoulliProbabilityDistributionType(pdparam_bernoulli.size) - validate_probtype(bernoulli, pdparam_bernoulli) - - -def validate_probtype(probtype, pdparam): - """ - validate probability distribution types - - :param probtype: (ProbabilityDistributionType) the type to validate - :param pdparam: ([float]) the flat probabilities to test - """ - number_samples = 100000 - # Check to see if mean negative log likelihood == differential entropy - mval = np.repeat(pdparam[None, :], number_samples, axis=0) - mval_ph = probtype.param_placeholder([number_samples]) - xval_ph = probtype.sample_placeholder([number_samples]) - proba_distribution = probtype.proba_distribution_from_flat(mval_ph) - calcloglik = tf_util.function([xval_ph, mval_ph], proba_distribution.logp(xval_ph)) - calcent = tf_util.function([mval_ph], proba_distribution.entropy()) - xval = tf.get_default_session().run(proba_distribution.sample(), feed_dict={mval_ph: mval}) - logliks = calcloglik(xval, mval) - entval_ll = - logliks.mean() - entval_ll_stderr = logliks.std() / np.sqrt(number_samples) - entval = calcent(mval).mean() - assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas - - # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] - mval2_ph = probtype.param_placeholder([number_samples]) - pd2 = probtype.proba_distribution_from_flat(mval2_ph) - tmp = pdparam + np.random.randn(pdparam.size) * 0.1 - mval2 = np.repeat(tmp[None, :], number_samples, axis=0) - calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2)) - klval = calckl(mval, mval2).mean() - logliks = calcloglik(xval, mval2) - klval_ll = - entval - logliks.mean() - klval_ll_stderr = logliks.std() / np.sqrt(number_samples) - assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas - print('ok on', probtype, pdparam) diff --git a/tests/test_envs.py b/tests/test_envs.py deleted file mode 100644 index d436f180..00000000 --- a/tests/test_envs.py +++ /dev/null @@ -1,149 +0,0 @@ -import pytest -import gym -from gym import spaces -import numpy as np - -from stable_baselines.common.env_checker import check_env -from stable_baselines.common.bit_flipping_env import BitFlippingEnv -from stable_baselines.common.identity_env import (IdentityEnv, IdentityEnvBox, - IdentityEnvMultiBinary, IdentityEnvMultiDiscrete,) - -ENV_CLASSES = [BitFlippingEnv, IdentityEnv, IdentityEnvBox, IdentityEnvMultiBinary, - IdentityEnvMultiDiscrete] - - -@pytest.mark.parametrize("env_id", ['CartPole-v0', 'Pendulum-v0', 'BreakoutNoFrameskip-v4']) -def test_env(env_id): - """ - Check that environmnent integrated in Gym pass the test. - - :param env_id: (str) - """ - env = gym.make(env_id) - with pytest.warns(None) as record: - check_env(env) - - # Pendulum-v0 will produce a warning because the action space is - # in [-2, 2] and not [-1, 1] - if env_id == 'Pendulum-v0': - assert len(record) == 1 - else: - # The other environments must pass without warning - assert len(record) == 0 - - -@pytest.mark.parametrize("env_class", ENV_CLASSES) -def test_custom_envs(env_class): - env = env_class() - check_env(env) - - -def test_high_dimension_action_space(): - """ - Test for continuous action space - with more than one action. - """ - env = gym.make('Pendulum-v0') - # Patch the action space - env.action_space = spaces.Box(low=-1, high=1, shape=(20,), dtype=np.float32) - # Patch to avoid error - def patched_step(_action): - return env.observation_space.sample(), 0.0, False, {} - env.step = patched_step - check_env(env) - - -@pytest.mark.parametrize("new_obs_space", [ - # Small image - spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8), - # Range not in [0, 255] - spaces.Box(low=0, high=1, shape=(64, 64, 3), dtype=np.uint8), - # Wrong dtype - spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.float32), - # Not an image, it should be a 1D vector - spaces.Box(low=-1, high=1, shape=(64, 3), dtype=np.float32), - # Tuple space is not supported by SB - spaces.Tuple([spaces.Discrete(5), spaces.Discrete(10)]), - # Dict space is not supported by SB when env is not a GoalEnv - spaces.Dict({"position": spaces.Discrete(5)}), -]) -def test_non_default_spaces(new_obs_space): - env = gym.make('BreakoutNoFrameskip-v4') - env.observation_space = new_obs_space - # Patch methods to avoid errors - env.reset = new_obs_space.sample - - def patched_step(_action): - return new_obs_space.sample(), 0.0, False, {} - - env.step = patched_step - with pytest.warns(UserWarning): - check_env(env) - - -def check_reset_assert_error(env, new_reset_return): - """ - Helper to check that the error is caught. - :param env: (gym.Env) - :param new_reset_return: (Any) - """ - - def wrong_reset(): - return new_reset_return - - # Patch the reset method with a wrong one - env.reset = wrong_reset - with pytest.raises(AssertionError): - check_env(env) - - -def test_common_failures_reset(): - """ - Test that common failure cases of the `reset_method` are caught - """ - env = IdentityEnvBox() - # Return an observation that does not match the observation_space - check_reset_assert_error(env, np.ones((3,))) - # The observation is not a numpy array - check_reset_assert_error(env, 1) - - # Return not only the observation - check_reset_assert_error(env, (env.observation_space.sample(), False)) - - -def check_step_assert_error(env, new_step_return=()): - """ - Helper to check that the error is caught. - :param env: (gym.Env) - :param new_step_return: (tuple) - """ - - def wrong_step(_action): - return new_step_return - - # Patch the step method with a wrong one - env.step = wrong_step - with pytest.raises(AssertionError): - check_env(env) - - -def test_common_failures_step(): - """ - Test that common failure cases of the `step` method are caught - """ - env = IdentityEnvBox() - - # Wrong shape for the observation - check_step_assert_error(env, (np.ones((4,)), 1.0, False, {})) - # Obs is not a numpy array - check_step_assert_error(env, (1, 1.0, False, {})) - - # Return a wrong reward - check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, {})) - - # Info dict is not returned - check_step_assert_error(env, (env.observation_space.sample(), 0.0, False)) - - # Done is not a boolean - check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, {})) - check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, {})) diff --git a/tests/test_gail.py b/tests/test_gail.py deleted file mode 100644 index 803cb8b7..00000000 --- a/tests/test_gail.py +++ /dev/null @@ -1,170 +0,0 @@ -import os -import shutil - -import gym -import numpy as np -import pytest - -from stable_baselines import (A2C, ACER, ACKTR, GAIL, DDPG, DQN, PPO1, PPO2, - TD3, TRPO, SAC) -from stable_baselines.common.cmd_util import make_atari_env -from stable_baselines.common.vec_env import VecFrameStack, DummyVecEnv -from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines.common.callbacks import CheckpointCallback -from stable_baselines.gail import ExpertDataset, generate_expert_traj - - -EXPERT_PATH_PENDULUM = "stable_baselines/gail/dataset/expert_pendulum.npz" -EXPERT_PATH_DISCRETE = "stable_baselines/gail/dataset/expert_cartpole.npz" - - -@pytest.mark.parametrize("expert_env", [('Pendulum-v0', EXPERT_PATH_PENDULUM, True), - ('CartPole-v1', EXPERT_PATH_DISCRETE, False)]) -def test_gail(tmp_path, expert_env): - env_id, expert_path, load_from_memory = expert_env - env = gym.make(env_id) - - traj_data = None - if load_from_memory: - traj_data = np.load(expert_path) - expert_path = None - dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, - sequential_preprocessing=True) - - # Note: train for 1M steps to have a working policy - model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, - expert_dataset=dataset, hidden_size_adversary=64, verbose=0) - - model.learn(300) - model.save(str(tmp_path / "GAIL-{}".format(env_id))) - model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) - model.learn(300) - - evaluate_policy(model, env, n_eval_episodes=5) - del dataset, model - - -@pytest.mark.parametrize("generate_env", [ - (SAC, 'MlpPolicy', 'Pendulum-v0', 1, 10), - (DQN, 'MlpPolicy', 'CartPole-v1', 1, 10), - (A2C, 'MlpLstmPolicy', 'Pendulum-v0', 1, 10), - (A2C, 'MlpLstmPolicy', 'CartPole-v1', 1, 10), - (A2C, 'CnnPolicy', 'BreakoutNoFrameskip-v4', 8, 1), - ]) -def test_generate(tmp_path, generate_env): - model, policy, env_name, n_env, n_episodes = generate_env - - if n_env > 1: - env = make_atari_env(env_name, num_env=n_env, seed=0) - model = model(policy, env, verbose=0) - else: - model = model(policy, env_name, verbose=0) - - dataset = generate_expert_traj(model, str(tmp_path / 'expert'), n_timesteps=300, n_episodes=n_episodes, - image_folder=str(tmp_path / 'test_recorded_images')) - - assert set(dataset.keys()).issuperset(['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']) - assert sum(dataset['episode_starts']) == n_episodes - assert len(dataset['episode_returns']) == n_episodes - n_timesteps = len(dataset['episode_starts']) - for key, val in dataset.items(): - if key != 'episode_returns': - assert val.shape[0] == n_timesteps, "inconsistent number of timesteps at '{}'".format(key) - - dataset_loaded = np.load(str(tmp_path / 'expert.npz'), allow_pickle=True) - assert dataset.keys() == dataset_loaded.keys() - for key in dataset.keys(): - assert (dataset[key] == dataset_loaded[key]).all(), "different data at '{}'".format(key) - # Cleanup folder - if os.path.isdir(str(tmp_path / 'test_recorded_images')): - shutil.rmtree(str(tmp_path / 'test_recorded_images')) - - -def test_generate_callable(tmp_path): - """ - Test generating expert trajectories with a callable. - """ - env = gym.make("CartPole-v1") - # Here the expert is a random agent - def dummy_expert(_obs): - return env.action_space.sample() - generate_expert_traj(dummy_expert, tmp_path / 'dummy_expert_cartpole', env, n_timesteps=0, n_episodes=10) - -def test_pretrain_twice(tmp_path): - """ - Test pretraining twice in the same execution. - """ - dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, - sequential_preprocessing=True, verbose=0) - model = PPO2("MlpPolicy", "Pendulum-v0") - model.pretrain(dataset, n_epochs=5) - model.pretrain(dataset, n_epochs=5) - del dataset, model - -@pytest.mark.xfail(reason="Not Enough Memory", strict=False) -def test_pretrain_images(tmp_path): - env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) - env = VecFrameStack(env, n_stack=3) - model = PPO2('CnnPolicy', env) - generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1, - image_folder=str(tmp_path / 'pretrain_recorded_images')) - - expert_path = str(tmp_path / 'expert_pong.npz') - dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32, - sequential_preprocessing=True) - model.pretrain(dataset, n_epochs=2) - - shutil.rmtree(str(tmp_path / 'pretrain_recorded_images')) - env.close() - del dataset, model, env - - -def test_gail_callback(tmp_path): - dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, - sequential_preprocessing=True, verbose=0) - model = GAIL("MlpPolicy", "Pendulum-v0", dataset) - checkpoint_callback = CheckpointCallback(save_freq=150, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail') - model.learn(total_timesteps=301, callback=checkpoint_callback) - shutil.rmtree(str(tmp_path / 'logs/gail/')) - del dataset, model - - -@pytest.mark.parametrize("model_class", [A2C, ACKTR, GAIL, DDPG, PPO1, PPO2, SAC, TD3, TRPO]) -def test_behavior_cloning_box(tmp_path, model_class): - """ - Behavior cloning with continuous actions. - """ - dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10, - sequential_preprocessing=True, verbose=0) - model = model_class("MlpPolicy", "Pendulum-v0") - model.pretrain(dataset, n_epochs=5) - model.save(str(tmp_path / "test-pretrain")) - del dataset, model - - -@pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, GAIL, PPO1, PPO2, TRPO]) -def test_behavior_cloning_discrete(tmp_path, model_class): - dataset = ExpertDataset(expert_path=EXPERT_PATH_DISCRETE, traj_limitation=10, - sequential_preprocessing=True, verbose=0) - model = model_class("MlpPolicy", "CartPole-v1") - model.pretrain(dataset, n_epochs=5) - model.save(str(tmp_path / "test-pretrain")) - del dataset, model - - -def test_dataset_param_validation(): - with pytest.raises(ValueError): - ExpertDataset() - - traj_data = np.load(EXPERT_PATH_PENDULUM) - with pytest.raises(ValueError): - ExpertDataset(traj_data=traj_data, expert_path=EXPERT_PATH_PENDULUM) - - -def test_generate_vec_env_non_image_observation(): - env = DummyVecEnv([lambda: gym.make('CartPole-v1')] * 2) - - model = PPO2('MlpPolicy', env) - model.learn(total_timesteps=300) - - generate_expert_traj(model, save_path='.', n_timesteps=0, n_episodes=5) diff --git a/tests/test_her.py b/tests/test_her.py deleted file mode 100644 index 127a799f..00000000 --- a/tests/test_her.py +++ /dev/null @@ -1,119 +0,0 @@ -import os - -import pytest - -from stable_baselines import HER, DQN, SAC, DDPG, TD3 -from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper -from stable_baselines.her.replay_buffer import KEY_TO_GOAL_STRATEGY -from stable_baselines.common.bit_flipping_env import BitFlippingEnv -from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize - -N_BITS = 10 - - -def model_predict(model, env, n_steps, additional_check=None): - """ - Test helper - :param model: (rl model) - :param env: (gym.Env) - :param n_steps: (int) - :param additional_check: (callable) - """ - obs = env.reset() - for _ in range(n_steps): - action, _ = model.predict(obs) - obs, reward, done, _ = env.step(action) - - if additional_check is not None: - additional_check(obs, action, reward, done) - - if done: - obs = env.reset() - - -@pytest.mark.parametrize('goal_selection_strategy', list(GoalSelectionStrategy)) -@pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) -@pytest.mark.parametrize('discrete_obs_space', [False, True]) -def test_her(model_class, goal_selection_strategy, discrete_obs_space): - env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], - max_steps=N_BITS, discrete_obs_space=discrete_obs_space) - - # Take random actions 10% of the time - kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {} - model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, - verbose=0, **kwargs) - model.learn(150) - - -@pytest.mark.parametrize('model_class', [DDPG, SAC, DQN, TD3]) -def test_long_episode(model_class): - """ - Check that the model does not break when the replay buffer is still empty - after the first rollout (because the episode is not over). - """ - # n_bits > nb_rollout_steps - n_bits = 10 - env = BitFlippingEnv(n_bits, continuous=model_class in [DDPG, SAC, TD3], - max_steps=n_bits) - kwargs = {} - if model_class == DDPG: - kwargs['nb_rollout_steps'] = 9 # < n_bits - elif model_class in [DQN, SAC, TD3]: - kwargs['batch_size'] = 8 # < n_bits - kwargs['learning_starts'] = 0 - - model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', - verbose=0, **kwargs) - model.learn(100) - - -@pytest.mark.parametrize('goal_selection_strategy', [list(KEY_TO_GOAL_STRATEGY.keys())[0]]) -@pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) -def test_model_manipulation(model_class, goal_selection_strategy): - env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - env = DummyVecEnv([lambda: env]) - - model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, - verbose=0) - model.learn(150) - - model_predict(model, env, n_steps=20, additional_check=None) - - model.save('./test_her.zip') - del model - - # NOTE: HER does not support VecEnvWrapper yet - with pytest.raises(AssertionError): - model = HER.load('./test_her.zip', env=VecNormalize(env)) - - model = HER.load('./test_her.zip') - - # Check that the model raises an error when the env - # is not wrapped (or no env passed to the model) - with pytest.raises(ValueError): - model.predict(env.reset()) - - env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - env_ = HERGoalEnvWrapper(env_) - - model_predict(model, env_, n_steps=20, additional_check=None) - - model.set_env(env) - model.learn(150) - - model_predict(model, env_, n_steps=20, additional_check=None) - - assert model.n_sampled_goal == 3 - - del model - - env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) - model = HER.load('./test_her', env=env) - model.learn(150) - - model_predict(model, env_, n_steps=20, additional_check=None) - - assert model.n_sampled_goal == 3 - - if os.path.isfile('./test_her.zip'): - os.remove('./test_her.zip') diff --git a/tests/test_identity.py b/tests/test_identity.py deleted file mode 100644 index 0ee44cc3..00000000 --- a/tests/test_identity.py +++ /dev/null @@ -1,136 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO -from stable_baselines.ddpg import NormalActionNoise -from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.common.evaluation import evaluate_policy - - -# Hyperparameters for learning identity for each RL model -LEARN_FUNC_DICT = { - "a2c": lambda e: A2C( - policy="MlpPolicy", - learning_rate=1e-3, - n_steps=4, - gamma=0.4, - ent_coef=0.0, - env=e, - seed=0, - ).learn(total_timesteps=4000), - "acer": lambda e: ACER( - policy="MlpPolicy", - env=e, - seed=0, - n_steps=4, - replay_ratio=1, - ent_coef=0.0, - ).learn(total_timesteps=4000), - "acktr": lambda e: ACKTR( - policy="MlpPolicy", env=e, seed=0, learning_rate=5e-4, ent_coef=0.0, n_steps=4 - ).learn(total_timesteps=4000), - "dqn": lambda e: DQN( - policy="MlpPolicy", - batch_size=32, - gamma=0.1, - learning_starts=0, - exploration_final_eps=0.05, - exploration_fraction=0.1, - env=e, - seed=0, - ).learn(total_timesteps=4000), - "ppo1": lambda e: PPO1( - policy="MlpPolicy", - env=e, - seed=0, - lam=0.5, - entcoeff=0.0, - optim_batchsize=16, - gamma=0.4, - optim_stepsize=1e-3, - ).learn(total_timesteps=3000), - "ppo2": lambda e: PPO2( - policy="MlpPolicy", - env=e, - seed=0, - learning_rate=1.5e-3, - lam=0.8, - ent_coef=0.0, - gamma=0.4, - ).learn(total_timesteps=3000), - "trpo": lambda e: TRPO( - policy="MlpPolicy", - env=e, - gamma=0.4, - seed=0, - max_kl=0.05, - lam=0.7, - timesteps_per_batch=256, - ).learn(total_timesteps=4000), -} - - -@pytest.mark.slow -@pytest.mark.parametrize( - "model_name", ["a2c", "acer", "acktr", "dqn", "ppo1", "ppo2", "trpo"] -) -def test_identity_discrete(model_name): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - - :param model_name: (str) Name of the RL model - """ - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - model = LEARN_FUNC_DICT[model_name](env) - evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) - - obs = env.reset() - assert model.action_probability(obs).shape == ( - 1, - 10, - ), "Error: action_probability not returning correct shape" - action = env.action_space.sample() - action_prob = model.action_probability(obs, actions=action) - assert np.prod(action_prob.shape) == 1, "Error: not scalar probability" - action_logprob = model.action_probability(obs, actions=action, logp=True) - assert np.allclose(action_prob, np.exp(action_logprob)), ( - action_prob, - action_logprob, - ) - - # Free memory - del model, env - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", [DDPG, TD3, SAC]) -def test_identity_continuous(model_class): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - """ - env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) - - n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class] - - kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5) - if model_class in [DDPG, TD3]: - n_actions = 1 - action_noise = NormalActionNoise( - mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions) - ) - kwargs["action_noise"] = action_noise - - if model_class == DDPG: - kwargs["actor_lr"] = 1e-3 - kwargs["batch_size"] = 100 - - model = model_class("MlpPolicy", env, **kwargs) - model.learn(total_timesteps=n_steps) - - evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) - # Free memory - del model, env diff --git a/tests/test_load_parameters.py b/tests/test_load_parameters.py deleted file mode 100644 index f1f5c7b1..00000000 --- a/tests/test_load_parameters.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -from io import BytesIO - -import pytest -import numpy as np - -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO -from stable_baselines.common.identity_env import IdentityEnv -from stable_baselines.common.vec_env import DummyVecEnv - -MODEL_LIST = [ - A2C, - ACER, - ACKTR, - DQN, - PPO1, - PPO2, - TRPO, -] - - -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_load_parameters(request, model_class): - """ - Test if ``load_parameters`` loads given parameters correctly (the model actually changes) - and that the backwards compatability with a list of params works - - :param model_class: (BaseRLModel) A RL model - """ - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - # create model - model = model_class(policy="MlpPolicy", env=env) - - # test action probability for given (obs, action) pair - env = model.get_env() - obs = env.reset() - observations = np.array([obs for _ in range(10)]) - observations = np.squeeze(observations) - - actions = np.array([env.action_space.sample() for _ in range(10)]) - original_actions_probas = model.action_probability(observations, actions=actions) - - # Get dictionary of current parameters - params = model.get_parameters() - # Modify all parameters to be random values - random_params = dict((param_name, np.random.random(size=param.shape)) for param_name, param in params.items()) - # Update model parameters with the new zeroed values - model.load_parameters(random_params) - # Get new action probas - new_actions_probas = model.action_probability(observations, actions=actions) - - # Check that at least some action probabilities are different now - assert not np.any(np.isclose(original_actions_probas, new_actions_probas)), "Action probabilities did not change " \ - "after changing model parameters." - # Also check that new parameters are there (they should be random_params) - new_params = model.get_parameters() - comparisons = [np.all(np.isclose(new_params[key], random_params[key])) for key in random_params.keys()] - assert all(comparisons), "Parameters of model are not the same as provided ones." - - # Now test the backwards compatibility with params being a list instead of a dict. - # Get the ordering of parameters. - tf_param_list = model.get_parameter_list() - # Make random parameters negative to make sure the results should be different from - # previous random values - random_param_list = [-np.random.random(size=tf_param.shape) for tf_param in tf_param_list] - model.load_parameters(random_param_list) - - # Compare results against the previous load - new_actions_probas_list = model.action_probability(observations, actions=actions) - assert not np.any(np.isclose(new_actions_probas, new_actions_probas_list)), "Action probabilities did not " \ - "change after changing model " \ - "parameters (list)." - - # Test file/file-like object loading for load_parameters. - # Save whatever is stored in model now, assign random parameters, - # load parameters from file with load_parameters and check if original probabilities - # are restored - original_actions_probas = model.action_probability(observations, actions=actions) - model_fname = './test_model_{}.zip'.format(request.node.name) - - try: - # Save model to a file and file-like buffer - # (partly copy/paste from test_save) - model.save(model_fname) - b_io = BytesIO() - model.save(b_io) - model_bytes = b_io.getvalue() - b_io.close() - - random_params = dict((param_name, np.random.random(size=param.shape)) for param_name, param in params.items()) - model.load_parameters(random_params) - # Previous tests confirm that load_parameters works, - # so just right into testing loading from file - model.load_parameters(model_fname) - new_actions_probas = model.action_probability(observations, actions=actions) - assert np.all(np.isclose(original_actions_probas, new_actions_probas)), "Action probabilities changed " \ - "after load_parameters from a file." - # Reset with random parameters again - model.load_parameters(random_params) - # Now load from file-like (copy/paste from test_save) - b_io = BytesIO(model_bytes) - model.load_parameters(b_io) - b_io.close() - new_actions_probas = model.action_probability(observations, actions=actions) - assert np.all(np.isclose(original_actions_probas, new_actions_probas)), "Action probabilities changed after" \ - "load_parameters from a file-like." - finally: - if os.path.exists(model_fname): - os.remove(model_fname) - - # Test `exact_match` functionality of load_parameters - original_actions_probas = model.action_probability(observations, actions=actions) - # Create dictionary with one variable name missing - truncated_random_params = dict((param_name, np.random.random(size=param.shape)) - for param_name, param in params.items()) - # Remove some element - _ = truncated_random_params.pop(list(truncated_random_params.keys())[0]) - # With exact_match=True, this should be an expection - with pytest.raises(RuntimeError): - model.load_parameters(truncated_random_params, exact_match=True) - # Make sure we did not update model regardless - new_actions_probas = model.action_probability(observations, actions=actions) - assert np.all(np.isclose(original_actions_probas, new_actions_probas)), "Action probabilities changed " \ - "after load_parameters raised " \ - "RunTimeError (exact_match=True)." - - # With False, this should be fine - model.load_parameters(truncated_random_params, exact_match=False) - # Also check that results changed, again - new_actions_probas = model.action_probability(observations, actions=actions) - assert not np.any(np.isclose(original_actions_probas, new_actions_probas)), "Action probabilities did not " \ - "change after changing model " \ - "parameters (exact_match=False)." - - del model, env diff --git a/tests/test_log_prob.py b/tests/test_log_prob.py deleted file mode 100644 index 3fbaf90b..00000000 --- a/tests/test_log_prob.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines import A2C, ACKTR, PPO1, PPO2, TRPO -from stable_baselines.common.identity_env import IdentityEnvBox - - -class Helper: - @staticmethod - def proba_vals(obs, state, mask): - # Return fixed mean, std - return np.array([-0.4]), np.array([[0.1]]) - - -@pytest.mark.parametrize("model_class", [A2C, ACKTR, PPO1, PPO2, TRPO]) -def test_log_prob_calcuation(model_class): - model = model_class("MlpPolicy", IdentityEnvBox()) - # Fixed mean/std - model.proba_step = Helper.proba_vals - # Check that the log probability is the one expected for the given mean/std - logprob = model.action_probability(observation=np.array([[0.5], [0.5]]), actions=0.2, logp=True) - assert np.allclose(logprob, np.array([-16.616353440210627])), "Calculation failed for {}".format(model_class) diff --git a/tests/test_logger.py b/tests/test_logger.py deleted file mode 100644 index 3b9ab569..00000000 --- a/tests/test_logger.py +++ /dev/null @@ -1,52 +0,0 @@ -import pytest -import numpy as np - -from stable_baselines.logger import make_output_format, read_tb, read_csv, read_json, _demo -from .test_common import _maybe_disable_mpi - - -KEY_VALUES = { - "test": 1, - "b": -3.14, - "8": 9.9, - "l": [1, 2], - "a": np.array([1, 2, 3]), - "f": np.array(1), - "g": np.array([[[1]]]), -} -LOG_DIR = '/tmp/openai_baselines/' - - -def test_main(): - """ - Dry-run python -m stable_baselines.logger - """ - _demo() - - -@pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv']) -@pytest.mark.parametrize('mpi_disabled', [False, True]) -def test_make_output(_format, mpi_disabled): - """ - test make output - - :param _format: (str) output format - """ - with _maybe_disable_mpi(mpi_disabled): - writer = make_output_format(_format, LOG_DIR) - writer.writekvs(KEY_VALUES) - if _format == 'tensorboard': - read_tb(LOG_DIR) - elif _format == "csv": - read_csv(LOG_DIR + 'progress.csv') - elif _format == 'json': - read_json(LOG_DIR + 'progress.json') - writer.close() - - -def test_make_output_fail(): - """ - test value error on logger - """ - with pytest.raises(ValueError): - make_output_format('dummy_format', LOG_DIR) diff --git a/tests/test_lstm_policy.py b/tests/test_lstm_policy.py deleted file mode 100644 index 1f9e15f3..00000000 --- a/tests/test_lstm_policy.py +++ /dev/null @@ -1,132 +0,0 @@ -import os - -from gym.envs.classic_control import CartPoleEnv -from gym.wrappers.time_limit import TimeLimit -from gym import spaces -import numpy as np -import pytest - -from stable_baselines import A2C, ACER, ACKTR, PPO2, bench -from stable_baselines.common.policies import MlpLstmPolicy, LstmPolicy -from stable_baselines.common.vec_env import SubprocVecEnv -from stable_baselines.common.vec_env.vec_normalize import VecNormalize -from stable_baselines.common.math_util import safe_mean -from stable_baselines.common.evaluation import evaluate_policy - - -class CustomLSTMPolicy1(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[8, 'lstm', 8], - layer_norm=False, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy2(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=['lstm', 8], layer_norm=True, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy3(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=[8, 'lstm'], layer_norm=False, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy4(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=[8, 'lstm', dict(vf=[5, 10], pi=[10])], - layer_norm=True, feature_extraction="mlp", **_kwargs) - - -class CartPoleNoVelEnv(CartPoleEnv): - """Variant of CartPoleEnv with velocity information removed. This task requires memory to solve.""" - - def __init__(self): - super(CartPoleNoVelEnv, self).__init__() - high = np.array([ - self.x_threshold * 2, - self.theta_threshold_radians * 2, - ]) - self.observation_space = spaces.Box(-high, high, dtype=np.float32) - - @staticmethod - def _pos_obs(full_obs): - xpos, _xvel, thetapos, _thetavel = full_obs - return xpos, thetapos - - def reset(self): - full_obs = super().reset() - return CartPoleNoVelEnv._pos_obs(full_obs) - - def step(self, action): - full_obs, rew, done, info = super().step(action) - return CartPoleNoVelEnv._pos_obs(full_obs), rew, done, info - - -N_TRIALS = 100 -NUM_ENVS = 16 -NUM_EPISODES_FOR_SCORE = 10 - -MODELS = [A2C, ACER, ACKTR, PPO2] -LSTM_POLICIES = [MlpLstmPolicy, CustomLSTMPolicy1, CustomLSTMPolicy2, CustomLSTMPolicy3, CustomLSTMPolicy4] - - -@pytest.mark.parametrize("model_class", MODELS) -@pytest.mark.parametrize("policy", LSTM_POLICIES) -def test_lstm_policy(request, model_class, policy): - model_fname = './test_model_{}.zip'.format(request.node.name) - - try: - # create and train - if model_class == PPO2: - model = model_class(policy, 'CartPole-v1', nminibatches=1, n_steps=4) - else: - model = model_class(policy, 'CartPole-v1', n_steps=4) - model.learn(total_timesteps=15) - - env = model.get_env() - evaluate_policy(model, env, n_eval_episodes=10) - # saving - model.save(model_fname) - del model, env - # loading - _ = model_class.load(model_fname, policy=policy) - - finally: - if os.path.exists(model_fname): - os.remove(model_fname) - - -@pytest.mark.expensive -def test_lstm_train(): - """Test that LSTM models are able to achieve >=150 (out of 500) reward on CartPoleNoVelEnv. - - This environment requires memory to perform well in.""" - def make_env(i): - env = CartPoleNoVelEnv() - env = TimeLimit(env, max_episode_steps=500) - env = bench.Monitor(env, None, allow_early_resets=True) - env.seed(i) - return env - - env = SubprocVecEnv([lambda: make_env(i) for i in range(NUM_ENVS)]) - env = VecNormalize(env) - model = PPO2(MlpLstmPolicy, env, n_steps=128, nminibatches=NUM_ENVS, lam=0.95, gamma=0.99, - noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1) - - eprewmeans = [] - def reward_callback(local, _): - nonlocal eprewmeans - eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']])) - - model.learn(total_timesteps=100000, callback=reward_callback) - - # Maximum episode reward is 500. - # In CartPole-v1, a non-recurrent policy can easily get >= 450. - # In CartPoleNoVelEnv, a non-recurrent policy doesn't get more than ~50. - # LSTM policies can reach above 400, but it varies a lot between runs; consistently get >=150. - # See PR #244 for more detailed benchmarks. - - average_reward = sum(eprewmeans[-NUM_EPISODES_FOR_SCORE:]) / NUM_EPISODES_FOR_SCORE - assert average_reward >= 150, "Mean reward below 150; per-episode rewards {}".format(average_reward) diff --git a/tests/test_math_util.py b/tests/test_math_util.py deleted file mode 100644 index 584ba98d..00000000 --- a/tests/test_math_util.py +++ /dev/null @@ -1,81 +0,0 @@ -import tensorflow as tf -import numpy as np -from gym.spaces.box import Box - -from stable_baselines.common.math_util import discount_with_boundaries, scale_action, unscale_action - - -def test_discount_with_boundaries(): - """ - test the discount_with_boundaries function - """ - gamma = 0.9 - rewards = np.array([1.0, 2.0, 3.0, 4.0], 'float32') - episode_starts = [1.0, 0.0, 0.0, 1.0] - discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma) - assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4]) - return - - -def test_scaling_action(): - """ - test scaling of scalar, 1d and 2d vectors of finite non-NaN real numbers to and from tanh co-domain (per component) - """ - test_ranges = [(-1, 1), (-10, 10), (-10, 5), (-10, 0), (-10, -5), (0, 10), (5, 10)] - - # scalars - for (range_low, range_high) in test_ranges: - check_scaled_actions_from_range(range_low, range_high, scalar=True) - - # 1d vectors: wrapped scalars - for test_range in test_ranges: - check_scaled_actions_from_range(*test_range) - - # 2d vectors: all combinations of ranges above - for (r1_low, r1_high) in test_ranges: - for (r2_low, r2_high) in test_ranges: - check_scaled_actions_from_range(np.array([r1_low, r2_low], dtype=np.float), - np.array([r1_high, r2_high], dtype=np.float)) - - -def check_scaled_actions_from_range(low, high, scalar=False): - """ - helper method which creates dummy action space spanning between respective components of low and high - and then checks scaling to and from tanh co-domain for low, middle and high value from that action space - :param low: (np.ndarray), (int) or (float) - :param high: (np.ndarray), (int) or (float) - :param scalar: (bool) Whether consider scalar range or wrap it into 1d vector - """ - - if scalar and (isinstance(low, float) or isinstance(low, int)): - ones = 1. - action_space = Box(low, high, shape=(1,)) - else: - low = np.atleast_1d(low) - high = np.atleast_1d(high) - ones = np.ones_like(low) - action_space = Box(low, high) - - mid = 0.5 * (low + high) - - expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)] - - for (not_scaled, scaled) in expected_mapping: - assert np.allclose(scale_action(action_space, not_scaled), scaled) - assert np.allclose(unscale_action(action_space, scaled), not_scaled) - - -def test_batch_shape_invariant_to_scaling(): - """ - test that scaling deals well with batches as tensors and numpy matrices in terms of shape - """ - action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.])) - - tensor = tf.constant(1., shape=[2, 3]) - matrix = np.ones((2, 3)) - - assert scale_action(action_space, tensor).shape == (2, 3) - assert scale_action(action_space, matrix).shape == (2, 3) - - assert unscale_action(action_space, tensor).shape == (2, 3) - assert unscale_action(action_space, matrix).shape == (2, 3) diff --git a/tests/test_monitor.py b/tests/test_monitor.py deleted file mode 100644 index f69561b4..00000000 --- a/tests/test_monitor.py +++ /dev/null @@ -1,87 +0,0 @@ -import uuid -import json -import os - -import pandas -import gym - -from stable_baselines.bench import Monitor -from stable_baselines.bench.monitor import get_monitor_files, load_results - - -def test_monitor(): - """ - test the monitor wrapper - """ - env = gym.make("CartPole-v1") - env.seed(0) - mon_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()) - menv = Monitor(env, mon_file) - menv.reset() - for _ in range(1000): - _, _, done, _ = menv.step(0) - if done: - menv.reset() - - file_handler = open(mon_file, 'rt') - - firstline = file_handler.readline() - assert firstline.startswith('#') - metadata = json.loads(firstline[1:]) - assert metadata['env_id'] == "CartPole-v1" - assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata" - - last_logline = pandas.read_csv(file_handler, index_col=None) - assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" - file_handler.close() - os.remove(mon_file) - - -def test_monitor_load_results(tmp_path): - """ - test load_results on log files produced by the monitor wrapper - """ - tmp_path = str(tmp_path) - env1 = gym.make("CartPole-v1") - env1.seed(0) - monitor_file1 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) - monitor_env1 = Monitor(env1, monitor_file1) - - monitor_files = get_monitor_files(tmp_path) - assert len(monitor_files) == 1 - assert monitor_file1 in monitor_files - - monitor_env1.reset() - episode_count1 = 0 - for _ in range(1000): - _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) - if done: - episode_count1 += 1 - monitor_env1.reset() - - results_size1 = len(load_results(os.path.join(tmp_path)).index) - assert results_size1 == episode_count1 - - env2 = gym.make("CartPole-v1") - env2.seed(0) - monitor_file2 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) - monitor_env2 = Monitor(env2, monitor_file2) - monitor_files = get_monitor_files(tmp_path) - assert len(monitor_files) == 2 - assert monitor_file1 in monitor_files - assert monitor_file2 in monitor_files - - monitor_env2.reset() - episode_count2 = 0 - for _ in range(1000): - _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) - if done: - episode_count2 += 1 - monitor_env2.reset() - - results_size2 = len(load_results(os.path.join(tmp_path)).index) - - assert results_size2 == (results_size1 + episode_count2) - - os.remove(monitor_file1) - os.remove(monitor_file2) diff --git a/tests/test_mpi_adam.py b/tests/test_mpi_adam.py deleted file mode 100644 index f7484ac7..00000000 --- a/tests/test_mpi_adam.py +++ /dev/null @@ -1,24 +0,0 @@ -import subprocess - -import pytest - -from .test_common import _assert_eq - - -def test_mpi_adam(): - """Test RunningMeanStd object for MPI""" - # Test will be run in CI before pytest is run - pytest.skip() - return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', - 'python', '-m', 'stable_baselines.common.mpi_adam']) - _assert_eq(return_code, 0) - - -def test_mpi_adam_ppo1(): - """Running test for ppo1""" - # Test will be run in CI before pytest is run - pytest.skip() - return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', - 'python', '-m', - 'stable_baselines.ppo1.experiments.train_cartpole']) - _assert_eq(return_code, 0) diff --git a/tests/test_multiple_learn.py b/tests/test_multiple_learn.py deleted file mode 100644 index e4bed8db..00000000 --- a/tests/test_multiple_learn.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytest - -from stable_baselines import A2C, ACER, ACKTR, PPO2 -from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox -from stable_baselines.common.vec_env import DummyVecEnv - -# TODO: Fix multiple-learn on commented-out models (Issue #619). -MODEL_LIST = [ - A2C, - ACER, - ACKTR, - PPO2, - - # MPI-based models, which use traj_segment_generator instead of Runner. - # - # PPO1, - # TRPO, - - # Off-policy models, which don't use Runner but reset every .learn() anyways. - # - # DDPG, - # SAC, - # TD3, -] - - -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_model_multiple_learn_no_reset(model_class): - """Check that when we call learn multiple times, we don't unnecessarily - reset the environment. - """ - if model_class is ACER: - def make_env(): - return IdentityEnv(ep_length=1e10, dim=2) - else: - def make_env(): - return IdentityEnvBox(ep_length=1e10) - env = make_env() - venv = DummyVecEnv([lambda: env]) - model = model_class(policy="MlpPolicy", env=venv) - _check_reset_count(model, env) - - # Try again following a `set_env`. - env = make_env() - venv = DummyVecEnv([lambda: env]) - assert env.num_resets == 0 - - model.set_env(venv) - _check_reset_count(model, env) - - -def _check_reset_count(model, env: IdentityEnv): - assert env.num_resets == 0 - _prev_runner = None - for _ in range(2): - model.learn(total_timesteps=300) - # Lazy constructor for Runner fires upon the first call to learn. - assert env.num_resets == 1 - if _prev_runner is not None: - assert _prev_runner is model.runner, "Runner shouldn't change" - _prev_runner = model.runner diff --git a/tests/test_no_mpi.py b/tests/test_no_mpi.py deleted file mode 100644 index 09755ff2..00000000 --- a/tests/test_no_mpi.py +++ /dev/null @@ -1,21 +0,0 @@ -import sys - -from .test_common import _maybe_disable_mpi - - -def test_no_mpi_no_crash(): - with _maybe_disable_mpi(True): - # Temporarily delete previously imported stable baselines - old_modules = {} - sb_modules = [name for name in sys.modules.keys() - if name.startswith('stable_baselines')] - for name in sb_modules: - old_modules[name] = sys.modules.pop(name) - - # Re-import (with mpi disabled) - import stable_baselines - del stable_baselines # appease Codacy - - # Restore old version of stable baselines (with MPI imported) - for name, mod in old_modules.items(): - sys.modules[name] = mod diff --git a/tests/test_ppo2.py b/tests/test_ppo2.py deleted file mode 100644 index ced0dc52..00000000 --- a/tests/test_ppo2.py +++ /dev/null @@ -1,47 +0,0 @@ -import os - -import pytest -import gym - -from stable_baselines import PPO2 -from stable_baselines.common import make_vec_env -from stable_baselines.common.vec_env import DummyVecEnv - - -@pytest.mark.parametrize("cliprange", [0.2, lambda x: 0.1 * x]) -@pytest.mark.parametrize("cliprange_vf", [None, 0.2, lambda x: 0.3 * x, -1.0]) -def test_clipping(tmp_path, cliprange, cliprange_vf): - """Test the different clipping (policy and vf)""" - model = PPO2( - "MlpPolicy", - "CartPole-v1", - cliprange=cliprange, - cliprange_vf=cliprange_vf, - noptepochs=2, - n_steps=64, - ).learn(100) - save_path = os.path.join(str(tmp_path), "ppo2_clip.zip") - model.save(save_path) - env = model.get_env() - model = PPO2.load(save_path, env=env) - model.learn(100) - - if os.path.exists(save_path): - os.remove(save_path) - - -def test_ppo2_update_n_batch_on_load(tmp_path): - env = make_vec_env("CartPole-v1", n_envs=2) - model = PPO2("MlpPolicy", env, n_steps=10, nminibatches=1) - save_path = os.path.join(str(tmp_path), "ppo2_cartpole.zip") - - model.learn(total_timesteps=100) - model.save(save_path) - - del model - - model = PPO2.load(save_path) - test_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) - - model.set_env(test_env) - model.learn(total_timesteps=100) diff --git a/tests/test_replay_buffer.py b/tests/test_replay_buffer.py deleted file mode 100644 index 996fcecc..00000000 --- a/tests/test_replay_buffer.py +++ /dev/null @@ -1,72 +0,0 @@ -import numpy as np - -from stable_baselines.common.buffers import ReplayBuffer, PrioritizedReplayBuffer - - -def test_extend_uniform(): - nvals = 16 - states = [np.random.rand(2, 2) for _ in range(nvals)] - actions = [np.random.rand(2) for _ in range(nvals)] - rewards = [np.random.rand() for _ in range(nvals)] - newstate = [np.random.rand(2, 2) for _ in range(nvals)] - done = [np.random.randint(0, 2) for _ in range(nvals)] - - size = 32 - baseline = ReplayBuffer(size) - ext = ReplayBuffer(size) - for data in zip(states, actions, rewards, newstate, done): - baseline.add(*data) - - states, actions, rewards, newstates, done = map( - np.array, [states, actions, rewards, newstate, done]) - - ext.extend(states, actions, rewards, newstates, done) - assert len(baseline) == len(ext) - - # Check buffers have same values - for i in range(nvals): - for j in range(5): - condition = (baseline.storage[i][j] == ext.storage[i][j]) - if isinstance(condition, np.ndarray): - # for obs, obs_t1 - assert np.all(condition) - else: - # for done, reward action - assert condition - - -def test_extend_prioritized(): - nvals = 16 - states = [np.random.rand(2, 2) for _ in range(nvals)] - actions = [np.random.rand(2) for _ in range(nvals)] - rewards = [np.random.rand() for _ in range(nvals)] - newstate = [np.random.rand(2, 2) for _ in range(nvals)] - done = [np.random.randint(0, 2) for _ in range(nvals)] - - size = 32 - alpha = 0.99 - baseline = PrioritizedReplayBuffer(size, alpha) - ext = PrioritizedReplayBuffer(size, alpha) - for data in zip(states, actions, rewards, newstate, done): - baseline.add(*data) - - states, actions, rewards, newstates, done = map( - np.array, [states, actions, rewards, newstate, done]) - - ext.extend(states, actions, rewards, newstates, done) - assert len(baseline) == len(ext) - - # Check buffers have same values - for i in range(nvals): - for j in range(5): - condition = (baseline.storage[i][j] == ext.storage[i][j]) - if isinstance(condition, np.ndarray): - # for obs, obs_t1 - assert np.all(condition) - else: - # for done, reward action - assert condition - - # assert priorities - assert (baseline._it_min._value == ext._it_min._value).all() - assert (baseline._it_sum._value == ext._it_sum._value).all() diff --git a/tests/test_save.py b/tests/test_save.py deleted file mode 100644 index 2a87a36c..00000000 --- a/tests/test_save.py +++ /dev/null @@ -1,209 +0,0 @@ -import os -from io import BytesIO -import json -import zipfile - -import pytest -import numpy as np - -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO -from stable_baselines.common.identity_env import IdentityEnv -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines.common.policies import MlpPolicy, FeedForwardPolicy - -N_EVAL_EPISODES = 5 - -MODEL_LIST = [ - A2C, - ACER, - ACKTR, - DQN, - PPO1, - PPO2, - TRPO, -] - -STORE_METHODS = [ - "path", - "file-like" -] - -STORE_FORMAT = [ - "zip", - "cloudpickle" -] - - -@pytest.mark.slow -@pytest.mark.parametrize("model_class", MODEL_LIST) -@pytest.mark.parametrize("storage_method", STORE_METHODS) -@pytest.mark.parametrize("store_format", STORE_FORMAT) -def test_model_manipulation(request, model_class, storage_method, store_format): - """ - Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching - works and that the action prediction works - - :param model_class: (BaseRLModel) A RL model - :param storage_method: (str) Should file be saved to a file ("path") or to a buffer - ("file-like") - :param store_format: (str) Save format, either "zip" or "cloudpickle". - """ - - # Use postfix ".model" so we can remove the file later - model_fname = './test_model_{}.model'.format(request.node.name) - store_as_cloudpickle = store_format == "cloudpickle" - - kwargs = dict(seed=0, gamma=0.4) - if model_class in [DQN]: - kwargs["learning_starts"] = 0 - kwargs["exploration_final_eps"] = 0.05 - - if model_class == PPO1: - kwargs["entcoeff"] = 0.0 - kwargs["optim_batchsize"] = 4 - kwargs["timesteps_per_actorbatch"] = 4 - - if model_class in [A2C, ACKTR, PPO2]: - kwargs["n_steps"] = 4 - kwargs["ent_coef"] = 0.0 - - if model_class in [TRPO]: - kwargs["timesteps_per_batch"] = 4 - - try: - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - # create and train - model = model_class(policy="MlpPolicy", env=env, **kwargs) - model.learn(total_timesteps=15) - - env.envs[0].action_space.seed(0) - mean_reward, _ = evaluate_policy(model, env, deterministic=True, - n_eval_episodes=N_EVAL_EPISODES) - - # test action probability for given (obs, action) pair - env = model.get_env() - obs = env.reset() - observations = np.array([env.step([env.action_space.sample()])[0] for _ in range(10)]) - observations = np.squeeze(observations) - selected_actions, _ = model.predict(observations, deterministic=True) - - actions = np.array([env.action_space.sample() for _ in range(10)]) - actions_probas = model.action_probability(observations, actions=actions) - assert actions_probas.shape == (len(actions), 1), actions_probas.shape - assert actions_probas.min() >= 0, actions_probas.min() - assert actions_probas.max() <= 1, actions_probas.max() - - # saving - if storage_method == "path": # saving to a path - model.save(model_fname, cloudpickle=store_as_cloudpickle) - else: # saving to a file-like object (BytesIO in this case) - b_io = BytesIO() - model.save(b_io, cloudpickle=store_as_cloudpickle) - model_bytes = b_io.getvalue() - b_io.close() - - del model, env - - # loading - if storage_method == "path": # loading from path - model = model_class.load(model_fname) - else: - b_io = BytesIO(model_bytes) # loading from file-like object (BytesIO in this case) - model = model_class.load(b_io) - b_io.close() - - # changing environment (note: this can be done at loading) - env = DummyVecEnv([lambda: IdentityEnv(10)]) - model.set_env(env) - - # check if model still selects the same actions - new_selected_actions, _ = model.predict(observations, deterministic=True) - assert np.allclose(selected_actions, new_selected_actions, 1e-4) - - # learn post loading - model.learn(total_timesteps=15) - - # predict new values - evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES) - - del model, env - - finally: - if os.path.exists(model_fname): - os.remove(model_fname) - - -class CustomMlpPolicy(FeedForwardPolicy): - """A dummy "custom" policy to test out custom_objects""" - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(CustomMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, - n_batch, reuse, feature_extraction="mlp", - **_kwargs) - - -@pytest.mark.parametrize("model_class", MODEL_LIST) -def test_save_custom_objects(request, model_class): - """ - Test feeding custom_objects in model.load(...) function - """ - # Skip DQN (not an actor-critic policy) - if model_class == DQN: - return - - model_fname = './test_model_{}.zip'.format(request.node.name) - - try: - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - # Create and save model with default MLP policy - model = model_class(policy=MlpPolicy, env=env) - model.save(model_fname) - - del model, env - - # Corrupt "policy" serialization in the file - data_file = zipfile.ZipFile(model_fname, "r") - # Load all data (can't just update one file in the archive) - parameter_list = data_file.read("parameter_list") - parameters = data_file.read("parameters") - class_data = json.loads(data_file.read("data").decode()) - data_file.close() - - # Corrupt serialization of the "policy" - class_data["policy"][":serialized:"] = ( - "Adding this should break serialization" + - class_data["policy"][":serialized:"] - ) - - # And dump everything back to the model file - data_file = zipfile.ZipFile(model_fname, "w") - data_file.writestr("data", json.dumps(class_data)) - data_file.writestr("parameter_list", parameter_list) - data_file.writestr("parameters", parameters) - data_file.close() - - # Try loading the model. This should - # result in an error - with pytest.raises(RuntimeError): - model = model_class.load(model_fname) - - # Load model with custom objects ("custom" MlpPolicy) - # and it should work fine. - # Note: We could load model with just vanilla - # MlpPolicy, too. - model = model_class.load( - model_fname, - custom_objects={ - "policy": CustomMlpPolicy - } - ) - - # Make sure we loaded custom MLP policy - assert model.policy == CustomMlpPolicy - del model - - finally: - if os.path.exists(model_fname): - os.remove(model_fname) diff --git a/tests/test_schedules.py b/tests/test_schedules.py deleted file mode 100644 index 433a6e79..00000000 --- a/tests/test_schedules.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np - -from stable_baselines.common.schedules import ConstantSchedule, PiecewiseSchedule, LinearSchedule - - -def test_piecewise_schedule(): - """ - test PiecewiseSchedule - """ - piecewise_sched = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], - outside_value=500) - - assert np.isclose(piecewise_sched.value(-10), 500) - assert np.isclose(piecewise_sched.value(0), 150) - assert np.isclose(piecewise_sched.value(5), 200) - assert np.isclose(piecewise_sched.value(9), 80) - assert np.isclose(piecewise_sched.value(50), 50) - assert np.isclose(piecewise_sched.value(80), 50) - assert np.isclose(piecewise_sched.value(150), 0) - assert np.isclose(piecewise_sched.value(175), -25) - assert np.isclose(piecewise_sched.value(201), 500) - assert np.isclose(piecewise_sched.value(500), 500) - - assert np.isclose(piecewise_sched.value(200 - 1e-10), -50) - - -def test_constant_schedule(): - """ - test ConstantSchedule - """ - constant_sched = ConstantSchedule(5) - for i in range(-100, 100): - assert np.isclose(constant_sched.value(i), 5) - - -def test_linear_schedule(): - """ - test LinearSchedule - """ - linear_sched = LinearSchedule(schedule_timesteps=100, initial_p=0.2, final_p=0.8) - assert np.isclose(linear_sched.value(50), 0.5) - assert np.isclose(linear_sched.value(0), 0.2) - assert np.isclose(linear_sched.value(100), 0.8) - - linear_sched = LinearSchedule(schedule_timesteps=100, initial_p=0.8, final_p=0.2) - assert np.isclose(linear_sched.value(50), 0.5) - assert np.isclose(linear_sched.value(0), 0.8) - assert np.isclose(linear_sched.value(100), 0.2) - - linear_sched = LinearSchedule(schedule_timesteps=100, initial_p=-0.6, final_p=0.2) - assert np.isclose(linear_sched.value(50), -0.2) - assert np.isclose(linear_sched.value(0), -0.6) - assert np.isclose(linear_sched.value(100), 0.2) - - linear_sched = LinearSchedule(schedule_timesteps=100, initial_p=0.2, final_p=-0.6) - assert np.isclose(linear_sched.value(50), -0.2) - assert np.isclose(linear_sched.value(0), 0.2) - assert np.isclose(linear_sched.value(100), -0.6) diff --git a/tests/test_segment_tree.py b/tests/test_segment_tree.py deleted file mode 100644 index db0c5bc8..00000000 --- a/tests/test_segment_tree.py +++ /dev/null @@ -1,194 +0,0 @@ -import numpy as np - -from stable_baselines.common.segment_tree import SumSegmentTree, MinSegmentTree - - -def test_tree_set(): - """ - test Segment Tree data structure - """ - tree = SumSegmentTree(4) - - tree[np.array([2, 3])] = [1.0, 3.0] - - assert np.isclose(tree.sum(), 4.0) - assert np.isclose(tree.sum(0, 2), 0.0) - assert np.isclose(tree.sum(0, 3), 1.0) - assert np.isclose(tree.sum(2, 3), 1.0) - assert np.isclose(tree.sum(2, -1), 1.0) - assert np.isclose(tree.sum(2, 4), 4.0) - - tree = SumSegmentTree(4) - tree[2] = 1.0 - tree[3] = 3.0 - - assert np.isclose(tree.sum(), 4.0) - assert np.isclose(tree.sum(0, 2), 0.0) - assert np.isclose(tree.sum(0, 3), 1.0) - assert np.isclose(tree.sum(2, 3), 1.0) - assert np.isclose(tree.sum(2, -1), 1.0) - assert np.isclose(tree.sum(2, 4), 4.0) - - -def test_tree_set_overlap(): - """ - test Segment Tree data structure - """ - tree = SumSegmentTree(4) - - tree[np.array([2])] = 1.0 - tree[np.array([2])] = 3.0 - - assert np.isclose(tree.sum(), 3.0) - assert np.isclose(tree.sum(2, 3), 3.0) - assert np.isclose(tree.sum(2, -1), 3.0) - assert np.isclose(tree.sum(2, 4), 3.0) - assert np.isclose(tree.sum(1, 2), 0.0) - - tree = SumSegmentTree(4) - - tree[2] = 1.0 - tree[2] = 3.0 - - assert np.isclose(tree.sum(), 3.0) - assert np.isclose(tree.sum(2, 3), 3.0) - assert np.isclose(tree.sum(2, -1), 3.0) - assert np.isclose(tree.sum(2, 4), 3.0) - assert np.isclose(tree.sum(1, 2), 0.0) - - -def test_prefixsum_idx(): - """ - test Segment Tree data structure - """ - tree = SumSegmentTree(4) - - tree[2] = 1.0 - tree[3] = 3.0 - - assert tree.find_prefixsum_idx(0.0) == 2 - assert tree.find_prefixsum_idx(0.5) == 2 - assert tree.find_prefixsum_idx(0.99) == 2 - assert tree.find_prefixsum_idx(1.01) == 3 - assert tree.find_prefixsum_idx(3.00) == 3 - assert tree.find_prefixsum_idx(4.00) == 3 - assert np.all(tree.find_prefixsum_idx([0.0, 0.5, 0.99, 1.01, 3.00, 4.00]) == [2, 2, 2, 3, 3, 3]) - - tree = SumSegmentTree(4) - - tree[np.array([2, 3])] = [1.0, 3.0] - - assert tree.find_prefixsum_idx(0.0) == 2 - assert tree.find_prefixsum_idx(0.5) == 2 - assert tree.find_prefixsum_idx(0.99) == 2 - assert tree.find_prefixsum_idx(1.01) == 3 - assert tree.find_prefixsum_idx(3.00) == 3 - assert tree.find_prefixsum_idx(4.00) == 3 - assert np.all(tree.find_prefixsum_idx([0.0, 0.5, 0.99, 1.01, 3.00, 4.00]) == [2, 2, 2, 3, 3, 3]) - - -def test_prefixsum_idx2(): - """ - test Segment Tree data structure - """ - tree = SumSegmentTree(4) - - tree[np.array([0, 1, 2, 3])] = [0.5, 1.0, 1.0, 3.0] - - assert tree.find_prefixsum_idx(0.00) == 0 - assert tree.find_prefixsum_idx(0.55) == 1 - assert tree.find_prefixsum_idx(0.99) == 1 - assert tree.find_prefixsum_idx(1.51) == 2 - assert tree.find_prefixsum_idx(3.00) == 3 - assert tree.find_prefixsum_idx(5.50) == 3 - - tree = SumSegmentTree(4) - - tree[0] = 0.5 - tree[1] = 1.0 - tree[2] = 1.0 - tree[3] = 3.0 - - assert tree.find_prefixsum_idx(0.00) == 0 - assert tree.find_prefixsum_idx(0.55) == 1 - assert tree.find_prefixsum_idx(0.99) == 1 - assert tree.find_prefixsum_idx(1.51) == 2 - assert tree.find_prefixsum_idx(3.00) == 3 - assert tree.find_prefixsum_idx(5.50) == 3 - - -def test_max_interval_tree(): - """ - test Segment Tree data structure - """ - tree = MinSegmentTree(4) - - tree[0] = 1.0 - tree[2] = 0.5 - tree[3] = 3.0 - - assert np.isclose(tree.min(), 0.5) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 0.5) - assert np.isclose(tree.min(0, -1), 0.5) - assert np.isclose(tree.min(2, 4), 0.5) - assert np.isclose(tree.min(3, 4), 3.0) - - tree[2] = 0.7 - - assert np.isclose(tree.min(), 0.7) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 0.7) - assert np.isclose(tree.min(0, -1), 0.7) - assert np.isclose(tree.min(2, 4), 0.7) - assert np.isclose(tree.min(3, 4), 3.0) - - tree[2] = 4.0 - - assert np.isclose(tree.min(), 1.0) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 1.0) - assert np.isclose(tree.min(0, -1), 1.0) - assert np.isclose(tree.min(2, 4), 3.0) - assert np.isclose(tree.min(2, 3), 4.0) - assert np.isclose(tree.min(2, -1), 4.0) - assert np.isclose(tree.min(3, 4), 3.0) - - tree = MinSegmentTree(4) - - tree[np.array([0, 2, 3])] = [1.0, 0.5, 3.0] - - assert np.isclose(tree.min(), 0.5) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 0.5) - assert np.isclose(tree.min(0, -1), 0.5) - assert np.isclose(tree.min(2, 4), 0.5) - assert np.isclose(tree.min(3, 4), 3.0) - - tree[np.array([2])] = 0.7 - - assert np.isclose(tree.min(), 0.7) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 0.7) - assert np.isclose(tree.min(0, -1), 0.7) - assert np.isclose(tree.min(2, 4), 0.7) - assert np.isclose(tree.min(3, 4), 3.0) - - tree[np.array([2])] = 4.0 - - assert np.isclose(tree.min(), 1.0) - assert np.isclose(tree.min(0, 2), 1.0) - assert np.isclose(tree.min(0, 3), 1.0) - assert np.isclose(tree.min(0, -1), 1.0) - assert np.isclose(tree.min(2, 4), 3.0) - assert np.isclose(tree.min(2, 3), 4.0) - assert np.isclose(tree.min(2, -1), 4.0) - assert np.isclose(tree.min(3, 4), 3.0) - - -if __name__ == '__main__': - test_tree_set() - test_tree_set_overlap() - test_prefixsum_idx() - test_prefixsum_idx2() - test_max_interval_tree() diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py deleted file mode 100644 index ef6d7a91..00000000 --- a/tests/test_tensorboard.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import shutil - -import pytest - -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TD3, TRPO - -TENSORBOARD_DIR = '/tmp/tb_dir/' - -if os.path.isdir(TENSORBOARD_DIR): - shutil.rmtree(TENSORBOARD_DIR) - -MODEL_DICT = { - 'a2c': (A2C, 'CartPole-v1'), - 'acer': (ACER, 'CartPole-v1'), - 'acktr': (ACKTR, 'CartPole-v1'), - 'dqn': (DQN, 'CartPole-v1'), - 'ddpg': (DDPG, 'Pendulum-v0'), - 'ppo1': (PPO1, 'CartPole-v1'), - 'ppo2': (PPO2, 'CartPole-v1'), - 'sac': (SAC, 'Pendulum-v0'), - 'td3': (TD3, 'Pendulum-v0'), - 'trpo': (TRPO, 'CartPole-v1'), -} - -N_STEPS = 300 - - -@pytest.mark.parametrize("model_name", MODEL_DICT.keys()) -def test_tensorboard(model_name): - logname = model_name.upper() - algo, env_id = MODEL_DICT[model_name] - model = algo('MlpPolicy', env_id, verbose=1, tensorboard_log=TENSORBOARD_DIR) - model.learn(N_STEPS) - model.learn(N_STEPS, reset_num_timesteps=False) - - assert os.path.isdir(TENSORBOARD_DIR + logname + "_1") - assert not os.path.isdir(TENSORBOARD_DIR + logname + "_2") - - -@pytest.mark.parametrize("model_name", MODEL_DICT.keys()) -def test_multiple_runs(model_name): - logname = "tb_multiple_runs_" + model_name - algo, env_id = MODEL_DICT[model_name] - model = algo('MlpPolicy', env_id, verbose=1, tensorboard_log=TENSORBOARD_DIR) - model.learn(N_STEPS, tb_log_name=logname) - model.learn(N_STEPS, tb_log_name=logname) - - assert os.path.isdir(TENSORBOARD_DIR + logname + "_1") - # Check that the log dir name increments correctly - assert os.path.isdir(TENSORBOARD_DIR + logname + "_2") diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py deleted file mode 100644 index d71374da..00000000 --- a/tests/test_tf_util.py +++ /dev/null @@ -1,61 +0,0 @@ -# tests for tf_util -import numpy as np -import tensorflow as tf - -from stable_baselines.common.tf_util import function, initialize, single_threaded_session, is_image - - -def test_function(): - """ - test the function function in tf_util - """ - with tf.Graph().as_default(): - x_ph = tf.placeholder(tf.int32, (), name="x") - y_ph = tf.placeholder(tf.int32, (), name="y") - z_ph = 3 * x_ph + 2 * y_ph - linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0}) - - with single_threaded_session(): - initialize() - - assert linear_fn(2) == 6 - assert linear_fn(2, 2) == 10 - - -def test_multikwargs(): - """ - test the function function in tf_util - """ - with tf.Graph().as_default(): - x_ph = tf.placeholder(tf.int32, (), name="x") - with tf.variable_scope("other"): - x2_ph = tf.placeholder(tf.int32, (), name="x") - z_ph = 3 * x_ph + 2 * x2_ph - - linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0}) - with single_threaded_session(): - initialize() - assert linear_fn(2) == 6 - assert linear_fn(2, 2) == 10 - - -def test_image_detection(): - rgb = (32, 64, 3) - gray = (43, 23, 1) - rgbd = (12, 32, 4) - invalid_1 = (32, 12) - invalid_2 = (12, 32, 6) - - # TF checks - for shape in (rgb, gray, rgbd): - assert is_image(tf.placeholder(tf.uint8, shape=shape)) - - for shape in (invalid_1, invalid_2): - assert not is_image(tf.placeholder(tf.uint8, shape=shape)) - - # Numpy checks - for shape in (rgb, gray, rgbd): - assert is_image(np.ones(shape)) - - for shape in (invalid_1, invalid_2): - assert not is_image(np.ones(shape)) diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index d83f2d4a..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -import shutil - -import pytest -import gym - -from stable_baselines import A2C -from stable_baselines.bench.monitor import Monitor -from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines.common.cmd_util import make_vec_env -from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv - - -@pytest.mark.parametrize("env_id", ['CartPole-v1', lambda: gym.make('CartPole-v1')]) -@pytest.mark.parametrize("n_envs", [1, 2]) -@pytest.mark.parametrize("vec_env_cls", [None, SubprocVecEnv]) -@pytest.mark.parametrize("wrapper_class", [None, gym.wrappers.TimeLimit]) -def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): - env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, - wrapper_class=wrapper_class, monitor_dir=None, seed=0) - - assert env.num_envs == n_envs - - if vec_env_cls is None: - assert isinstance(env, DummyVecEnv) - if wrapper_class is not None: - assert isinstance(env.envs[0], wrapper_class) - else: - assert isinstance(env.envs[0], Monitor) - else: - assert isinstance(env, SubprocVecEnv) - # Kill subprocesses - env.close() - - -def test_custom_vec_env(): - """ - Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests. - """ - monitor_dir = 'logs/test_make_vec_env/' - env = make_vec_env('CartPole-v1', n_envs=1, - monitor_dir=monitor_dir, seed=0, - vec_env_cls=SubprocVecEnv, vec_env_kwargs={'start_method': None}) - - assert env.num_envs == 1 - assert isinstance(env, SubprocVecEnv) - assert os.path.isdir('logs/test_make_vec_env/') - # Kill subprocess - env.close() - # Cleanup folder - shutil.rmtree(monitor_dir) - - # This should fail because DummyVecEnv does not have any keyword argument - with pytest.raises(TypeError): - make_vec_env('CartPole-v1', n_envs=1, vec_env_kwargs={'dummy': False}) - - -def test_evaluate_policy(): - model = A2C('MlpPolicy', 'Pendulum-v0', seed=0) - n_steps_per_episode, n_eval_episodes = 200, 2 - model.n_callback_calls = 0 - - def dummy_callback(locals_, _globals): - locals_['model'].n_callback_calls += 1 - - _, episode_lengths = evaluate_policy(model, model.get_env(), n_eval_episodes, deterministic=True, - render=False, callback=dummy_callback, reward_threshold=None, - return_episode_rewards=True) - - n_steps = sum(episode_lengths) - assert n_steps == n_steps_per_episode * n_eval_episodes - assert n_steps == model.n_callback_calls - - # Reaching a mean reward of zero is impossible with the Pendulum env - with pytest.raises(AssertionError): - evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0) - - episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True) - assert len(episode_rewards) == n_eval_episodes diff --git a/tests/test_vec_check_nan.py b/tests/test_vec_check_nan.py deleted file mode 100644 index 93b2cf4d..00000000 --- a/tests/test_vec_check_nan.py +++ /dev/null @@ -1,71 +0,0 @@ -import gym -from gym import spaces -import numpy as np - -from stable_baselines.common.vec_env import DummyVecEnv, VecCheckNan - - -class NanAndInfEnv(gym.Env): - """Custom Environment that raised NaNs and Infs""" - metadata = {'render.modes': ['human']} - - def __init__(self): - super(NanAndInfEnv, self).__init__() - self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64) - self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64) - - @staticmethod - def step(action): - if np.all(np.array(action) > 0): - obs = float('NaN') - elif np.all(np.array(action) < 0): - obs = float('inf') - else: - obs = 0 - return [obs], 0.0, False, {} - - @staticmethod - def reset(): - return [0.0] - - def render(self, mode='human', close=False): - pass - - -def test_check_nan(): - """Test VecCheckNan Object""" - - env = DummyVecEnv([NanAndInfEnv]) - env = VecCheckNan(env, raise_exception=True) - - env.step([[0]]) - - try: - env.step([[float('NaN')]]) - except ValueError: - pass - else: - assert False - - try: - env.step([[float('inf')]]) - except ValueError: - pass - else: - assert False - - try: - env.step([[-1]]) - except ValueError: - pass - else: - assert False - - try: - env.step([[1]]) - except ValueError: - pass - else: - assert False - - env.step(np.array([[0, 1], [0, 1]])) diff --git a/tests/test_vec_envs.py b/tests/test_vec_envs.py deleted file mode 100644 index 99442467..00000000 --- a/tests/test_vec_envs.py +++ /dev/null @@ -1,339 +0,0 @@ -import collections -import functools -import itertools -import multiprocessing - -import pytest -import gym -import numpy as np - -from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize, VecFrameStack - -N_ENVS = 3 -VEC_ENV_CLASSES = [DummyVecEnv, SubprocVecEnv] -VEC_ENV_WRAPPERS = [None, VecNormalize, VecFrameStack] - - -class CustomGymEnv(gym.Env): - def __init__(self, space): - """ - Custom gym environment for testing purposes - """ - self.action_space = space - self.observation_space = space - self.current_step = 0 - self.ep_length = 4 - - def reset(self): - self.current_step = 0 - self._choose_next_state() - return self.state - - def step(self, action): - reward = 1 - self._choose_next_state() - self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} - - def _choose_next_state(self): - self.state = self.observation_space.sample() - - def render(self, mode='human'): - if mode == 'rgb_array': - return np.zeros((4, 4, 3)) - - def seed(self, seed=None): - pass - - @staticmethod - def custom_method(dim_0=1, dim_1=1): - """ - Dummy method to test call to custom method - from VecEnv - - :param dim_0: (int) - :param dim_1: (int) - :return: (np.ndarray) - """ - return np.ones((dim_0, dim_1)) - - -@pytest.mark.parametrize('vec_env_class', VEC_ENV_CLASSES) -@pytest.mark.parametrize('vec_env_wrapper', VEC_ENV_WRAPPERS) -def test_vecenv_custom_calls(vec_env_class, vec_env_wrapper): - """Test access to methods/attributes of vectorized environments""" - def make_env(): - return CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))) - vec_env = vec_env_class([make_env for _ in range(N_ENVS)]) - - if vec_env_wrapper is not None: - if vec_env_wrapper == VecFrameStack: - vec_env = vec_env_wrapper(vec_env, n_stack=2) - else: - vec_env = vec_env_wrapper(vec_env) - - # Test seed method - vec_env.seed(0) - # Test render method call - # vec_env.render() # we need a X server to test the "human" mode - vec_env.render(mode='rgb_array') - - env_method_results = vec_env.env_method('custom_method', 1, indices=None, dim_1=2) - setattr_results = [] - # Set current_step to an arbitrary value - for env_idx in range(N_ENVS): - setattr_results.append(vec_env.set_attr('current_step', env_idx, indices=env_idx)) - # Retrieve the value for each environment - getattr_results = vec_env.get_attr('current_step') - - assert len(env_method_results) == N_ENVS - assert len(setattr_results) == N_ENVS - assert len(getattr_results) == N_ENVS - - for env_idx in range(N_ENVS): - assert (env_method_results[env_idx] == np.ones((1, 2))).all() - assert setattr_results[env_idx] is None - assert getattr_results[env_idx] == env_idx - - # Call env_method on a subset of the VecEnv - env_method_subset = vec_env.env_method('custom_method', 1, indices=[0, 2], dim_1=3) - assert (env_method_subset[0] == np.ones((1, 3))).all() - assert (env_method_subset[1] == np.ones((1, 3))).all() - assert len(env_method_subset) == 2 - - # Test to change value for all the environments - setattr_result = vec_env.set_attr('current_step', 42, indices=None) - getattr_result = vec_env.get_attr('current_step') - assert setattr_result is None - assert getattr_result == [42 for _ in range(N_ENVS)] - - # Additional tests for setattr that does not affect all the environments - vec_env.reset() - setattr_result = vec_env.set_attr('current_step', 12, indices=[0, 1]) - getattr_result = vec_env.get_attr('current_step') - getattr_result_subset = vec_env.get_attr('current_step', indices=[0, 1]) - assert setattr_result is None - assert getattr_result == [12 for _ in range(2)] + [0 for _ in range(N_ENVS - 2)] - assert getattr_result_subset == [12, 12] - assert vec_env.get_attr('current_step', indices=[0, 2]) == [12, 0] - - vec_env.reset() - # Change value only for first and last environment - setattr_result = vec_env.set_attr('current_step', 12, indices=[0, -1]) - getattr_result = vec_env.get_attr('current_step') - assert setattr_result is None - assert getattr_result == [12] + [0 for _ in range(N_ENVS - 2)] + [12] - assert vec_env.get_attr('current_step', indices=[-1]) == [12] - - vec_env.close() - - -class StepEnv(gym.Env): - def __init__(self, max_steps): - """Gym environment for testing that terminal observation is inserted - correctly.""" - self.action_space = gym.spaces.Discrete(2) - self.observation_space = gym.spaces.Box(np.array([0]), np.array([999]), - dtype='int') - self.max_steps = max_steps - self.current_step = 0 - - def reset(self): - self.current_step = 0 - return np.array([self.current_step], dtype='int') - - def step(self, action): - prev_step = self.current_step - self.current_step += 1 - done = self.current_step >= self.max_steps - return np.array([prev_step], dtype='int'), 0.0, done, {} - - -@pytest.mark.parametrize('vec_env_class', VEC_ENV_CLASSES) -@pytest.mark.parametrize('vec_env_wrapper', VEC_ENV_WRAPPERS) -def test_vecenv_terminal_obs(vec_env_class, vec_env_wrapper): - """Test that 'terminal_observation' gets added to info dict upon - termination.""" - step_nums = [i + 5 for i in range(N_ENVS)] - vec_env = vec_env_class([functools.partial(StepEnv, n) for n in step_nums]) - - if vec_env_wrapper is not None: - if vec_env_wrapper == VecFrameStack: - vec_env = vec_env_wrapper(vec_env, n_stack=2) - else: - vec_env = vec_env_wrapper(vec_env) - - zero_acts = np.zeros((N_ENVS,), dtype='int') - prev_obs_b = vec_env.reset() - for step_num in range(1, max(step_nums) + 1): - obs_b, _, done_b, info_b = vec_env.step(zero_acts) - assert len(obs_b) == N_ENVS - assert len(done_b) == N_ENVS - assert len(info_b) == N_ENVS - env_iter = zip(prev_obs_b, obs_b, done_b, info_b, step_nums) - for prev_obs, obs, done, info, final_step_num in env_iter: - assert done == (step_num == final_step_num) - if not done: - assert 'terminal_observation' not in info - else: - terminal_obs = info['terminal_observation'] - - # do some rough ordering checks that should work for all - # wrappers, including VecNormalize - assert np.all(prev_obs < terminal_obs) - assert np.all(obs < prev_obs) - - if not isinstance(vec_env, VecNormalize): - # more precise tests that we can't do with VecNormalize - # (which changes observation values) - assert np.all(prev_obs + 1 == terminal_obs) - assert np.all(obs == 0) - - prev_obs_b = obs_b - - vec_env.close() - - -SPACES = collections.OrderedDict([ - ('discrete', gym.spaces.Discrete(2)), - ('multidiscrete', gym.spaces.MultiDiscrete([2, 3])), - ('multibinary', gym.spaces.MultiBinary(3)), - ('continuous', gym.spaces.Box(low=np.zeros(2), high=np.ones(2))), -]) - - -def check_vecenv_spaces(vec_env_class, space, obs_assert): - """Helper method to check observation spaces in vectorized environments.""" - def make_env(): - return CustomGymEnv(space) - - vec_env = vec_env_class([make_env for _ in range(N_ENVS)]) - obs = vec_env.reset() - obs_assert(obs) - - dones = [False] * N_ENVS - while not any(dones): - actions = [vec_env.action_space.sample() for _ in range(N_ENVS)] - obs, _rews, dones, _infos = vec_env.step(actions) - obs_assert(obs) - vec_env.close() - - -def check_vecenv_obs(obs, space): - """Helper method to check observations from multiple environments each belong to - the appropriate observation space.""" - assert obs.shape[0] == N_ENVS - for value in obs: - assert space.contains(value) - - -@pytest.mark.parametrize('vec_env_class,space', itertools.product(VEC_ENV_CLASSES, SPACES.values())) -def test_vecenv_single_space(vec_env_class, space): - def obs_assert(obs): - return check_vecenv_obs(obs, space) - - check_vecenv_spaces(vec_env_class, space, obs_assert) - - -class _UnorderedDictSpace(gym.spaces.Dict): - """Like DictSpace, but returns an unordered dict when sampling.""" - def sample(self): - return dict(super().sample()) - - -@pytest.mark.parametrize('vec_env_class', VEC_ENV_CLASSES) -def test_vecenv_dict_spaces(vec_env_class): - """Test dictionary observation spaces with vectorized environments.""" - space = gym.spaces.Dict(SPACES) - - def obs_assert(obs): - assert isinstance(obs, collections.OrderedDict) - assert obs.keys() == space.spaces.keys() - for key, values in obs.items(): - check_vecenv_obs(values, space.spaces[key]) - - check_vecenv_spaces(vec_env_class, space, obs_assert) - - unordered_space = _UnorderedDictSpace(SPACES) - # Check that vec_env_class can accept unordered dict observations (and convert to OrderedDict) - check_vecenv_spaces(vec_env_class, unordered_space, obs_assert) - - -@pytest.mark.parametrize('vec_env_class', VEC_ENV_CLASSES) -def test_vecenv_tuple_spaces(vec_env_class): - """Test tuple observation spaces with vectorized environments.""" - space = gym.spaces.Tuple(tuple(SPACES.values())) - - def obs_assert(obs): - assert isinstance(obs, tuple) - assert len(obs) == len(space.spaces) - for values, inner_space in zip(obs, space.spaces): - check_vecenv_obs(values, inner_space) - - return check_vecenv_spaces(vec_env_class, space, obs_assert) - - -def test_subproc_start_method(): - start_methods = [None] - # Only test thread-safe methods. Others may deadlock tests! (gh/428) - # safe_methods = {'forkserver', 'spawn'} - safe_methods = {'spawn'} - available_methods = multiprocessing.get_all_start_methods() - start_methods += list(safe_methods.intersection(available_methods)) - space = gym.spaces.Discrete(2) - - def obs_assert(obs): - return check_vecenv_obs(obs, space) - - for start_method in start_methods: - vec_env_class = functools.partial(SubprocVecEnv, start_method=start_method) - check_vecenv_spaces(vec_env_class, space, obs_assert) - - with pytest.raises(ValueError, match="cannot find context for 'illegal_method'"): - vec_env_class = functools.partial(SubprocVecEnv, start_method='illegal_method') - check_vecenv_spaces(vec_env_class, space, obs_assert) - - -class CustomWrapperA(VecNormalize): - def __init__(self, venv): - VecNormalize.__init__(self, venv) - self.var_a = 'a' - - -class CustomWrapperB(VecNormalize): - def __init__(self, venv): - VecNormalize.__init__(self, venv) - self.var_b = 'b' - - def func_b(self): - return self.var_b - - def name_test(self): - return self.__class__ - - -class CustomWrapperBB(CustomWrapperB): - def __init__(self, venv): - CustomWrapperB.__init__(self, venv) - self.var_bb = 'bb' - - -def test_vecenv_wrapper_getattr(): - def make_env(): - return CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))) - vec_env = DummyVecEnv([make_env for _ in range(N_ENVS)]) - wrapped = CustomWrapperA(CustomWrapperBB(vec_env)) - assert wrapped.var_a == 'a' - assert wrapped.var_b == 'b' - assert wrapped.var_bb == 'bb' - assert wrapped.func_b() == 'b' - assert wrapped.name_test() == CustomWrapperBB - - double_wrapped = CustomWrapperA(CustomWrapperB(wrapped)) - dummy = double_wrapped.var_a # should not raise as it is directly defined here - with pytest.raises(AttributeError): # should raise due to ambiguity - dummy = double_wrapped.var_b - with pytest.raises(AttributeError): # should raise as does not exist - dummy = double_wrapped.nonexistent_attribute - del dummy # keep linter happy diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py deleted file mode 100644 index 0a98a02d..00000000 --- a/tests/test_vec_normalize.py +++ /dev/null @@ -1,194 +0,0 @@ -import subprocess - -import gym -import numpy as np -import pytest - -from stable_baselines import DDPG, DQN, SAC, TD3 -from stable_baselines.common.running_mean_std import RunningMeanStd -from stable_baselines.common.vec_env import (DummyVecEnv, VecNormalize, VecFrameStack, - sync_envs_normalization, unwrap_vec_normalize) -from .test_common import _assert_eq - -ENV_ID = 'Pendulum-v0' - - -def make_env(): - return gym.make(ENV_ID) - - -def test_runningmeanstd(): - """Test RunningMeanStd object""" - for (x_1, x_2, x_3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: - rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) - - x_cat = np.concatenate([x_1, x_2, x_3], axis=0) - moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] - rms.update(x_1) - rms.update(x_2) - rms.update(x_3) - moments_2 = [rms.mean, rms.var] - - assert np.allclose(moments_1, moments_2) - - -def check_rms_equal(rmsa, rmsb): - assert np.all(rmsa.mean == rmsb.mean) - assert np.all(rmsa.var == rmsb.var) - assert np.all(rmsa.count == rmsb.count) - - -def check_vec_norm_equal(norma, normb): - assert norma.observation_space == normb.observation_space - assert norma.action_space == normb.action_space - assert norma.num_envs == normb.num_envs - - check_rms_equal(norma.obs_rms, normb.obs_rms) - check_rms_equal(norma.ret_rms, normb.ret_rms) - assert norma.clip_obs == normb.clip_obs - assert norma.clip_reward == normb.clip_reward - assert norma.norm_obs == normb.norm_obs - assert norma.norm_reward == normb.norm_reward - - assert np.all(norma.ret == normb.ret) - assert norma.gamma == normb.gamma - assert norma.epsilon == normb.epsilon - assert norma.training == normb.training - - -def test_vec_env(tmpdir): - """Test VecNormalize Object""" - clip_obs = 0.5 - clip_reward = 5.0 - - orig_venv = DummyVecEnv([make_env]) - norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) - _, done = norm_venv.reset(), [False] - while not done[0]: - actions = [norm_venv.action_space.sample()] - obs, rew, done, _ = norm_venv.step(actions) - assert np.max(np.abs(obs)) <= clip_obs - assert np.max(np.abs(rew)) <= clip_reward - - path = str(tmpdir.join("vec_normalize")) - norm_venv.save(path) - deserialized = VecNormalize.load(path, venv=orig_venv) - check_vec_norm_equal(norm_venv, deserialized) - - -def _make_warmstart_cartpole(): - """Warm-start VecNormalize by stepping through CartPole""" - venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) - venv = VecNormalize(venv) - venv.reset() - venv.get_original_obs() - - for _ in range(100): - actions = [venv.action_space.sample()] - venv.step(actions) - return venv - - -def test_get_original(): - venv = _make_warmstart_cartpole() - for _ in range(3): - actions = [venv.action_space.sample()] - obs, rewards, _, _ = venv.step(actions) - obs = obs[0] - orig_obs = venv.get_original_obs()[0] - rewards = rewards[0] - orig_rewards = venv.get_original_reward()[0] - - assert np.all(orig_rewards == 1) - assert orig_obs.shape == obs.shape - assert orig_rewards.dtype == rewards.dtype - assert not np.array_equal(orig_obs, obs) - assert not np.array_equal(orig_rewards, rewards) - np.testing.assert_allclose(venv.normalize_obs(orig_obs), obs) - np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards) - - -def test_normalize_external(): - venv = _make_warmstart_cartpole() - - rewards = np.array([1, 1]) - norm_rewards = venv.normalize_reward(rewards) - assert norm_rewards.shape == rewards.shape - # Episode return is almost always >= 1 in CartPole. So reward should shrink. - assert np.all(norm_rewards < 1) - - # Don't have any guarantees on obs normalization, except shape, really. - obs = np.array([0, 0, 0, 0]) - norm_obs = venv.normalize_obs(obs) - assert obs.shape == norm_obs.shape - - -@pytest.mark.parametrize("model_class", [DDPG, DQN, SAC, TD3]) -def test_offpolicy_normalization(model_class): - if model_class == DQN: - env = DummyVecEnv([lambda: gym.make('CartPole-v1')]) - else: - env = DummyVecEnv([make_env]) - env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) - - model = model_class('MlpPolicy', env, verbose=1) - model.learn(total_timesteps=1000) - # Check getter - assert isinstance(model.get_vec_normalize_env(), VecNormalize) - - -def test_sync_vec_normalize(): - env = DummyVecEnv([make_env]) - - assert unwrap_vec_normalize(env) is None - - env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) - - assert isinstance(unwrap_vec_normalize(env), VecNormalize) - - env = VecFrameStack(env, 1) - - assert isinstance(unwrap_vec_normalize(env), VecNormalize) - - eval_env = DummyVecEnv([make_env]) - eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) - eval_env = VecFrameStack(eval_env, 1) - - env.reset() - # Initialize running mean - for _ in range(100): - env.step([env.action_space.sample()]) - - obs = env.reset() - original_obs = env.get_original_obs() - dummy_rewards = np.random.rand(10) - # Normalization must be different - assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) - - sync_envs_normalization(env, eval_env) - - # Now they must be synced - assert np.allclose(obs, eval_env.normalize_obs(original_obs)) - assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards)) - - -def test_mpi_runningmeanstd(): - """Test RunningMeanStd object for MPI""" - # Test will be run in CI before pytest is run - pytest.skip() - return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', - 'python', '-m', 'stable_baselines.common.mpi_running_mean_std']) - _assert_eq(return_code, 0) - - -def test_mpi_moments(): - """ - test running mean std function - """ - # Test will be run in CI before pytest is run - pytest.skip() - subprocess.check_call(['mpirun', '--allow-run-as-root', '-np', '3', 'python', '-c', - 'from stable_baselines.common.mpi_moments ' - 'import _helper_runningmeanstd; _helper_runningmeanstd()']) diff --git a/stable_baselines/deepq/experiments/__init__.py b/txt.txt similarity index 100% rename from stable_baselines/deepq/experiments/__init__.py rename to txt.txt