From 401ce0d6574541ab7435340a93f07dd30226b142 Mon Sep 17 00:00:00 2001 From: Tomasz Panek Date: Tue, 5 Nov 2019 12:20:59 +0100 Subject: [PATCH 001/428] Fix issue with incompatible python packages versions (boto and dateutil) --- docker/dockerfiles/rl_coach/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/dockerfiles/rl_coach/Dockerfile b/docker/dockerfiles/rl_coach/Dockerfile index 19fbfcae..881d6373 100644 --- a/docker/dockerfiles/rl_coach/Dockerfile +++ b/docker/dockerfiles/rl_coach/Dockerfile @@ -25,7 +25,7 @@ RUN mkdir /robo RUN mkdir /robo/container # install dependencies -RUN pip install -U sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" +RUN pip install -U sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" # set command -CMD (cd rl_coach; ipython rl_deepracer_coach_robomaker.py) \ No newline at end of file +CMD (cd rl_coach; ipython rl_deepracer_coach_robomaker.py) From 034d97c3b65331f08a525d1aa447150a83ceaaad Mon Sep 17 00:00:00 2001 From: Dan Jarvis Date: Mon, 11 Nov 2019 14:20:26 -0500 Subject: [PATCH 002/428] Fix path to work on Mac OSX Root level folders have restricted access, which might be one reason why this didn't work. This change makes the `rl_coach` target consistent with the `robomaker` target. --- docker/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 0b9b0740..8c4fa355 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -24,7 +24,7 @@ services: - '//var/run/docker.sock:/var/run/docker.sock' - '../deepracer/sagemaker-python-sdk:/deepracer/sagemaker-python-sdk' - '../deepracer/rl_coach:/deepracer/rl_coach' - - '/robo/container:/robo/container' + - './volumes/robo/container:/robo/container' depends_on: - minio robomaker: From 4326efd6d05086162f6f16767864fcdf30a6aab2 Mon Sep 17 00:00:00 2001 From: Dan Jarvis Date: Mon, 11 Nov 2019 15:00:10 -0500 Subject: [PATCH 003/428] Add helpful error messages when expected commands are missing ``` $ ./start.sh minio is up-to-date Starting rl_coach ... done robomaker is up-to-date waiting for containers to start up... Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu. Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH. ``` --- scripts/training/start.sh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 52e599da..5d55ec0a 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -8,8 +8,24 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 15 -echo 'attempting to pull up sagemaker logs...' -gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" +if ! [ -x "$(command -v gnome-terminal)" ]; +then + echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' +else + echo 'attempting to pull up sagemaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" +fi -echo 'attempting to open vnc viewer...' -gnome-terminal -x sh -c "!!; vncviewer localhost:8080" \ No newline at end of file +if ! [ -x "$(command -v gnome-terminal)" ]; +then + if ! [ -x "$(command -v vncviewer)" ]; + then + echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' + else + echo 'attempting to open vnc viewer...' + vncviewer localhost:8080 + fi +else + echo 'attempting to open vnc viewer...' + gnome-terminal -x sh -c "!!; vncviewer localhost:8080" +fi \ No newline at end of file From 92b5d712c6dbd131be142e9e76e5e8035193a51f Mon Sep 17 00:00:00 2001 From: AbdElrhman Mohamed Date: Thu, 21 Nov 2019 20:21:06 +0200 Subject: [PATCH 004/428] Check for existence of rl-deepracer-sagemaker --- scripts/training/set-last-run-to-pretrained.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/training/set-last-run-to-pretrained.sh b/scripts/training/set-last-run-to-pretrained.sh index c4a13e9c..27070ac1 100755 --- a/scripts/training/set-last-run-to-pretrained.sh +++ b/scripts/training/set-last-run-to-pretrained.sh @@ -1,4 +1,12 @@ #!/usr/bin/env bash +Folder=rl-deepracer-sagemaker +if [ -d ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker ]; +then + echo "Folder $Folder exist." + rm -rf ../../docker/volumes/minio/bucket/rl-deepracer-pretrained + mv ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker ../../docker/volumes/minio/bucket/rl-deepracer-pretrained + echo "Done." -rm -rf ../../docker/volumes/minio/bucket/rl-deepracer-pretrained -mv ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker ../../docker/volumes/minio/bucket/rl-deepracer-pretrained \ No newline at end of file +else + echo "Folder $Folder does not exist" +fi From bf27ef620475f84d62836e8bd09fb7263edf60c3 Mon Sep 17 00:00:00 2001 From: Alex Schultz Date: Tue, 26 Nov 2019 06:20:13 -0500 Subject: [PATCH 005/428] adding docker file used to create log analysis container --- docker/dockerfiles/log-analysis/Dockerfile | 25 +++++++++++++++++++ .../dockerfiles/log-analysis/requirements.txt | 11 ++++++++ 2 files changed, 36 insertions(+) create mode 100644 docker/dockerfiles/log-analysis/Dockerfile create mode 100644 docker/dockerfiles/log-analysis/requirements.txt diff --git a/docker/dockerfiles/log-analysis/Dockerfile b/docker/dockerfiles/log-analysis/Dockerfile new file mode 100644 index 00000000..362f0da4 --- /dev/null +++ b/docker/dockerfiles/log-analysis/Dockerfile @@ -0,0 +1,25 @@ +FROM nvcr.io/nvidia/tensorflow:19.06-py2 + +LABEL maintainer="alex.c.schultz@gmail.com" \ + description="Log Analysis for DeepRacer Training Run" \ + version=1.0 + +# Container Dependency Setup +RUN apt-get update +RUN apt-get upgrade -y +RUN apt-get install software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y +RUN pip3 install virtualenv +RUN virtualenv venv +WORKDIR /workspace/venv +RUN mkdir -p /workspace/venv/data /workspace/venv/logs + +# Install common pip packages +WORKDIR /workspace/venv +COPY requirements.txt ./ +RUN ls -lrt +RUN . /workspace/venv/bin/activate && pip install -r requirements.txt + +EXPOSE 8888 +VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws"] +CMD . /workspace/venv/bin/activate && jupyter lab --ip=0.0.0.0 --port=8888 --allow-root + diff --git a/docker/dockerfiles/log-analysis/requirements.txt b/docker/dockerfiles/log-analysis/requirements.txt new file mode 100644 index 00000000..e93ed28a --- /dev/null +++ b/docker/dockerfiles/log-analysis/requirements.txt @@ -0,0 +1,11 @@ +jupyterlab +matplotlib +numpy +opencv-python +scipy +pandas +sklearn +shapely +boto3 +awscli +plotly From c8d24f6e8fd61156ce38733daccd19fe5145c006 Mon Sep 17 00:00:00 2001 From: Alex Schultz Date: Tue, 26 Nov 2019 06:50:38 -0500 Subject: [PATCH 006/428] fixing path for volume mount --- docker/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8c4fa355..0b9b0740 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -24,7 +24,7 @@ services: - '//var/run/docker.sock:/var/run/docker.sock' - '../deepracer/sagemaker-python-sdk:/deepracer/sagemaker-python-sdk' - '../deepracer/rl_coach:/deepracer/rl_coach' - - './volumes/robo/container:/robo/container' + - '/robo/container:/robo/container' depends_on: - minio robomaker: From 586a7399f19441adce391f2f83abdf2021932e8f Mon Sep 17 00:00:00 2001 From: Alex Schultz Date: Tue, 26 Nov 2019 06:52:32 -0500 Subject: [PATCH 007/428] Revert "Fix `rl_coach` Docker volume path to work on Mac OSX" From 266c330419d73d185fad6d645add382007b6a57c Mon Sep 17 00:00:00 2001 From: Alex Lenk Date: Wed, 25 Dec 2019 21:28:43 +0100 Subject: [PATCH 008/428] Added --http_proxy, --https_proxy, --no_proxy as command line args to init.sh in order to build rl_coach with proxy and add the parameters to ./docker/.env file. --- init.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/init.sh b/init.sh index 2a63272e..e7dd383b 100755 --- a/init.sh +++ b/init.sh @@ -21,10 +21,23 @@ ln -s deepracer/rl_coach/rl_deepracer_coach_robomaker.py rl_deepracer_coach_robo # replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) # TODO this file should be genrated from a gui before running training -cat overrides/rl_deepracer_coach_robomaker.py > rl_deepracer_coach_robomaker.py +cat overrides/rl_deepracer_coach_robomaker.py > rl_deepracer_coach_robomaker.py + +#set proxys if required +for arg in "$@"; +do + IFS='=' read -ra part <<< "$arg" + if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then + var=${part[0]:2}=${part[1]} + envs=$'\n'"${var}${envs}" + args="${args} --build-arg ${var}" + fi +done + +echo -e "$envs" >> ./docker/.env # build rl-coach image with latest code from crr0004's repo -docker build -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ +docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ # copy reward function and model-metadata files to bucket cp deepracer/custom_files/* docker/volumes/minio/bucket/custom_files/ @@ -35,4 +48,4 @@ docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then docker network create $SAGEMAKER_NW -fi +fi \ No newline at end of file From 966dffb6370521c0eb80c6de70bcdd104d8b0beb Mon Sep 17 00:00:00 2001 From: Alex Lenk Date: Thu, 26 Dec 2019 16:30:54 +0100 Subject: [PATCH 009/428] Removed unnecessary proxy settings in .env file --- init.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/init.sh b/init.sh index e7dd383b..d1add86c 100755 --- a/init.sh +++ b/init.sh @@ -29,13 +29,10 @@ do IFS='=' read -ra part <<< "$arg" if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then var=${part[0]:2}=${part[1]} - envs=$'\n'"${var}${envs}" args="${args} --build-arg ${var}" fi done -echo -e "$envs" >> ./docker/.env - # build rl-coach image with latest code from crr0004's repo docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ @@ -48,4 +45,4 @@ docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then docker network create $SAGEMAKER_NW -fi \ No newline at end of file +fi From bc05c321b61842e1c5a8c27b6d198d73cf52a4e5 Mon Sep 17 00:00:00 2001 From: Alex Lenk Date: Thu, 26 Dec 2019 17:00:33 +0100 Subject: [PATCH 010/428] Added additional check to ./scripts/training/start.sh and ./scripts/evaluation/start.sh to avoid errors when using a machine that has ubuntu desktop installed but training is executed via ssh. --- scripts/evaluation/start.sh | 16 ++++++++++++---- scripts/training/start.sh | 37 ++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index d788af5c..e9e694f0 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -11,8 +11,16 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 15 -echo 'attempting to pull up sagemaker logs...' -gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" +if xhost >& /dev/null; +then + echo "Display exists, using gnome-terminal for logs and starting vncviewer." -echo 'attempting to open vnc viewer...' -gnome-terminal -x sh -c "!!; vncviewer localhost:8080" + echo 'attempting to pull up sagemaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + + echo 'attempting to open vnc viewer...' + gnome-terminal -x sh -c "!!; vncviewer localhost:8080" +else + echo "No display. Falling back to CLI mode." + docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') +fi diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 5d55ec0a..501a8e80 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -8,24 +8,31 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 15 -if ! [ -x "$(command -v gnome-terminal)" ]; +if xhost >& /dev/null; then - echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' -else - echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" -fi + echo "Display exists, using gnome-terminal for logs and starting vncviewer." + if ! [ -x "$(command -v gnome-terminal)" ]; + then + echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' + else + echo 'attempting to pull up sagemaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + fi -if ! [ -x "$(command -v gnome-terminal)" ]; -then - if ! [ -x "$(command -v vncviewer)" ]; + if ! [ -x "$(command -v gnome-terminal)" ]; then - echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' + if ! [ -x "$(command -v vncviewer)" ]; + then + echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' + else + echo 'attempting to open vnc viewer...' + vncviewer localhost:8080 + fi else echo 'attempting to open vnc viewer...' - vncviewer localhost:8080 + gnome-terminal -x sh -c "!!; vncviewer localhost:8080" fi -else - echo 'attempting to open vnc viewer...' - gnome-terminal -x sh -c "!!; vncviewer localhost:8080" -fi \ No newline at end of file +else + echo "No display. Falling back to CLI mode." + docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') +fi From 66a22d9ecb77f6d04896d56d2b2b64c869fbf7ed Mon Sep 17 00:00:00 2001 From: larsll <59617571+larsll@users.noreply.github.com> Date: Sun, 19 Jan 2020 20:13:05 +0100 Subject: [PATCH 011/428] Initial setup for Azure (#1) * Altering docker setup; adjusted images etc. * Fixing paths in the upload script * Reading in Minio/Azure keys from environment and not from file * Enabling log analysis * New location of custom_files * Copy over all tracks into log-analysis * Analysis and workbook shared folder * Ignore VS Code configuration * Increasing trials! * Tweaking evaluation * Fixing dockerfile to include CUDA9 and CUDNN * Adding more packages to log-analysis * Update upload-snapshot.sh * Updating to have hyperparameters externally * Moving freq. changed customization into sep files. * Moving the adjusted robomaker script. * Creating the patch for metrics * Adding submodules as proper submodules * Moved log-analysis submodule to correct commit * Fixes during testing * Reverting change to Dockerfile * Documentation and environment variables. * Update to ensure we find the env file * activate.sh needs to be run with source * Updated link * Fixing link * Merging * Creating alias based scripts in activate.sh * Added usage information. --- .gitignore | 11 +- .gitmodules | 8 + README.md | 244 ++---------------- activate.sh | 83 ++++++ aws-deepracer-workshops | 1 + deepracer | 1 + defaults/deepracer_racetrack_env.py.patch | 33 +++ defaults/hyperparameters.json | 15 ++ .../rl_deepracer_coach_robomaker.py | 110 ++++---- defaults/template-run.env | 10 + docker/.env | 13 +- docker/docker-compose.yml | 37 ++- .../deepracer_robomaker/Dockerfile | 60 +++++ docker/dockerfiles/log-analysis/Dockerfile | 10 +- .../dockerfiles/log-analysis/requirements.txt | 3 + init.sh | 33 ++- scripts/evaluation/start.sh | 14 +- scripts/log-analysis/start.sh | 5 +- scripts/log-analysis/stop.sh | 2 +- scripts/training/start.sh | 4 +- scripts/training/upload-snapshot.sh | 33 ++- 21 files changed, 407 insertions(+), 323 deletions(-) create mode 100644 .gitmodules create mode 100644 activate.sh create mode 160000 aws-deepracer-workshops create mode 160000 deepracer create mode 100644 defaults/deepracer_racetrack_env.py.patch create mode 100644 defaults/hyperparameters.json rename {overrides => defaults}/rl_deepracer_coach_robomaker.py (56%) create mode 100644 defaults/template-run.env create mode 100644 docker/dockerfiles/deepracer_robomaker/Dockerfile diff --git a/.gitignore b/.gitignore index 0df9f2eb..db124a68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ - -aws-deepracer-workshops/ - -deepracer/ - +.vscode/ +custom_files/ +analysis/ docker/volumes/ +recording/ +recording +current-run.env diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..47b24b2d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,8 @@ +[submodule "deepracer"] + path = deepracer + url = https://github.com/crr0004/deepracer.git +[submodule "aws-deepracer-workshops"] + path = aws-deepracer-workshops + url = https://github.com/breadcentric/aws-deepracer-workshops.git + branch = enhance-log-analysis + diff --git a/README.md b/README.md index 2830428a..b5d0b83a 100644 --- a/README.md +++ b/README.md @@ -1,235 +1,27 @@ -# DeepRacer-For-Dummies -Provides a quick and easy way to get up and running with a local deepracer training environment using Docker Compose. -This repo just creates a wrapper around the amazing work done by Chris found here: https://github.com/crr0004/deepracer -Please refer to his repo to understand more about what's going on under the covers. +# DeepRacer-For-Azure +Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure, using the [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu). -# Video Instructions -[![Video Instructions](https://img.youtube.com/vi/CFNcKmtVRSI/0.jpg)](https://www.youtube.com/watch?v=CFNcKmtVRSI) +This repo is an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer) -# Getting Started +Please refer to Chris' repo to understand more about what's going on under the covers. ---- -#### Prerequisites +Main differences to the work done by Alex is: +* Local S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). +* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. +* Configuration has been reorganized : + * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. + * `current-run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). + * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. +* Uses the Azure temporary drive on `/mnt` to store robomaker files (checkpoints, logs); these will be deleted between runs, but provides ~300GB of 'free' storage as long as the VM is running. Archiving of logs and additional checkpoint files required if desired. -* This project is specifically built to run on Ubuntu 18.04 with an **Nvidia GPU**. It is assumed you already have **CUDA/CUDNN** installed and configured. +## Installation -* You also need to have **Docker** installed as well as the **Nvidia-Docker** runtime. +A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) is available. -* You should have an AWS account with the **AWS cli** installed. The credentials should be located in your home directory (~/.aws/credentials) +TODO: Create an end-to-end installation script. -* ensure you have **vncviewer** installed +## Usage -#### NOTE: If you already have these prerequisites setup then you can simply run the init.sh script described in the **Initialization** section. If you are setting everything up for the first time, then the information provided here can help you to get your environment ready to use this repo. +Before every session run `source activate.sh` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. - -#### Local Environment Setup - -If you are running Windows and would like to use this repo, you will need to modify the process to get everything to run on Windows (not recommended as you will not be able to take advantage of the GPU during training) Many users have found it useful to dual-boot (Windows/Linux). There are many tutorials online for how to do this. You can follow the instructions provided below as guidance. - -##### * Installing Ubuntu 18.04 with Windows 10 - -https://medium.com/bigdatarepublic/dual-boot-windows-and-linux-aa281c3c01f9 - -When it gets to the Disk Management part, to make space for your Ubuntu installation, followed this guide and specifically look at the 2nd method (MiniTool Partition Wizard): - -https://win10faq.com/shrink-partition-windows-10/?source=post_page--------------------------- - -======= -##### * Installing the AWS CLI - - pip install -U awscli - -Then Follow this: https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html - -##### * Installing Docker-ce (steps from https://docs.docker.com/install/linux/docker-ce/ubuntu/ ) - - sudo apt-get remove docker docker-engine docker.io containerd runc - sudo apt-get update - - sudo apt-get install \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg-agent \ - software-properties-common - - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - sudo apt-key fingerprint 0EBFCD88 - - sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" - - sudo apt-get update - sudo apt-get install docker-ce docker-ce-cli containerd.io - -Verify docker works - - sudo docker run hello-world - -##### 3. Installing Docker-compose (from https://docs.docker.com/compose/install/#install-compose ) - - curl -L https://github.com/docker/compose/releases/download/1.24.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose - -Verify installation - - docker-compose --version - -###### NOTE: You can also choose to install docker-compose via another package manager (i.e. pip or conda), but if you do, make sure to do so in a virtual env. Many OS’s have python system packages that conflict with docker-compose dependencies. ###### - -Additionally, make sure your user-id can run docker without sudo (from https://docs.docker.com/install/linux/linux-postinstall/ ) - - sudo groupadd docker - sudo usermod -aG docker $USER - -Log out and log back in so that your group membership is re-evaluated. - -And configure Docker to start on boot. - - sudo systemctl enable docker - -##### * Preparing for nvidia-docker - -The NVIDIA Container Toolkit allows users to build and run GPU accelerated Docker containers. -Nvidia-docker essentially exposes the GPU to the containers to use: https://github.com/NVIDIA/nvidia-docker - -You may want to note what you have installed currently. - - sudo apt list --installed | grep nvidia - -Then prepare for clean installation of Nvidia drivers. - - sudo apt-get purge nvidia* - -##### Installing nvidia-docker runtime (from https://github.com/NVIDIA/nvidia-docker/wiki/Installation-(version-2.0) ) - - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) - curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get update - sudo apt-get install nvidia-docker2 - sudo pkill -SIGHUP dockerd - -##### * Installing the proper nvidia drivers - -Check for driver version here according to your GPU(s): https://www.nvidia.com/Download/index.aspx?lang=en-us -In the dropdown for OS, choose “show all OS’s” to see if there are Ubuntu specific choices. Otherwise choose Linux. -If you get a dropdown for “cuda toolkit”, choose 10.0) - - sudo add-apt-repository ppa:graphics-drivers - sudo apt-get update - sudo apt install nvidia-driver-410 && sudo reboot - -###### NOTE: 410 is a driver version that is compatible with the GPU I selected on the Nvidia website. ###### - -Verify the driver installation: - - nvidia-smi - nvcc --version - -##### * Installing VNC viewer on your local machine - -This doc is straight forward: https://www.techspot.com/downloads/5760-vnc-viewer.html - -##### * Installing the Nvidia deep learning libraries (CUDA/CUDNN) for GPU hardware: - -This guide goes through how to install CUDA & CUDNN : https://medium.com/@zhanwenchen/install-cuda-and-cudnn-for-tensorflow-gpu-on-ubuntu-79306e4ac04e - -###### NOTE: You can apparently use Anaconda instead to install CUDA/CUDNN. I have not tried this, however some users have and have reported that this method is much easier. If you use this approach, you will need to first install Anaconda. Once installed you can then use the conda package manager to install the desired versions of CUDA and cuDNN. The following installation configuration has been reported to work together successfully ###### - -##### Downloading Anaconda - - sudo apt-get update -y && sudo apt-get upgrade -y - cd /tmp/ - sudo wget https://repo.anaconda.com/archive/Anaconda3-2019.03-Linux-x86_64.sh - -##### Installing Anaconda - - bash Anaconda3-2019.03-Linux-x86_64.sh - "yes" for using the default directory location - “yes” for running conda init - -##### Activating Anaconda - - source ~/.bashrc - -##### Verifying the conda package manager works - - conda list - -##### Installing CUDA/CUDNN - - conda install cudnn==7.3.1 && conda install -c fragcolor cuda10.0 - - -#### Initialization (After all prerequisites have been installed) - - -##### 11. Run Init.sh from this repo (refer to the rest of this doc for script details) - -In a command prompt, simply run "./init.sh". -This will set everything up so you can run the deepracer local training environment. - - -**init.sh** performs these steps so you don't have to do them manually: -1. Clones Chris's repo: https://github.com/crr0004/deepracer.git -2. Does a mkdir -p ~/.sagemaker && cp config.yaml ~/.sagemaker -3. Sets the image name in rl_deepracer_coach_robomaker.py to "crr0004/sagemaker-rl-tensorflow:nvidia” -4. Also sets the instance_type in rl_deepracer_coach_robomaker.py to “local_gpu” -5. Copies the reward.py and model-metadata files into your Minio bucket - - -To start or stop the local deepracer training, use the scripts found in the scripts directory. - -Here is a brief overview of the available scripts: - -#### Scripts - -* training - * start.sh - * starts the whole environment using docker compose - * it will also open a terminal window where you can monitor the log output from the sagemaker training directory - * it will also automatically open vncviewer so you can watch the training happening in Gazebo - * stop.sh - * stops the whole environment - * automatically finds and stops the training container which was started from the sagemaker container - * upload-snapshot.sh - * uploads a specific snapshot to S3 in AWS. If no checkpoint is provided, it attempts to retrieve the latest snapshot - * set-last-run-to-pretrained.sh - * renames the last training run directory from ***rl-deepracer-sagemaker*** to ***rl-deepracer-pretrained*** so that you can use it as a starting point for a new training run. - * delete-last-run.sh - * (WARNING: this script deletes files on your system. I take no responsibility for any resulting actions by running this script. Please look at what the script is doing before running it so that you understand) - * deletes the last training run including all of the snapshots and log files. You will need sudo to run this command. - - -* evaluation - * start.sh - * starts the whole environment using docker compose to run an evaluation run - * it will also open a terminal window where you can monitor the log output from the sagemaker training directory - * it will also automatically open vncviewer so you can watch the training happening in Gazebo - * stop.sh - * stops the whole environment - * automatically finds and stops the training container which was started from the sagemaker container - -* log-analysis - * start.sh - * starts a container with Nvidia-Docker running jupyter labs with the log analysis notebooks which were originally provided by AWS and then extended by Tomasz Ptak - * the logs from robomaker are automatically mounted in the container so you don't have to move any files around - * in order to get to the container, look at the log output from when it starts. You need to grab the URL including the token query parameter and then paste it into the brower at **localhost:8888**. - * stop.sh - * stops the log-analysis container - - -#### Hyperparameters - -You can modify training hyperparameters from the file **rl_deepracer_coach_robomaker.py**. - -#### Action Space & Reward Function - -The action-space and reward function files are located in the **deepracer-for-dummies/docker/volumes/minio/bucket/custom_files** directory - -#### Track Selection - -The track selection is controled via an environment variable in the **.env** file located in the **deepracer-for-dummies/docker** directory \ No newline at end of file +Ensure that the configuration files are uploaded into the bucket `dr-upload-local-custom-files`. Start a training with `dr-start-local-training`. diff --git a/activate.sh b/activate.sh new file mode 100644 index 00000000..5226a486 --- /dev/null +++ b/activate.sh @@ -0,0 +1,83 @@ +#!/bin/bash +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# create directory structure for docker volumes +sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording +sudo chown -R $(id -u):$(id -g) /mnt/deepracer + +if [[ -f "$DIR/current-run.env" ]] +then + export $(grep -v '^#' current-run.env | xargs) +else + echo "File current-run.env does not exist." + exit 1 +fi + +export AZ_ACCESS_KEY_ID=$(aws --profile $AZURE_S3_PROFILE configure get aws_access_key_id | xargs) +export AZ_SECRET_ACCESS_KEY=$(aws --profile $AZURE_S3_PROFILE configure get aws_secret_access_key | xargs) + +function dr-upload-local-custom-files { + eval $(cat $DIR/docker/.env | grep 'MODEL_S3_BUCKET\|MODEL_CUSTOM_FILES_S3_PREFIX' | xargs ) + eval CUSTOM_TARGET=$(echo s3://$MODEL_S3_BUCKET/$MODEL_CUSTOM_FILES_S3_PREFIX) + ROBOMAKER_COMMAND="" docker-compose -f $DIR/docker/docker-compose.yml up -d minio + echo "Uploading files to $CUSTOM_TARGET" + aws --profile $AZURE_S3_PROFILE --endpoint-url http://localhost:9000 s3 sync custom_files/ s3://$MODEL_S3_BUCKET/$MODEL_CUSTOM_FILES_S3_PREFIX +} + +function dr-start-local-training { + bash -c "cd $DIR/scripts/training && ./start.sh" +} + +function dr-stop-local-training { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" +} + +function dr-start-local-evaluation { + bash -c "cd $DIR/scripts/evaluation && ./start.sh" +} + +function dr-stop-local-evaluation { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" +} + +function dr-start-loganalysis { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" +} + +function dr-stop-loganalysis { + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /log-analysis/ { print $1 }') + if [ -n "$LOG_ANALYSIS_ID" ]; then + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./stop.sh" + else + echo "Log-analysis is not running." + fi + +} + +function dr-logs-sagemaker { + eval SAGEMAKER_ID=$(docker ps | awk ' /sagemaker/ { print $1 }') + if [ -n "$SAGEMAKER_ID" ]; then + docker logs -f $SAGEMAKER_ID + else + echo "Sagemaker is not running." + fi +} + +function dr-logs-robomaker { + eval ROBOMAKER_ID=$(docker ps | awk ' /robomaker/ { print $1 }') + if [ -n "$ROBOMAKER_ID" ]; then + docker logs -f $ROBOMAKER_ID + else + echo "Robomaker is not running." + fi +} + +function dr-logs-loganalysis { + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /log-analysis/ { print $1 }') + if [ -n "$LOG_ANALYSIS_ID" ]; then + docker logs -f $LOG_ANALYSIS_ID + else + echo "Log-analysis is not running." + fi + +} \ No newline at end of file diff --git a/aws-deepracer-workshops b/aws-deepracer-workshops new file mode 160000 index 00000000..757fa87f --- /dev/null +++ b/aws-deepracer-workshops @@ -0,0 +1 @@ +Subproject commit 757fa87f2b5246a5ce451158dfdaedc99a3927b0 diff --git a/deepracer b/deepracer new file mode 160000 index 00000000..86605d39 --- /dev/null +++ b/deepracer @@ -0,0 +1 @@ +Subproject commit 86605d39c19923fe85fc96b54b720122ecb75d03 diff --git a/defaults/deepracer_racetrack_env.py.patch b/defaults/deepracer_racetrack_env.py.patch new file mode 100644 index 00000000..57e78d8a --- /dev/null +++ b/defaults/deepracer_racetrack_env.py.patch @@ -0,0 +1,33 @@ +diff --git a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py +index 569b33c..9f43d62 100644 +--- a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py ++++ b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py +@@ -467,7 +467,7 @@ class DeepRacerRacetrackEnv(gym.Env): + # Update metrics based on job type + if self.job_type == TRAINING_JOB: + self.send_reward_to_cloudwatch(self.reward_in_episode) +- self.update_training_metrics() ++ self.update_training_metrics(progress) + self.write_metrics_to_s3() + if self.is_training_done(): + self.cancel_simulation_job() +@@ -485,13 +485,18 @@ class DeepRacerRacetrackEnv(gym.Env): + eval_metric['trial'] = int(self.number_of_trials) + self.metrics.append(eval_metric) + +- def update_training_metrics(self): ++ def update_training_metrics(self, progress = 1): + training_metric = {} + training_metric['reward_score'] = int(round(self.reward_in_episode)) + training_metric['metric_time'] = int(round(time.time() * 1000)) + training_metric['start_time'] = int(round(self.simulation_start_time * 1000)) + training_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) + training_metric['episode'] = int(self.episodes) ++ training_metric['completion_percentage'] = int(progress) ++ if int(progress) == 100: ++ training_metric['episode_status'] = "Lap complete" ++ else: ++ training_metric['episode_status'] = "Off track" + self.metrics.append(training_metric) + + def write_metrics_to_s3(self): diff --git a/defaults/hyperparameters.json b/defaults/hyperparameters.json new file mode 100644 index 00000000..25ad617f --- /dev/null +++ b/defaults/hyperparameters.json @@ -0,0 +1,15 @@ +{ + "batch_size": 64, + "beta_entropy": 0.01, + "discount_factor": 0.995, + "e_greedy_value": 1.0, + "epsilon_steps": 10000, + "exploration_type": "categorical", + "loss_type": "huber", + "lr": 0.0003, + "num_episodes_between_training": 20, + "num_epochs": 10, + "stack_size": 1, + "term_cond_avg_score": 350.0, + "term_cond_max_episodes": 1000 + } \ No newline at end of file diff --git a/overrides/rl_deepracer_coach_robomaker.py b/defaults/rl_deepracer_coach_robomaker.py similarity index 56% rename from overrides/rl_deepracer_coach_robomaker.py rename to defaults/rl_deepracer_coach_robomaker.py index 9c96c7c6..43b53bd5 100644 --- a/overrides/rl_deepracer_coach_robomaker.py +++ b/defaults/rl_deepracer_coach_robomaker.py @@ -9,6 +9,8 @@ import glob import re import subprocess +import json +import io from IPython.display import Markdown from time import gmtime, strftime sys.path.append("common") @@ -16,31 +18,48 @@ from sagemaker.rl import RLEstimator, RLToolkit, RLFramework from markdown_helper import * - +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") # S3 bucket boto_session = boto3.session.Session( - aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minio"), + aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minio"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "miniokey"), region_name=os.environ.get("AWS_REGION", "us-east-1")) s3Client = boto_session.resource("s3", use_ssl=False, -endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) - -sage_session = sagemaker.local.LocalSession(boto_session=boto_session, s3_client=s3Client) -s3_bucket = os.environ.get("MODEL_S3_BUCKET", "bucket") #sage_session.default_bucket() -s3_output_path = 's3://{}/'.format(s3_bucket) # SDK appends the job name and output folder + endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) +s3Client_c = boto_session.client("s3", use_ssl=False, + endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) + +sage_session = sagemaker.local.LocalSession( + boto_session=boto_session, s3_client=s3Client) +# sage_session.default_bucket() +s3_bucket = os.environ.get("MODEL_S3_BUCKET", "bucket") +pretrained = str2bool(os.environ.get("PRETRAINED", False)) +s3_pretrained_bucket = os.environ.get("PRETRAINED_S3_BUCKET", "bucket") +s3_pretrained_prefix = os.environ.get( + "PRETRAINED_S3_PREFIX", "rl-deepracer-pretrained") +# SDK appends the job name and output folder +s3_output_path = 's3://{}/'.format(s3_bucket) + +# Hyperparameters +hyperparameter_file = os.environ.get( + "HYPERPARAMETER_FILE_S3_KEY", "custom_files/hyperparameters.json") # ### Define Variables # We define variables such as the job prefix for the training jobs and s3_prefix for storing metadata required for synchronization between the training and simulation jobs -job_name_prefix = 'rl-deepracer' # this should be MODEL_S3_PREFIX, but that already ends with "-sagemaker" +# this should be MODEL_S3_PREFIX, but that already ends with "-sagemaker" +job_name_prefix = 'rl-deepracer' # create unique job name tm = gmtime() -job_name = s3_prefix = job_name_prefix + "-sagemaker"#-" + strftime("%y%m%d-%H%M%S", tm) #Ensure S3 prefix contains SageMaker -s3_prefix_robomaker = job_name_prefix + "-robomaker"#-" + strftime("%y%m%d-%H%M%S", tm) #Ensure that the S3 prefix contains the keyword 'robomaker' +# -" + strftime("%y%m%d-%H%M%S", tm) #Ensure S3 prefix contains SageMaker +job_name = s3_prefix = job_name_prefix + "-sagemaker" +# -" + strftime("%y%m%d-%H%M%S", tm) #Ensure that the S3 prefix contains the keyword 'robomaker' +s3_prefix_robomaker = job_name_prefix + "-robomaker" # Duration of job in seconds (5 hours) @@ -49,8 +68,10 @@ aws_region = sage_session.boto_region_name if aws_region not in ["us-west-2", "us-east-1", "eu-west-1"]: - raise Exception("This notebook uses RoboMaker which is available only in US East (N. Virginia), US West (Oregon) and EU (Ireland). Please switch to one of these regions.") -print("Model checkpoints and other metadata will be stored at: {}{}".format(s3_output_path, job_name)) + raise Exception( + "This notebook uses RoboMaker which is available only in US East (N. Virginia), US West (Oregon) and EU (Ireland). Please switch to one of these regions.") +print("Model checkpoints and other metadata will be stored at: {}{}".format( + s3_output_path, job_name)) s3_location = "s3://%s/%s" % (s3_bucket, s3_prefix) @@ -61,13 +82,13 @@ # Training> Name=main_level/agent, Worker=0, Episode=19, Total reward=-102.88, Steps=19019, Training iteration=1 {'Name': 'reward-training', 'Regex': '^Training>.*Total reward=(.*?),'}, - + # Policy training> Surrogate loss=-0.32664725184440613, KL divergence=7.255815035023261e-06, Entropy=2.83156156539917, training epoch=0, learning_rate=0.00025 {'Name': 'ppo-surrogate-loss', 'Regex': '^Policy training>.*Surrogate loss=(.*?),'}, - {'Name': 'ppo-entropy', + {'Name': 'ppo-entropy', 'Regex': '^Policy training>.*Entropy=(.*?),'}, - + # Testing> Name=main_level/agent, Worker=0, Episode=19, Total reward=1359.12, Steps=20015, Training iteration=2 {'Name': 'reward-testing', 'Regex': '^Testing>.*Total reward=(.*?),'}, @@ -75,7 +96,7 @@ # We use the RLEstimator for training RL jobs. -# +# # 1. Specify the source directory which has the environment file, preset and training code. # 2. Specify the entry point as the training code # 3. Specify the choice of RL toolkit and framework. This automatically resolves to the ECR path for the RL Container. @@ -94,6 +115,28 @@ image_name = "crr0004/sagemaker-rl-tensorflow:{}".format( "nvidia" if gpu_available else "console") +# Prepare hyperparameters +hyperparameters_core = { + "s3_bucket": s3_bucket, + "s3_prefix": s3_prefix, + "aws_region": aws_region, + "model_metadata_s3_key": "s3://{}/custom_files/model_metadata.json".format(s3_bucket), + "RLCOACH_PRESET": RLCOACH_PRESET +} + +if pretrained == True: + hyperparameters_core['pretrained_s3_bucket'] = "{}".format( + s3_pretrained_bucket) + hyperparameters_core['pretrained_s3_prefix'] = s3_pretrained_prefix + +# Downloading the hyperparameter file from our local bucket. +hyperparameter_data = io.BytesIO() +s3Client_c.download_fileobj( + s3_bucket, hyperparameter_file, hyperparameter_data) +hyperparameters_nn = json.loads(hyperparameter_data.getvalue().decode("utf-8")) +hyperparameters = {**hyperparameters_core, **hyperparameters_nn} +print("Configured following hyperparameters") +print(hyperparameters) estimator = RLEstimator(entry_point="training_worker.py", source_dir='src', dependencies=["common/sagemaker_rl"], @@ -101,40 +144,19 @@ toolkit_version='0.11', framework=RLFramework.TENSORFLOW, sagemaker_session=sage_session, - #bypass sagemaker SDK validation of the role + # bypass sagemaker SDK validation of the role role="aaa/", train_instance_type=instance_type, train_instance_count=1, output_path=s3_output_path, base_job_name=job_name_prefix, image_name=image_name, - train_max_run=job_duration_in_seconds, # Maximum runtime in seconds - hyperparameters={"s3_bucket": s3_bucket, - "s3_prefix": s3_prefix, - "aws_region": aws_region, - "model_metadata_s3_key": "s3://{}/custom_files/model_metadata.json".format(s3_bucket), - "RLCOACH_PRESET": RLCOACH_PRESET, - "batch_size": 64, - "num_epochs": 10, - "stack_size" : 1, - "lr" : 0.00035, - "exploration_type" : "categorical", - "e_greedy_value" : 0.05, - "epsilon_steps" : 10000, - "beta_entropy" : 0.01, - "discount_factor" : 0.999, - "loss_type": "mean squared error", - "num_episodes_between_training" : 20, - "term_cond_max_episodes" : 100000, - "term_cond_avg_score" : 100000 - #"pretrained_s3_bucket": "{}".format(s3_bucket), - #"pretrained_s3_prefix": "rl-deepracer-pretrained" - # "loss_type": "mean squared error", - }, - metric_definitions = metric_definitions, + train_max_run=job_duration_in_seconds, # Maximum runtime in seconds + hyperparameters=hyperparameters, + metric_definitions=metric_definitions, s3_client=s3Client - #subnets=default_subnets, # Required for VPC mode - #security_group_ids=default_security_groups, # Required for VPC mode - ) + # subnets=default_subnets, # Required for VPC mode + # security_group_ids=default_security_groups, # Required for VPC mode + ) estimator.fit(job_name=job_name, wait=False) diff --git a/defaults/template-run.env b/defaults/template-run.env new file mode 100644 index 00000000..af689ef7 --- /dev/null +++ b/defaults/template-run.env @@ -0,0 +1,10 @@ +WORLD_NAME=Vegas_track +CHANGE_START_POSITION=True +PRETRAINED=True +PRETRAINED_S3_PREFIX=rl-sagemaker-pretrained +PRETRAINED_S3_BUCKET=bucket +UPLOAD_S3_PROFILE=default +UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal +UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix +UPLOAD_MODEL_NAME=mymodelname +AZURE_S3_PROFILE=azure \ No newline at end of file diff --git a/docker/.env b/docker/.env index da93ce80..61bc534d 100644 --- a/docker/.env +++ b/docker/.env @@ -1,4 +1,4 @@ -WORLD_NAME=AWS_track +WORLD_NAME=Vegas_track LOCAL_ENV_VAR_JSON_PATH=env_vars.json MINIO_ACCESS_KEY=minio MINIO_SECRET_KEY=miniokey @@ -10,6 +10,9 @@ ROS_AWS_REGION=us-east-1 AWS_REGION=us-east-1 MODEL_S3_PREFIX=rl-deepracer-sagemaker MODEL_S3_BUCKET=bucket +PRETRAINED=True +PRETRAINED_S3_PREFIX=rl-deepracer-pretrained +PRETRAINED_S3_BUCKET=bucket LOCAL=True MARKOV_PRESET_FILE=deepracer.py XAUTHORITY=/root/.Xauthority @@ -21,14 +24,18 @@ SAGEMAKER_SHARED_S3_PREFIX=rl-deepracer-sagemaker SAGEMAKER_SHARED_S3_BUCKET=bucket TRAINING_JOB_ARN=aaa METRICS_S3_BUCKET=bucket -METRICS_S3_OBJECT_KEY=custom_files/metric.json +METRICS_S3_OBJECT_KEY=metrics/metric.json ROBOMAKER_RUN_TYPE=distributed_training +CHANGE_START_POSITION=True TARGET_REWARD_SCORE=100000 NUMBER_OF_EPISODES=20000 ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID=aaa AWS_ROBOMAKER_SIMULATION_JOB_ID=aaa +MODEL_CUSTOM_FILES_S3_PREFIX=custom_files/ MODEL_METADATA_FILE_S3_KEY=custom_files/model_metadata.json +HYPERPARAMETER_FILE_S3_KEY=custom_files/hyperparameters.json REWARD_FILE_S3_KEY=custom_files/reward.py BUNDLE_CURRENT_PREFIX=/app/robomaker-deepracer/simulation_ws/ GPU_AVAILABLE=True -NUMBER_OF_TRIALS=5 \ No newline at end of file +NUMBER_OF_TRIALS=6 +SM_NUM_GPU=1 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 0b9b0740..b4868935 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -11,13 +11,34 @@ services: ports: - "9000:9000" container_name: minio - command: server /data + command: gateway azure + restart: unless-stopped + environment: + - MINIO_ACCESS_KEY=${AZ_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${AZ_SECRET_ACCESS_KEY} + - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} + minio-log: + image: minio/minio + ports: + - "9001:9001" + container_name: minio-log + command: server --address :9001 /data volumes: - - ./volumes/minio:/data + - /mnt/deepracer/robo/checkpoint:/data restart: unless-stopped - env_file: .env + environment: + - MINIO_ACCESS_KEY=${AZ_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${AZ_SECRET_ACCESS_KEY} rl_coach: image: aschu/rl_coach + environment: + - WORLD_NAME + - PRETRAINED + - PRETRAINED_S3_PREFIX + - PRETRAINED_S3_BUCKET + - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} env_file: .env container_name: rl_coach volumes: @@ -28,15 +49,21 @@ services: depends_on: - minio robomaker: - image: crr0004/deepracer_robomaker:console + image: larsll/deepracer_robomaker:latest command: ["${ROBOMAKER_COMMAND}"] volumes: - ../deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src:/app/robomaker-deepracer/simulation_ws/src - - ./volumes/robo/checkpoint:/root/.ros/ + - /mnt/deepracer/robo/checkpoint:/root/.ros/ + - /mnt/deepracer/recording:/mnt/recording ports: - "8080:5900" container_name: robomaker restart: unless-stopped + environment: + - CHANGE_START_POSITION + - WORLD_NAME + - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} env_file: .env depends_on: - rl_coach diff --git a/docker/dockerfiles/deepracer_robomaker/Dockerfile b/docker/dockerfiles/deepracer_robomaker/Dockerfile new file mode 100644 index 00000000..fcc850c9 --- /dev/null +++ b/docker/dockerfiles/deepracer_robomaker/Dockerfile @@ -0,0 +1,60 @@ + +FROM crr0004/deepracer_robomaker:console +LABEL maintainer "Lars Ludvigsen " + +RUN apt-get update && apt-get install -y --no-install-recommends \ +ca-certificates apt-transport-https gnupg-curl && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ + NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ + echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-get purge --auto-remove -y gnupg-curl && \ +rm -rf /var/lib/apt/lists/* + +ENV CUDA_VERSION 9.0.176 +ENV CUDNN_VERSION 7.6.4.38 +ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1 + +# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a +RUN apt-get update && apt-get install -y --no-install-recommends \ + cuda-cudart-$CUDA_PKG_VERSION \ + && ln -s cuda-9.0 /usr/local/cuda + +# Required for nvidia-docker v1 +LABEL com.nvidia.volumes.needed="nvidia_driver" +LABEL com.nvidia.cuda.version="${CUDA_VERSION}" +LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" + +RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf + +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 + +# nvidia-container-runtime +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0 " + +ENV NCCL_VERSION 2.4.8 + +RUN apt-get install -y --no-install-recommends \ + cuda-libraries-$CUDA_PKG_VERSION \ + cuda-cublas-9-0=9.0.176.4-1 \ + cuda-cusolver-$CUDA_PKG_VERSION \ + libnccl2=$NCCL_VERSION-1+cuda9.0 && \ + apt-mark hold libnccl2 + +RUN apt-get install -y --no-install-recommends \ + libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \ + apt-mark hold libcudnn7 && \ + rm -rf /var/lib/apt/lists/* + + +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-9.0/targets/x86_64-linux/lib + +RUN pip install tensorflow-gpu==1.11.0 diff --git a/docker/dockerfiles/log-analysis/Dockerfile b/docker/dockerfiles/log-analysis/Dockerfile index 362f0da4..f8806ad7 100644 --- a/docker/dockerfiles/log-analysis/Dockerfile +++ b/docker/dockerfiles/log-analysis/Dockerfile @@ -1,6 +1,6 @@ -FROM nvcr.io/nvidia/tensorflow:19.06-py2 +FROM tensorflow/tensorflow:latest-gpu -LABEL maintainer="alex.c.schultz@gmail.com" \ +LABEL maintainer="lars@ludvig.no" \ description="Log Analysis for DeepRacer Training Run" \ version=1.0 @@ -9,9 +9,9 @@ RUN apt-get update RUN apt-get upgrade -y RUN apt-get install software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y RUN pip3 install virtualenv -RUN virtualenv venv +RUN virtualenv /workspace/venv WORKDIR /workspace/venv -RUN mkdir -p /workspace/venv/data /workspace/venv/logs +RUN mkdir -p /workspace/venv/data /workspace/venv/logs /workspace/venv/workbook # Install common pip packages WORKDIR /workspace/venv @@ -20,6 +20,6 @@ RUN ls -lrt RUN . /workspace/venv/bin/activate && pip install -r requirements.txt EXPOSE 8888 -VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws"] +VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws", "/workspace/venv/workbook"] CMD . /workspace/venv/bin/activate && jupyter lab --ip=0.0.0.0 --port=8888 --allow-root diff --git a/docker/dockerfiles/log-analysis/requirements.txt b/docker/dockerfiles/log-analysis/requirements.txt index e93ed28a..d1eb298f 100644 --- a/docker/dockerfiles/log-analysis/requirements.txt +++ b/docker/dockerfiles/log-analysis/requirements.txt @@ -9,3 +9,6 @@ shapely boto3 awscli plotly +tensorflow==1.15.0 +Pillow +python-resize-image \ No newline at end of file diff --git a/init.sh b/init.sh index 2a63272e..19bdfca1 100755 --- a/init.sh +++ b/init.sh @@ -1,33 +1,44 @@ #!/usr/bin/env bash +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd $DIR # create directory structure for docker volumes -mkdir -p docker/volumes/minio/bucket/custom_files \ - docker/volumes/robo/checkpoint +sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording /mnt/deepracer/robo/checkpoint +sudo chown -R $(id -u):$(id -g) /mnt/deepracer +mkdir -p $DIR/docker/volumes # create symlink to current user's home .aws directory # NOTE: AWS cli must be installed for this to work # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html -ln -s $(eval echo "~${USER}")/.aws docker/volumes/ +ln -s $(eval echo "~${USER}")/.aws $DIR/docker/volumes/ # grab local training deepracer repo from crr0004 and log analysis repo from vreadcentric -git clone --recurse-submodules https://github.com/crr0004/deepracer.git +# Now as submodules! +# git clone --recurse-submodules https://github.com/crr0004/deepracer.git +# git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. +git submodule init && git submodule update -git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. +ln -sf ./aws-deepracer-workshops/log-analysis ./docker/volumes/log-analysis +cp deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ -ln -s ../../aws-deepracer-workshops/log-analysis ./docker/volumes/log-analysis +# copy rewardfunctions +mkdir -p custom_files analysis +cp deepracer/custom_files/* custom_files/ +cp defaults/hyperparameters.json custom_files/ # setup symlink to rl-coach config file -ln -s deepracer/rl_coach/rl_deepracer_coach_robomaker.py rl_deepracer_coach_robomaker.py +ln -f defaults/rl_deepracer_coach_robomaker.py deepracer/rl_coach/rl_deepracer_coach_robomaker.py +cd deepracer/ && patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch && cd .. # replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) # TODO this file should be genrated from a gui before running training -cat overrides/rl_deepracer_coach_robomaker.py > rl_deepracer_coach_robomaker.py +cp defaults/template-run.env current-run.env # build rl-coach image with latest code from crr0004's repo docker build -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ - -# copy reward function and model-metadata files to bucket -cp deepracer/custom_files/* docker/volumes/minio/bucket/custom_files/ +docker build -f ./docker/dockerfiles/deepracer_robomaker/Dockerfile -t larsll/deepracer_robomaker +docker build -f ./docker/dockerfiles/log-analysis/Dockerfile -t larsll/log-analysis +docker pull crr0004/sagemaker-rl-tensorflow:nvidia # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index d788af5c..278cd9e8 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -1,6 +1,6 @@ # set evaluation specific environment variables export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" -export METRICS_S3_OBJECT_KEY=custom_files/eval_metrics.json +export METRICS_S3_OBJECT_KEY=metrics/eval_metrics.json export NUMBER_OF_TRIALS=5 docker-compose -f ../../docker/docker-compose.yml up -d @@ -11,8 +11,10 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 15 -echo 'attempting to pull up sagemaker logs...' -gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" - -echo 'attempting to open vnc viewer...' -gnome-terminal -x sh -c "!!; vncviewer localhost:8080" +if ! [ -x "$(command -v gnome-terminal)" ]; +then + docker logs -f robomaker +else + echo 'attempting to pull up robomaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f docker logs -f robomaker" +fi diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index c0730070..2d8148f0 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -3,5 +3,6 @@ nvidia-docker run --rm -it -p "8888:8888" \ -v `pwd`/../../docker/volumes/log-analysis:/workspace/venv/data \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ --v `pwd`/../../docker/volumes/robo/checkpoint/log:/workspace/venv/logs \ - aschu/log-analysis +-v /mnt/deepracer/robo/checkpoint/log:/workspace/venv/logs \ +-v `pwd`/../../analysis:/workspace/venv/workbook \ + larsll/log-analysis diff --git a/scripts/log-analysis/stop.sh b/scripts/log-analysis/stop.sh index e9c3cab3..f69a56c2 100755 --- a/scripts/log-analysis/stop.sh +++ b/scripts/log-analysis/stop.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash docker stop $(docker ps | awk ' /analysis/ { print $1 }') -docker rm $(docker ps -a | awk ' /analysis/ { print $1 }') +#docker rm $(docker ps -a | awk ' /analysis/ { print $1 }') diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 5d55ec0a..59124b1e 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -6,11 +6,11 @@ docker-compose -f ../../docker/docker-compose.yml up -d echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start -sleep 15 +sleep 20 if ! [ -x "$(command -v gnome-terminal)" ]; then - echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' + docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') else echo 'attempting to pull up sagemaker logs...' gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" diff --git a/scripts/training/upload-snapshot.sh b/scripts/training/upload-snapshot.sh index 99a767e9..51c25c09 100755 --- a/scripts/training/upload-snapshot.sh +++ b/scripts/training/upload-snapshot.sh @@ -1,10 +1,13 @@ -#!/usr/bin/env bash +#!/bin/bash -S3_BUCKET={replace with your own S3 bucket name} +S3_BUCKET=${UPLOAD_S3_BUCKET} +S3_PREFIX=${UPLOAD_S3_PREFIX} -S3_PREFIX={replace with your own S3 prefix} - -MODEL_DIR=$(pwd)/../../docker/volumes/minio/bucket/rl-deepracer-sagemaker/model/ +WORK_DIR=/mnt/deepracer +MODEL_DIR=${WORK_DIR}/rl-deepracer-sagemaker/model/ +MODEL_REWARD=$(pwd)/../../custom_files/reward.py +MODEL_HYPER=$(pwd)/../../custom_files/hyperparameters.json +MODEL_NAME=$UPLOAD_MODEL_NAME display_usage() { echo -e "\nUsage:\n./upload-snapshot.sh -c checkpoint \n" @@ -47,7 +50,8 @@ else echo "Checkpoint supplied: ["${CHECKPOINT}"]" fi -mkdir -p checkpoint +mkdir -p $WORK_DIR/tmp_upload && rm -rf $WORK_DIR/tmp_upload/* + MODEL_FILE=$MODEL_DIR"model_"$CHECKPOINT".pb" METADATA_FILE=$MODEL_DIR"model_metadata.json" @@ -56,19 +60,19 @@ if test ! -f "$MODEL_FILE"; then echo "$MODEL_FILE doesn't exist" return 1 else - cp $MODEL_FILE checkpoint/ + cp $MODEL_FILE $WORK_DIR/tmp_upload/ fi if test ! -f "$METADATA_FILE"; then echo "$METADATA_FILE doesn't exist" return 1 else - cp $METADATA_FILE checkpoint/ + cp $METADATA_FILE $WORK_DIR/tmp_upload/ fi for i in $( find $MODEL_DIR -type f -name $CHECKPOINT"*" ); do - cp $i checkpoint/ + cp $i $WORK_DIR/tmp_upload/ done ls ${MODEL_DIR}${CHECKPOINT}_Step-*.ckpt.index | xargs -n 1 basename | sed 's/[.][^ ]*//' @@ -76,16 +80,19 @@ ls ${MODEL_DIR}${CHECKPOINT}_Step-*.ckpt.index | xargs -n 1 basename | sed 's/[. CONTENT=$(ls ${MODEL_DIR}${CHECKPOINT}_Step-*.ckpt.index | xargs -n 1 basename | sed 's/[.][^ ]*//') echo ${CONTENT} -echo 'model_checkpoint_path: "'${CONTENT}'.ckpt"' > checkpoint/checkpoint +echo 'model_checkpoint_path: "'${CONTENT}'.ckpt"' > $WORK_DIR/tmp_upload/checkpoint # # upload files to s3 -for filename in checkpoint/*; do +for filename in $WORK_DIR/tmp_upload/*; do aws s3 cp $filename s3://$S3_BUCKET/$S3_PREFIX/model/ done +aws s3 cp $MODEL_HYPER s3://$S3_BUCKET/$S3_PREFIX/ip/ +# tar -czvf $WORK_DIR/$MODEL_NAME-${CHECKPOINT}-checkpoint.tar.gz $WORK_DIR/checkpoint/* -tar -czvf ${CHECKPOINT}-checkpoint.tar.gz checkpoint/* +# # upload meta-data +aws s3 cp $METADATA_FILE s3://$S3_BUCKET/model-metadata/$MODEL_NAME/ +aws s3 cp $MODEL_REWARD s3://$S3_BUCKET/reward-functions/$MODEL_NAME/reward_function.py -rm -rf checkpoint echo 'done uploading model!' From 92a97fcd36342de15468ab58d13c46c80ee5f12b Mon Sep 17 00:00:00 2001 From: larsll <59617571+larsll@users.noreply.github.com> Date: Sun, 19 Jan 2020 20:30:21 +0100 Subject: [PATCH 012/428] Merge upstream changes (#2) * Added --http_proxy, --https_proxy, --no_proxy as command line args to init.sh in order to build rl_coach with proxy and add the parameters to ./docker/.env file. * Removed unnecessary proxy settings in .env file * Added additional check to ./scripts/training/start.sh and ./scripts/evaluation/start.sh to avoid errors when using a machine that has ubuntu desktop installed but training is executed via ssh. Co-authored-by: Alex Lenk Co-authored-by: Alex Schultz --- init.sh | 12 +++++++++++- scripts/evaluation/start.sh | 16 +++++++++++----- scripts/training/start.sh | 37 ++++++++++++++++++++++--------------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/init.sh b/init.sh index 19bdfca1..1452f2c0 100755 --- a/init.sh +++ b/init.sh @@ -34,8 +34,18 @@ cd deepracer/ && patch simulation/aws-robomaker-sample-application-deepracer/sim # TODO this file should be genrated from a gui before running training cp defaults/template-run.env current-run.env +#set proxys if required +for arg in "$@"; +do + IFS='=' read -ra part <<< "$arg" + if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then + var=${part[0]:2}=${part[1]} + args="${args} --build-arg ${var}" + fi +done + # build rl-coach image with latest code from crr0004's repo -docker build -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ +docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ docker build -f ./docker/dockerfiles/deepracer_robomaker/Dockerfile -t larsll/deepracer_robomaker docker build -f ./docker/dockerfiles/log-analysis/Dockerfile -t larsll/log-analysis docker pull crr0004/sagemaker-rl-tensorflow:nvidia diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 278cd9e8..b0813437 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -11,10 +11,16 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 15 -if ! [ -x "$(command -v gnome-terminal)" ]; +if xhost >& /dev/null; then - docker logs -f robomaker -else - echo 'attempting to pull up robomaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f docker logs -f robomaker" + echo "Display exists, using gnome-terminal for logs and starting vncviewer." + + echo 'attempting to pull up sagemaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + + echo 'attempting to open vnc viewer...' + gnome-terminal -x sh -c "!!; vncviewer localhost:8080" +else + echo "No display. Falling back to CLI mode." + docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') fi diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 59124b1e..824ab420 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -8,24 +8,31 @@ echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start sleep 20 -if ! [ -x "$(command -v gnome-terminal)" ]; +if xhost >& /dev/null; then - docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') -else - echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" -fi + echo "Display exists, using gnome-terminal for logs and starting vncviewer." + if ! [ -x "$(command -v gnome-terminal)" ]; + then + echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' + else + echo 'attempting to pull up sagemaker logs...' + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + fi -if ! [ -x "$(command -v gnome-terminal)" ]; -then - if ! [ -x "$(command -v vncviewer)" ]; + if ! [ -x "$(command -v gnome-terminal)" ]; then - echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' + if ! [ -x "$(command -v vncviewer)" ]; + then + echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' + else + echo 'attempting to open vnc viewer...' + vncviewer localhost:8080 + fi else echo 'attempting to open vnc viewer...' - vncviewer localhost:8080 + gnome-terminal -x sh -c "!!; vncviewer localhost:8080" fi -else - echo 'attempting to open vnc viewer...' - gnome-terminal -x sh -c "!!; vncviewer localhost:8080" -fi \ No newline at end of file +else + echo "No display. Falling back to CLI mode." + docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') +fi From ff71f1464f52cc80005c6c4c5336c6bbb3e8e96c Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 19 Jan 2020 19:58:29 +0000 Subject: [PATCH 013/428] Fix activate.sh to be runable from any location. --- activate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/activate.sh b/activate.sh index 5226a486..a72472ac 100644 --- a/activate.sh +++ b/activate.sh @@ -7,7 +7,7 @@ sudo chown -R $(id -u):$(id -g) /mnt/deepracer if [[ -f "$DIR/current-run.env" ]] then - export $(grep -v '^#' current-run.env | xargs) + export $(grep -v '^#' $DIR/current-run.env | xargs) else echo "File current-run.env does not exist." exit 1 From 43a87e44dac61c717b36d734af0a89a8b7501906 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 20 Jan 2020 20:05:49 +0000 Subject: [PATCH 014/428] Initial refactoring and fixes for AWS Compatibility --- activate.sh | 13 ++++---- defaults/rl_deepracer_coach_robomaker.py | 11 +++++-- docker/.env | 2 +- docker/docker-compose.yml | 38 +++++++++++++++++------- init.sh | 8 ++--- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/activate.sh b/activate.sh index a72472ac..92da6849 100644 --- a/activate.sh +++ b/activate.sh @@ -3,7 +3,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # create directory structure for docker volumes sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording -sudo chown -R $(id -u):$(id -g) /mnt/deepracer +sudo chown $(id -u):$(id -g) /mnt/deepracer if [[ -f "$DIR/current-run.env" ]] then @@ -13,15 +13,14 @@ else exit 1 fi -export AZ_ACCESS_KEY_ID=$(aws --profile $AZURE_S3_PROFILE configure get aws_access_key_id | xargs) -export AZ_SECRET_ACCESS_KEY=$(aws --profile $AZURE_S3_PROFILE configure get aws_secret_access_key | xargs) +export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) +export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) function dr-upload-local-custom-files { - eval $(cat $DIR/docker/.env | grep 'MODEL_S3_BUCKET\|MODEL_CUSTOM_FILES_S3_PREFIX' | xargs ) - eval CUSTOM_TARGET=$(echo s3://$MODEL_S3_BUCKET/$MODEL_CUSTOM_FILES_S3_PREFIX) + eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) ROBOMAKER_COMMAND="" docker-compose -f $DIR/docker/docker-compose.yml up -d minio echo "Uploading files to $CUSTOM_TARGET" - aws --profile $AZURE_S3_PROFILE --endpoint-url http://localhost:9000 s3 sync custom_files/ s3://$MODEL_S3_BUCKET/$MODEL_CUSTOM_FILES_S3_PREFIX + aws --profile $LOCAL_S3_PROFILE s3 sync custom_files/ $CUSTOM_TARGET } function dr-start-local-training { @@ -80,4 +79,4 @@ function dr-logs-loganalysis { echo "Log-analysis is not running." fi -} \ No newline at end of file +} diff --git a/defaults/rl_deepracer_coach_robomaker.py b/defaults/rl_deepracer_coach_robomaker.py index 43b53bd5..c619644e 100644 --- a/defaults/rl_deepracer_coach_robomaker.py +++ b/defaults/rl_deepracer_coach_robomaker.py @@ -26,9 +26,16 @@ def str2bool(v): aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minio"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "miniokey"), region_name=os.environ.get("AWS_REGION", "us-east-1")) -s3Client = boto_session.resource("s3", use_ssl=False, + +endpoint_url = os.environ.get("S3_ENDPOINT_URL", "") + +if endpoint_url == "": + s3Client = boto_session.resource("s3") + s3Client_c = boto_session.client("s3") +else: + s3Client = boto_session.resource("s3", use_ssl=False, endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) -s3Client_c = boto_session.client("s3", use_ssl=False, + s3Client_c = boto_session.client("s3", use_ssl=False, endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) sage_session = sagemaker.local.LocalSession( diff --git a/docker/.env b/docker/.env index 61bc534d..b0aa3e9d 100644 --- a/docker/.env +++ b/docker/.env @@ -5,7 +5,7 @@ MINIO_SECRET_KEY=miniokey AWS_ACCESS_KEY_ID=minio AWS_SECRET_ACCESS_KEY=miniokey AWS_DEFAULT_REGION=us-east-1 -S3_ENDPOINT_URL=http://minio:9000 +#S3_ENDPOINT_URL=http://minio:9000 ROS_AWS_REGION=us-east-1 AWS_REGION=us-east-1 MODEL_S3_PREFIX=rl-deepracer-sagemaker diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b4868935..f3fffe32 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -14,10 +14,10 @@ services: command: gateway azure restart: unless-stopped environment: - - MINIO_ACCESS_KEY=${AZ_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${AZ_SECRET_ACCESS_KEY} - - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} + - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} minio-log: image: minio/minio ports: @@ -28,8 +28,8 @@ services: - /mnt/deepracer/robo/checkpoint:/data restart: unless-stopped environment: - - MINIO_ACCESS_KEY=${AZ_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${AZ_SECRET_ACCESS_KEY} + - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} rl_coach: image: aschu/rl_coach environment: @@ -37,8 +37,17 @@ services: - PRETRAINED - PRETRAINED_S3_PREFIX - PRETRAINED_S3_BUCKET - - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} + - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} + - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} + - S3_ENDPOINT_URL=${LOCAL_S3_ENDPOINT_URL} + - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json + - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py + - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} + - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json + - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} + - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} env_file: .env container_name: rl_coach volumes: @@ -62,8 +71,17 @@ services: environment: - CHANGE_START_POSITION - WORLD_NAME - - AWS_ACCESS_KEY_ID=${AZ_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${AZ_SECRET_ACCESS_KEY} + - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} + - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} + - S3_ENDPOINT_URL=${LOCAL_S3_ENDPOINT_URL} + - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json + - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py + - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} + - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json + - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} + - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} env_file: .env depends_on: - rl_coach diff --git a/init.sh b/init.sh index 1452f2c0..45f5753e 100755 --- a/init.sh +++ b/init.sh @@ -18,7 +18,7 @@ ln -s $(eval echo "~${USER}")/.aws $DIR/docker/volumes/ # git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. git submodule init && git submodule update -ln -sf ./aws-deepracer-workshops/log-analysis ./docker/volumes/log-analysis +ln -sf $DIR/aws-deepracer-workshops/log-analysis $DIR/docker/volumes/log-analysis cp deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ # copy rewardfunctions @@ -45,9 +45,9 @@ do done # build rl-coach image with latest code from crr0004's repo -docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ -docker build -f ./docker/dockerfiles/deepracer_robomaker/Dockerfile -t larsll/deepracer_robomaker -docker build -f ./docker/dockerfiles/log-analysis/Dockerfile -t larsll/log-analysis +docker build ${args} -f $DIR/docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ +docker build $DIR/docker/dockerfiles/deepracer_robomaker/Dockerfile -t larsll/deepracer_robomaker deepracer/ +docker build $DIR/docker/dockerfiles/log-analysis/Dockerfile -t larsll/log-analysis deepracer/ docker pull crr0004/sagemaker-rl-tensorflow:nvidia # create the network sagemaker-local if it doesn't exit From 58cf5f8a8ad8c935d604cbb824e63cbad3ded50b Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 20 Jan 2020 20:28:38 +0000 Subject: [PATCH 015/428] Further changes to be compatible with AWS --- activate.sh | 14 ++++++++++++-- docker/.env | 3 +-- docker/docker-compose-azure.yml | 23 +++++++++++++++++++++++ docker/docker-compose.yml | 16 ---------------- 4 files changed, 36 insertions(+), 20 deletions(-) create mode 100644 docker/docker-compose-azure.yml diff --git a/activate.sh b/activate.sh index 92da6849..335226d3 100644 --- a/activate.sh +++ b/activate.sh @@ -16,11 +16,21 @@ fi export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) +if [ $CLOUD = "Azure" ] +then + ENDPOINT="--endpoint-url http://localhost:9000" + +fi +export LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE $ENDPOINT" + function dr-upload-local-custom-files { + if [ $CLOUD = "Azure" ] + then + ROBOMAKER_COMMAND="" docker-compose -f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-azure.yml up -d minio + fi eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) - ROBOMAKER_COMMAND="" docker-compose -f $DIR/docker/docker-compose.yml up -d minio echo "Uploading files to $CUSTOM_TARGET" - aws --profile $LOCAL_S3_PROFILE s3 sync custom_files/ $CUSTOM_TARGET + aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync custom_files/ $CUSTOM_TARGET } function dr-start-local-training { diff --git a/docker/.env b/docker/.env index b0aa3e9d..62c654ae 100644 --- a/docker/.env +++ b/docker/.env @@ -10,7 +10,7 @@ ROS_AWS_REGION=us-east-1 AWS_REGION=us-east-1 MODEL_S3_PREFIX=rl-deepracer-sagemaker MODEL_S3_BUCKET=bucket -PRETRAINED=True +PRETRAINED=False PRETRAINED_S3_PREFIX=rl-deepracer-pretrained PRETRAINED_S3_BUCKET=bucket LOCAL=True @@ -31,7 +31,6 @@ TARGET_REWARD_SCORE=100000 NUMBER_OF_EPISODES=20000 ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID=aaa AWS_ROBOMAKER_SIMULATION_JOB_ID=aaa -MODEL_CUSTOM_FILES_S3_PREFIX=custom_files/ MODEL_METADATA_FILE_S3_KEY=custom_files/model_metadata.json HYPERPARAMETER_FILE_S3_KEY=custom_files/hyperparameters.json REWARD_FILE_S3_KEY=custom_files/reward.py diff --git a/docker/docker-compose-azure.yml b/docker/docker-compose-azure.yml new file mode 100644 index 00000000..6534a89d --- /dev/null +++ b/docker/docker-compose-azure.yml @@ -0,0 +1,23 @@ +version: '3.7' + +services: + minio: + image: minio/minio + ports: + - "9000:9000" + container_name: minio + command: gateway azure + restart: unless-stopped + environment: + - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} + rl_coach: + environment: + - S3_ENDPOINT_URL=http://minio:9000 + depends_on: + - minio + robomaker: + environment: + - S3_ENDPOINT_URL=http://minio:9000 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index f3fffe32..048e186a 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -6,18 +6,6 @@ networks: name: sagemaker-local services: - minio: - image: minio/minio - ports: - - "9000:9000" - container_name: minio - command: gateway azure - restart: unless-stopped - environment: - - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} minio-log: image: minio/minio ports: @@ -39,7 +27,6 @@ services: - PRETRAINED_S3_BUCKET - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - - S3_ENDPOINT_URL=${LOCAL_S3_ENDPOINT_URL} - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} @@ -55,8 +42,6 @@ services: - '../deepracer/sagemaker-python-sdk:/deepracer/sagemaker-python-sdk' - '../deepracer/rl_coach:/deepracer/rl_coach' - '/robo/container:/robo/container' - depends_on: - - minio robomaker: image: larsll/deepracer_robomaker:latest command: ["${ROBOMAKER_COMMAND}"] @@ -73,7 +58,6 @@ services: - WORLD_NAME - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - - S3_ENDPOINT_URL=${LOCAL_S3_ENDPOINT_URL} - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} From 1a94e895f29babde41302643cc6aa179562273b4 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 20 Jan 2020 20:33:51 +0000 Subject: [PATCH 016/428] Updating the template file with new environment names --- defaults/template-run.env | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index af689ef7..dd93028e 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,10 +1,14 @@ +CLOUD=Azure WORLD_NAME=Vegas_track CHANGE_START_POSITION=True -PRETRAINED=True +PRETRAINED=False PRETRAINED_S3_PREFIX=rl-sagemaker-pretrained PRETRAINED_S3_BUCKET=bucket UPLOAD_S3_PROFILE=default UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix UPLOAD_MODEL_NAME=mymodelname -AZURE_S3_PROFILE=azure \ No newline at end of file +LOCAL_S3_PROFILE=default +LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker +LOCAL_S3_BUCKET=bucket +LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files From 573142aaa1a48591a97843f26ad892750addcfc7 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jan 2020 13:57:38 +0000 Subject: [PATCH 017/428] Creation of prep script and use of Docker Hub --- README.md | 1 + docker/docker-compose.yml | 4 +- .../deepracer_robomaker/Dockerfile | 3 +- docker/dockerfiles/log-analysis/Dockerfile | 7 +- docker/dockerfiles/rl_coach/Dockerfile | 33 ++++---- init.sh | 20 +++-- prepare.sh | 75 +++++++++++++++++++ scripts/log-analysis/start.sh | 2 +- 8 files changed, 115 insertions(+), 30 deletions(-) create mode 100755 prepare.sh diff --git a/README.md b/README.md index b5d0b83a..5d471ad0 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Main differences to the work done by Alex is: * `current-run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. * Uses the Azure temporary drive on `/mnt` to store robomaker files (checkpoints, logs); these will be deleted between runs, but provides ~300GB of 'free' storage as long as the VM is running. Archiving of logs and additional checkpoint files required if desired. +* Robomaker, RL Coach and Log Analysis Docker images are now available as downloads in [Docker Hub](https://hub.docker.com/search?q=larsll%2Fdeepracer&type=image), which reduces the time to build a new VM. ## Installation diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b4868935..3577801d 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -31,7 +31,7 @@ services: - MINIO_ACCESS_KEY=${AZ_ACCESS_KEY_ID} - MINIO_SECRET_KEY=${AZ_SECRET_ACCESS_KEY} rl_coach: - image: aschu/rl_coach + image: larsll/deepracer-rlcoach environment: - WORLD_NAME - PRETRAINED @@ -49,7 +49,7 @@ services: depends_on: - minio robomaker: - image: larsll/deepracer_robomaker:latest + image: larsll/deepracer-robomaker:latest command: ["${ROBOMAKER_COMMAND}"] volumes: - ../deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src:/app/robomaker-deepracer/simulation_ws/src diff --git a/docker/dockerfiles/deepracer_robomaker/Dockerfile b/docker/dockerfiles/deepracer_robomaker/Dockerfile index fcc850c9..a1822701 100644 --- a/docker/dockerfiles/deepracer_robomaker/Dockerfile +++ b/docker/dockerfiles/deepracer_robomaker/Dockerfile @@ -11,8 +11,7 @@ ca-certificates apt-transport-https gnupg-curl && \ echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --auto-remove -y gnupg-curl && \ -rm -rf /var/lib/apt/lists/* + apt-get purge --auto-remove -y gnupg-curl ENV CUDA_VERSION 9.0.176 ENV CUDNN_VERSION 7.6.4.38 diff --git a/docker/dockerfiles/log-analysis/Dockerfile b/docker/dockerfiles/log-analysis/Dockerfile index f8806ad7..23f204ff 100644 --- a/docker/dockerfiles/log-analysis/Dockerfile +++ b/docker/dockerfiles/log-analysis/Dockerfile @@ -5,9 +5,10 @@ LABEL maintainer="lars@ludvig.no" \ version=1.0 # Container Dependency Setup -RUN apt-get update -RUN apt-get upgrade -y -RUN apt-get install software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y +RUN apt-get update && apt-get upgrade -y && \ + apt-get install software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ + rm -rf /var/lib/apt/lists/* + RUN pip3 install virtualenv RUN virtualenv /workspace/venv WORKDIR /workspace/venv diff --git a/docker/dockerfiles/rl_coach/Dockerfile b/docker/dockerfiles/rl_coach/Dockerfile index 881d6373..7f175d86 100644 --- a/docker/dockerfiles/rl_coach/Dockerfile +++ b/docker/dockerfiles/rl_coach/Dockerfile @@ -1,31 +1,30 @@ FROM python:3.7.3-stretch +LABEL maintainer "Lars Ludvigsen " # install docker -RUN apt-get update -RUN apt-get -y install apt-transport-https ca-certificates curl gnupg2 software-properties-common +RUN apt-get update && apt-get install -y \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg-agent \ + software-properties-common RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" - -RUN apt-get update -RUN apt-get -y install docker-ce - -# add required deepracer directories to the container -RUN mkdir /deepracer -RUN mkdir /deepracer/rl_coach -RUN mkdir /deepracer/sagemaker-python-sdk -WORKDIR /deepracer -ADD rl_coach rl_coach -ADD sagemaker-python-sdk sagemaker-python-sdk +RUN apt-get update && apt-get install -y docker-ce-cli && rm -rf /var/lib/apt/lists/* # create sagemaker configuration RUN mkdir /root/.sagemaker -COPY config.yaml /root/.sagemaker/config.yaml +COPY deepracer/config.yaml /root/.sagemaker/config.yaml +RUN mkdir /robo && mkdir /robo/container -RUN mkdir /robo -RUN mkdir /robo/container +# add required deepracer directories to the container +# RUN mkdir -p /deepracer && mkdir -p /deepracer/rl_coach && mkdir -p /deepracer/sagemaker-python-sdk +WORKDIR /deepracer +ADD deepracer/rl_coach /deepracer/rl_coach +ADD deepracer/sagemaker-python-sdk /deepracer/sagemaker-python-sdk # install dependencies -RUN pip install -U sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" +RUN pip install -U /deepracer/sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" # set command CMD (cd rl_coach; ipython rl_deepracer_coach_robomaker.py) diff --git a/init.sh b/init.sh index 1452f2c0..3410d5f8 100755 --- a/init.sh +++ b/init.sh @@ -1,4 +1,11 @@ #!/usr/bin/env bash +GPUS=$(docker run --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) +if [ $? -ne 0 ] || [ $GPUS -eq 0 ] +then + echo "No GPU detected in docker. Please check setup". + exit 1 +fi + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd $DIR @@ -18,7 +25,7 @@ ln -s $(eval echo "~${USER}")/.aws $DIR/docker/volumes/ # git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. git submodule init && git submodule update -ln -sf ./aws-deepracer-workshops/log-analysis ./docker/volumes/log-analysis +ln -sf $DIR/aws-deepracer-workshops/log-analysis $DIR/docker/volumes/log-analysis cp deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ # copy rewardfunctions @@ -44,10 +51,13 @@ do fi done -# build rl-coach image with latest code from crr0004's repo -docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t aschu/rl_coach deepracer/ -docker build -f ./docker/dockerfiles/deepracer_robomaker/Dockerfile -t larsll/deepracer_robomaker -docker build -f ./docker/dockerfiles/log-analysis/Dockerfile -t larsll/log-analysis +# Download docker images. Change to build statements if locally built images are desired. +# docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t larsll/deepracer-rlcoach ./ +# docker build ./docker/dockerfiles/deepracer_robomaker/ -t larsll/deepracer-robomaker +# docker build ./docker/dockerfiles/log-analysis/ -t larsll/deepracer-loganalysis +docker pull larsll/deepracer-rlcoach +docker pull larsll/deepracer-robomaker +docker pull larsll/deepracer-loganalysis docker pull crr0004/sagemaker-rl-tensorflow:nvidia # create the network sagemaker-local if it doesn't exit diff --git a/prepare.sh b/prepare.sh new file mode 100755 index 00000000..0afbd4e0 --- /dev/null +++ b/prepare.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +## Do I have a GPU +GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) +if [ $? -ne 0 ] || [ $GPUS -eq 0 ] +then + echo "No NVIDIA GPU detected. Exiting". + exit 1 +fi + +## Do I have an additional disk? + +ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') +ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') + +if [ -n $ADDL_DISK ] && [ -z $ADDL_PART] +then + echo "Found $ADDL_DISK, preparing it for use" + echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1") + sudo mkfs.ext4 $ADDL_DEVICE + sudo mkdir -p /var/lib/docker + echo "$ADDL_DEVICE /var/lib/docker auto rw,user,auto 0 0" | sudo tee -a /etc/fstab + mount /var/lib/docker + if [ $? -ne 0 ] + then + echo "Error during preparing of additional disk. Exiting." + exit 1 + fi +elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART] +then + echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." + +else + echo "Did not find $ADDL_DISK, taking no action." +fi + +## Adding Nvidia Drivers +sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub +sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' +sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' +sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' + +## Adding AWSCli and JQ +sudo apt-get install -y awscli jq + +## Installing Docker +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ +$(lsb_release -cs) \ +stable" +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ +$(lsb_release -cs) \ +stable" +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io + +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update && sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime +jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json +sudo systemctl enable docker +sudo systemctl restart docker + +## Ensure user can run docker +sudo usermod -a -G docker $(id -un) + +## Installing Docker Compose +sudo curl -L https://github.com/docker/compose/releases/download/1.25.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compose + +## Reboot to load driver +sudo reboot diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 2d8148f0..2fb17e03 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -5,4 +5,4 @@ nvidia-docker run --rm -it -p "8888:8888" \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v /mnt/deepracer/robo/checkpoint/log:/workspace/venv/logs \ -v `pwd`/../../analysis:/workspace/venv/workbook \ - larsll/log-analysis + larsll/deepracer-loganalysis From 541607ca72b58da4e8b87363aad7d7f37698ee69 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jan 2020 15:06:24 +0000 Subject: [PATCH 018/428] Using environment variable to control Docker Compose --- activate.sh | 11 +++++++---- prepare.sh | 4 ++-- scripts/evaluation/start.sh | 2 +- scripts/evaluation/stop.sh | 4 ++-- scripts/training/start.sh | 3 +-- scripts/training/stop.sh | 4 ++-- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/activate.sh b/activate.sh index 335226d3..9cdf75a4 100644 --- a/activate.sh +++ b/activate.sh @@ -16,17 +16,20 @@ fi export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) -if [ $CLOUD = "Azure" ] +if [[ "${CLOUD,,}" == "azure" ]]; then ENDPOINT="--endpoint-url http://localhost:9000" - + COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" +else + COMPOSE_FILE="$DIR/docker/docker-compose.yml" fi +export COMPOSE_FILE export LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE $ENDPOINT" function dr-upload-local-custom-files { - if [ $CLOUD = "Azure" ] + if [[ "${CLOUD,,}" == "azure" ]]; then - ROBOMAKER_COMMAND="" docker-compose -f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-azure.yml up -d minio + ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" diff --git a/prepare.sh b/prepare.sh index 0afbd4e0..05de9634 100755 --- a/prepare.sh +++ b/prepare.sh @@ -29,10 +29,10 @@ then fi elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART] then - echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." + echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." else - echo "Did not find $ADDL_DISK, taking no action." + echo "Did not find $ADDL_DISK. Installing into present drive/directory structure." fi ## Adding Nvidia Drivers diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index b0813437..3aeb6ca4 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -3,7 +3,7 @@ export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" export METRICS_S3_OBJECT_KEY=metrics/eval_metrics.json export NUMBER_OF_TRIALS=5 -docker-compose -f ../../docker/docker-compose.yml up -d +docker-compose up -d echo 'waiting for containers to start up...' diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index bb1776fd..3cc4e387 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -docker-compose -f ../../docker/docker-compose.yml down +docker-compose down docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') -docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') \ No newline at end of file +docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 824ab420..d94b431e 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" - -docker-compose -f ../../docker/docker-compose.yml up -d +docker-compose up -d echo 'waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index bb1776fd..3cc4e387 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -docker-compose -f ../../docker/docker-compose.yml down +docker-compose down docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') -docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') \ No newline at end of file +docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') From 74a6f605b17cf926dc559456d802a318e33eee0b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jan 2020 17:37:09 +0000 Subject: [PATCH 019/428] Moving binaries --- init.sh => bin/init.sh | 0 prepare.sh => bin/prepare.sh | 38 ++++++++++++++++++++++++++--- bin/runonce.sh | 46 ++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) rename init.sh => bin/init.sh (100%) rename prepare.sh => bin/prepare.sh (71%) create mode 100755 bin/runonce.sh diff --git a/init.sh b/bin/init.sh similarity index 100% rename from init.sh rename to bin/init.sh diff --git a/prepare.sh b/bin/prepare.sh similarity index 71% rename from prepare.sh rename to bin/prepare.sh index 05de9634..1ef1e482 100755 --- a/prepare.sh +++ b/bin/prepare.sh @@ -1,5 +1,7 @@ #!/bin/bash +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + ## Do I have a GPU GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ] @@ -8,7 +10,7 @@ then exit 1 fi -## Do I have an additional disk? +## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure) ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') @@ -35,6 +37,34 @@ else echo "Did not find $ADDL_DISK. Installing into present drive/directory structure." fi + +## Do I have an ephemeral disk / temporary storage for runtime output - looking for /dev/nvme0n1 (AWS)? + +ADDL_DISK=$(lsblk | awk '/^nvme0n1/ {print $1}') +ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') + +if [ -n $ADDL_DISK ] && [ -z $ADDL_PART] +then + echo "Found $ADDL_DISK, preparing it for use" + echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"p1") + sudo mkfs.ext4 $ADDL_DEVICE + sudo mkdir -p /mnt + echo "$ADDL_DEVICE /mnt auto rw,user,auto 0 0" | sudo tee -a /etc/fstab + mount /mnt + if [ $? -ne 0 ] + then + echo "Error during preparing of temporary disk. Exiting." + exit 1 + fi +elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART] +then + echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." + +else + echo "Did not find $ADDL_DISK, taking no action." +fi + ## Adding Nvidia Drivers sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' @@ -69,7 +99,9 @@ sudo usermod -a -G docker $(id -un) ## Installing Docker Compose sudo curl -L https://github.com/docker/compose/releases/download/1.25.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose -sudo chmod +x /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compos -## Reboot to load driver +## Reboot to load driver -- continue install +cd $DIR +./runonce.sh init.sh sudo reboot diff --git a/bin/runonce.sh b/bin/runonce.sh new file mode 100755 index 00000000..dfc72f75 --- /dev/null +++ b/bin/runonce.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +if [[ $# -eq 0 ]]; then + echo "Schedules a command to be run after the next reboot." + echo "Usage: $(basename $0) " + echo " $(basename $0) -p " + echo " $(basename $0) -r " +else + REMOVE=0 + COMMAND=${!#} + SCRIPTPATH=$PATH + + while getopts ":r:p:" optionName; do + case "$optionName" in + r) REMOVE=1; COMMAND=$OPTARG;; + p) SCRIPTPATH=$OPTARG;; + esac + done + + SCRIPT="${HOME}/.$(basename $0)_$(echo $COMMAND | sed 's/[^a-zA-Z0-9_]/_/g')" + + if [[ ! -f $SCRIPT ]]; then + echo "PATH=$SCRIPTPATH" >> $SCRIPT + echo "cd $(pwd)" >> $SCRIPT + echo "logger -t $(basename $0) -p local3.info \"COMMAND=$COMMAND ; USER=\$(whoami) ($(logname)) ; PWD=$(pwd) ; PATH=\$PATH\"" >> $SCRIPT + echo "$COMMAND | logger -t $(basename $0) -p local3.info" >> $SCRIPT + echo "$0 -r \"$(echo $COMMAND | sed 's/\"/\\\"/g')\"" >> $SCRIPT + chmod +x $SCRIPT + fi + + CRONTAB="${HOME}/.$(basename $0)_temp_crontab_$RANDOM" + ENTRY="@reboot $SCRIPT" + + echo "$(crontab -l 2>/dev/null)" | grep -v "$ENTRY" | grep -v "^# DO NOT EDIT THIS FILE - edit the master and reinstall.$" | grep -v "^# ([^ ]* installed on [^)]*)$" | grep -v "^# (Cron version [^$]*\$[^$]*\$)$" > $CRONTAB + + if [[ $REMOVE -eq 0 ]]; then + echo "$ENTRY" >> $CRONTAB + fi + + crontab $CRONTAB + rm $CRONTAB + + if [[ $REMOVE -ne 0 ]]; then + rm $SCRIPT + fi +fi From 82f6e6220a1d1967b61496802cd4177f4782ed6f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 23 Jan 2020 18:58:04 +0100 Subject: [PATCH 020/428] Updates to be more generic (Azure + AWS) --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5d471ad0..cd1b6457 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,26 @@ -# DeepRacer-For-Azure -Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure, using the [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu). +# DeepRacer-For-Cloud +Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerating Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). This repo is an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer) Please refer to Chris' repo to understand more about what's going on under the covers. Main differences to the work done by Alex is: -* Local S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). -* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. +* Runtime S3 storage is setup to fit the connected cloud platform: + * Azure: Local 'virtual' S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). + * AWS: Directly connects to a real S3 bucket. +* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. * Configuration has been reorganized : * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. * `current-run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -* Uses the Azure temporary drive on `/mnt` to store robomaker files (checkpoints, logs); these will be deleted between runs, but provides ~300GB of 'free' storage as long as the VM is running. Archiving of logs and additional checkpoint files required if desired. +* Runtime storage: Uses `/mnt` to store robomaker files (checkpoints, logs); depending on setup these will normally be deleted between runs, but Azure and AWS provides 200+ GB free storage which is very suitable for this purpuse. Archiving of logs and additional checkpoint files required if desired. + * Azure: Uses the normal temporary drive which is mounted on /mnt by default. + * AWS: Preparation scripts mounts the ephemeral drive on /mnt * Robomaker, RL Coach and Log Analysis Docker images are now available as downloads in [Docker Hub](https://hub.docker.com/search?q=larsll%2Fdeepracer&type=image), which reduces the time to build a new VM. + + ## Installation A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) is available. From 67c91f0c75a852497bfa0fec6edd2ce837f28e20 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 23 Jan 2020 19:08:41 +0100 Subject: [PATCH 021/428] Adding Requirements --- README.md | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index cd1b6457..4409502d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # DeepRacer-For-Cloud -Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerating Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). +Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). This repo is an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer) @@ -17,10 +17,28 @@ Main differences to the work done by Alex is: * Runtime storage: Uses `/mnt` to store robomaker files (checkpoints, logs); depending on setup these will normally be deleted between runs, but Azure and AWS provides 200+ GB free storage which is very suitable for this purpuse. Archiving of logs and additional checkpoint files required if desired. * Azure: Uses the normal temporary drive which is mounted on /mnt by default. * AWS: Preparation scripts mounts the ephemeral drive on /mnt -* Robomaker, RL Coach and Log Analysis Docker images are now available as downloads in [Docker Hub](https://hub.docker.com/search?q=larsll%2Fdeepracer&type=image), which reduces the time to build a new VM. - - - +* Robomaker, RL Coach and Log Analysis Docker images are now available as downloads in [Docker Hub](https://hub.docker.com/search?q=larsll%2Fdeepracer&type=image), which reduces the time to build a new VM. Log analysis is not downloaded by default to reduce required disk space. + +## Requirements + +Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. + +AWS: +* EC2 instance of type G3, G4, P2 or P3. + * Ubuntu 18.04 + * Minimum 30 GB, preferred 40 GB of OS disk. + * Ephemeral Drive connected + * Minimum 8 GB GPU-RAM + * Recommended at least 6 VCPUs. +* S3 bucket. Preferrably in same region as EC2 instance. + +Azure: +* N-Series VM that comes with NVIDIA Graphics Adapter. + * Ubuntu 18.04 + * Standard 30 GB OS drive is sufficient to get started. + * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. +* Storage Account with one Blob container configured for Access Key authentication. + ## Installation A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) is available. From ab49fafd110a4fd7fc61d359e3685060533313c9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 23 Jan 2020 19:15:33 +0100 Subject: [PATCH 022/428] Further installation instructions --- README.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4409502d..b490ae7a 100644 --- a/README.md +++ b/README.md @@ -24,26 +24,35 @@ Main differences to the work done by Alex is: Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. AWS: -* EC2 instance of type G3, G4, P2 or P3. +* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge * Ubuntu 18.04 * Minimum 30 GB, preferred 40 GB of OS disk. * Ephemeral Drive connected * Minimum 8 GB GPU-RAM - * Recommended at least 6 VCPUs. + * Recommended at least 6 VCPUs * S3 bucket. Preferrably in same region as EC2 instance. Azure: -* N-Series VM that comes with NVIDIA Graphics Adapter. +* N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard * Ubuntu 18.04 * Standard 30 GB OS drive is sufficient to get started. * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. + * Minimum 8 GB GPU-RAM + * Recommended at least 6 VCPUs * Storage Account with one Blob container configured for Access Key authentication. ## Installation -A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) is available. +A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) for manual installation in Azure is available. + +The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. + + git clone https://github.com/larsll/deepracer-for-azure.git + cd deepracer-for-azure && ./bin/prepare.sh + +This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. -TODO: Create an end-to-end installation script. +TODO: Setup of environment. ## Usage From 16c75f8f0bff0b8127fc737d642b33b1b9f37081 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jan 2020 18:20:38 +0000 Subject: [PATCH 023/428] Altering init.sh --- bin/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/init.sh b/bin/init.sh index 3410d5f8..35bad396 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -57,7 +57,7 @@ done # docker build ./docker/dockerfiles/log-analysis/ -t larsll/deepracer-loganalysis docker pull larsll/deepracer-rlcoach docker pull larsll/deepracer-robomaker -docker pull larsll/deepracer-loganalysis +# docker pull larsll/deepracer-loganalysis docker pull crr0004/sagemaker-rl-tensorflow:nvidia # create the network sagemaker-local if it doesn't exit From 972c7109336076d140b8b2f1e9355dc28b9e05fc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 23 Jan 2020 19:25:09 +0100 Subject: [PATCH 024/428] Moving Repo to new URL --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b490ae7a..cb56f635 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,11 @@ Azure: ## Installation -A step by step [installation guide](https://github.com/larsll/deepracer-for-azure/wiki/Install-DeepRacer-in-Azure) for manual installation in Azure is available. +A step by step [installation guide](https://github.com/larsll/deepracer-for-cloud/wiki/Install-DeepRacer-in-Azure) for manual installation in Azure is available. The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - git clone https://github.com/larsll/deepracer-for-azure.git + git clone https://github.com/larsll/deepracer-for-cloud.git cd deepracer-for-azure && ./bin/prepare.sh This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. From 00bccd81341a5db6bb2318b1c7d7ab0532cb232c Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 24 Jan 2020 10:22:07 +0100 Subject: [PATCH 025/428] Fixing prepare and init scripts to allow for automatic provisioning in AWS (#8) * Adding sleep after fdisk, fixing typo * No automount of AWS ephemeral drive * Fixing reboot sequence * Fixing script path * Fixing init script to handle changed path --- activate.sh | 3 +++ bin/init.sh | 25 +++++++++++++------------ bin/prepare.sh | 32 +++++++++++++++----------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/activate.sh b/activate.sh index 9cdf75a4..81ef0b20 100644 --- a/activate.sh +++ b/activate.sh @@ -2,6 +2,9 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # create directory structure for docker volumes +if ! (mount | grep /mnt > /dev/null); then + mount /mnt +fi sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording sudo chown $(id -u):$(id -g) /mnt/deepracer diff --git a/bin/init.sh b/bin/init.sh index 35bad396..7f1881b1 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -6,18 +6,19 @@ then exit 1 fi -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd $DIR +INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" +cd $INSTALL_DIR # create directory structure for docker volumes +mount /mnt sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording /mnt/deepracer/robo/checkpoint sudo chown -R $(id -u):$(id -g) /mnt/deepracer -mkdir -p $DIR/docker/volumes +mkdir -p $INSTALL_DIR/docker/volumes # create symlink to current user's home .aws directory # NOTE: AWS cli must be installed for this to work # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html -ln -s $(eval echo "~${USER}")/.aws $DIR/docker/volumes/ +ln -s $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ # grab local training deepracer repo from crr0004 and log analysis repo from vreadcentric # Now as submodules! @@ -25,21 +26,21 @@ ln -s $(eval echo "~${USER}")/.aws $DIR/docker/volumes/ # git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. git submodule init && git submodule update -ln -sf $DIR/aws-deepracer-workshops/log-analysis $DIR/docker/volumes/log-analysis -cp deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ +ln -sf $INSTALL_DIR/aws-deepracer-workshops/log-analysis $INSTALL_DIR/docker/volumes/log-analysis +cp $INSTALL_DIR/deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ # copy rewardfunctions -mkdir -p custom_files analysis -cp deepracer/custom_files/* custom_files/ -cp defaults/hyperparameters.json custom_files/ +mkdir -p $INSTALL_DIR/custom_files $INSTALL_DIR/analysis +cp $INSTALL_DIR/deepracer/custom_files/* $INSTALL_DIR/custom_files/ +cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/ # setup symlink to rl-coach config file -ln -f defaults/rl_deepracer_coach_robomaker.py deepracer/rl_coach/rl_deepracer_coach_robomaker.py -cd deepracer/ && patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch && cd .. +ln -f $INSTALL_DIR/defaults/rl_deepracer_coach_robomaker.py $INSTALL_DIR/deepracer/rl_coach/rl_deepracer_coach_robomaker.py +cd $INSTALL_DIR/deepracer/ && patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch && cd .. # replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) # TODO this file should be genrated from a gui before running training -cp defaults/template-run.env current-run.env +cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/current-run.env #set proxys if required for arg in "$@"; diff --git a/bin/prepare.sh b/bin/prepare.sh index 1ef1e482..aca41f89 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -4,7 +4,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Do I have a GPU GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) -if [ $? -ne 0 ] || [ $GPUS -eq 0 ] +if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then echo "No NVIDIA GPU detected. Exiting". exit 1 @@ -15,21 +15,22 @@ fi ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') -if [ -n $ADDL_DISK ] && [ -z $ADDL_PART] +if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; then echo "Found $ADDL_DISK, preparing it for use" echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + sleep 1s ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1") sudo mkfs.ext4 $ADDL_DEVICE sudo mkdir -p /var/lib/docker - echo "$ADDL_DEVICE /var/lib/docker auto rw,user,auto 0 0" | sudo tee -a /etc/fstab + echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab mount /var/lib/docker if [ $? -ne 0 ] then echo "Error during preparing of additional disk. Exiting." exit 1 fi -elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART] +elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; then echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." @@ -43,21 +44,22 @@ fi ADDL_DISK=$(lsblk | awk '/^nvme0n1/ {print $1}') ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') -if [ -n $ADDL_DISK ] && [ -z $ADDL_PART] +if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; then echo "Found $ADDL_DISK, preparing it for use" echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + sleep 1s ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"p1") sudo mkfs.ext4 $ADDL_DEVICE sudo mkdir -p /mnt - echo "$ADDL_DEVICE /mnt auto rw,user,auto 0 0" | sudo tee -a /etc/fstab + echo "$ADDL_DEVICE /mnt ext4 rw,user,noauto 0 0" | sudo tee -a /etc/fstab mount /mnt if [ $? -ne 0 ] then echo "Error during preparing of temporary disk. Exiting." exit 1 fi -elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART] +elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; then echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." @@ -75,15 +77,9 @@ sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build- sudo apt-get install -y awscli jq ## Installing Docker -sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ -$(lsb_release -cs) \ -stable" curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - -sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ -$(lsb_release -cs) \ -stable" -sudo apt-get update -sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" +sudo apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - @@ -99,9 +95,11 @@ sudo usermod -a -G docker $(id -un) ## Installing Docker Compose sudo curl -L https://github.com/docker/compose/releases/download/1.25.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose -sudo chmod +x /usr/local/bin/docker-compos +sudo chmod +x /usr/local/bin/docker-compose ## Reboot to load driver -- continue install +echo "Rebooting in 5 seconds. Will continue with install." cd $DIR -./runonce.sh init.sh +./runonce.sh ./init.sh +sleep 5s sudo reboot From c2d62f99bd0239fac5167a23f36c909d37a042e6 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 24 Jan 2020 20:48:31 +0100 Subject: [PATCH 026/428] Optimizing size of docker images (#9) * Tuning the docker image size for log-analysis and rl_coach * Updating robomaker dockerfile --- bin/init.sh | 2 +- .../dockerfiles/deepracer_robomaker/Dockerfile | 8 ++++---- docker/dockerfiles/log-analysis/Dockerfile | 14 +++++++------- docker/dockerfiles/rl_coach/Dockerfile | 16 ++++++++-------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 7f1881b1..65bf6be3 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -24,7 +24,7 @@ ln -s $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ # Now as submodules! # git clone --recurse-submodules https://github.com/crr0004/deepracer.git # git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. -git submodule init && git submodule update +git submodule update --init --recursive ln -sf $INSTALL_DIR/aws-deepracer-workshops/log-analysis $INSTALL_DIR/docker/volumes/log-analysis cp $INSTALL_DIR/deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ diff --git a/docker/dockerfiles/deepracer_robomaker/Dockerfile b/docker/dockerfiles/deepracer_robomaker/Dockerfile index a1822701..e4f58b9a 100644 --- a/docker/dockerfiles/deepracer_robomaker/Dockerfile +++ b/docker/dockerfiles/deepracer_robomaker/Dockerfile @@ -45,15 +45,15 @@ RUN apt-get install -y --no-install-recommends \ cuda-cublas-9-0=9.0.176.4-1 \ cuda-cusolver-$CUDA_PKG_VERSION \ libnccl2=$NCCL_VERSION-1+cuda9.0 && \ - apt-mark hold libnccl2 - -RUN apt-get install -y --no-install-recommends \ + apt-mark hold libnccl2 && \ + apt-get install -y --no-install-recommends \ libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \ apt-mark hold libcudnn7 && \ + apt-get clean && \ rm -rf /var/lib/apt/lists/* ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-9.0/targets/x86_64-linux/lib -RUN pip install tensorflow-gpu==1.11.0 +RUN pip install --no-cache-dir tensorflow-gpu==1.11.0 diff --git a/docker/dockerfiles/log-analysis/Dockerfile b/docker/dockerfiles/log-analysis/Dockerfile index 23f204ff..519e57ee 100644 --- a/docker/dockerfiles/log-analysis/Dockerfile +++ b/docker/dockerfiles/log-analysis/Dockerfile @@ -1,4 +1,4 @@ -FROM tensorflow/tensorflow:latest-gpu +FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 LABEL maintainer="lars@ludvig.no" \ description="Log Analysis for DeepRacer Training Run" \ @@ -6,19 +6,19 @@ LABEL maintainer="lars@ludvig.no" \ # Container Dependency Setup RUN apt-get update && apt-get upgrade -y && \ - apt-get install software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ - rm -rf /var/lib/apt/lists/* + apt-get install --no-install-recommends software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ + apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3 install virtualenv -RUN virtualenv /workspace/venv +ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-10.0/targets/x86_64-linux/lib + +RUN pip3 install virtualenv && virtualenv /workspace/venv WORKDIR /workspace/venv RUN mkdir -p /workspace/venv/data /workspace/venv/logs /workspace/venv/workbook # Install common pip packages WORKDIR /workspace/venv COPY requirements.txt ./ -RUN ls -lrt -RUN . /workspace/venv/bin/activate && pip install -r requirements.txt +RUN . /workspace/venv/bin/activate && pip install --no-cache-dir -r requirements.txt EXPOSE 8888 VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws", "/workspace/venv/workbook"] diff --git a/docker/dockerfiles/rl_coach/Dockerfile b/docker/dockerfiles/rl_coach/Dockerfile index 7f175d86..d3dbf8b6 100644 --- a/docker/dockerfiles/rl_coach/Dockerfile +++ b/docker/dockerfiles/rl_coach/Dockerfile @@ -1,21 +1,21 @@ -FROM python:3.7.3-stretch +FROM python:3.7.6-slim LABEL maintainer "Lars Ludvigsen " # install docker -RUN apt-get update && apt-get install -y \ +RUN apt-get update && apt-get install --no-install-recommends -y \ apt-transport-https \ ca-certificates \ curl \ + gnupg \ gnupg-agent \ software-properties-common -RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - -RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" -RUN apt-get update && apt-get install -y docker-ce-cli && rm -rf /var/lib/apt/lists/* +RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - && \ + add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" +RUN apt-get update && apt-get install --no-install-recommends -y docker-ce-cli && apt-get clean && rm -rf /var/lib/apt/lists/* # create sagemaker configuration -RUN mkdir /root/.sagemaker +RUN mkdir -p /root/.sagemaker /robo/container COPY deepracer/config.yaml /root/.sagemaker/config.yaml -RUN mkdir /robo && mkdir /robo/container # add required deepracer directories to the container # RUN mkdir -p /deepracer && mkdir -p /deepracer/rl_coach && mkdir -p /deepracer/sagemaker-python-sdk @@ -24,7 +24,7 @@ ADD deepracer/rl_coach /deepracer/rl_coach ADD deepracer/sagemaker-python-sdk /deepracer/sagemaker-python-sdk # install dependencies -RUN pip install -U /deepracer/sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" +RUN pip install --no-cache-dir -U /deepracer/sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" # set command CMD (cd rl_coach; ipython rl_deepracer_coach_robomaker.py) From 34a6b4db8222c49fc23421a3215c6ebe48fbc0c0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 25 Jan 2020 10:52:15 +0100 Subject: [PATCH 027/428] Enabling support for AWS IAM key-less login to S3 (#10) * Altering configuration to allow for AWS IAM credentials * Enabling automatic activation * Say yes when installing via APT * Fixing reference to rl_coach * Fixing file reference --- .gitmodules | 3 ++- activate.sh | 28 ++++++++++++++++++++---- bin/init.sh | 11 +++++++++- bin/prepare.sh | 3 +++ defaults/rl_coach_env.sh.patch | 14 ++++++++++++ defaults/rl_deepracer_coach_robomaker.py | 5 +---- defaults/robomaker.env.patch | 12 ++++++++++ defaults/template-run.env | 2 ++ docker/.env | 4 ---- docker/docker-compose-keys.yml | 13 +++++++++++ docker/docker-compose-log.yml | 19 ++++++++++++++++ docker/docker-compose.yml | 16 -------------- 12 files changed, 100 insertions(+), 30 deletions(-) create mode 100644 defaults/rl_coach_env.sh.patch create mode 100644 defaults/robomaker.env.patch create mode 100644 docker/docker-compose-keys.yml create mode 100644 docker/docker-compose-log.yml diff --git a/.gitmodules b/.gitmodules index 47b24b2d..c65cf66a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,9 @@ [submodule "deepracer"] path = deepracer url = https://github.com/crr0004/deepracer.git + ignore = dirty [submodule "aws-deepracer-workshops"] path = aws-deepracer-workshops url = https://github.com/breadcentric/aws-deepracer-workshops.git branch = enhance-log-analysis - + ignore = dirty diff --git a/activate.sh b/activate.sh index 81ef0b20..05c886d1 100644 --- a/activate.sh +++ b/activate.sh @@ -16,18 +16,26 @@ else exit 1 fi -export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) -export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) if [[ "${CLOUD,,}" == "azure" ]]; then - ENDPOINT="--endpoint-url http://localhost:9000" + LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" else + LOCAL_PROFILE_ENDPOINT_URL="" COMPOSE_FILE="$DIR/docker/docker-compose.yml" fi + +## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. +if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq 0 ]; +then + export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) + export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) + COMPOSE_FILE="$COMPOSE_FILE:$DIR/docker/docker-compose-keys.yml" +fi + export COMPOSE_FILE -export LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE $ENDPOINT" +export LOCAL_PROFILE_ENDPOINT_URL function dr-upload-local-custom-files { if [[ "${CLOUD,,}" == "azure" ]]; @@ -96,3 +104,15 @@ function dr-logs-loganalysis { fi } + +function dr-logs-proxy-start { + docker-compose -f $DIR/docker/docker-compose-log.yml up -d +} + +function dr-logs-proxy-stop { + docker-compose -f $DIR/docker/docker-compose-log.yml down +} + +function dr-update { + source $DIR/activate.sh +} diff --git a/bin/init.sh b/bin/init.sh index 65bf6be3..7902a253 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -36,7 +36,13 @@ cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/ # setup symlink to rl-coach config file ln -f $INSTALL_DIR/defaults/rl_deepracer_coach_robomaker.py $INSTALL_DIR/deepracer/rl_coach/rl_deepracer_coach_robomaker.py -cd $INSTALL_DIR/deepracer/ && patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch && cd .. + +# patching files in submodules that don't entirely fit our needs +cd $INSTALL_DIR/deepracer/ +patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch +patch robomaker.env < ../defaults/robomaker.env.patch +patch rl_coach/env.sh < ../defaults/rl_coach_env.sh.patch +cd .. # replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) # TODO this file should be genrated from a gui before running training @@ -68,3 +74,6 @@ if [ $? -ne 0 ] then docker network create $SAGEMAKER_NW fi + +# ensure our variables are set on startup +echo "source $INSTALL_DIR/activate.sh" >> $HOME/.profile diff --git a/bin/prepare.sh b/bin/prepare.sh index aca41f89..87e65493 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -2,6 +2,9 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +## Patch system +sudo apt-get update && sudo apt-get upgrade -y + ## Do I have a GPU GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; diff --git a/defaults/rl_coach_env.sh.patch b/defaults/rl_coach_env.sh.patch new file mode 100644 index 00000000..1495fdf2 --- /dev/null +++ b/defaults/rl_coach_env.sh.patch @@ -0,0 +1,14 @@ +diff --git a/rl_coach/env.sh b/rl_coach/env.sh +index b77cf42..4c204c7 100644 +--- a/rl_coach/env.sh ++++ b/rl_coach/env.sh +@@ -1,7 +1,7 @@ + export MINIO_ACCESS_KEY=minio + export MINIO_SECRET_KEY=miniokey +-export AWS_ACCESS_KEY_ID=minio +-export AWS_SECRET_ACCESS_KEY=miniokey ++#export AWS_ACCESS_KEY_ID=minio ++#export AWS_SECRET_ACCESS_KEY=miniokey + export WORLD_NAME=New_York_Track + export ROS_AWS_REGION=us-east-1 + export AWS_REGION=us-east-1 diff --git a/defaults/rl_deepracer_coach_robomaker.py b/defaults/rl_deepracer_coach_robomaker.py index c619644e..9e997d54 100644 --- a/defaults/rl_deepracer_coach_robomaker.py +++ b/defaults/rl_deepracer_coach_robomaker.py @@ -22,10 +22,7 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") # S3 bucket -boto_session = boto3.session.Session( - aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minio"), - aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "miniokey"), - region_name=os.environ.get("AWS_REGION", "us-east-1")) +boto_session = boto3.session.Session(region_name=os.environ.get("AWS_REGION", "us-east-1")) endpoint_url = os.environ.get("S3_ENDPOINT_URL", "") diff --git a/defaults/robomaker.env.patch b/defaults/robomaker.env.patch new file mode 100644 index 00000000..b9fb1c65 --- /dev/null +++ b/defaults/robomaker.env.patch @@ -0,0 +1,12 @@ +diff --git a/robomaker.env b/robomaker.env +index fde2aaf..fea2347 100644 +--- a/robomaker.env ++++ b/robomaker.env +@@ -1,5 +1,5 @@ +-AWS_ACCESS_KEY_ID=minio +-AWS_SECRET_ACCESS_KEY=miniokey ++#AWS_ACCESS_KEY_ID=minio ++#AWS_SECRET_ACCESS_KEY=miniokey + WORLD_NAME=Mexico_track + ROS_AWS_REGION=us-east-1 + AWS_REGION=us-east-1 diff --git a/defaults/template-run.env b/defaults/template-run.env index dd93028e..3eeb1198 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -12,3 +12,5 @@ LOCAL_S3_PROFILE=default LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker LOCAL_S3_BUCKET=bucket LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files +LOGS_ACCESS_KEY=mylogs +LOGS_ACCESS_SECRET=mylogpass diff --git a/docker/.env b/docker/.env index 62c654ae..1d7489d1 100644 --- a/docker/.env +++ b/docker/.env @@ -1,9 +1,5 @@ WORLD_NAME=Vegas_track LOCAL_ENV_VAR_JSON_PATH=env_vars.json -MINIO_ACCESS_KEY=minio -MINIO_SECRET_KEY=miniokey -AWS_ACCESS_KEY_ID=minio -AWS_SECRET_ACCESS_KEY=miniokey AWS_DEFAULT_REGION=us-east-1 #S3_ENDPOINT_URL=http://minio:9000 ROS_AWS_REGION=us-east-1 diff --git a/docker/docker-compose-keys.yml b/docker/docker-compose-keys.yml new file mode 100644 index 00000000..33d99f6f --- /dev/null +++ b/docker/docker-compose-keys.yml @@ -0,0 +1,13 @@ +version: '3.7' + +services: + rl_coach: + environment: + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} + depends_on: + - minio + robomaker: + environment: + - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} diff --git a/docker/docker-compose-log.yml b/docker/docker-compose-log.yml new file mode 100644 index 00000000..e1632f22 --- /dev/null +++ b/docker/docker-compose-log.yml @@ -0,0 +1,19 @@ +version: '3.7' +networks: + default: + external: + name: sagemaker-local + +services: + minio-log: + image: minio/minio + ports: + - "9001:9001" + container_name: minio-log + command: server --address :9001 /data + volumes: + - /mnt/deepracer/robo/checkpoint:/data + restart: unless-stopped + environment: + - MINIO_ACCESS_KEY=${LOGS_ACCESS_KEY} + - MINIO_SECRET_KEY=${LOGS_ACCESS_SECRET} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 02315596..ab9e7246 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -6,18 +6,6 @@ networks: name: sagemaker-local services: - minio-log: - image: minio/minio - ports: - - "9001:9001" - container_name: minio-log - command: server --address :9001 /data - volumes: - - /mnt/deepracer/robo/checkpoint:/data - restart: unless-stopped - environment: - - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} rl_coach: image: larsll/deepracer-rlcoach environment: @@ -33,8 +21,6 @@ services: - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} env_file: .env container_name: rl_coach volumes: @@ -64,8 +50,6 @@ services: - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} env_file: .env depends_on: - rl_coach From be82fac1d64c6d8abad866a34a431008433c42bf Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 25 Jan 2020 23:04:39 +0100 Subject: [PATCH 028/428] Race tuning - (#11) * Altering configuration to allow for AWS IAM credentials * Enabling automatic activation * Say yes when installing via APT * Fixing reference to rl_coach * Fixing file reference * Adding config download feature * Altering script names * Removing hard coded prefix (rl-deepracer-sagemaker) --- activate.sh | 34 +++++++++++++++++++----- defaults/rl_deepracer_coach_robomaker.py | 3 ++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/activate.sh b/activate.sh index 05c886d1..47a3db58 100644 --- a/activate.sh +++ b/activate.sh @@ -37,29 +37,49 @@ fi export COMPOSE_FILE export LOCAL_PROFILE_ENDPOINT_URL -function dr-upload-local-custom-files { +function dr-upload-custom-files { if [[ "${CLOUD,,}" == "azure" ]]; then ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync custom_files/ $CUSTOM_TARGET + aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET } -function dr-start-local-training { +function dr-upload-logs { + if [[ "${CLOUD,,}" == "azure" ]]; + then + ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio + fi + eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_LOGS_PREFIX/) + echo "Uploading files to $CUSTOM_TARGET" + aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync /mnt/deepracer/robo/checkpoint/log $CUSTOM_TARGET --exclude "*" --include "rl_coach*.log*" --no-follow-symlinks +} + +function dr-download-custom-files { + if [[ "${CLOUD,,}" == "azure" ]]; + then + ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio + fi + eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) + echo "Downloading files from $CUSTOM_TARGET" + aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ +} + +function dr-start-training { bash -c "cd $DIR/scripts/training && ./start.sh" } -function dr-stop-local-training { +function dr-stop-training { ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" } -function dr-start-local-evaluation { +function dr-start-evaluation { bash -c "cd $DIR/scripts/evaluation && ./start.sh" } -function dr-stop-local-evaluation { +function dr-stop-evaluation { ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" } @@ -74,7 +94,7 @@ function dr-stop-loganalysis { else echo "Log-analysis is not running." fi - + } function dr-logs-sagemaker { diff --git a/defaults/rl_deepracer_coach_robomaker.py b/defaults/rl_deepracer_coach_robomaker.py index 9e997d54..fb42d1e1 100644 --- a/defaults/rl_deepracer_coach_robomaker.py +++ b/defaults/rl_deepracer_coach_robomaker.py @@ -39,6 +39,7 @@ def str2bool(v): boto_session=boto_session, s3_client=s3Client) # sage_session.default_bucket() s3_bucket = os.environ.get("MODEL_S3_BUCKET", "bucket") +s3_prefix = os.environ.get("MODEL_S3_PREFIX", "rl-deepracer-sagemaker") pretrained = str2bool(os.environ.get("PRETRAINED", False)) s3_pretrained_bucket = os.environ.get("PRETRAINED_S3_BUCKET", "bucket") s3_pretrained_prefix = os.environ.get( @@ -61,7 +62,7 @@ def str2bool(v): # create unique job name tm = gmtime() # -" + strftime("%y%m%d-%H%M%S", tm) #Ensure S3 prefix contains SageMaker -job_name = s3_prefix = job_name_prefix + "-sagemaker" +job_name = job_name_prefix + "-sagemaker" # -" + strftime("%y%m%d-%H%M%S", tm) #Ensure that the S3 prefix contains the keyword 'robomaker' s3_prefix_robomaker = job_name_prefix + "-robomaker" From 3cf1213abd6a086d3b0d45d7a6a352bd5426ab6e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 25 Jan 2020 23:07:35 +0100 Subject: [PATCH 029/428] Updated Installation and Usage descriptions. --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cb56f635..0cefddc8 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,12 @@ The package comes with preparation and setup scripts that would allow a turn-key This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. +The installation script will adapt `.profile` to ensure that all settings are applied on login. + TODO: Setup of environment. ## Usage -Before every session run `source activate.sh` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. +Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. (If `dr-update` is not found, try `source activate.sh` to get aliases defined. -Ensure that the configuration files are uploaded into the bucket `dr-upload-local-custom-files`. Start a training with `dr-start-local-training`. +Ensure that the configuration files are uploaded into the bucket `dr-upload-custom-files`. Start a training with `dr-start-training`. From bbc42c227ab967acd6b1833dc060fccfa2cab4ce Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 26 Jan 2020 10:57:21 +0100 Subject: [PATCH 030/428] Bugfix - avoiding grub-prompt in AWS --- bin/prepare.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 87e65493..82e48f7f 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -3,7 +3,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system -sudo apt-get update && sudo apt-get upgrade -y +sudo apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ + DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade ## Do I have a GPU GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) From 441b4ec9ff7a3d4238bd502b00e41230db2b190b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 26 Jan 2020 11:05:37 +0100 Subject: [PATCH 031/428] More blocks to avoid changes to grub in AWS --- bin/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 82e48f7f..3e9c0fc8 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -3,7 +3,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system -sudo apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ +sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade ## Do I have a GPU From 3630a6963fdb8277b0cd18332d3cb64d0ba99e5b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 26 Jan 2020 21:41:20 +0100 Subject: [PATCH 032/428] Expose Trials in env-file. Improving code. (#12) --- activate.sh | 37 +++++++++++++++++++----- defaults/template-run.env | 2 ++ docker/docker-compose.yml | 4 +-- scripts/evaluation/start.sh | 5 ++-- scripts/evaluation/stop.sh | 8 +++-- scripts/training/back-up-training-run.sh | 6 ---- scripts/training/delete-last-run.sh | 7 ----- scripts/training/stop.sh | 8 +++-- 8 files changed, 48 insertions(+), 29 deletions(-) delete mode 100755 scripts/training/back-up-training-run.sh delete mode 100755 scripts/training/delete-last-run.sh diff --git a/activate.sh b/activate.sh index 47a3db58..1f988045 100644 --- a/activate.sh +++ b/activate.sh @@ -10,13 +10,12 @@ sudo chown $(id -u):$(id -g) /mnt/deepracer if [[ -f "$DIR/current-run.env" ]] then - export $(grep -v '^#' $DIR/current-run.env | xargs) + export $(grep -v '^#' $DIR/current-run.env | xargs) else - echo "File current-run.env does not exist." - exit 1 + echo "File current-run.env does not exist." + exit 1 fi - if [[ "${CLOUD,,}" == "azure" ]]; then LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" @@ -52,9 +51,14 @@ function dr-upload-logs { then ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi - eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_LOGS_PREFIX/) - echo "Uploading files to $CUSTOM_TARGET" - aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync /mnt/deepracer/robo/checkpoint/log $CUSTOM_TARGET --exclude "*" --include "rl_coach*.log*" --no-follow-symlinks + if [ -d /mnt/deepracer/robo/checkpoint/log/ ]; + then + eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_LOGS_PREFIX/) + echo "Uploading files to $CUSTOM_TARGET" + aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync /mnt/deepracer/robo/checkpoint/log $CUSTOM_TARGET --exclude "*" --include "rl_coach*.log*" --no-follow-symlinks + else + echo "No logfiles to upload" + fi } function dr-download-custom-files { @@ -68,18 +72,22 @@ function dr-download-custom-files { } function dr-start-training { + dr-update-env bash -c "cd $DIR/scripts/training && ./start.sh" } function dr-stop-training { + dr-upload-logs ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" } function dr-start-evaluation { + dr-update-env bash -c "cd $DIR/scripts/evaluation && ./start.sh" } function dr-stop-evaluation { + dr-upload-logs ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" } @@ -125,6 +133,11 @@ function dr-logs-loganalysis { } +function dr-clean-local { + dr-stop-training + sudo rm -rf /robo/* && sudo rm -rf /mnt/deepracer/robo/checkpoint/* +} + function dr-logs-proxy-start { docker-compose -f $DIR/docker/docker-compose-log.yml up -d } @@ -136,3 +149,13 @@ function dr-logs-proxy-stop { function dr-update { source $DIR/activate.sh } + +function dr-update-env { + if [[ -f "$DIR/current-run.env" ]] + then + export $(grep -v '^#' $DIR/current-run.env | xargs) + else + echo "File current-run.env does not exist." + exit 1 + fi +} \ No newline at end of file diff --git a/defaults/template-run.env b/defaults/template-run.env index 3eeb1198..9d7b27f5 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,5 +1,6 @@ CLOUD=Azure WORLD_NAME=Vegas_track +NUMBER_OF_TRIALS=5 CHANGE_START_POSITION=True PRETRAINED=False PRETRAINED_S3_PREFIX=rl-sagemaker-pretrained @@ -12,5 +13,6 @@ LOCAL_S3_PROFILE=default LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker LOCAL_S3_BUCKET=bucket LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files +LOCAL_S3_LOGS_PREFIX=logs LOGS_ACCESS_KEY=mylogs LOGS_ACCESS_SECRET=mylogpass diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index ab9e7246..055000ab 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -10,6 +10,7 @@ services: image: larsll/deepracer-rlcoach environment: - WORLD_NAME + - NUMBER_OF_TRIALS=${NUMBER_OF_TRIALS} - PRETRAINED - PRETRAINED_S3_PREFIX - PRETRAINED_S3_BUCKET @@ -19,8 +20,6 @@ services: - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json - - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} env_file: .env container_name: rl_coach volumes: @@ -42,6 +41,7 @@ services: environment: - CHANGE_START_POSITION - WORLD_NAME + - NUMBER_OF_TRIALS=${NUMBER_OF_TRIALS} - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 3aeb6ca4..03deaaaf 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -1,7 +1,6 @@ # set evaluation specific environment variables export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" export METRICS_S3_OBJECT_KEY=metrics/eval_metrics.json -export NUMBER_OF_TRIALS=5 docker-compose up -d @@ -16,11 +15,11 @@ then echo "Display exists, using gnome-terminal for logs and starting vncviewer." echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /robomaker/ { print $1 }')" echo 'attempting to open vnc viewer...' gnome-terminal -x sh -c "!!; vncviewer localhost:8080" else echo "No display. Falling back to CLI mode." - docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') + docker logs -f $(docker ps | awk ' /robomaker/ { print $1 }') fi diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index 3cc4e387..413f711e 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -2,5 +2,9 @@ docker-compose down -docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') -docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') +SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') +if [[ -n $SAGEMAKER ]]; +then + docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') + docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') +fi \ No newline at end of file diff --git a/scripts/training/back-up-training-run.sh b/scripts/training/back-up-training-run.sh deleted file mode 100755 index f3e7c7dc..00000000 --- a/scripts/training/back-up-training-run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -BACKUP_LOC=/media/aschu/storage/deepracer-training/backup -FILENAME=$(date +%Y-%m-%d_%H-%M-%S) -tar -czvf ${FILENAME}.tar.gz ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker/* -mv ${FILENAME}.tar.gz $BACKUP_LOC \ No newline at end of file diff --git a/scripts/training/delete-last-run.sh b/scripts/training/delete-last-run.sh deleted file mode 100755 index b2f5883f..00000000 --- a/scripts/training/delete-last-run.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -rm -rf ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker -rm -rf ../../docker/volumes/robo/checkpoint/checkpoint -mkdir ../../docker/volumes/robo/checkpoint/checkpoint -rm -rf /robo/container/* -rm -rf ../../docker/volumes/robo/checkpoint/log/* diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index 3cc4e387..413f711e 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -2,5 +2,9 @@ docker-compose down -docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') -docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') +SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') +if [[ -n $SAGEMAKER ]]; +then + docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') + docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') +fi \ No newline at end of file From 666666cbfbd2b71ed6f203b80c0b60559e7074b6 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 27 Jan 2020 13:43:44 +0100 Subject: [PATCH 033/428] Updated user documentation (#13) * Updated user documentation * Further updates * EOF line-feeds --- .gitignore | 1 + README.md | 110 +++++++++++++++++++++++++++++++++++++++++++++++++--- activate.sh | 4 +- bin/init.sh | 5 ++- 4 files changed, 111 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index db124a68..7e79a5de 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ docker/volumes/ recording/ recording current-run.env +DONE diff --git a/README.md b/README.md index 0cefddc8..07a881a6 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Main differences to the work done by Alex is: Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. -AWS: +**AWS**: * EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge * Ubuntu 18.04 * Minimum 30 GB, preferred 40 GB of OS disk. @@ -32,7 +32,7 @@ AWS: * Recommended at least 6 VCPUs * S3 bucket. Preferrably in same region as EC2 instance. -Azure: +**Azure**: * N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard * Ubuntu 18.04 * Standard 30 GB OS drive is sufficient to get started. @@ -48,16 +48,114 @@ A step by step [installation guide](https://github.com/larsll/deepracer-for-clou The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. git clone https://github.com/larsll/deepracer-for-cloud.git - cd deepracer-for-azure && ./bin/prepare.sh + cd deepracer-for-cloud && ./bin/prepare.sh -This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. +This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. The installation script will adapt `.profile` to ensure that all settings are applied on login. -TODO: Setup of environment. +*TODO: Document how to configure via cloud-init.* + +## Environment Setup + +### AWS + +In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. + +#### IAM Roles + +To use IAM Roles: +* An empty S3 bucket in the same region as the EC2 instance. +* An IAM Role that has permissions to access both the *new* S3 bucket as well as the DeepRacer bucket. +* An EC2 instance with the IAM Role assigned. +* Configure `current-run.env` as follows: + * `LOCAL_S3_PROFILE=default` + * `LOCAL_S3_BUCKET=` + * `UPLOAD_S3_PROFILE=default` + * `UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + + +#### Manual setup +For access with IAM user: +* An empty S3 bucket in the same region as the EC2 instance. +* A real AWS IAM user set up with access keys: + * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. + * Use `aws configure` to configure this into the default profile. +* Configure `current-run.env` as follows: + * `LOCAL_S3_PROFILE=default` + * `LOCAL_S3_BUCKET=` + * `UPLOAD_S3_PROFILE=default` + * `UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + +### Azure + +In Azure mode the script-set requires the following: +* A storage account with a blob container set up with access keys: + * Use `aws configure --profile ` to configure this into a specific profile. + * Access Key ID is the Storage Account name. + * Secret Access Key is the Access Key for the Storage Account. + * The blob container is equivalent to the S3 bucket. +* A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. +* Configure `current-run.env` as follows: + * `LOCAL_S3_PROFILE=` + * `LOCAL_S3_BUCKET=` + * `UPLOAD_S3_PROFILE=default` + * `UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + +As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models. + +If you want to use awscli (`aws`) to manually move files then use `aws $LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. + +### Local + +*TODO*. The current script-set does not provide a direct way to host all files locally within the VM. It is possible to work around this by changing `docker\docker-compose-azure.yml` to put minio in a server and not an azure gateway mode. + +### Environment Variables +The scripts assume that a file `current-run.env` is populated with the required values. + +| Variable | Description | +|----------|-------------| +| `CLOUD` | Can be `Azure` or `AWS`; determines how the storage will be configured.| +| `WORLD_NAME` | Defines the track to be used.| +| `NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| +| `CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| +| `PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{PRETRAINED_S3_BUCKET}/{PRETRAINED_S3_PREFIX}`, accessible by credentials held in profile `{LOCAL_S3_PROFILE}`.| +| `PRETRAINED_S3_BUCKET` | Name of S3 bucket which holds the pretrained model.| +| `PRETRAINED_S3_PREFIX` | Prefix of pretrained model within S3 bucket.| +| `LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| +| `LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| +| `LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| +| `LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| +| `LOCAL_S3_LOGS_PREFIX` | Prefix of log files within S3 bucket. | +| `LOGS_ACCESS_KEY` | Username for local S3 log proxy (minio container).| +| `LOGS_ACCESS_SECRET` | Password for local S3 log proxy (minio container).| + ## Usage -Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. (If `dr-update` is not found, try `source activate.sh` to get aliases defined. +Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. If `dr-update` is not found, try `source activate.sh` to get aliases defined. Ensure that the configuration files are uploaded into the bucket `dr-upload-custom-files`. Start a training with `dr-start-training`. + +### Commands + +| Command | Description | +|---------|-------------| +| `dr-update` | Loads in all scripts and environment variables again.| +| `dr-update-env` | Loads in all environment variables from `current-run.env`.| +| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{LOCAL_S3_BUCKET}/custom_files`.| +| `dr-download-custom-files` | Downloads changed configuration files from `s3://{LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| +| `dr-upload-logs` | Uploads changed Robomaker log files from `/mnt/deepracer/robo/checkpoint/log` into `s3://{LOCAL_S3_BUCKET}/${LOCAL_S3_LOGS_PREFIX}`.| +| `dr-start-training` | Starts a training session in the local VM based on current configuration.| +| `dr-stop-training` | Stops the current local training session. Uploads log files.| +| `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| +| `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| +| `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| +| `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| +| `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| +| `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| +| `dr-logs-start-proxy` | Starts a local Minio S3 instance on port 9001 to expose files in `/mnt/deepracer/robo/checkpoint/log`. Useful if doing log analysis outside of VM. +| `dr-logs-stop-proxy` | Stops the local Minio S3 instance on port 9001. \ No newline at end of file diff --git a/activate.sh b/activate.sh index 1f988045..e31e9490 100644 --- a/activate.sh +++ b/activate.sh @@ -138,11 +138,11 @@ function dr-clean-local { sudo rm -rf /robo/* && sudo rm -rf /mnt/deepracer/robo/checkpoint/* } -function dr-logs-proxy-start { +function dr-logs-start-proxy { docker-compose -f $DIR/docker/docker-compose-log.yml up -d } -function dr-logs-proxy-stop { +function dr-logs-stop-proxy { docker-compose -f $DIR/docker/docker-compose-log.yml down } diff --git a/bin/init.sh b/bin/init.sh index 7902a253..0b53ce20 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -GPUS=$(docker run --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) +GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then echo "No GPU detected in docker. Please check setup". @@ -77,3 +77,6 @@ fi # ensure our variables are set on startup echo "source $INSTALL_DIR/activate.sh" >> $HOME/.profile + +# mark as done +date | tee $INSTALL_DIR/DONE From e0353512c4f10e8a1c627191af0b458ffc2e3193 Mon Sep 17 00:00:00 2001 From: larsll Date: Tue, 28 Jan 2020 16:56:47 +0000 Subject: [PATCH 034/428] Ensuring that .aws has been created --- bin/init.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/init.sh b/bin/init.sh index 0b53ce20..6f70c9ca 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -18,6 +18,7 @@ mkdir -p $INSTALL_DIR/docker/volumes # create symlink to current user's home .aws directory # NOTE: AWS cli must be installed for this to work # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html +mkdir -p $(eval echo "~${USER}")/.aws ln -s $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ # grab local training deepracer repo from crr0004 and log analysis repo from vreadcentric From 39efab69831cb2054f2819a7db617bbb3a5e0c3f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 28 Jan 2020 18:01:41 +0100 Subject: [PATCH 035/428] New upload functionality (#15) * Re-worked upload script. * Adding more descriptions. * Adding Hyperparameters file to upload script. --- scripts/training/upload-snapshot.sh | 105 ---------------------------- scripts/upload/upload-model.sh | 89 +++++++++++++++++++++++ 2 files changed, 89 insertions(+), 105 deletions(-) delete mode 100755 scripts/training/upload-snapshot.sh create mode 100755 scripts/upload/upload-model.sh diff --git a/scripts/training/upload-snapshot.sh b/scripts/training/upload-snapshot.sh deleted file mode 100755 index 51c25c09..00000000 --- a/scripts/training/upload-snapshot.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -S3_BUCKET=${UPLOAD_S3_BUCKET} -S3_PREFIX=${UPLOAD_S3_PREFIX} - -WORK_DIR=/mnt/deepracer -MODEL_DIR=${WORK_DIR}/rl-deepracer-sagemaker/model/ -MODEL_REWARD=$(pwd)/../../custom_files/reward.py -MODEL_HYPER=$(pwd)/../../custom_files/hyperparameters.json -MODEL_NAME=$UPLOAD_MODEL_NAME - -display_usage() { - echo -e "\nUsage:\n./upload-snapshot.sh -c checkpoint \n" -} - -# check whether user had supplied -h or --help . If yes display usage -if [[ ( $# == "--help") || $# == "-h" ]]; then -display_usage -exit 0 -fi - -while getopts ":c:" opt; do -case $opt in -c) CHECKPOINT="$OPTARG" -;; -\?) echo "Invalid option -$OPTARG" >&2 -;; -esac -done - -# echo 'checkpoint recieved: ' ${CHECKPOINT} - -if [ -z "$CHECKPOINT" ]; then - echo "Checkpoint not supplied, checking for latest checkpoint" - CHECKPOINT_FILE=$MODEL_DIR"checkpoint" - - if [ ! -f ${CHECKPOINT_FILE} ]; then - echo "Checkpoint file not found!" - return 1 - else - echo "found checkpoint index file "$CHECKPOINT_FILE - fi; - - FIRST_LINE=$(head -n 1 $CHECKPOINT_FILE) - CHECKPOINT=`echo $FIRST_LINE | sed "s/[model_checkpoint_path: [^ ]*//"` - CHECKPOINT=`echo $CHECKPOINT | sed 's/[_][^ ]*//'` - CHECKPOINT=`echo $CHECKPOINT | sed 's/"//g'` - echo "latest checkpoint = "$CHECKPOINT -else - echo "Checkpoint supplied: ["${CHECKPOINT}"]" -fi - -mkdir -p $WORK_DIR/tmp_upload && rm -rf $WORK_DIR/tmp_upload/* - -MODEL_FILE=$MODEL_DIR"model_"$CHECKPOINT".pb" -METADATA_FILE=$MODEL_DIR"model_metadata.json" - - -if test ! -f "$MODEL_FILE"; then - echo "$MODEL_FILE doesn't exist" - return 1 -else - cp $MODEL_FILE $WORK_DIR/tmp_upload/ -fi - -if test ! -f "$METADATA_FILE"; then - echo "$METADATA_FILE doesn't exist" - return 1 -else - cp $METADATA_FILE $WORK_DIR/tmp_upload/ -fi - - -for i in $( find $MODEL_DIR -type f -name $CHECKPOINT"*" ); do - cp $i $WORK_DIR/tmp_upload/ -done - -ls ${MODEL_DIR}${CHECKPOINT}_Step-*.ckpt.index | xargs -n 1 basename | sed 's/[.][^ ]*//' - -CONTENT=$(ls ${MODEL_DIR}${CHECKPOINT}_Step-*.ckpt.index | xargs -n 1 basename | sed 's/[.][^ ]*//') -echo ${CONTENT} - -echo 'model_checkpoint_path: "'${CONTENT}'.ckpt"' > $WORK_DIR/tmp_upload/checkpoint - -# # upload files to s3 -for filename in $WORK_DIR/tmp_upload/*; do - aws s3 cp $filename s3://$S3_BUCKET/$S3_PREFIX/model/ -done -aws s3 cp $MODEL_HYPER s3://$S3_BUCKET/$S3_PREFIX/ip/ -# tar -czvf $WORK_DIR/$MODEL_NAME-${CHECKPOINT}-checkpoint.tar.gz $WORK_DIR/checkpoint/* - -# # upload meta-data -aws s3 cp $METADATA_FILE s3://$S3_BUCKET/model-metadata/$MODEL_NAME/ -aws s3 cp $MODEL_REWARD s3://$S3_BUCKET/reward-functions/$MODEL_NAME/reward_function.py - -echo 'done uploading model!' - - - - - - - - - diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh new file mode 100755 index 00000000..d6aa6789 --- /dev/null +++ b/scripts/upload/upload-model.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +while getopts ":c:" opt; do +case $opt in +c) OPT_CHECKPOINT="$OPTARG" +;; +\?) echo "Invalid option -$OPTARG" >&2 +;; +esac +done + +TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} +TARGET_S3_PREFIX=${UPLOAD_S3_PREFIX} + +SOURCE_S3_BUCKET=${LOCAL_S3_BUCKET} +SOURCE_S3_MODEL_PREFIX=${LOCAL_S3_MODEL_PREFIX} +SOURCE_S3_CONFIG=${LOCAL_S3_CUSTOM_FILES_PREFIX} + +WORK_DIR=/mnt/deepracer/tmp/ +mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model + +# Download information on model. +PARAM_FILE=$(aws s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f) +if [ -n "$PARAM_FILE" ]; +then + TARGET_METADATA_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/REWARD_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/METRICS_S3_OBJECT_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" + MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | awk '{split($0,a,"/"); print a[2] }') + echo "Preparing upload for model ${MODEL_NAME}." +else + echo "No DeepRacer information found in s3://${UPLOAD_S3_BUCKET}/${UPLOAD_S3_PREFIX}. Exiting" + exit 1 +fi + + +# Check if metadata-files are available +REWARD_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py ${WORK_DIR} --no-progress | awk '/reward.py$/ {print $4}'| xargs readlink -f) +METADATA_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f) +HYPERPARAM_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f) +METRICS_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/metrics/metric.json ${WORK_DIR} --no-progress | awk '/metric.json$/ {print $4}'| xargs readlink -f) + +if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$METRICS_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; +then + echo "All meta-data files found. Looking for checkpoint." + # SOURCE_METADATA_FILE_S3_KEY="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py" + # SOURCE_REWARD_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json" + # SOURCE_METRICS_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/metrics/metric.json" +fi + +# Download checkpoint file +CHECKPOINT_FILE=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model --exclude "*" --include "checkpoint" --no-progress | awk '{print $4}' | xargs readlink -f) + +if [ -z "$CHECKPOINT_FILE" ]; then + echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting." + exit 1 +fi + +if [ -z "$OPT_CHECKPOINT" ]; then + echo "Checkpoint not supplied, checking for latest checkpoint" + + FIRST_LINE=$(head -n 1 $CHECKPOINT_FILE) + CHECKPOINT_PREFIX=$(echo $FIRST_LINE | sed "s/[model_checkpoint_path: [^ ]*//" | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + CHECKPOINT=`echo $CHECKPOINT_PREFIX | sed 's/[_][^ ]*//'` + echo "Latest checkpoint = "$CHECKPOINT +else + CHECKPOINT="${OPT_CHECKPOINT}" + CHECKPOINT_PREFIX=$(cat $CHECKPOINT_FILE | grep "all_model_checkpoint_paths: \"$CHECKPOINT" | sed "s/[all_model_checkpoint_paths: [^ ]*//" | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + echo "Checkpoint supplied: ["${CHECKPOINT}"]" +fi + +# Find checkpoint & model files - download +if [ -n "$CHECKPOINT_PREFIX" ]; then + CHECKPOINT_MODEL_FILES=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT_PREFIX}*" --include "model_${CHECKPOINT}.pb" --no-progress | awk '{print $4}' | xargs readlink -f) + cp ${METADATA_FILE} ${WORK_DIR}model/ + echo "model_checkpoint_path: \"${CHECKPOINT_PREFIX}\"" | tee ${CHECKPOINT_FILE} +else + echo "Checkpoint not found. Exiting." + exit 1 +fi + +# Upload files +echo "Uploading files for model ${MODEL_NAME} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" +aws s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ +aws s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} +aws s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} +aws s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} +aws s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} From 3cdd90258d2be3597f222fa878555fd6bc042181 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 30 Jan 2020 17:03:23 +0100 Subject: [PATCH 036/428] Updating Upload Functionality (#16) * Re-worked upload script. * Adding more descriptions. * Adding Hyperparameters file to upload script. * Further enhancements to upload * Wipe option added * Initial version of list * Tweaks * Finished the list-set-models. Clean-up & activation. * Updating name in config as well. * Updating documentation. --- .gitignore | 1 + README.md | 9 ++- activate.sh | 14 ++++ scripts/upload/list-set-models.sh | 110 ++++++++++++++++++++++++++++++ scripts/upload/upload-model.sh | 82 ++++++++++++++++++---- 5 files changed, 201 insertions(+), 15 deletions(-) create mode 100755 scripts/upload/list-set-models.sh diff --git a/.gitignore b/.gitignore index 7e79a5de..5dab6f30 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ docker/volumes/ recording/ recording current-run.env +current-run.env.bak DONE diff --git a/README.md b/README.md index 07a881a6..377a18f8 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,10 @@ The scripts assume that a file `current-run.env` is populated with the required | `LOCAL_S3_LOGS_PREFIX` | Prefix of log files within S3 bucket. | | `LOGS_ACCESS_KEY` | Username for local S3 log proxy (minio container).| | `LOGS_ACCESS_SECRET` | Password for local S3 log proxy (minio container).| +| `UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| +| `UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| +| `UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| +| `UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| ## Usage @@ -158,4 +162,7 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| | `dr-logs-start-proxy` | Starts a local Minio S3 instance on port 9001 to expose files in `/mnt/deepracer/robo/checkpoint/log`. Useful if doing log analysis outside of VM. -| `dr-logs-stop-proxy` | Stops the local Minio S3 instance on port 9001. \ No newline at end of file +| `dr-logs-stop-proxy` | Stops the local Minio S3 instance on port 9001. | +| `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | +| `dr-set-upload-model` | Updates the `current-run.env` with the prefix and name of your selected model. | +| `dr-upload-model` | Uploads the model defined in `LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `UPLOAD_S3_PREFIX` | \ No newline at end of file diff --git a/activate.sh b/activate.sh index e31e9490..d35354da 100644 --- a/activate.sh +++ b/activate.sh @@ -1,5 +1,6 @@ #!/bin/bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +export DR_DIR=$DIR # create directory structure for docker volumes if ! (mount | grep /mnt > /dev/null); then @@ -46,6 +47,19 @@ function dr-upload-custom-files { aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET } +function dr-upload-model { + dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" +} + +function dr-list-aws-models { + dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" +} + +function dr-set-upload-model { + dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" +} + + function dr-upload-logs { if [[ "${CLOUD,,}" == "azure" ]]; then diff --git a/scripts/upload/list-set-models.sh b/scripts/upload/list-set-models.sh new file mode 100755 index 00000000..213bc904 --- /dev/null +++ b/scripts/upload/list-set-models.sh @@ -0,0 +1,110 @@ +#!/bin/bash +#set -x + +usage(){ + echo "Usage: $0 [-h] [-s ] [-c]" + echo " -s model Configures environment to upload into selected model." + echo " -c Use local cache of models." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +while getopts ":chs:" opt; do +case $opt in +s) OPT_SET="$OPTARG" +;; +c) OPT_CACHE="cache" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} +WORK_DIR=/mnt/deepracer/tmp-list +mkdir -p ${WORK_DIR} + +if [[ -n "${OPT_CACHE}" ]]; +then + PARAM_FILES=$(ls -t "${WORK_DIR}" ) + echo -e "Using local cache..." +else + PARAM_FILES=$(aws s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) + echo -e "\nLooking for DeepRacer models in s3://${TARGET_S3_BUCKET}...\n" +fi + + +if [[ -z "${PARAM_FILES}" ]]; +then + echo "No models found in s3://{TARGET_S3_BUCKET}. Exiting." + exit 1 +fi + +if [[ -z "${OPT_SET}" ]]; +then + echo "+---------------------------------------------------------------------------+" + printf "| %-40s | %-30s |\n" "Model Name" "Creation Time" + echo "+---------------------------------------------------------------------------+" + + for PARAM_FILE in $PARAM_FILES; do + if [[ -z "${OPT_CACHE}" ]]; then + aws s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') + else + PARAM_FILE_L=$PARAM_FILE + fi + MODIFICATION_TIME=$(stat -c %Y ${WORK_DIR}/${PARAM_FILE_L}) + MODIFICATION_TIME_STR=$(echo "@${MODIFICATION_TIME}" | xargs date -d ) + MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') + printf "| %-40s | %-30s |\n" "$MODEL_NAME" "$MODIFICATION_TIME_STR" + done + + echo "+---------------------------------------------------------------------------+" + echo -e "\nSet the model with dr-set-upload-model -s .\n" +else + echo -e "Looking for DeepRacer model ${OPT_SET} in s3://${TARGET_S3_BUCKET}..." + + for PARAM_FILE in $PARAM_FILES; do + if [[ -z "${OPT_CACHE}" ]]; then + aws s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') + MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') + if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then + MATCHED_PREFIX=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[1]}') + echo "Found in ${MODEL_NAME} in ${MATCHED_PREFIX}". + break + fi + else + PARAM_FILE_L=$PARAM_FILE + MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') + if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then + MATCHED_PREFIX=$(awk '/SAGEMAKER_SHARED_S3_PREFIX/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") + echo "Found in ${MODEL_NAME} in ${MATCHED_PREFIX}". + break + fi + fi + done + + CONFIG_FILE=$(echo $DR_DIR/current-run.env) + echo "Configuration file $CONFIG_FILE will be updated." + if [[ -n "${MODEL_NAME}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + else + sed -i.bak -re "s/(UPLOAD_S3_PREFIX=).*$/\1$MATCHED_PREFIX/g; s/(UPLOAD_MODEL_NAME=).*$/\1$MODEL_NAME/g" "$CONFIG_FILE" && echo "Done." + fi + fi +fi \ No newline at end of file diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index d6aa6789..c60a38f2 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -1,19 +1,57 @@ #!/bin/bash -while getopts ":c:" opt; do +usage(){ + echo "Usage: $0 [-f] [-w] [-d] [-c ] [-p ]" + echo " -f Force upload. No confirmation question." + echo " -w Wipes the target AWS DeepRacer model structure before upload." + echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." + echo " -c num Uploads specified checkpoint. Default is last checkpoint." + echo " -p model Uploads model in specified S3 prefix." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +while getopts ":fwdhc:p:" opt; do case $opt in c) OPT_CHECKPOINT="$OPTARG" +;; +f) OPT_FORCE="True" +;; +d) OPT_DRYRUN="--dryrun" +;; +p) OPT_PREFIX="$OPTARG" +;; +w) OPT_WIPE="--delete" +;; +h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 +usage ;; esac done +if [[ -n "${OPT_DRYRUN}" ]]; +then + echo "*** DRYRUN MODE ***" +fi + TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} TARGET_S3_PREFIX=${UPLOAD_S3_PREFIX} SOURCE_S3_BUCKET=${LOCAL_S3_BUCKET} -SOURCE_S3_MODEL_PREFIX=${LOCAL_S3_MODEL_PREFIX} +if [[ -n "${OPT_PREFIX}" ]]; +then + SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX} +else + SOURCE_S3_MODEL_PREFIX=${LOCAL_S3_MODEL_PREFIX} +fi SOURCE_S3_CONFIG=${LOCAL_S3_CUSTOM_FILES_PREFIX} WORK_DIR=/mnt/deepracer/tmp/ @@ -28,7 +66,7 @@ then TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/METRICS_S3_OBJECT_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | awk '{split($0,a,"/"); print a[2] }') - echo "Preparing upload for model ${MODEL_NAME}." + echo "Detected DeepRacer Model ${MODEL_NAME} at s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/." else echo "No DeepRacer information found in s3://${UPLOAD_S3_BUCKET}/${UPLOAD_S3_PREFIX}. Exiting" exit 1 @@ -36,10 +74,10 @@ fi # Check if metadata-files are available -REWARD_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py ${WORK_DIR} --no-progress | awk '/reward.py$/ {print $4}'| xargs readlink -f) -METADATA_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f) -HYPERPARAM_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f) -METRICS_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/metrics/metric.json ${WORK_DIR} --no-progress | awk '/metric.json$/ {print $4}'| xargs readlink -f) +REWARD_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py ${WORK_DIR} --no-progress | awk '/reward.py$/ {print $4}'| xargs readlink -f 2> /dev/null) +METADATA_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +HYPERPARAM_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +METRICS_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/metrics/metric.json ${WORK_DIR} --no-progress | awk '/metric.json$/ {print $4}'| xargs readlink -f 2> /dev/null) if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$METRICS_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then @@ -47,10 +85,14 @@ then # SOURCE_METADATA_FILE_S3_KEY="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py" # SOURCE_REWARD_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json" # SOURCE_METRICS_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/metrics/metric.json" +else + echo "Meta-data files are not found. Exiting." + exit 1 fi # Download checkpoint file -CHECKPOINT_FILE=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model --exclude "*" --include "checkpoint" --no-progress | awk '{print $4}' | xargs readlink -f) +echo "Looking for model to upload from s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/" +CHECKPOINT_FILE=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model --exclude "*" --include "checkpoint" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) if [ -z "$CHECKPOINT_FILE" ]; then echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting." @@ -81,9 +123,21 @@ else fi # Upload files -echo "Uploading files for model ${MODEL_NAME} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" -aws s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ -aws s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} -aws s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} -aws s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} -aws s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} +if [[ -z "${OPT_FORCE}" ]]; +then + echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to ${MODEL_NAME} in s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi +fi + +touch ${WORK_DIR}model/.ready +cd ${WORK_DIR} +aws s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} +aws s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} +aws s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} +aws s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} From bbfea14742ddadf531b33280ec5f55528dff0c37 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 1 Feb 2020 16:21:06 +0100 Subject: [PATCH 037/428] Increment training script (#17) * Feature allowing increment of configuration file. * Reducing parameters * Documentation and minor bugs * Typo in variable --- README.md | 6 +- activate.sh | 5 + defaults/template-run.env | 5 +- docker/docker-compose.yml | 6 +- scripts/training/increment.sh | 92 +++++++++++++++++++ .../training/set-last-run-to-pretrained.sh | 12 --- scripts/upload/list-set-models.sh | 6 +- scripts/upload/upload-model.sh | 12 +-- 8 files changed, 114 insertions(+), 30 deletions(-) create mode 100755 scripts/training/increment.sh delete mode 100755 scripts/training/set-last-run-to-pretrained.sh diff --git a/README.md b/README.md index 377a18f8..25eebdd6 100644 --- a/README.md +++ b/README.md @@ -122,14 +122,13 @@ The scripts assume that a file `current-run.env` is populated with the required | `WORLD_NAME` | Defines the track to be used.| | `NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| | `CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| -| `PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{PRETRAINED_S3_BUCKET}/{PRETRAINED_S3_PREFIX}`, accessible by credentials held in profile `{LOCAL_S3_PROFILE}`.| -| `PRETRAINED_S3_BUCKET` | Name of S3 bucket which holds the pretrained model.| -| `PRETRAINED_S3_PREFIX` | Prefix of pretrained model within S3 bucket.| | `LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| | `LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| | `LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| | `LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| | `LOCAL_S3_LOGS_PREFIX` | Prefix of log files within S3 bucket. | +| `LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{LOCAL_S3_PROFILE}`.| +| `LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| | `LOGS_ACCESS_KEY` | Username for local S3 log proxy (minio container).| | `LOGS_ACCESS_SECRET` | Password for local S3 log proxy (minio container).| | `UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| @@ -154,6 +153,7 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust | `dr-download-custom-files` | Downloads changed configuration files from `s3://{LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| | `dr-upload-logs` | Uploads changed Robomaker log files from `/mnt/deepracer/robo/checkpoint/log` into `s3://{LOCAL_S3_BUCKET}/${LOCAL_S3_LOGS_PREFIX}`.| | `dr-start-training` | Starts a training session in the local VM based on current configuration.| +| `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.| | `dr-stop-training` | Stops the current local training session. Uploads log files.| | `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| | `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| diff --git a/activate.sh b/activate.sh index d35354da..42723a22 100644 --- a/activate.sh +++ b/activate.sh @@ -32,6 +32,7 @@ then export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) COMPOSE_FILE="$COMPOSE_FILE:$DIR/docker/docker-compose-keys.yml" + export UPLOAD_PROFILE="--profile $UPLOAD_S3_PROFILE" fi export COMPOSE_FILE @@ -90,6 +91,10 @@ function dr-start-training { bash -c "cd $DIR/scripts/training && ./start.sh" } +function dr-increment-training { + dr-update-env && ${DIR}/scripts/training/increment.sh "$@" && dr-update-env +} + function dr-stop-training { dr-upload-logs ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" diff --git a/defaults/template-run.env b/defaults/template-run.env index 9d7b27f5..21638780 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -2,13 +2,12 @@ CLOUD=Azure WORLD_NAME=Vegas_track NUMBER_OF_TRIALS=5 CHANGE_START_POSITION=True -PRETRAINED=False -PRETRAINED_S3_PREFIX=rl-sagemaker-pretrained -PRETRAINED_S3_BUCKET=bucket UPLOAD_S3_PROFILE=default UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix UPLOAD_MODEL_NAME=mymodelname +LOCAL_S3_PRETRAINED=False +LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained LOCAL_S3_PROFILE=default LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker LOCAL_S3_BUCKET=bucket diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 055000ab..b61ebc77 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -11,9 +11,9 @@ services: environment: - WORLD_NAME - NUMBER_OF_TRIALS=${NUMBER_OF_TRIALS} - - PRETRAINED - - PRETRAINED_S3_PREFIX - - PRETRAINED_S3_BUCKET + - PRETRAINED=${LOCAL_S3_PRETRAINED} + - PRETRAINED_S3_PREFIX=${LOCAL_S3_PRETRAINED_PREFIX} + - PRETRAINED_S3_BUCKET=${LOCAL_S3_BUCKET} - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json diff --git a/scripts/training/increment.sh b/scripts/training/increment.sh new file mode 100755 index 00000000..1dd8d634 --- /dev/null +++ b/scripts/training/increment.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +usage(){ + echo "Usage: $0 [-f] [-w] [-p ] [-d ]" + echo "" + echo "Command will set the current model to be the pre-trained model and increment a numerical suffix." + echo "-p model Sets the to-be name to be rather than auto-incremeneting the previous model." + echo "-d delim Delimiter in model-name (e.g. '-' in 'test-model-1')" + echo "-f Force. Ask for no confirmations." + echo "-w Wipe the S3 prefix to ensure that two models are not mixed." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +OPT_DELIM='-' + +while getopts ":fwp:d:" opt; do +case $opt in + +f) OPT_FORCE="True" +;; +p) OPT_PREFIX="$OPTARG" +;; +w) OPT_WIPE="--delete" +;; +d) OPT_DELIM="$OPTARG" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +CONFIG_FILE=$(echo $DR_DIR/current-run.env) +echo "Configuration file $CONFIG_FILE will be updated." + +## Read in data +CURRENT_RUN_MODEL=$(awk '/LOCAL_S3_MODEL_PREFIX/ {print $1}' ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') +CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \ + awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') +if [[ -z ${CURRENT_RUN_MODEL_NUM} ]]; +then + NEW_RUN_MODEL="${CURRENT_RUN_MODEL}${OPT_DELIM}1" +else + NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc ) + NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") +fi + +if [[ -n "${NEW_RUN_MODEL}" ]]; +then + echo "Incrementing model from ${CURRENT_RUN_MODEL} to ${NEW_RUN_MODEL}" + if [[ -z "${OPT_FORCE}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi + fi + sed -i.bak -re "s/(LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done." +else + echo "Error in determining new model. Aborting." + exit 1 +fi + +if [[ -n "${OPT_WIPE}" ]]; +then + MODEL_DIR_S3=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} ) + if [[ -n "${MODEL_DIR_S3}" ]]; + then + echo "The new model's S3 prefix s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe." + fi + if [[ -z "${OPT_FORCE}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi + fi + aws $LOCAL_PROFILE_ENDPOINT_URL s3 rm s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive +fi diff --git a/scripts/training/set-last-run-to-pretrained.sh b/scripts/training/set-last-run-to-pretrained.sh deleted file mode 100755 index 27070ac1..00000000 --- a/scripts/training/set-last-run-to-pretrained.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -Folder=rl-deepracer-sagemaker -if [ -d ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker ]; -then - echo "Folder $Folder exist." - rm -rf ../../docker/volumes/minio/bucket/rl-deepracer-pretrained - mv ../../docker/volumes/minio/bucket/rl-deepracer-sagemaker ../../docker/volumes/minio/bucket/rl-deepracer-pretrained - echo "Done." - -else - echo "Folder $Folder does not exist" -fi diff --git a/scripts/upload/list-set-models.sh b/scripts/upload/list-set-models.sh index 213bc904..a11cba4d 100755 --- a/scripts/upload/list-set-models.sh +++ b/scripts/upload/list-set-models.sh @@ -38,7 +38,7 @@ then PARAM_FILES=$(ls -t "${WORK_DIR}" ) echo -e "Using local cache..." else - PARAM_FILES=$(aws s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) + PARAM_FILES=$(aws ${UPLOAD_PROFILE} s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) echo -e "\nLooking for DeepRacer models in s3://${TARGET_S3_BUCKET}...\n" fi @@ -57,7 +57,7 @@ then for PARAM_FILE in $PARAM_FILES; do if [[ -z "${OPT_CACHE}" ]]; then - aws s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + aws ${UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') else PARAM_FILE_L=$PARAM_FILE @@ -75,7 +75,7 @@ else for PARAM_FILE in $PARAM_FILES; do if [[ -z "${OPT_CACHE}" ]]; then - aws s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + aws ${UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index c60a38f2..f5ca2fab 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -58,7 +58,7 @@ WORK_DIR=/mnt/deepracer/tmp/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model # Download information on model. -PARAM_FILE=$(aws s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f) +PARAM_FILE=$(aws ${UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f) if [ -n "$PARAM_FILE" ]; then TARGET_METADATA_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") @@ -136,8 +136,8 @@ fi touch ${WORK_DIR}model/.ready cd ${WORK_DIR} -aws s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} -aws s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} -aws s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} -aws s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} -aws s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} +aws ${UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} From 9f78fe299406a05518f836471bf1e7dffec03bb0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 2 Feb 2020 11:53:08 +0100 Subject: [PATCH 038/428] Addition of local training feature (#18) * Initial version for local setup * Updates to documentation * Initial version for local setup * Updates to documentation * Added cloud detection script * Azure tweak * Updates * Fix bug * Debugging * Fix sudo for installing package * Further bug-fixes * Fixing wong param in init. * Additional fix * Ensuring minio runs --- README.md | 9 ++- activate.sh | 10 ++- bin/detect.sh | 21 ++++++ bin/init.sh | 34 ++++++++- bin/prepare.sh | 127 +++++++++++++++++++------------- defaults/template-run.env | 2 +- docker/docker-compose-local.yml | 23 ++++++ 7 files changed, 166 insertions(+), 60 deletions(-) create mode 100755 bin/detect.sh create mode 100644 docker/docker-compose-local.yml diff --git a/README.md b/README.md index 25eebdd6..78ebae45 100644 --- a/README.md +++ b/README.md @@ -54,10 +54,17 @@ This will prepare the VM by partitioning additional drives as well as installing The installation script will adapt `.profile` to ensure that all settings are applied on login. +For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. + *TODO: Document how to configure via cloud-init.* +*TODO: Create a local setup prepare script* ## Environment Setup +The environment is set via the `CLOUD` parameter in `current-run.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. + +Note: If in the `bin/prepare.sh` script then the working directory `/mnt/deepracer` will be provided based on the temporary storage partitions made available. If you want to provision the working directory in a different fashion then just ensure that a volume is mounted on `/mnt` or `/mnt/deepracer` with sufficient storage. + ### AWS In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. @@ -111,7 +118,7 @@ If you want to use awscli (`aws`) to manually move files then use `aws $LOCAL_PR ### Local -*TODO*. The current script-set does not provide a direct way to host all files locally within the VM. It is possible to work around this by changing `docker\docker-compose-azure.yml` to put minio in a server and not an azure gateway mode. +Local mode runs a minio server that hosts the data in the `/mnt/deepracer` partition. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. ### Environment Variables The scripts assume that a file `current-run.env` is populated with the required values. diff --git a/activate.sh b/activate.sh index 42723a22..f0543c1e 100644 --- a/activate.sh +++ b/activate.sh @@ -21,6 +21,10 @@ if [[ "${CLOUD,,}" == "azure" ]]; then LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" +elif [[ "${CLOUD,,}" == "local" ]]; +then + LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" + COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-local.yml" else LOCAL_PROFILE_ENDPOINT_URL="" COMPOSE_FILE="$DIR/docker/docker-compose.yml" @@ -39,7 +43,7 @@ export COMPOSE_FILE export LOCAL_PROFILE_ENDPOINT_URL function dr-upload-custom-files { - if [[ "${CLOUD,,}" == "azure" ]]; + if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; then ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi @@ -62,7 +66,7 @@ function dr-set-upload-model { function dr-upload-logs { - if [[ "${CLOUD,,}" == "azure" ]]; + if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; then ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi @@ -77,7 +81,7 @@ function dr-upload-logs { } function dr-download-custom-files { - if [[ "${CLOUD,,}" == "azure" ]]; + if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; then ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio fi diff --git a/bin/detect.sh b/bin/detect.sh new file mode 100755 index 00000000..9433aa91 --- /dev/null +++ b/bin/detect.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +## What am I? +if [[ -f /var/run/cloud-init/instance-data.json ]]; +then + # We have a cloud-init environment (Azure or AWS). + CLOUD_NAME=$(jq -r '.v1."cloud-name"' /var/run/cloud-init/instance-data.json) + if [[ "${CLOUD_NAME}" == "azure" ]]; + then + export CLOUD_NAME + export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta_data".imds.compute."vmSize"' /var/run/cloud-init/instance-data.json) + elif [[ "${CLOUD_NAME}" == "aws" ]]; + then + export CLOUD_NAME + export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta-data"."instance-type"' /var/run/cloud-init/instance-data.json) + else + export CLOUD_NAME=local + fi +else + export CLOUD_NAME=local +fi \ No newline at end of file diff --git a/bin/init.sh b/bin/init.sh index 6f70c9ca..b7f3fbd3 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -1,4 +1,24 @@ #!/usr/bin/env bash + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +while getopts ":m:c:" opt; do +case $opt in +m) OPT_MOUNT="$OPTARG" +;; +c) OPT_CLOUD="$OPTARG" +;; +\?) echo "Invalid option -$OPTARG" >&2 +exit 1 +;; +esac +done + GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then @@ -10,8 +30,12 @@ INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd cd $INSTALL_DIR # create directory structure for docker volumes -mount /mnt -sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording /mnt/deepracer/robo/checkpoint + +if [[ -n "$OPT_MOUNT" ]]; +then + mount "${OPT_MOUNT}" +fi +sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording /mnt/deepracer/robo/checkpoint /mnt/deepracer/minio/bucket sudo chown -R $(id -u):$(id -g) /mnt/deepracer mkdir -p $INSTALL_DIR/docker/volumes @@ -19,7 +43,7 @@ mkdir -p $INSTALL_DIR/docker/volumes # NOTE: AWS cli must be installed for this to work # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html mkdir -p $(eval echo "~${USER}")/.aws -ln -s $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ +ln -sf $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ # grab local training deepracer repo from crr0004 and log analysis repo from vreadcentric # Now as submodules! @@ -48,6 +72,10 @@ cd .. # replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) # TODO this file should be genrated from a gui before running training cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/current-run.env +if [[ -n "$OPT_CLOUD" ]]; +then + sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/current-run.env +fi #set proxys if required for arg in "$@"; diff --git a/bin/prepare.sh b/bin/prepare.sh index 3e9c0fc8..94105204 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -1,10 +1,21 @@ #!/bin/bash +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ - DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade + DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \ + sudo apt-get -y install jq + +source $DIR/detect.sh +echo "Detected cloud type ${CLOUD_NAME}" ## Do I have a GPU GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) @@ -16,59 +27,64 @@ fi ## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure) -ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') -ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') - -if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; +if [[ "${CLOUD_NAME}" == "azure" ]]; then - echo "Found $ADDL_DISK, preparing it for use" - echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK - sleep 1s - ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1") - sudo mkfs.ext4 $ADDL_DEVICE - sudo mkdir -p /var/lib/docker - echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab - mount /var/lib/docker - if [ $? -ne 0 ] + ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') + ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') + + if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; then - echo "Error during preparing of additional disk. Exiting." - exit 1 - fi -elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; -then - echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." + echo "Found $ADDL_DISK, preparing it for use" + echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + sleep 1s + ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1") + sudo mkfs.ext4 $ADDL_DEVICE + sudo mkdir -p /var/lib/docker + echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab + mount /var/lib/docker + if [ $? -ne 0 ] + then + echo "Error during preparing of additional disk. Exiting." + exit 1 + fi + elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; + then + echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." -else - echo "Did not find $ADDL_DISK. Installing into present drive/directory structure." + else + echo "Did not find $ADDL_DISK. Installing into present drive/directory structure." + fi fi - ## Do I have an ephemeral disk / temporary storage for runtime output - looking for /dev/nvme0n1 (AWS)? +if [[ "${CLOUD_NAME}" == "aws" ]]; +then -ADDL_DISK=$(lsblk | awk '/^nvme0n1/ {print $1}') -ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') + ADDL_DISK=$(lsblk | awk '/^nvme0n1/ {print $1}') + ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') -if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; -then - echo "Found $ADDL_DISK, preparing it for use" - echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK - sleep 1s - ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"p1") - sudo mkfs.ext4 $ADDL_DEVICE - sudo mkdir -p /mnt - echo "$ADDL_DEVICE /mnt ext4 rw,user,noauto 0 0" | sudo tee -a /etc/fstab - mount /mnt - if [ $? -ne 0 ] + if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; then - echo "Error during preparing of temporary disk. Exiting." - exit 1 - fi -elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; -then - echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." + echo "Found $ADDL_DISK, preparing it for use" + echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK + sleep 1s + ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"p1") + sudo mkfs.ext4 $ADDL_DEVICE + sudo mkdir -p /mnt + echo "$ADDL_DEVICE /mnt ext4 rw,user,noauto 0 0" | sudo tee -a /etc/fstab + mount /mnt + if [ $? -ne 0 ] + then + echo "Error during preparing of temporary disk. Exiting." + exit 1 + fi + elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; + then + echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." -else - echo "Did not find $ADDL_DISK, taking no action." + else + echo "Did not find $ADDL_DISK, taking no action." + fi fi ## Adding Nvidia Drivers @@ -77,8 +93,8 @@ sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' -## Adding AWSCli and JQ -sudo apt-get install -y awscli jq +## Adding AWSCli +sudo apt-get install -y awscli ## Installing Docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - @@ -101,9 +117,16 @@ sudo usermod -a -G docker $(id -un) sudo curl -L https://github.com/docker/compose/releases/download/1.25.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose -## Reboot to load driver -- continue install -echo "Rebooting in 5 seconds. Will continue with install." -cd $DIR -./runonce.sh ./init.sh -sleep 5s -sudo reboot +## Reboot to load driver -- continue install if in cloud-init +CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l) + +if [[ "$CLOUD_INIT" -ne 0 ]]; +then + echo "Rebooting in 5 seconds. Will continue with install." + cd $DIR + ./runonce.sh "./init.sh -m /mnt -c ${CLOUD_NAME}" + sleep 5s + sudo reboot +else + echo "First stage done. Please reboot and run init.sh" +fi \ No newline at end of file diff --git a/defaults/template-run.env b/defaults/template-run.env index 21638780..70a7471f 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,4 +1,4 @@ -CLOUD=Azure +CLOUD= WORLD_NAME=Vegas_track NUMBER_OF_TRIALS=5 CHANGE_START_POSITION=True diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml new file mode 100644 index 00000000..7bf50bc1 --- /dev/null +++ b/docker/docker-compose-local.yml @@ -0,0 +1,23 @@ +version: '3.7' + +services: + minio: + image: minio/minio + ports: + - "9000:9000" + container_name: minio + command: server /data + restart: unless-stopped + environment: + - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} + volumes: + - /mnt/deepracer/minio:/data + rl_coach: + environment: + - S3_ENDPOINT_URL=http://minio:9000 + depends_on: + - minio + robomaker: + environment: + - S3_ENDPOINT_URL=http://minio:9000 From 6fb3a8815f7cc166f4d1e8a3b4fe8f7406661fca Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 2 Feb 2020 15:24:14 +0100 Subject: [PATCH 039/428] Adding nogpu log-analysis (#19) --- activate.sh | 4 ++-- .../{Dockerfile => Dockerfile.gpu} | 0 .../dockerfiles/log-analysis/Dockerfile.nogpu | 24 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) rename docker/dockerfiles/log-analysis/{Dockerfile => Dockerfile.gpu} (100%) create mode 100644 docker/dockerfiles/log-analysis/Dockerfile.nogpu diff --git a/activate.sh b/activate.sh index f0543c1e..c4b40fa3 100644 --- a/activate.sh +++ b/activate.sh @@ -119,7 +119,7 @@ function dr-start-loganalysis { } function dr-stop-loganalysis { - eval LOG_ANALYSIS_ID=$(docker ps | awk ' /log-analysis/ { print $1 }') + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./stop.sh" else @@ -181,4 +181,4 @@ function dr-update-env { echo "File current-run.env does not exist." exit 1 fi -} \ No newline at end of file +} diff --git a/docker/dockerfiles/log-analysis/Dockerfile b/docker/dockerfiles/log-analysis/Dockerfile.gpu similarity index 100% rename from docker/dockerfiles/log-analysis/Dockerfile rename to docker/dockerfiles/log-analysis/Dockerfile.gpu diff --git a/docker/dockerfiles/log-analysis/Dockerfile.nogpu b/docker/dockerfiles/log-analysis/Dockerfile.nogpu new file mode 100644 index 00000000..7e10d4f3 --- /dev/null +++ b/docker/dockerfiles/log-analysis/Dockerfile.nogpu @@ -0,0 +1,24 @@ +FROM python:3.7.6-slim + +LABEL maintainer="lars@ludvig.no" \ + description="Log Analysis for DeepRacer Training Run" \ + version=1.0 + +# Container Dependency Setup +RUN apt-get update && apt-get upgrade -y && \ + apt-get install --no-install-recommends software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip3 install virtualenv && virtualenv /workspace/venv +WORKDIR /workspace/venv +RUN mkdir -p /workspace/venv/data /workspace/venv/logs /workspace/venv/workbook + +# Install common pip packages +WORKDIR /workspace/venv +COPY requirements.txt ./ +RUN . /workspace/venv/bin/activate && pip install --no-cache-dir -r requirements.txt + +EXPOSE 8888 +VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws", "/workspace/venv/workbook"] +CMD . /workspace/venv/bin/activate && jupyter lab --ip=0.0.0.0 --port=8888 --allow-root + From 7bb48e48c8c2695a812d5ba58f62971fcdb4b738 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 26 Feb 2020 11:24:57 +0100 Subject: [PATCH 040/428] Improved error handling (#20) --- scripts/upload/upload-model.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index f5ca2fab..29ea9c8f 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -45,6 +45,18 @@ fi TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} TARGET_S3_PREFIX=${UPLOAD_S3_PREFIX} +if [[ -z "${UPLOAD_S3_BUCKET}" ]]; +then + echo "No upload bucket defined. Exiting." + exit 1 +fi + +if [[ -z "${UPLOAD_S3_PREFIX}" ]]; +then + echo "No upload prefix defined. Exiting." + exit 1 +fi + SOURCE_S3_BUCKET=${LOCAL_S3_BUCKET} if [[ -n "${OPT_PREFIX}" ]]; then @@ -58,7 +70,7 @@ WORK_DIR=/mnt/deepracer/tmp/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model # Download information on model. -PARAM_FILE=$(aws ${UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f) +PARAM_FILE=$(aws ${UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) if [ -n "$PARAM_FILE" ]; then TARGET_METADATA_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") From 69a43bb8d3bab1bcccb00987b76f9c6880392a1b Mon Sep 17 00:00:00 2001 From: larsll Date: Tue, 25 Feb 2020 21:23:57 +0000 Subject: [PATCH 041/428] Fixing timing to use simulated time --- defaults/deepracer_racetrack_env.py.patch | 52 ++++++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/defaults/deepracer_racetrack_env.py.patch b/defaults/deepracer_racetrack_env.py.patch index 57e78d8a..b37e1d95 100644 --- a/defaults/deepracer_racetrack_env.py.patch +++ b/defaults/deepracer_racetrack_env.py.patch @@ -1,27 +1,67 @@ diff --git a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py -index 569b33c..9f43d62 100644 +index 569b33c..81727af 100644 --- a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py +++ b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py -@@ -467,7 +467,7 @@ class DeepRacerRacetrackEnv(gym.Env): +@@ -220,7 +220,7 @@ class DeepRacerRacetrackEnv(gym.Env): + + self.racecar_reset() + self.steps = 0 +- self.simulation_start_time = time.time() ++ self.simulation_start_time = rospy.get_time() + self.infer_reward_state(0, 0) + + return self.next_state +@@ -430,7 +430,7 @@ class DeepRacerRacetrackEnv(gym.Env): + current_progress, + closest_waypoint_index, + self.track_length, +- time.time())) ++ rospy.get_time())) + + # Terminate this episode when ready + if done and node_type == SIMULATION_WORKER: +@@ -456,6 +456,7 @@ class DeepRacerRacetrackEnv(gym.Env): + + def finish_episode(self, progress): + # Increment episode count, update start position and direction ++ simulation_end_time = rospy.get_time() + self.episodes += 1 + if self.change_start: + self.start_ndist = (self.start_ndist + ROUND_ROBIN_ADVANCE_DIST) % 1.0 +@@ -467,31 +468,36 @@ class DeepRacerRacetrackEnv(gym.Env): # Update metrics based on job type if self.job_type == TRAINING_JOB: self.send_reward_to_cloudwatch(self.reward_in_episode) - self.update_training_metrics() -+ self.update_training_metrics(progress) ++ self.update_training_metrics(progress, simulation_end_time) self.write_metrics_to_s3() if self.is_training_done(): self.cancel_simulation_job() -@@ -485,13 +485,18 @@ class DeepRacerRacetrackEnv(gym.Env): + elif self.job_type == EVALUATION_JOB: + self.number_of_trials += 1 +- self.update_eval_metrics(progress) ++ self.update_eval_metrics(progress, simulation_end_time) + self.write_metrics_to_s3() + +- def update_eval_metrics(self, progress): ++ def update_eval_metrics(self, progress, simulation_end_time): + eval_metric = {} + eval_metric['completion_percentage'] = int(progress) + eval_metric['metric_time'] = int(round(time.time() * 1000)) + eval_metric['start_time'] = int(round(self.simulation_start_time * 1000)) +- eval_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) ++ eval_metric['elapsed_time_in_milliseconds'] = int(round((simulation_end_time - self.simulation_start_time) * 1000)) eval_metric['trial'] = int(self.number_of_trials) self.metrics.append(eval_metric) - def update_training_metrics(self): -+ def update_training_metrics(self, progress = 1): ++ def update_training_metrics(self, progress, simulation_end_time): training_metric = {} training_metric['reward_score'] = int(round(self.reward_in_episode)) training_metric['metric_time'] = int(round(time.time() * 1000)) training_metric['start_time'] = int(round(self.simulation_start_time * 1000)) - training_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) +- training_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) ++ training_metric['elapsed_time_in_milliseconds'] = int(round((simulation_end_time - self.simulation_start_time) * 1000)) training_metric['episode'] = int(self.episodes) + training_metric['completion_percentage'] = int(progress) + if int(progress) == 100: From b2d517ce18982e174d371c6c7dfcfbb70bbdbf67 Mon Sep 17 00:00:00 2001 From: larsll Date: Thu, 27 Feb 2020 15:50:02 +0000 Subject: [PATCH 042/428] Improving scripts --- activate.sh | 3 ++- scripts/training/start.sh | 53 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/activate.sh b/activate.sh index c4b40fa3..9f404e74 100644 --- a/activate.sh +++ b/activate.sh @@ -158,7 +158,8 @@ function dr-logs-loganalysis { function dr-clean-local { dr-stop-training - sudo rm -rf /robo/* && sudo rm -rf /mnt/deepracer/robo/checkpoint/* + dr-stop-loganalysis + sudo rm -rf /robo/* && sudo rm -rf /mnt/deepracer/robo/* } function dr-logs-start-proxy { diff --git a/scripts/training/start.sh b/scripts/training/start.sh index d94b431e..7f315e8d 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -1,6 +1,59 @@ #!/usr/bin/env bash +usage(){ + echo "Usage: $0 [-f] [-k]" + echo "" + echo "Command will start training." + echo "-f Force deletion of model path. Ask for no confirmations." + echo "-k Keep model path" + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +OPT_DELIM='-' + +while getopts ":fkh" opt; do +case $opt in + +f) OPT_FORCE="True" +;; +k) OPT_KEEP="Keep" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" + +if [[ -z "${OPT_KEEP}" ]]; +then + MODEL_DIR_S3=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} ) + if [[ -n "${MODEL_DIR_S3}" ]]; + then + echo "The new model's S3 prefix s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} exists. Will wipe." + if [[ -z "${OPT_FORCE}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi + fi + aws $LOCAL_PROFILE_ENDPOINT_URL s3 rm s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} --recursive + fi +fi + docker-compose up -d echo 'waiting for containers to start up...' From 8ea6fa89bc722fa74f0ee2aeea40618b46fe2af4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 2 Apr 2020 19:31:50 +0200 Subject: [PATCH 043/428] Upgrading to 2020 Setup (#23) * Initial changes for v2 setup * Temporary upload version * Enabling GPU Robomaker * Updating structure and docker * Adding default files * Removing link to Deepracer Workshops * Removed last external module * Removing build steps and local dockerfiles * Remove 'logs proxy' * Remove automatic upload of logs * Log-analysis changes * Updated scripts for v2 * Bugfixing in clean VM * Bugfixes * File name changes * Moving activate.sh * Bugfix * Update README.md * Fixing activation script * Fix names * Bugfixing * Setting CPU as standard for Robomaker --- .gitignore | 1 + .gitmodules | 9 - README.md | 93 +++++----- aws-deepracer-workshops | 1 - activate.sh => bin/activate.sh | 111 +++++------- bin/init.sh | 56 +++--- bin/prepare.sh | 2 +- deepracer | 1 - defaults/deepracer_racetrack_env.py.patch | 73 -------- defaults/model_metadata.json | 26 +++ defaults/reward_function.py | 33 ++++ defaults/rl_coach_env.sh.patch | 14 -- defaults/rl_deepracer_coach_robomaker.py | 167 ------------------ defaults/robomaker.env.patch | 12 -- defaults/template-run.env | 43 +++-- docker/.env | 36 ---- docker/docker-compose-azure.yml | 8 +- docker/docker-compose-keys.yml | 10 +- docker/docker-compose-local.yml | 4 +- docker/docker-compose-log.yml | 19 -- docker/docker-compose.yml | 53 +++--- .../deepracer_robomaker/Dockerfile | 59 ------- .../dockerfiles/log-analysis/Dockerfile.gpu | 26 --- .../dockerfiles/log-analysis/Dockerfile.nogpu | 24 --- .../dockerfiles/log-analysis/requirements.txt | 14 -- docker/dockerfiles/rl_coach/Dockerfile | 30 ---- scripts/log-analysis/start.sh | 10 +- scripts/log-analysis/stop.sh | 3 +- scripts/training/increment.sh | 10 +- scripts/training/prepare-config.py | 52 ++++++ scripts/training/start.sh | 55 +++--- scripts/training/stop.sh | 2 +- scripts/training/temp.yml | 24 +++ scripts/upload/list-set-models.sh | 10 +- scripts/upload/upload-model.sh | 73 ++++---- 35 files changed, 385 insertions(+), 779 deletions(-) delete mode 100644 .gitmodules delete mode 160000 aws-deepracer-workshops rename activate.sh => bin/activate.sh (50%) delete mode 160000 deepracer delete mode 100644 defaults/deepracer_racetrack_env.py.patch create mode 100644 defaults/model_metadata.json create mode 100644 defaults/reward_function.py delete mode 100644 defaults/rl_coach_env.sh.patch delete mode 100644 defaults/rl_deepracer_coach_robomaker.py delete mode 100644 defaults/robomaker.env.patch delete mode 100644 docker/.env delete mode 100644 docker/docker-compose-log.yml delete mode 100644 docker/dockerfiles/deepracer_robomaker/Dockerfile delete mode 100644 docker/dockerfiles/log-analysis/Dockerfile.gpu delete mode 100644 docker/dockerfiles/log-analysis/Dockerfile.nogpu delete mode 100644 docker/dockerfiles/log-analysis/requirements.txt delete mode 100644 docker/dockerfiles/rl_coach/Dockerfile create mode 100755 scripts/training/prepare-config.py create mode 100644 scripts/training/temp.yml diff --git a/.gitignore b/.gitignore index 5dab6f30..d1861aab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .vscode/ custom_files/ analysis/ +logs/ docker/volumes/ recording/ recording diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index c65cf66a..00000000 --- a/.gitmodules +++ /dev/null @@ -1,9 +0,0 @@ -[submodule "deepracer"] - path = deepracer - url = https://github.com/crr0004/deepracer.git - ignore = dirty -[submodule "aws-deepracer-workshops"] - path = aws-deepracer-workshops - url = https://github.com/breadcentric/aws-deepracer-workshops.git - branch = enhance-log-analysis - ignore = dirty diff --git a/README.md b/README.md index 78ebae45..acac78a3 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,7 @@ # DeepRacer-For-Cloud Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). -This repo is an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer) - -Please refer to Chris' repo to understand more about what's going on under the covers. +This repo started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. Main differences to the work done by Alex is: * Runtime S3 storage is setup to fit the connected cloud platform: @@ -14,10 +12,9 @@ Main differences to the work done by Alex is: * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. * `current-run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -* Runtime storage: Uses `/mnt` to store robomaker files (checkpoints, logs); depending on setup these will normally be deleted between runs, but Azure and AWS provides 200+ GB free storage which is very suitable for this purpuse. Archiving of logs and additional checkpoint files required if desired. +* Runtime storage: Uses `/mnt` to store robomaker files (checkpoints, logs); depending on setup these will normally be deleted between runs, but Azure and AWS provides 200+ GB free storage which is very suitable for this purpuse. Archiving of logs and additional checkpoint files required if desired. (Update: as of V2 this is less important as the robomaker is now cleaning up on itsown) * Azure: Uses the normal temporary drive which is mounted on /mnt by default. * AWS: Preparation scripts mounts the ephemeral drive on /mnt -* Robomaker, RL Coach and Log Analysis Docker images are now available as downloads in [Docker Hub](https://hub.docker.com/search?q=larsll%2Fdeepracer&type=image), which reduces the time to build a new VM. Log analysis is not downloaded by default to reduce required disk space. ## Requirements @@ -52,7 +49,7 @@ The package comes with preparation and setup scripts that would allow a turn-key This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. -The installation script will adapt `.profile` to ensure that all settings are applied on login. +The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. @@ -73,15 +70,17 @@ In AWS it is possible to set up authentication to S3 in two ways: Integrated sig To use IAM Roles: * An empty S3 bucket in the same region as the EC2 instance. -* An IAM Role that has permissions to access both the *new* S3 bucket as well as the DeepRacer bucket. +* An IAM Role that has permissions to: + * Access both the *new* S3 bucket as well as the DeepRacer bucket. + * AmazonVPCReadOnlyAccess + * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis * An EC2 instance with the IAM Role assigned. * Configure `current-run.env` as follows: - * `LOCAL_S3_PROFILE=default` - * `LOCAL_S3_BUCKET=` - * `UPLOAD_S3_PROFILE=default` - * `UPLOAD_S3_BUCKET=` -* Run `dr-update` for configuration to take effect. - + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` +* Run `dr-update-env` for configuration to take effect. #### Manual setup For access with IAM user: @@ -90,10 +89,10 @@ For access with IAM user: * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. * Use `aws configure` to configure this into the default profile. * Configure `current-run.env` as follows: - * `LOCAL_S3_PROFILE=default` - * `LOCAL_S3_BUCKET=` - * `UPLOAD_S3_PROFILE=default` - * `UPLOAD_S3_BUCKET=` + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. ### Azure @@ -106,15 +105,15 @@ In Azure mode the script-set requires the following: * The blob container is equivalent to the S3 bucket. * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. * Configure `current-run.env` as follows: - * `LOCAL_S3_PROFILE=` - * `LOCAL_S3_BUCKET=` - * `UPLOAD_S3_PROFILE=default` - * `UPLOAD_S3_BUCKET=` + * `DR_LOCAL_S3_PROFILE=` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models. -If you want to use awscli (`aws`) to manually move files then use `aws $LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. +If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. ### Local @@ -125,24 +124,29 @@ The scripts assume that a file `current-run.env` is populated with the required | Variable | Description | |----------|-------------| -| `CLOUD` | Can be `Azure` or `AWS`; determines how the storage will be configured.| -| `WORLD_NAME` | Defines the track to be used.| -| `NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| -| `CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| -| `LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| -| `LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| -| `LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| -| `LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| -| `LOCAL_S3_LOGS_PREFIX` | Prefix of log files within S3 bucket. | -| `LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{LOCAL_S3_PROFILE}`.| -| `LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| -| `LOGS_ACCESS_KEY` | Username for local S3 log proxy (minio container).| -| `LOGS_ACCESS_SECRET` | Password for local S3 log proxy (minio container).| -| `UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| -| `UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| -| `UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| -| `UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| - +| `DR_CLOUD` | Can be `Azure` or `AWS`; determines how the storage will be configured.| +| `DR_WORLD_NAME` | Defines the track to be used.| +| `DR_NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| +| `DR_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| +| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| +| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| +| `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| +| `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| +| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| +| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| +| `DR_LOCAL_S3_PARAMS_FILE` | YAML file path used to configure Robomaker relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| +| `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| +| `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| +| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| +| `DR_UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| +| `DR_CAR_COLOR` | Color of car | +| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading. | +| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | +| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | +| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | +| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | +| `DR_GPU_AVAILABLE` | Is GPU enabled? | +| `DR_DOCKER_IMAGE_TYPE` | `cpu` or `gpu`; docker images will be used based on this | ## Usage @@ -156,9 +160,8 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust |---------|-------------| | `dr-update` | Loads in all scripts and environment variables again.| | `dr-update-env` | Loads in all environment variables from `current-run.env`.| -| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{LOCAL_S3_BUCKET}/custom_files`.| -| `dr-download-custom-files` | Downloads changed configuration files from `s3://{LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| -| `dr-upload-logs` | Uploads changed Robomaker log files from `/mnt/deepracer/robo/checkpoint/log` into `s3://{LOCAL_S3_BUCKET}/${LOCAL_S3_LOGS_PREFIX}`.| +| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| +| `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| | `dr-start-training` | Starts a training session in the local VM based on current configuration.| | `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.| | `dr-stop-training` | Stops the current local training session. Uploads log files.| @@ -168,8 +171,6 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust | `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| -| `dr-logs-start-proxy` | Starts a local Minio S3 instance on port 9001 to expose files in `/mnt/deepracer/robo/checkpoint/log`. Useful if doing log analysis outside of VM. -| `dr-logs-stop-proxy` | Stops the local Minio S3 instance on port 9001. | | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | | `dr-set-upload-model` | Updates the `current-run.env` with the prefix and name of your selected model. | -| `dr-upload-model` | Uploads the model defined in `LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `UPLOAD_S3_PREFIX` | \ No newline at end of file +| `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | diff --git a/aws-deepracer-workshops b/aws-deepracer-workshops deleted file mode 160000 index 757fa87f..00000000 --- a/aws-deepracer-workshops +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 757fa87f2b5246a5ce451158dfdaedc99a3927b0 diff --git a/activate.sh b/bin/activate.sh similarity index 50% rename from activate.sh rename to bin/activate.sh index 9f404e74..33d7f64e 100644 --- a/activate.sh +++ b/bin/activate.sh @@ -1,9 +1,25 @@ #!/bin/bash -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +function dr-update-env { + if [[ -f "$DIR/current-run.env" ]] + then + LINES=$(grep -v '^#' $DIR/current-run.env) + for l in $LINES; do + env_var=$(echo $l | cut -f1 -d\=) + env_val=$(echo $l | cut -f2 -d\=) + eval "export $env_var=$env_val" + done + else + echo "File current-run.env does not exist." + exit 1 + fi +} + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +DIR="$( dirname $SCRIPT_DIR )" export DR_DIR=$DIR # create directory structure for docker volumes -if ! (mount | grep /mnt > /dev/null); then +if [[ $(mount | grep /mnt | wc -l) -ne 0 ]]; then mount /mnt fi sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording @@ -11,45 +27,47 @@ sudo chown $(id -u):$(id -g) /mnt/deepracer if [[ -f "$DIR/current-run.env" ]] then - export $(grep -v '^#' $DIR/current-run.env | xargs) + dr-update-env else echo "File current-run.env does not exist." exit 1 fi -if [[ "${CLOUD,,}" == "azure" ]]; +if [[ "${DR_CLOUD,,}" == "azure" ]]; then - LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" - COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" -elif [[ "${CLOUD,,}" == "local" ]]; + export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" + DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" + DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" +elif [[ "${DR_CLOUD,,}" == "local" ]]; then - LOCAL_PROFILE_ENDPOINT_URL="--profile $LOCAL_S3_PROFILE --endpoint-url http://localhost:9000" - COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-local.yml" + export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" + DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" + DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-local.yml" else - LOCAL_PROFILE_ENDPOINT_URL="" - COMPOSE_FILE="$DIR/docker/docker-compose.yml" + DR_LOCAL_PROFILE_ENDPOINT_URL="" + DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml" fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq 0 ]; then - export LOCAL_ACCESS_KEY_ID=$(aws --profile $LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) - export LOCAL_SECRET_ACCESS_KEY=$(aws --profile $LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) - COMPOSE_FILE="$COMPOSE_FILE:$DIR/docker/docker-compose-keys.yml" - export UPLOAD_PROFILE="--profile $UPLOAD_S3_PROFILE" + export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) + export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) + DR_COMPOSE_FILE="$DR_COMPOSE_FILE:$DIR/docker/docker-compose-keys.yml" + export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" fi -export COMPOSE_FILE -export LOCAL_PROFILE_ENDPOINT_URL +export DR_COMPOSE_FILE +export DR_LOCAL_PROFILE_ENDPOINT_URL function dr-upload-custom-files { - if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; + if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILES up -d minio fi - eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) + eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET } function dr-upload-model { @@ -64,35 +82,19 @@ function dr-set-upload-model { dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" } - -function dr-upload-logs { - if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; - then - ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio - fi - if [ -d /mnt/deepracer/robo/checkpoint/log/ ]; - then - eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_LOGS_PREFIX/) - echo "Uploading files to $CUSTOM_TARGET" - aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync /mnt/deepracer/robo/checkpoint/log $CUSTOM_TARGET --exclude "*" --include "rl_coach*.log*" --no-follow-symlinks - else - echo "No logfiles to upload" - fi -} - function dr-download-custom-files { - if [[ "${CLOUD,,}" == "azure" || "${CLOUD,,}" == "local" ]]; + if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $COMPOSE_FILES up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILES up -d minio fi - eval CUSTOM_TARGET=$(echo s3://$LOCAL_S3_BUCKET/$LOCAL_S3_CUSTOM_FILES_PREFIX/) + eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" - aws $LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ } function dr-start-training { dr-update-env - bash -c "cd $DIR/scripts/training && ./start.sh" + bash -c "cd $DIR/scripts/training && ./start.sh $@" } function dr-increment-training { @@ -100,7 +102,6 @@ function dr-increment-training { } function dr-stop-training { - dr-upload-logs ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" } @@ -110,7 +111,6 @@ function dr-start-evaluation { } function dr-stop-evaluation { - dr-upload-logs ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" } @@ -147,7 +147,7 @@ function dr-logs-robomaker { } function dr-logs-loganalysis { - eval LOG_ANALYSIS_ID=$(docker ps | awk ' /log-analysis/ { print $1 }') + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then docker logs -f $LOG_ANALYSIS_ID else @@ -158,28 +158,11 @@ function dr-logs-loganalysis { function dr-clean-local { dr-stop-training - dr-stop-loganalysis - sudo rm -rf /robo/* && sudo rm -rf /mnt/deepracer/robo/* -} - -function dr-logs-start-proxy { - docker-compose -f $DIR/docker/docker-compose-log.yml up -d -} - -function dr-logs-stop-proxy { - docker-compose -f $DIR/docker/docker-compose-log.yml down + sudo rm -rf /robo/* } function dr-update { source $DIR/activate.sh } -function dr-update-env { - if [[ -f "$DIR/current-run.env" ]] - then - export $(grep -v '^#' $DIR/current-run.env | xargs) - else - echo "File current-run.env does not exist." - exit 1 - fi -} + diff --git a/bin/init.sh b/bin/init.sh index b7f3fbd3..0885e0b5 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -7,8 +7,12 @@ function ctrl_c() { exit 1 } -while getopts ":m:c:" opt; do +OPT_ARCH="gpu" + +while getopts ":m:c:a:" opt; do case $opt in +a) OPT_ARCH="$OPTARG" +;; m) OPT_MOUNT="$OPTARG" ;; c) OPT_CLOUD="$OPTARG" @@ -19,11 +23,14 @@ exit 1 esac done -GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) -if [ $? -ne 0 ] || [ $GPUS -eq 0 ] +if [[ "${OPT_ARCH}" == "gpu" ]] then - echo "No GPU detected in docker. Please check setup". - exit 1 + GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) + if [ $? -ne 0 ] || [ $GPUS -eq 0 ] + then + echo "No GPU detected in docker. Please check setup". + exit 1 + fi fi INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" @@ -45,32 +52,12 @@ mkdir -p $INSTALL_DIR/docker/volumes mkdir -p $(eval echo "~${USER}")/.aws ln -sf $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ -# grab local training deepracer repo from crr0004 and log analysis repo from vreadcentric -# Now as submodules! -# git clone --recurse-submodules https://github.com/crr0004/deepracer.git -# git clone https://github.com/breadcentric/aws-deepracer-workshops.git && cd aws-deepracer-workshops && git checkout enhance-log-analysis && cd .. -git submodule update --init --recursive - -ln -sf $INSTALL_DIR/aws-deepracer-workshops/log-analysis $INSTALL_DIR/docker/volumes/log-analysis -cp $INSTALL_DIR/deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/deepracer_simulation/routes/* docker/volumes/log-analysis/tracks/ - # copy rewardfunctions -mkdir -p $INSTALL_DIR/custom_files $INSTALL_DIR/analysis -cp $INSTALL_DIR/deepracer/custom_files/* $INSTALL_DIR/custom_files/ +mkdir -p $INSTALL_DIR/custom_files $INSTALL_DIR/logs $INSTALL_DIR/analysis cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/ +cp $INSTALL_DIR/defaults/model_metadata.json $INSTALL_DIR/custom_files/ +cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/ -# setup symlink to rl-coach config file -ln -f $INSTALL_DIR/defaults/rl_deepracer_coach_robomaker.py $INSTALL_DIR/deepracer/rl_coach/rl_deepracer_coach_robomaker.py - -# patching files in submodules that don't entirely fit our needs -cd $INSTALL_DIR/deepracer/ -patch simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py < ../defaults/deepracer_racetrack_env.py.patch -patch robomaker.env < ../defaults/robomaker.env.patch -patch rl_coach/env.sh < ../defaults/rl_coach_env.sh.patch -cd .. - -# replace the contents of the rl_deepracer_coach_robomaker.py file with the gpu specific version (this is also where you can edit the hyperparameters) -# TODO this file should be genrated from a gui before running training cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/current-run.env if [[ -n "$OPT_CLOUD" ]]; then @@ -88,13 +75,10 @@ do done # Download docker images. Change to build statements if locally built images are desired. -# docker build ${args} -f ./docker/dockerfiles/rl_coach/Dockerfile -t larsll/deepracer-rlcoach ./ -# docker build ./docker/dockerfiles/deepracer_robomaker/ -t larsll/deepracer-robomaker -# docker build ./docker/dockerfiles/log-analysis/ -t larsll/deepracer-loganalysis -docker pull larsll/deepracer-rlcoach -docker pull larsll/deepracer-robomaker -# docker pull larsll/deepracer-loganalysis -docker pull crr0004/sagemaker-rl-tensorflow:nvidia +docker pull larsll/deepracer-rlcoach:v2 +docker pull awsdeepracercommunity/deepracer-robomaker:cpu +docker pull awsdeepracercommunity/deepracer-sagemaker:$OPT_ARCH +docker pull larsll/deepracer-loganalysis:v2-cpu # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' @@ -105,7 +89,7 @@ then fi # ensure our variables are set on startup -echo "source $INSTALL_DIR/activate.sh" >> $HOME/.profile +echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile # mark as done date | tee $INSTALL_DIR/DONE diff --git a/bin/prepare.sh b/bin/prepare.sh index 94105204..a95e171c 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -94,7 +94,7 @@ sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-lea sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' ## Adding AWSCli -sudo apt-get install -y awscli +sudo apt-get install -y awscli python3-boto3 ## Installing Docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - diff --git a/deepracer b/deepracer deleted file mode 160000 index 86605d39..00000000 --- a/deepracer +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 86605d39c19923fe85fc96b54b720122ecb75d03 diff --git a/defaults/deepracer_racetrack_env.py.patch b/defaults/deepracer_racetrack_env.py.patch deleted file mode 100644 index b37e1d95..00000000 --- a/defaults/deepracer_racetrack_env.py.patch +++ /dev/null @@ -1,73 +0,0 @@ -diff --git a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py -index 569b33c..81727af 100644 ---- a/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py -+++ b/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src/sagemaker_rl_agent/markov/environments/deepracer_racetrack_env.py -@@ -220,7 +220,7 @@ class DeepRacerRacetrackEnv(gym.Env): - - self.racecar_reset() - self.steps = 0 -- self.simulation_start_time = time.time() -+ self.simulation_start_time = rospy.get_time() - self.infer_reward_state(0, 0) - - return self.next_state -@@ -430,7 +430,7 @@ class DeepRacerRacetrackEnv(gym.Env): - current_progress, - closest_waypoint_index, - self.track_length, -- time.time())) -+ rospy.get_time())) - - # Terminate this episode when ready - if done and node_type == SIMULATION_WORKER: -@@ -456,6 +456,7 @@ class DeepRacerRacetrackEnv(gym.Env): - - def finish_episode(self, progress): - # Increment episode count, update start position and direction -+ simulation_end_time = rospy.get_time() - self.episodes += 1 - if self.change_start: - self.start_ndist = (self.start_ndist + ROUND_ROBIN_ADVANCE_DIST) % 1.0 -@@ -467,31 +468,36 @@ class DeepRacerRacetrackEnv(gym.Env): - # Update metrics based on job type - if self.job_type == TRAINING_JOB: - self.send_reward_to_cloudwatch(self.reward_in_episode) -- self.update_training_metrics() -+ self.update_training_metrics(progress, simulation_end_time) - self.write_metrics_to_s3() - if self.is_training_done(): - self.cancel_simulation_job() - elif self.job_type == EVALUATION_JOB: - self.number_of_trials += 1 -- self.update_eval_metrics(progress) -+ self.update_eval_metrics(progress, simulation_end_time) - self.write_metrics_to_s3() - -- def update_eval_metrics(self, progress): -+ def update_eval_metrics(self, progress, simulation_end_time): - eval_metric = {} - eval_metric['completion_percentage'] = int(progress) - eval_metric['metric_time'] = int(round(time.time() * 1000)) - eval_metric['start_time'] = int(round(self.simulation_start_time * 1000)) -- eval_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) -+ eval_metric['elapsed_time_in_milliseconds'] = int(round((simulation_end_time - self.simulation_start_time) * 1000)) - eval_metric['trial'] = int(self.number_of_trials) - self.metrics.append(eval_metric) - -- def update_training_metrics(self): -+ def update_training_metrics(self, progress, simulation_end_time): - training_metric = {} - training_metric['reward_score'] = int(round(self.reward_in_episode)) - training_metric['metric_time'] = int(round(time.time() * 1000)) - training_metric['start_time'] = int(round(self.simulation_start_time * 1000)) -- training_metric['elapsed_time_in_milliseconds'] = int(round((time.time() - self.simulation_start_time) * 1000)) -+ training_metric['elapsed_time_in_milliseconds'] = int(round((simulation_end_time - self.simulation_start_time) * 1000)) - training_metric['episode'] = int(self.episodes) -+ training_metric['completion_percentage'] = int(progress) -+ if int(progress) == 100: -+ training_metric['episode_status'] = "Lap complete" -+ else: -+ training_metric['episode_status'] = "Off track" - self.metrics.append(training_metric) - - def write_metrics_to_s3(self): diff --git a/defaults/model_metadata.json b/defaults/model_metadata.json new file mode 100644 index 00000000..a3ecc63f --- /dev/null +++ b/defaults/model_metadata.json @@ -0,0 +1,26 @@ +{ + "action_space": [ + { + "steering_angle": -30, + "speed": 0.6 + }, + { + "steering_angle": -15, + "speed": 0.6 + }, + { + "steering_angle": 0, + "speed": 0.6 + }, + { + "steering_angle": 15, + "speed": 0.6 + }, + { + "steering_angle": 30, + "speed": 0.6 + } + ], + "sensor": ["FRONT_FACING_CAMERA"], + "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW" +} diff --git a/defaults/reward_function.py b/defaults/reward_function.py new file mode 100644 index 00000000..3a022ade --- /dev/null +++ b/defaults/reward_function.py @@ -0,0 +1,33 @@ +def reward_function(params): + ''' + Example of penalize steering, which helps mitigate zig-zag behaviors + ''' + + # Read input parameters + distance_from_center = params['distance_from_center'] + track_width = params['track_width'] + steering = abs(params['steering_angle']) # Only need the absolute steering angle + + # Calculate 3 marks that are farther and father away from the center line + marker_1 = 0.1 * track_width + marker_2 = 0.25 * track_width + marker_3 = 0.5 * track_width + + # Give higher reward if the car is closer to center line and vice versa + if distance_from_center <= marker_1: + reward = 1 + elif distance_from_center <= marker_2: + reward = 0.5 + elif distance_from_center <= marker_3: + reward = 0.1 + else: + reward = 1e-3 # likely crashed/ close to off track + + # Steering penality threshold, change the number based on your action space setting + ABS_STEERING_THRESHOLD = 15 + + # Penalize reward if the car is steering too much + if steering > ABS_STEERING_THRESHOLD: + reward *= 0.8 + + return float(reward) diff --git a/defaults/rl_coach_env.sh.patch b/defaults/rl_coach_env.sh.patch deleted file mode 100644 index 1495fdf2..00000000 --- a/defaults/rl_coach_env.sh.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/rl_coach/env.sh b/rl_coach/env.sh -index b77cf42..4c204c7 100644 ---- a/rl_coach/env.sh -+++ b/rl_coach/env.sh -@@ -1,7 +1,7 @@ - export MINIO_ACCESS_KEY=minio - export MINIO_SECRET_KEY=miniokey --export AWS_ACCESS_KEY_ID=minio --export AWS_SECRET_ACCESS_KEY=miniokey -+#export AWS_ACCESS_KEY_ID=minio -+#export AWS_SECRET_ACCESS_KEY=miniokey - export WORLD_NAME=New_York_Track - export ROS_AWS_REGION=us-east-1 - export AWS_REGION=us-east-1 diff --git a/defaults/rl_deepracer_coach_robomaker.py b/defaults/rl_deepracer_coach_robomaker.py deleted file mode 100644 index fb42d1e1..00000000 --- a/defaults/rl_deepracer_coach_robomaker.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - - -import sagemaker -import boto3 -import sys -import os -import glob -import re -import subprocess -import json -import io -from IPython.display import Markdown -from time import gmtime, strftime -sys.path.append("common") -from misc import get_execution_role, wait_for_s3_object -from sagemaker.rl import RLEstimator, RLToolkit, RLFramework -from markdown_helper import * - -def str2bool(v): - return v.lower() in ("yes", "true", "t", "1") - -# S3 bucket -boto_session = boto3.session.Session(region_name=os.environ.get("AWS_REGION", "us-east-1")) - -endpoint_url = os.environ.get("S3_ENDPOINT_URL", "") - -if endpoint_url == "": - s3Client = boto_session.resource("s3") - s3Client_c = boto_session.client("s3") -else: - s3Client = boto_session.resource("s3", use_ssl=False, - endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) - s3Client_c = boto_session.client("s3", use_ssl=False, - endpoint_url=os.environ.get("S3_ENDPOINT_URL", "http://127.0.0.1:9000")) - -sage_session = sagemaker.local.LocalSession( - boto_session=boto_session, s3_client=s3Client) -# sage_session.default_bucket() -s3_bucket = os.environ.get("MODEL_S3_BUCKET", "bucket") -s3_prefix = os.environ.get("MODEL_S3_PREFIX", "rl-deepracer-sagemaker") -pretrained = str2bool(os.environ.get("PRETRAINED", False)) -s3_pretrained_bucket = os.environ.get("PRETRAINED_S3_BUCKET", "bucket") -s3_pretrained_prefix = os.environ.get( - "PRETRAINED_S3_PREFIX", "rl-deepracer-pretrained") -# SDK appends the job name and output folder -s3_output_path = 's3://{}/'.format(s3_bucket) - -# Hyperparameters -hyperparameter_file = os.environ.get( - "HYPERPARAMETER_FILE_S3_KEY", "custom_files/hyperparameters.json") - -# ### Define Variables - -# We define variables such as the job prefix for the training jobs and s3_prefix for storing metadata required for synchronization between the training and simulation jobs - - -# this should be MODEL_S3_PREFIX, but that already ends with "-sagemaker" -job_name_prefix = 'rl-deepracer' - -# create unique job name -tm = gmtime() -# -" + strftime("%y%m%d-%H%M%S", tm) #Ensure S3 prefix contains SageMaker -job_name = job_name_prefix + "-sagemaker" -# -" + strftime("%y%m%d-%H%M%S", tm) #Ensure that the S3 prefix contains the keyword 'robomaker' -s3_prefix_robomaker = job_name_prefix + "-robomaker" - - -# Duration of job in seconds (5 hours) -job_duration_in_seconds = 24 * 60 * 60 - -aws_region = sage_session.boto_region_name - -if aws_region not in ["us-west-2", "us-east-1", "eu-west-1"]: - raise Exception( - "This notebook uses RoboMaker which is available only in US East (N. Virginia), US West (Oregon) and EU (Ireland). Please switch to one of these regions.") -print("Model checkpoints and other metadata will be stored at: {}{}".format( - s3_output_path, job_name)) - - -s3_location = "s3://%s/%s" % (s3_bucket, s3_prefix) -print("Uploading to " + s3_location) - - -metric_definitions = [ - # Training> Name=main_level/agent, Worker=0, Episode=19, Total reward=-102.88, Steps=19019, Training iteration=1 - {'Name': 'reward-training', - 'Regex': '^Training>.*Total reward=(.*?),'}, - - # Policy training> Surrogate loss=-0.32664725184440613, KL divergence=7.255815035023261e-06, Entropy=2.83156156539917, training epoch=0, learning_rate=0.00025 - {'Name': 'ppo-surrogate-loss', - 'Regex': '^Policy training>.*Surrogate loss=(.*?),'}, - {'Name': 'ppo-entropy', - 'Regex': '^Policy training>.*Entropy=(.*?),'}, - - # Testing> Name=main_level/agent, Worker=0, Episode=19, Total reward=1359.12, Steps=20015, Training iteration=2 - {'Name': 'reward-testing', - 'Regex': '^Testing>.*Total reward=(.*?),'}, -] - - -# We use the RLEstimator for training RL jobs. -# -# 1. Specify the source directory which has the environment file, preset and training code. -# 2. Specify the entry point as the training code -# 3. Specify the choice of RL toolkit and framework. This automatically resolves to the ECR path for the RL Container. -# 4. Define the training parameters such as the instance count, instance type, job name, s3_bucket and s3_prefix for storing model checkpoints and metadata. **Only 1 training instance is supported for now.** -# 4. Set the RLCOACH_PRESET as "deepracer" for this example. -# 5. Define the metrics definitions that you are interested in capturing in your logs. These can also be visualized in CloudWatch and SageMaker Notebooks. - -# In[ ]: - - -RLCOACH_PRESET = "deepracer" - -gpu_available = os.environ.get("GPU_AVAILABLE", False) -# 'local' for cpu, 'local_gpu' for nvidia gpu (and then you don't have to set default runtime to nvidia) -instance_type = "local_gpu" if gpu_available else "local" -image_name = "crr0004/sagemaker-rl-tensorflow:{}".format( - "nvidia" if gpu_available else "console") - -# Prepare hyperparameters -hyperparameters_core = { - "s3_bucket": s3_bucket, - "s3_prefix": s3_prefix, - "aws_region": aws_region, - "model_metadata_s3_key": "s3://{}/custom_files/model_metadata.json".format(s3_bucket), - "RLCOACH_PRESET": RLCOACH_PRESET -} - -if pretrained == True: - hyperparameters_core['pretrained_s3_bucket'] = "{}".format( - s3_pretrained_bucket) - hyperparameters_core['pretrained_s3_prefix'] = s3_pretrained_prefix - -# Downloading the hyperparameter file from our local bucket. -hyperparameter_data = io.BytesIO() -s3Client_c.download_fileobj( - s3_bucket, hyperparameter_file, hyperparameter_data) -hyperparameters_nn = json.loads(hyperparameter_data.getvalue().decode("utf-8")) -hyperparameters = {**hyperparameters_core, **hyperparameters_nn} -print("Configured following hyperparameters") -print(hyperparameters) -estimator = RLEstimator(entry_point="training_worker.py", - source_dir='src', - dependencies=["common/sagemaker_rl"], - toolkit=RLToolkit.COACH, - toolkit_version='0.11', - framework=RLFramework.TENSORFLOW, - sagemaker_session=sage_session, - # bypass sagemaker SDK validation of the role - role="aaa/", - train_instance_type=instance_type, - train_instance_count=1, - output_path=s3_output_path, - base_job_name=job_name_prefix, - image_name=image_name, - train_max_run=job_duration_in_seconds, # Maximum runtime in seconds - hyperparameters=hyperparameters, - metric_definitions=metric_definitions, - s3_client=s3Client - # subnets=default_subnets, # Required for VPC mode - # security_group_ids=default_security_groups, # Required for VPC mode - ) - -estimator.fit(job_name=job_name, wait=False) diff --git a/defaults/robomaker.env.patch b/defaults/robomaker.env.patch deleted file mode 100644 index b9fb1c65..00000000 --- a/defaults/robomaker.env.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/robomaker.env b/robomaker.env -index fde2aaf..fea2347 100644 ---- a/robomaker.env -+++ b/robomaker.env -@@ -1,5 +1,5 @@ --AWS_ACCESS_KEY_ID=minio --AWS_SECRET_ACCESS_KEY=miniokey -+#AWS_ACCESS_KEY_ID=minio -+#AWS_SECRET_ACCESS_KEY=miniokey - WORLD_NAME=Mexico_track - ROS_AWS_REGION=us-east-1 - AWS_REGION=us-east-1 diff --git a/defaults/template-run.env b/defaults/template-run.env index 70a7471f..d1905477 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,17 +1,26 @@ -CLOUD= -WORLD_NAME=Vegas_track -NUMBER_OF_TRIALS=5 -CHANGE_START_POSITION=True -UPLOAD_S3_PROFILE=default -UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal -UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix -UPLOAD_MODEL_NAME=mymodelname -LOCAL_S3_PRETRAINED=False -LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained -LOCAL_S3_PROFILE=default -LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker -LOCAL_S3_BUCKET=bucket -LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files -LOCAL_S3_LOGS_PREFIX=logs -LOGS_ACCESS_KEY=mylogs -LOGS_ACCESS_SECRET=mylogpass +DR_CLOUD= +DR_AWS_APP_REGION=us-east-1 +DR_WORLD_NAME=LGSWide +DR_CHANGE_START_POSITION=True +DR_UPLOAD_S3_PROFILE=default +DR_UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal +DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix +DR_UPLOAD_MODEL_NAME=mymodelname +DR_CAR_COLOR=Red +DR_CAR_NAME=FastCar +DR_KINESIS_STREAM_NAME=my-test-stream +DR_KINESIS_STREAM_ENABLE=False +DR_GUI_ENABLE=False +DR_GPU_AVAILABLE=True +DR_ROBOMAKER_IMAGE_TYPE=cpu +DR_LOCAL_S3_PRETRAINED=False +DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained +DR_LOCAL_S3_PROFILE=default +DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker +DR_LOCAL_S3_BUCKET=bucket +DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files +DR_LOCAL_S3_PARAMS_FILE=training-params.yaml +DR_TARGET_REWARD_SCORE=None +DR_NUMBER_OF_EPISODES=500 +DR_LOCAL_S3_REWARD_KEY=custom_files/reward_function.py +DR_LOCAL_S3_METRICS_KEY=$DR_LOCAL_S3_MODEL_PREFIX/metrics/training_metrics.json \ No newline at end of file diff --git a/docker/.env b/docker/.env deleted file mode 100644 index 1d7489d1..00000000 --- a/docker/.env +++ /dev/null @@ -1,36 +0,0 @@ -WORLD_NAME=Vegas_track -LOCAL_ENV_VAR_JSON_PATH=env_vars.json -AWS_DEFAULT_REGION=us-east-1 -#S3_ENDPOINT_URL=http://minio:9000 -ROS_AWS_REGION=us-east-1 -AWS_REGION=us-east-1 -MODEL_S3_PREFIX=rl-deepracer-sagemaker -MODEL_S3_BUCKET=bucket -PRETRAINED=False -PRETRAINED_S3_PREFIX=rl-deepracer-pretrained -PRETRAINED_S3_BUCKET=bucket -LOCAL=True -MARKOV_PRESET_FILE=deepracer.py -XAUTHORITY=/root/.Xauthority -DISPLAY_N=:0 -METRIC_NAME=reward -METRIC_NAMESPACE=deepracer -APP_REGION=us-east-1 -SAGEMAKER_SHARED_S3_PREFIX=rl-deepracer-sagemaker -SAGEMAKER_SHARED_S3_BUCKET=bucket -TRAINING_JOB_ARN=aaa -METRICS_S3_BUCKET=bucket -METRICS_S3_OBJECT_KEY=metrics/metric.json -ROBOMAKER_RUN_TYPE=distributed_training -CHANGE_START_POSITION=True -TARGET_REWARD_SCORE=100000 -NUMBER_OF_EPISODES=20000 -ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID=aaa -AWS_ROBOMAKER_SIMULATION_JOB_ID=aaa -MODEL_METADATA_FILE_S3_KEY=custom_files/model_metadata.json -HYPERPARAMETER_FILE_S3_KEY=custom_files/hyperparameters.json -REWARD_FILE_S3_KEY=custom_files/reward.py -BUNDLE_CURRENT_PREFIX=/app/robomaker-deepracer/simulation_ws/ -GPU_AVAILABLE=True -NUMBER_OF_TRIALS=6 -SM_NUM_GPU=1 diff --git a/docker/docker-compose-azure.yml b/docker/docker-compose-azure.yml index 6534a89d..dd05646e 100644 --- a/docker/docker-compose-azure.yml +++ b/docker/docker-compose-azure.yml @@ -9,10 +9,10 @@ services: command: gateway azure restart: unless-stopped environment: - - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} + - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} + - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} rl_coach: environment: - S3_ENDPOINT_URL=http://minio:9000 diff --git a/docker/docker-compose-keys.yml b/docker/docker-compose-keys.yml index 33d99f6f..2fb3aebe 100644 --- a/docker/docker-compose-keys.yml +++ b/docker/docker-compose-keys.yml @@ -3,11 +3,9 @@ version: '3.7' services: rl_coach: environment: - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} - depends_on: - - minio + - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} robomaker: environment: - - AWS_ACCESS_KEY_ID=${LOCAL_ACCESS_KEY_ID} - - AWS_SECRET_ACCESS_KEY=${LOCAL_SECRET_ACCESS_KEY} + - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml index 7bf50bc1..ea42a7d8 100644 --- a/docker/docker-compose-local.yml +++ b/docker/docker-compose-local.yml @@ -9,8 +9,8 @@ services: command: server /data restart: unless-stopped environment: - - MINIO_ACCESS_KEY=${LOCAL_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${LOCAL_SECRET_ACCESS_KEY} + - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} volumes: - /mnt/deepracer/minio:/data rl_coach: diff --git a/docker/docker-compose-log.yml b/docker/docker-compose-log.yml deleted file mode 100644 index e1632f22..00000000 --- a/docker/docker-compose-log.yml +++ /dev/null @@ -1,19 +0,0 @@ -version: '3.7' -networks: - default: - external: - name: sagemaker-local - -services: - minio-log: - image: minio/minio - ports: - - "9001:9001" - container_name: minio-log - command: server --address :9001 /data - volumes: - - /mnt/deepracer/robo/checkpoint:/data - restart: unless-stopped - environment: - - MINIO_ACCESS_KEY=${LOGS_ACCESS_KEY} - - MINIO_SECRET_KEY=${LOGS_ACCESS_SECRET} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b61ebc77..a84301a6 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -7,49 +7,38 @@ networks: services: rl_coach: - image: larsll/deepracer-rlcoach + image: larsll/deepracer-rlcoach:v2 environment: - - WORLD_NAME - - NUMBER_OF_TRIALS=${NUMBER_OF_TRIALS} - - PRETRAINED=${LOCAL_S3_PRETRAINED} - - PRETRAINED_S3_PREFIX=${LOCAL_S3_PRETRAINED_PREFIX} - - PRETRAINED_S3_BUCKET=${LOCAL_S3_BUCKET} - - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json - - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py - - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} - - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json - env_file: .env - container_name: rl_coach + - GPU_AVAILABLE=${DR_GPU_AVAILABLE} + - PRETRAINED=${DR_LOCAL_S3_PRETRAINED} + - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX} + - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} + - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + container_name: coach volumes: - '//var/run/docker.sock:/var/run/docker.sock' - - '../deepracer/sagemaker-python-sdk:/deepracer/sagemaker-python-sdk' - - '../deepracer/rl_coach:/deepracer/rl_coach' - '/robo/container:/robo/container' robomaker: - image: larsll/deepracer-robomaker:latest + image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE_TYPE} command: ["${ROBOMAKER_COMMAND}"] volumes: - - ../deepracer/simulation/aws-robomaker-sample-application-deepracer/simulation_ws/src:/app/robomaker-deepracer/simulation_ws/src - - /mnt/deepracer/robo/checkpoint:/root/.ros/ - /mnt/deepracer/recording:/mnt/recording ports: - - "8080:5900" + - "8080:8080" container_name: robomaker restart: unless-stopped environment: - - CHANGE_START_POSITION - - WORLD_NAME - - NUMBER_OF_TRIALS=${NUMBER_OF_TRIALS} - - MODEL_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - - MODEL_S3_BUCKET=${LOCAL_S3_BUCKET} - - MODEL_METADATA_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/model_metadata.json - - REWARD_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/reward.py - - METRICS_S3_BUCKET=${LOCAL_S3_BUCKET} - - HYPERPARAMETER_FILE_S3_KEY=${LOCAL_S3_CUSTOM_FILES_PREFIX}/hyperparameters.json - - SAGEMAKER_SHARED_S3_PREFIX=${LOCAL_S3_MODEL_PREFIX} - - SAGEMAKER_SHARED_S3_BUCKET=${LOCAL_S3_BUCKET} - env_file: .env + - XAUTHORITY=/root/.Xauthority + - DISPLAY_N=:0 + - WORLD_NAME=${DR_WORLD_NAME} + - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} + - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} + - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - APP_REGION=${DR_AWS_APP_REGION} + - S3_YAML_NAME=${DR_LOCAL_S3_PARAMS_FILE} + - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} + - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} + - ENABLE_GUI=${DR_GUI_ENABLE} depends_on: - rl_coach diff --git a/docker/dockerfiles/deepracer_robomaker/Dockerfile b/docker/dockerfiles/deepracer_robomaker/Dockerfile deleted file mode 100644 index e4f58b9a..00000000 --- a/docker/dockerfiles/deepracer_robomaker/Dockerfile +++ /dev/null @@ -1,59 +0,0 @@ - -FROM crr0004/deepracer_robomaker:console -LABEL maintainer "Lars Ludvigsen " - -RUN apt-get update && apt-get install -y --no-install-recommends \ -ca-certificates apt-transport-https gnupg-curl && \ - NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ - NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ - apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ - echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --auto-remove -y gnupg-curl - -ENV CUDA_VERSION 9.0.176 -ENV CUDNN_VERSION 7.6.4.38 -ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1 - -# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a -RUN apt-get update && apt-get install -y --no-install-recommends \ - cuda-cudart-$CUDA_PKG_VERSION \ - && ln -s cuda-9.0 /usr/local/cuda - -# Required for nvidia-docker v1 -LABEL com.nvidia.volumes.needed="nvidia_driver" -LABEL com.nvidia.cuda.version="${CUDA_VERSION}" -LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}" - -RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 - -# nvidia-container-runtime -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0 " - -ENV NCCL_VERSION 2.4.8 - -RUN apt-get install -y --no-install-recommends \ - cuda-libraries-$CUDA_PKG_VERSION \ - cuda-cublas-9-0=9.0.176.4-1 \ - cuda-cusolver-$CUDA_PKG_VERSION \ - libnccl2=$NCCL_VERSION-1+cuda9.0 && \ - apt-mark hold libnccl2 && \ - apt-get install -y --no-install-recommends \ - libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \ - apt-mark hold libcudnn7 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-9.0/targets/x86_64-linux/lib - -RUN pip install --no-cache-dir tensorflow-gpu==1.11.0 diff --git a/docker/dockerfiles/log-analysis/Dockerfile.gpu b/docker/dockerfiles/log-analysis/Dockerfile.gpu deleted file mode 100644 index 519e57ee..00000000 --- a/docker/dockerfiles/log-analysis/Dockerfile.gpu +++ /dev/null @@ -1,26 +0,0 @@ -FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 - -LABEL maintainer="lars@ludvig.no" \ - description="Log Analysis for DeepRacer Training Run" \ - version=1.0 - -# Container Dependency Setup -RUN apt-get update && apt-get upgrade -y && \ - apt-get install --no-install-recommends software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-10.0/targets/x86_64-linux/lib - -RUN pip3 install virtualenv && virtualenv /workspace/venv -WORKDIR /workspace/venv -RUN mkdir -p /workspace/venv/data /workspace/venv/logs /workspace/venv/workbook - -# Install common pip packages -WORKDIR /workspace/venv -COPY requirements.txt ./ -RUN . /workspace/venv/bin/activate && pip install --no-cache-dir -r requirements.txt - -EXPOSE 8888 -VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws", "/workspace/venv/workbook"] -CMD . /workspace/venv/bin/activate && jupyter lab --ip=0.0.0.0 --port=8888 --allow-root - diff --git a/docker/dockerfiles/log-analysis/Dockerfile.nogpu b/docker/dockerfiles/log-analysis/Dockerfile.nogpu deleted file mode 100644 index 7e10d4f3..00000000 --- a/docker/dockerfiles/log-analysis/Dockerfile.nogpu +++ /dev/null @@ -1,24 +0,0 @@ -FROM python:3.7.6-slim - -LABEL maintainer="lars@ludvig.no" \ - description="Log Analysis for DeepRacer Training Run" \ - version=1.0 - -# Container Dependency Setup -RUN apt-get update && apt-get upgrade -y && \ - apt-get install --no-install-recommends software-properties-common libsm6 libxext6 libxrender-dev git wget python3-pip -y && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -RUN pip3 install virtualenv && virtualenv /workspace/venv -WORKDIR /workspace/venv -RUN mkdir -p /workspace/venv/data /workspace/venv/logs /workspace/venv/workbook - -# Install common pip packages -WORKDIR /workspace/venv -COPY requirements.txt ./ -RUN . /workspace/venv/bin/activate && pip install --no-cache-dir -r requirements.txt - -EXPOSE 8888 -VOLUME ["/workspace/venv/data", "/workspace/venv/logs", "/root/.aws", "/workspace/venv/workbook"] -CMD . /workspace/venv/bin/activate && jupyter lab --ip=0.0.0.0 --port=8888 --allow-root - diff --git a/docker/dockerfiles/log-analysis/requirements.txt b/docker/dockerfiles/log-analysis/requirements.txt deleted file mode 100644 index d1eb298f..00000000 --- a/docker/dockerfiles/log-analysis/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -jupyterlab -matplotlib -numpy -opencv-python -scipy -pandas -sklearn -shapely -boto3 -awscli -plotly -tensorflow==1.15.0 -Pillow -python-resize-image \ No newline at end of file diff --git a/docker/dockerfiles/rl_coach/Dockerfile b/docker/dockerfiles/rl_coach/Dockerfile deleted file mode 100644 index d3dbf8b6..00000000 --- a/docker/dockerfiles/rl_coach/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM python:3.7.6-slim -LABEL maintainer "Lars Ludvigsen " - -# install docker -RUN apt-get update && apt-get install --no-install-recommends -y \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg \ - gnupg-agent \ - software-properties-common -RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - && \ - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" -RUN apt-get update && apt-get install --no-install-recommends -y docker-ce-cli && apt-get clean && rm -rf /var/lib/apt/lists/* - -# create sagemaker configuration -RUN mkdir -p /root/.sagemaker /robo/container -COPY deepracer/config.yaml /root/.sagemaker/config.yaml - -# add required deepracer directories to the container -# RUN mkdir -p /deepracer && mkdir -p /deepracer/rl_coach && mkdir -p /deepracer/sagemaker-python-sdk -WORKDIR /deepracer -ADD deepracer/rl_coach /deepracer/rl_coach -ADD deepracer/sagemaker-python-sdk /deepracer/sagemaker-python-sdk - -# install dependencies -RUN pip install --no-cache-dir -U /deepracer/sagemaker-python-sdk/ awscli ipython pandas "urllib3==1.22" "pyyaml==3.13" "python-dateutil==2.8.0" - -# set command -CMD (cd rl_coach; ipython rl_deepracer_coach_robomaker.py) diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 2fb17e03..cb7fe6ba 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -nvidia-docker run --rm -it -p "8888:8888" \ --v `pwd`/../../docker/volumes/log-analysis:/workspace/venv/data \ +docker run --rm -it -p "8888:8888" \ +-v `pwd`/../../logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ --v /mnt/deepracer/robo/checkpoint/log:/workspace/venv/logs \ --v `pwd`/../../analysis:/workspace/venv/workbook \ - larsll/deepracer-loganalysis +-v `pwd`/../../analysis:/workspace/analysis \ +--name loganalysis \ + larsll/deepracer-loganalysis:v2-cpu diff --git a/scripts/log-analysis/stop.sh b/scripts/log-analysis/stop.sh index f69a56c2..ed270080 100755 --- a/scripts/log-analysis/stop.sh +++ b/scripts/log-analysis/stop.sh @@ -1,4 +1,3 @@ #!/usr/bin/env bash -docker stop $(docker ps | awk ' /analysis/ { print $1 }') -#docker rm $(docker ps -a | awk ' /analysis/ { print $1 }') +docker stop loganalysis diff --git a/scripts/training/increment.sh b/scripts/training/increment.sh index 1dd8d634..db399de4 100755 --- a/scripts/training/increment.sh +++ b/scripts/training/increment.sh @@ -43,7 +43,7 @@ CONFIG_FILE=$(echo $DR_DIR/current-run.env) echo "Configuration file $CONFIG_FILE will be updated." ## Read in data -CURRENT_RUN_MODEL=$(awk '/LOCAL_S3_MODEL_PREFIX/ {print $1}' ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') +CURRENT_RUN_MODEL=$(grep -e "^DR_LOCAL_S3_MODEL_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \ awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') if [[ -z ${CURRENT_RUN_MODEL_NUM} ]]; @@ -66,7 +66,7 @@ then exit 1 fi fi - sed -i.bak -re "s/(LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done." + sed -i.bak -re "s/(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(DR_LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(DR_LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done." else echo "Error in determining new model. Aborting." exit 1 @@ -74,10 +74,10 @@ fi if [[ -n "${OPT_WIPE}" ]]; then - MODEL_DIR_S3=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} ) + MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} ) if [[ -n "${MODEL_DIR_S3}" ]]; then - echo "The new model's S3 prefix s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe." + echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe." fi if [[ -z "${OPT_FORCE}" ]]; then @@ -88,5 +88,5 @@ then exit 1 fi fi - aws $LOCAL_PROFILE_ENDPOINT_URL s3 rm s3://${LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive + aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive fi diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py new file mode 100755 index 00000000..92f87bcb --- /dev/null +++ b/scripts/training/prepare-config.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 + +import boto3 +import sys +import os +import time +import json +import io +import yaml + +config = {} +config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false') +config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') +config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +config['CHANGE_START_POSITION'] = os.environ.get('DR_CHANGE_START_POSITION', 'true') +config['JOB_TYPE'] = 'TRAINING' +config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') +config['METRIC_NAME'] = 'TrainingRewardScore' +config['METRIC_NAMESPACE'] = 'AWSDeepRacer' +config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') + '/metrics/training_metrics.json' +config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_CUSTOM_FILES_PREFIX', 'custom_files') + '/model_metadata.json' +config['NUMBER_OF_EPISODES'] = os.environ.get('DR_NUMBER_OF_EPISODES', '0') +config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') +config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_CUSTOM_FILES_PREFIX', 'custom_files') + '/reward_function.py' +config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') +config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') +config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') +config['TARGET_REWARD_SCORE'] = os.environ.get('DR_TARGET_REWARD_SCORE', 'None') +config['TRAINING_JOB_ARN'] = 'arn:Dummy' +config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') + +s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) +s3_region = config['AWS_REGION'] +s3_bucket = config['SAGEMAKER_SHARED_S3_BUCKET'] +s3_prefix = config['SAGEMAKER_SHARED_S3_PREFIX'] +s3_yaml_name = os.environ.get('DR_LOCAL_S3_PARAMS_FILE', 'training_params.yaml') +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) + +session = boto3.session.Session() +s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) + +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) +local_yaml_path = os.path.abspath(os.path.join('/tmp', 'training-params-' + str(round(time.time())) + '.yaml')) + +with open(local_yaml_path, 'w') as yaml_file: + yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) + +s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 7f315e8d..fd8f030a 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -1,11 +1,8 @@ #!/usr/bin/env bash usage(){ - echo "Usage: $0 [-f] [-k]" - echo "" - echo "Command will start training." - echo "-f Force deletion of model path. Ask for no confirmations." - echo "-k Keep model path" + echo "Usage: $0 [-w]" + echo " -w Wipes the target AWS DeepRacer model structure before upload." exit 1 } @@ -16,14 +13,9 @@ function ctrl_c() { exit 1 } -OPT_DELIM='-' - -while getopts ":fkh" opt; do +while getopts ":wh" opt; do case $opt in - -f) OPT_FORCE="True" -;; -k) OPT_KEEP="Keep" +w) OPT_WIPE="WIPE" ;; h) usage ;; @@ -33,32 +25,31 @@ usage esac done -export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" +S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" -if [[ -z "${OPT_KEEP}" ]]; -then - MODEL_DIR_S3=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} ) - if [[ -n "${MODEL_DIR_S3}" ]]; - then - echo "The new model's S3 prefix s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} exists. Will wipe." - if [[ -z "${OPT_FORCE}" ]]; - then - read -r -p "Are you sure? [y/N] " response - if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] - then - echo "Aborting." - exit 1 - fi - fi - aws $LOCAL_PROFILE_ENDPOINT_URL s3 rm s3://${LOCAL_S3_BUCKET}/${LOCAL_S3_MODEL_PREFIX} --recursive - fi +S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) +if [[ $S3_FILES > 0 ]]; +then + if [[ -z $OPT_WIPE ]]; + then + echo "Selected path $S3_PATH exists. Delete it, or use -w option. Exiting." + exit 1 + else + echo "Wiping path $S3_PATH." + aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH} + fi fi +echo "Creating Robomaker configuration in $S3_PATH/training_params.yaml" +python3 prepare-config.py + +export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" +export COMPOSE_FILE=$DR_COMPOSE_FILE docker-compose up -d -echo 'waiting for containers to start up...' +echo 'Waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start -sleep 20 +sleep 5 if xhost >& /dev/null; then diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index 413f711e..052c315b 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash - +export COMPOSE_FILE=$DR_COMPOSE_FILE docker-compose down SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') diff --git a/scripts/training/temp.yml b/scripts/training/temp.yml new file mode 100644 index 00000000..522088b6 --- /dev/null +++ b/scripts/training/temp.yml @@ -0,0 +1,24 @@ +--- +'ALTERNATE_DRIVING_DIRECTION': 'false' +'AWS_REGION': 'us-east-1' +'CAR_COLOR': 'Red' +'CAR_NAME': 'MyCar' +'CHANGE_START_POSITION': 'true' +'JOB_TYPE': 'TRAINING' +'KINESIS_VIDEO_STREAM_NAME': 'my-kinesis-stream' +'METRICS_S3_BUCKET': 'bucket' +'METRICS_S3_OBJECT_KEY': 'rl-deepracer-sagemaker/metrics/training_metrics.json' +'METRIC_NAME': 'TrainingRewardScore' +'METRIC_NAMESPACE': 'AWSDeepRacer' +'MODEL_METADATA_FILE_S3_KEY': 'custom_files/model_metadata.json' +'NUMBER_OF_EPISODES': '0' +'RACE_TYPE': 'TIME_TRIAL' +'REWARD_FILE_S3_KEY': 'custom_files/rewards.py' +'ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID': 'Dummy' +'SAGEMAKER_SHARED_S3_BUCKET': 'bucket' +'SAGEMAKER_SHARED_S3_PREFIX': 'rl-deepracer-sagemaker' +'SIMTRACE_S3_BUCKET': 'bucket' +'SIMTRACE_S3_PREFIX': 'rl-deepracer-sagemaker' +'TARGET_REWARD_SCORE': 'None' +'TRAINING_JOB_ARN': 'arn:Dummy' +'WORLD_NAME': 'LGSWide' diff --git a/scripts/upload/list-set-models.sh b/scripts/upload/list-set-models.sh index a11cba4d..c31e26a8 100755 --- a/scripts/upload/list-set-models.sh +++ b/scripts/upload/list-set-models.sh @@ -29,7 +29,7 @@ usage esac done -TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} +TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} WORK_DIR=/mnt/deepracer/tmp-list mkdir -p ${WORK_DIR} @@ -38,7 +38,7 @@ then PARAM_FILES=$(ls -t "${WORK_DIR}" ) echo -e "Using local cache..." else - PARAM_FILES=$(aws ${UPLOAD_PROFILE} s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) + PARAM_FILES=$(aws ${DR_UPLOAD_PROFILE} s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) echo -e "\nLooking for DeepRacer models in s3://${TARGET_S3_BUCKET}...\n" fi @@ -57,7 +57,7 @@ then for PARAM_FILE in $PARAM_FILES; do if [[ -z "${OPT_CACHE}" ]]; then - aws ${UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + aws ${DR_UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') else PARAM_FILE_L=$PARAM_FILE @@ -75,7 +75,7 @@ else for PARAM_FILE in $PARAM_FILES; do if [[ -z "${OPT_CACHE}" ]]; then - aws ${UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet + aws ${DR_UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then @@ -104,7 +104,7 @@ else echo "Aborting." exit 1 else - sed -i.bak -re "s/(UPLOAD_S3_PREFIX=).*$/\1$MATCHED_PREFIX/g; s/(UPLOAD_MODEL_NAME=).*$/\1$MODEL_NAME/g" "$CONFIG_FILE" && echo "Done." + sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$MATCHED_PREFIX/g; s/(DR_UPLOAD_MODEL_NAME=).*$/\1$MODEL_NAME/g" "$CONFIG_FILE" && echo "Done." fi fi fi \ No newline at end of file diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 29ea9c8f..00f6a57c 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -5,7 +5,7 @@ usage(){ echo " -f Force upload. No confirmation question." echo " -w Wipes the target AWS DeepRacer model structure before upload." echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." - echo " -c num Uploads specified checkpoint. Default is last checkpoint." + echo " -b Uploads best checkpoint. Default is last checkpoint." echo " -p model Uploads model in specified S3 prefix." exit 1 } @@ -17,9 +17,9 @@ function ctrl_c() { exit 1 } -while getopts ":fwdhc:p:" opt; do +while getopts ":fwdhbp:" opt; do case $opt in -c) OPT_CHECKPOINT="$OPTARG" +b) OPT_CHECKPOINT="Best" ;; f) OPT_FORCE="True" ;; @@ -42,35 +42,38 @@ then echo "*** DRYRUN MODE ***" fi -TARGET_S3_BUCKET=${UPLOAD_S3_BUCKET} -TARGET_S3_PREFIX=${UPLOAD_S3_PREFIX} +TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} +TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} -if [[ -z "${UPLOAD_S3_BUCKET}" ]]; +if [[ -z "${DR_UPLOAD_S3_BUCKET}" ]]; then echo "No upload bucket defined. Exiting." exit 1 fi -if [[ -z "${UPLOAD_S3_PREFIX}" ]]; +if [[ -z "${DR_UPLOAD_S3_PREFIX}" ]]; then echo "No upload prefix defined. Exiting." exit 1 fi -SOURCE_S3_BUCKET=${LOCAL_S3_BUCKET} +SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} if [[ -n "${OPT_PREFIX}" ]]; then SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX} else - SOURCE_S3_MODEL_PREFIX=${LOCAL_S3_MODEL_PREFIX} + SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} fi -SOURCE_S3_CONFIG=${LOCAL_S3_CUSTOM_FILES_PREFIX} +SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} +SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} +SOURCE_S3_METRICS=${DR_LOCAL_S3_METRICS_KEY} + WORK_DIR=/mnt/deepracer/tmp/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model # Download information on model. -PARAM_FILE=$(aws ${UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) +PARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) if [ -n "$PARAM_FILE" ]; then TARGET_METADATA_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") @@ -80,16 +83,15 @@ then MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | awk '{split($0,a,"/"); print a[2] }') echo "Detected DeepRacer Model ${MODEL_NAME} at s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/." else - echo "No DeepRacer information found in s3://${UPLOAD_S3_BUCKET}/${UPLOAD_S3_PREFIX}. Exiting" + echo "No DeepRacer information found in s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}. Exiting" exit 1 fi - # Check if metadata-files are available -REWARD_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py ${WORK_DIR} --no-progress | awk '/reward.py$/ {print $4}'| xargs readlink -f 2> /dev/null) -METADATA_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -HYPERPARAM_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -METRICS_FILE=$(aws $LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/metrics/metric.json ${WORK_DIR} --no-progress | awk '/metric.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$METRICS_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then @@ -104,31 +106,30 @@ fi # Download checkpoint file echo "Looking for model to upload from s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/" -CHECKPOINT_FILE=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model --exclude "*" --include "checkpoint" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) +CHECKPOINT_INDEX=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/deepracer_checkpoints.json ${WORK_DIR}model/ --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) -if [ -z "$CHECKPOINT_FILE" ]; then +if [ -z "$CHECKPOINT_INDEX" ]; then echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting." exit 1 fi if [ -z "$OPT_CHECKPOINT" ]; then - echo "Checkpoint not supplied, checking for latest checkpoint" - - FIRST_LINE=$(head -n 1 $CHECKPOINT_FILE) - CHECKPOINT_PREFIX=$(echo $FIRST_LINE | sed "s/[model_checkpoint_path: [^ ]*//" | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - CHECKPOINT=`echo $CHECKPOINT_PREFIX | sed 's/[_][^ ]*//'` - echo "Latest checkpoint = "$CHECKPOINT + echo "Checking for latest checkpoint" + CHECKPOINT_FILE=`jq -r .last_checkpoint.name < $CHECKPOINT_INDEX` + CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` + echo "Latest checkpoint = $CHECKPOINT" else - CHECKPOINT="${OPT_CHECKPOINT}" - CHECKPOINT_PREFIX=$(cat $CHECKPOINT_FILE | grep "all_model_checkpoint_paths: \"$CHECKPOINT" | sed "s/[all_model_checkpoint_paths: [^ ]*//" | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - echo "Checkpoint supplied: ["${CHECKPOINT}"]" + echo "Checking for best checkpoint" + CHECKPOINT_FILE=`jq -r .best_checkpoint.name < $CHECKPOINT_INDEX` + CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` + echo "Best checkpoint: $CHECKPOINT" fi # Find checkpoint & model files - download -if [ -n "$CHECKPOINT_PREFIX" ]; then - CHECKPOINT_MODEL_FILES=$(aws ${LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT_PREFIX}*" --include "model_${CHECKPOINT}.pb" --no-progress | awk '{print $4}' | xargs readlink -f) +if [ -n "$CHECKPOINT" ]; then + CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f) cp ${METADATA_FILE} ${WORK_DIR}model/ - echo "model_checkpoint_path: \"${CHECKPOINT_PREFIX}\"" | tee ${CHECKPOINT_FILE} + echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint else echo "Checkpoint not found. Exiting." exit 1 @@ -148,8 +149,8 @@ fi touch ${WORK_DIR}model/.ready cd ${WORK_DIR} -aws ${UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} -aws ${UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${UPLOAD_PROFILE} s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} +aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} From 325a799eeaf8993fb7c45acdfd918cd9b0ba5328 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 10 Apr 2020 09:36:28 +0000 Subject: [PATCH 044/428] Fixing start script --- scripts/training/start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index fd8f030a..f16a7ae7 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -59,7 +59,7 @@ then echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' else echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }')" + gnome-terminal -x sh -c "!!; docker logs -f $(docker ps -a | awk ' /sagemaker/ { print $1 }')" fi if ! [ -x "$(command -v gnome-terminal)" ]; From 42b967fc51c935edc79ac6e1f2366d3cfdfdf522 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 15 Apr 2020 11:00:20 +0000 Subject: [PATCH 045/428] Fixing log-analysis folder permissions --- scripts/log-analysis/start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index cb7fe6ba..8996dd93 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -4,5 +4,6 @@ docker run --rm -it -p "8888:8888" \ -v `pwd`/../../logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v `pwd`/../../analysis:/workspace/analysis \ +-e HOST_PERMS="$(id -u):$(id -g)" \ --name loganalysis \ larsll/deepracer-loganalysis:v2-cpu From 4e071b4e6e08bbfbfd081998b06e42e9b7f51ec9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 15 Apr 2020 14:32:47 +0200 Subject: [PATCH 046/428] Enabling automatic setup for non-GPU instances (#25) * Initial fix to allow for CPU based setup * Fixing issues --- bin/init.sh | 9 +++++++++ bin/prepare.sh | 34 +++++++++++++++++++++------------- defaults/template-run.env | 4 ++-- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 0885e0b5..d00d1f79 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -62,6 +62,15 @@ cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/current-run.env if [[ -n "$OPT_CLOUD" ]]; then sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/current-run.env +else + sed -i "s//local/g" $INSTALL_DIR/current-run.env +fi + +if [[ "${OPT_ARCH}" == "gpu" ]]; +then + sed -i "s//True/g" $INSTALL_DIR/current-run.env +else + sed -i "s//False/g" $INSTALL_DIR/current-run.env fi #set proxys if required diff --git a/bin/prepare.sh b/bin/prepare.sh index a95e171c..847827c4 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -21,8 +21,10 @@ echo "Detected cloud type ${CLOUD_NAME}" GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then - echo "No NVIDIA GPU detected. Exiting". - exit 1 + ARCH="cpu" + echo "No NVIDIA GPU detected. Will not install drivers." +else + ARCH="gpu" fi ## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure) @@ -88,10 +90,13 @@ then fi ## Adding Nvidia Drivers -sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub -sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' -sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' -sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' +if [[ "${ARCH}" == "gpu" ]]; +then + sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub + sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' + sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' + sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' +fi ## Adding AWSCli sudo apt-get install -y awscli python3-boto3 @@ -101,12 +106,15 @@ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" sudo apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list +if [[ "${ARCH}" == "gpu" ]]; +then + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list -sudo apt-get update && sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime -jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json + sudo apt-get update && sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime + jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json +fi sudo systemctl enable docker sudo systemctl restart docker @@ -124,9 +132,9 @@ if [[ "$CLOUD_INIT" -ne 0 ]]; then echo "Rebooting in 5 seconds. Will continue with install." cd $DIR - ./runonce.sh "./init.sh -m /mnt -c ${CLOUD_NAME}" + ./runonce.sh "./init.sh -m /mnt -c ${CLOUD_NAME} -a ${ARCH}" sleep 5s sudo reboot else echo "First stage done. Please reboot and run init.sh" -fi \ No newline at end of file +fi diff --git a/defaults/template-run.env b/defaults/template-run.env index d1905477..bd918bed 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -11,7 +11,7 @@ DR_CAR_NAME=FastCar DR_KINESIS_STREAM_NAME=my-test-stream DR_KINESIS_STREAM_ENABLE=False DR_GUI_ENABLE=False -DR_GPU_AVAILABLE=True +DR_GPU_AVAILABLE= DR_ROBOMAKER_IMAGE_TYPE=cpu DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained @@ -23,4 +23,4 @@ DR_LOCAL_S3_PARAMS_FILE=training-params.yaml DR_TARGET_REWARD_SCORE=None DR_NUMBER_OF_EPISODES=500 DR_LOCAL_S3_REWARD_KEY=custom_files/reward_function.py -DR_LOCAL_S3_METRICS_KEY=$DR_LOCAL_S3_MODEL_PREFIX/metrics/training_metrics.json \ No newline at end of file +DR_LOCAL_S3_METRICS_KEY=$DR_LOCAL_S3_MODEL_PREFIX/metrics/training_metrics.json From ada1de3507aefb167b978c6283bde5f18f539f53 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 15 Apr 2020 18:12:42 +0000 Subject: [PATCH 047/428] Fix missing sudo in script --- bin/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 847827c4..e7f40f54 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -104,7 +104,7 @@ sudo apt-get install -y awscli python3-boto3 ## Installing Docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" -sudo apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get update && sudo apt-get install -y docker-ce docker-ce-cli containerd.io if [[ "${ARCH}" == "gpu" ]]; then From 2357076fa1900f428dbb3d16fd16cca07d7c4f41 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 22 Apr 2020 21:17:46 +0000 Subject: [PATCH 048/428] Fixing typo in environment variable --- bin/activate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 33d7f64e..6cd54b91 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -63,7 +63,7 @@ export DR_LOCAL_PROFILE_ENDPOINT_URL function dr-upload-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILES up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" @@ -85,7 +85,7 @@ function dr-set-upload-model { function dr-download-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILES up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" From 018ae4dc2595dfa52b267f879b6e87c21220d343 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 12:15:35 +0000 Subject: [PATCH 049/428] Fixing missing parameter --- bin/activate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 6cd54b91..381c40b1 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -63,7 +63,7 @@ export DR_LOCAL_PROFILE_ENDPOINT_URL function dr-upload-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio + ROBOMAKER_COMMAND="" docker-compose -f $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" @@ -85,7 +85,7 @@ function dr-set-upload-model { function dr-download-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio + ROBOMAKER_COMMAND="" docker-compose -f $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" From d50d5311f0b69cf86c8a0991c9246ca25066a5a3 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 12:34:35 +0000 Subject: [PATCH 050/428] Fixing new format required by docker-compose --- bin/activate.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 381c40b1..1f17f947 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -37,15 +37,15 @@ if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" - DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-azure.yml" + DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-azure.yml" elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" - DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml:$DIR/docker/docker-compose-local.yml" + DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-local.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" - DR_COMPOSE_FILE="$DIR/docker/docker-compose.yml" + DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml" fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. @@ -53,7 +53,7 @@ if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) - DR_COMPOSE_FILE="$DR_COMPOSE_FILE:$DIR/docker/docker-compose-keys.yml" + DR_COMPOSE_FILE="$DR_COMPOSE_FILE -f $DIR/docker/docker-compose-keys.yml" export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" fi @@ -63,7 +63,7 @@ export DR_LOCAL_PROFILE_ENDPOINT_URL function dr-upload-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose -f $DR_COMPOSE_FILE up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" @@ -85,7 +85,7 @@ function dr-set-upload-model { function dr-download-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - ROBOMAKER_COMMAND="" docker-compose -f $DR_COMPOSE_FILE up -d minio + ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" From 33c6a6b342174a102fa8865431406cf2b1d4a1f4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 12:41:14 +0000 Subject: [PATCH 051/428] Fixing variable --- bin/activate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 1f17f947..0e22807c 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -36,12 +36,12 @@ fi if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" - DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" + DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-azure.yml" elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" - DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_ENDPOINT_URL" + DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-local.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" From 4b6598f3db6ae886fbefe01a7ce1e10193256754 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 12:46:37 +0000 Subject: [PATCH 052/428] Changed way to use compose files --- scripts/evaluation/start.sh | 2 +- scripts/evaluation/stop.sh | 2 +- scripts/training/start.sh | 4 ++-- scripts/training/stop.sh | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 03deaaaf..39adedc0 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -2,7 +2,7 @@ export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" export METRICS_S3_OBJECT_KEY=metrics/eval_metrics.json -docker-compose up -d +docker-compose $DR_COMPOSE_FILE up -d echo 'waiting for containers to start up...' diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index 413f711e..ba454672 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -docker-compose down +docker-compose $DR_COMPOSE_FILE down SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') if [[ -n $SAGEMAKER ]]; diff --git a/scripts/training/start.sh b/scripts/training/start.sh index f16a7ae7..60ebb93b 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -44,8 +44,8 @@ echo "Creating Robomaker configuration in $S3_PATH/training_params.yaml" python3 prepare-config.py export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" -export COMPOSE_FILE=$DR_COMPOSE_FILE -docker-compose up -d +#export COMPOSE_FILE=$DR_COMPOSE_FILE +docker-compose $DR_COMPOSE_FILE up -d echo 'Waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index 052c315b..047b2b00 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -export COMPOSE_FILE=$DR_COMPOSE_FILE -docker-compose down +#export COMPOSE_FILE=$DR_COMPOSE_FILE +docker-compose $DR_COMPOSE_FILE down SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') if [[ -n $SAGEMAKER ]]; From 46bd93bbb4e62c4c5c47635e9dc60032f855c062 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 13:24:07 +0000 Subject: [PATCH 053/428] Ensure minio runs on local --- scripts/training/start.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 60ebb93b..2ba8c3c8 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -41,9 +41,14 @@ then fi echo "Creating Robomaker configuration in $S3_PATH/training_params.yaml" +export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" +if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; +then + docker-compose $DR_COMPOSE_FILE up -d minio +fi python3 prepare-config.py -export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" + #export COMPOSE_FILE=$DR_COMPOSE_FILE docker-compose $DR_COMPOSE_FILE up -d echo 'Waiting for containers to start up...' From 8a48b667ef60e25d08cedbb72889a4b25ba4a6d8 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 24 Apr 2020 13:30:22 +0000 Subject: [PATCH 054/428] Ignore all .env files in root --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d1861aab..cc7b54fb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,6 @@ logs/ docker/volumes/ recording/ recording -current-run.env -current-run.env.bak +/*.env +/*.bak DONE From 0fd223a7fbf9376f28ce14f2ff9960eaf3bccb92 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 13 May 2020 13:44:11 +0200 Subject: [PATCH 055/428] Feature: Multiple Parallel Trainings (#27) Significant changes to allow for several parallel training sessions. Uses docker swarm instead of docker-compose for most activities. --- .gitignore | 3 +- README.md | 39 ++++--- bin/activate.sh | 169 +++++++++------------------ bin/init.sh | 90 +++++++++----- bin/prepare.sh | 39 +------ bin/scripts_wrapper.sh | 132 +++++++++++++++++++++ defaults/template-run.env | 30 +++-- defaults/template-system.env | 14 +++ docker/docker-compose-azure.yml | 16 +-- docker/docker-compose-endpoint.yml | 9 ++ docker/docker-compose-local.yml | 22 ++-- docker/docker-compose.yml | 29 ++--- scripts/evaluation/prepare-config.py | 54 +++++++++ scripts/evaluation/start.sh | 22 +++- scripts/evaluation/stop.sh | 24 +++- scripts/log-analysis/start.sh | 4 +- scripts/training/increment.sh | 2 +- scripts/training/prepare-config.py | 16 ++- scripts/training/start.sh | 21 ++-- scripts/training/stop.sh | 26 +++-- scripts/training/temp.yml | 24 ---- scripts/upload/list-set-models.sh | 4 +- scripts/upload/upload-model.sh | 2 +- 23 files changed, 482 insertions(+), 309 deletions(-) create mode 100644 bin/scripts_wrapper.sh create mode 100644 defaults/template-system.env create mode 100644 docker/docker-compose-endpoint.yml create mode 100755 scripts/evaluation/prepare-config.py delete mode 100644 scripts/training/temp.yml diff --git a/.gitignore b/.gitignore index cc7b54fb..0b814f8b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .vscode/ custom_files/ -analysis/ logs/ docker/volumes/ recording/ @@ -8,3 +7,5 @@ recording /*.env /*.bak DONE +data/ +tmp/ \ No newline at end of file diff --git a/README.md b/README.md index acac78a3..0643f71f 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,9 @@ Main differences to the work done by Alex is: * Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. * Configuration has been reorganized : * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. - * `current-run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). + * `system.env` contains system-wide constants (expected to be configured only at setup) + * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -* Runtime storage: Uses `/mnt` to store robomaker files (checkpoints, logs); depending on setup these will normally be deleted between runs, but Azure and AWS provides 200+ GB free storage which is very suitable for this purpuse. Archiving of logs and additional checkpoint files required if desired. (Update: as of V2 this is less important as the robomaker is now cleaning up on itsown) - * Azure: Uses the normal temporary drive which is mounted on /mnt by default. - * AWS: Preparation scripts mounts the ephemeral drive on /mnt ## Requirements @@ -40,8 +38,6 @@ Depending on your needs as well as specific needs of the cloud platform you can ## Installation -A step by step [installation guide](https://github.com/larsll/deepracer-for-cloud/wiki/Install-DeepRacer-in-Azure) for manual installation in Azure is available. - The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. git clone https://github.com/larsll/deepracer-for-cloud.git @@ -53,14 +49,17 @@ The installation script will adapt `.profile` to ensure that all settings are ap For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. +The Init Script takes a few parameters: +| Variable | Description | +|----------|-------------| +| `-c ` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` | +| `-a ` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. | + *TODO: Document how to configure via cloud-init.* -*TODO: Create a local setup prepare script* ## Environment Setup -The environment is set via the `CLOUD` parameter in `current-run.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. - -Note: If in the `bin/prepare.sh` script then the working directory `/mnt/deepracer` will be provided based on the temporary storage partitions made available. If you want to provision the working directory in a different fashion then just ensure that a volume is mounted on `/mnt` or `/mnt/deepracer` with sufficient storage. +The environment is set via the `CLOUD` parameter in `system.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. ### AWS @@ -75,7 +74,7 @@ To use IAM Roles: * AmazonVPCReadOnlyAccess * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis * An EC2 instance with the IAM Role assigned. -* Configure `current-run.env` as follows: +* Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` @@ -88,7 +87,7 @@ For access with IAM user: * A real AWS IAM user set up with access keys: * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. * Use `aws configure` to configure this into the default profile. -* Configure `current-run.env` as follows: +* Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` @@ -104,7 +103,7 @@ In Azure mode the script-set requires the following: * Secret Access Key is the Access Key for the Storage Account. * The blob container is equivalent to the S3 bucket. * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. -* Configure `current-run.env` as follows: +* Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=` * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` @@ -117,14 +116,18 @@ If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL ### Local -Local mode runs a minio server that hosts the data in the `/mnt/deepracer` partition. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. +Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. + +After having run init.sh do the following: +* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. +* Configure your normal AWS credentials with `aws configure` if this is not already in place on your system. This is required to use the model upload functionality. ### Environment Variables -The scripts assume that a file `current-run.env` is populated with the required values. +The scripts assume that two files `systen.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. | Variable | Description | |----------|-------------| -| `DR_CLOUD` | Can be `Azure` or `AWS`; determines how the storage will be configured.| +| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| | `DR_WORLD_NAME` | Defines the track to be used.| | `DR_NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| | `DR_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| @@ -159,7 +162,7 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust | Command | Description | |---------|-------------| | `dr-update` | Loads in all scripts and environment variables again.| -| `dr-update-env` | Loads in all environment variables from `current-run.env`.| +| `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.| | `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| | `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| | `dr-start-training` | Starts a training session in the local VM based on current configuration.| @@ -172,5 +175,5 @@ Ensure that the configuration files are uploaded into the bucket `dr-upload-cust | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | -| `dr-set-upload-model` | Updates the `current-run.env` with the prefix and name of your selected model. | +| `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. | | `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | diff --git a/bin/activate.sh b/bin/activate.sh index 0e22807c..25439088 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -1,35 +1,54 @@ #!/bin/bash function dr-update-env { - if [[ -f "$DIR/current-run.env" ]] + + if [[ -f "$DIR/system.env" ]] + then + LINES=$(grep -v '^#' $DIR/system.env) + for l in $LINES; do + env_var=$(echo $l | cut -f1 -d\=) + env_val=$(echo $l | cut -f2 -d\=) + eval "export $env_var=$env_val" + done + else + echo "File system.env does not exist." + exit 1 + fi + + if [[ -f "$DR_CONFIG" ]] then - LINES=$(grep -v '^#' $DIR/current-run.env) + LINES=$(grep -v '^#' $DR_CONFIG) for l in $LINES; do env_var=$(echo $l | cut -f1 -d\=) env_val=$(echo $l | cut -f2 -d\=) eval "export $env_var=$env_val" done else - echo "File current-run.env does not exist." + echo "File run.env does not exist." exit 1 fi + + if [[ -z "${DR_RUN_ID}" ]]; then + export DR_RUN_ID=0 + fi + export DR_ROBOMAKER_PORT=$(echo "8080 + $DR_RUN_ID" | bc) + export DR_ROBOMAKER_GUI_PORT=$(echo "5900 + $DR_RUN_ID" | bc) + } SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" DIR="$( dirname $SCRIPT_DIR )" export DR_DIR=$DIR -# create directory structure for docker volumes -if [[ $(mount | grep /mnt | wc -l) -ne 0 ]]; then - mount /mnt -fi -sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording -sudo chown $(id -u):$(id -g) /mnt/deepracer - -if [[ -f "$DIR/current-run.env" ]] +if [[ -f "$1" ]]; then + export DR_CONFIG=$(readlink -f $1) + dr-update-env +elif [[ -f "$DIR/run.env" ]]; +then + export DR_CONFIG="$DIR/run.env" dr-update-env else - echo "File current-run.env does not exist." + echo "No configuration file." exit 1 fi @@ -37,15 +56,17 @@ if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-azure.yml" + DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-azure.yml" elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml -f $DIR/docker/docker-compose-local.yml" + DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-local.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" - DR_COMPOSE_FILE="-f $DIR/docker/docker-compose.yml" + DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml" fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. @@ -53,116 +74,30 @@ if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) - DR_COMPOSE_FILE="$DR_COMPOSE_FILE -f $DIR/docker/docker-compose-keys.yml" + DR_COMPOSE_FILE="$DR_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" + export DR_LOCAL_S3_AUTH_MODE="profile" +else + export DR_LOCAL_S3_AUTH_MODE="role" fi export DR_COMPOSE_FILE export DR_LOCAL_PROFILE_ENDPOINT_URL -function dr-upload-custom-files { - if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; - then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio - fi - eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) - echo "Uploading files to $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET -} - -function dr-upload-model { - dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" -} - -function dr-list-aws-models { - dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" -} - -function dr-set-upload-model { - dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" -} - -function dr-download-custom-files { - if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; - then - ROBOMAKER_COMMAND="" docker-compose $DR_COMPOSE_FILE up -d minio - fi - eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) - echo "Downloading files from $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ -} - -function dr-start-training { - dr-update-env - bash -c "cd $DIR/scripts/training && ./start.sh $@" -} - -function dr-increment-training { - dr-update-env && ${DIR}/scripts/training/increment.sh "$@" && dr-update-env -} - -function dr-stop-training { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" -} - -function dr-start-evaluation { - dr-update-env - bash -c "cd $DIR/scripts/evaluation && ./start.sh" -} - -function dr-stop-evaluation { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" -} - -function dr-start-loganalysis { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" -} - -function dr-stop-loganalysis { - eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') - if [ -n "$LOG_ANALYSIS_ID" ]; then - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./stop.sh" - else - echo "Log-analysis is not running." - fi - -} - -function dr-logs-sagemaker { - eval SAGEMAKER_ID=$(docker ps | awk ' /sagemaker/ { print $1 }') - if [ -n "$SAGEMAKER_ID" ]; then - docker logs -f $SAGEMAKER_ID - else - echo "Sagemaker is not running." - fi -} - -function dr-logs-robomaker { - eval ROBOMAKER_ID=$(docker ps | awk ' /robomaker/ { print $1 }') - if [ -n "$ROBOMAKER_ID" ]; then - docker logs -f $ROBOMAKER_ID - else - echo "Robomaker is not running." - fi -} - -function dr-logs-loganalysis { - eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') - if [ -n "$LOG_ANALYSIS_ID" ]; then - docker logs -f $LOG_ANALYSIS_ID - else - echo "Log-analysis is not running." - fi - -} +if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then + export MINIO_UID=$(id -u) + export MINIO_USERNAME=$(id -u -n) + export MINIO_GID=$(id -g) + export MINIO_GROUPNAME=$(id -g -n) + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 +fi -function dr-clean-local { - dr-stop-training - sudo rm -rf /robo/* -} +source $SCRIPT_DIR/scripts_wrapper.sh function dr-update { - source $DIR/activate.sh + dr-update-env } - +function dr-reload { + source $DIR/bin/activate.sh $DR_CONFIG +} diff --git a/bin/init.sh b/bin/init.sh index d00d1f79..e114a177 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -8,6 +8,7 @@ function ctrl_c() { } OPT_ARCH="gpu" +OPT_CLOUD="local" while getopts ":m:c:a:" opt; do case $opt in @@ -23,55 +24,91 @@ exit 1 esac done +# Find CPU Level +CPU_LEVEL="cpu" +if [[ "$(dmesg | grep AVX | wc -l)" > 0 ]]; then + CPU_LEVEL="cpu-avx" +fi + +if [[ "$(dmesg | grep AVX2 | wc -l)" > 0 ]]; then + CPU_LEVEL="cpu-avx2" +fi + +if [[ "$(dmesg | grep AVX-512 | wc -l)" > 0 ]]; then + CPU_LEVEL="cpu-avx512" +fi + +# Check if Intel (to ensure MKN) +if [[ "$(dmesg | grep GenuineIntel | wc -l)" > 0 ]]; then + CPU_INTEL="true" +fi + +# Check GPU if [[ "${OPT_ARCH}" == "gpu" ]] then - GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" | awk '/GPU .:/' | wc -l) + GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" 2> /dev/null | awk '/GPU .:/' | wc -l ) if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then - echo "No GPU detected in docker. Please check setup". - exit 1 + echo "No GPU detected in docker. Using CPU". + OPT_ARCH="cpu" fi fi + INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" cd $INSTALL_DIR # create directory structure for docker volumes - -if [[ -n "$OPT_MOUNT" ]]; -then - mount "${OPT_MOUNT}" -fi -sudo mkdir -p /mnt/deepracer /mnt/deepracer/recording /mnt/deepracer/robo/checkpoint /mnt/deepracer/minio/bucket -sudo chown -R $(id -u):$(id -g) /mnt/deepracer -mkdir -p $INSTALL_DIR/docker/volumes +mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/recording +mkdir -p $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket +mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis +sudo mkdir -p /tmp/sagemaker # create symlink to current user's home .aws directory # NOTE: AWS cli must be installed for this to work # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html -mkdir -p $(eval echo "~${USER}")/.aws +mkdir -p $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ ln -sf $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ # copy rewardfunctions -mkdir -p $INSTALL_DIR/custom_files $INSTALL_DIR/logs $INSTALL_DIR/analysis +mkdir -p $INSTALL_DIR/custom_files cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/ cp $INSTALL_DIR/defaults/model_metadata.json $INSTALL_DIR/custom_files/ cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/ -cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/current-run.env -if [[ -n "$OPT_CLOUD" ]]; -then - sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/current-run.env +cp $INSTALL_DIR/defaults/template-system.env $INSTALL_DIR/system.env +cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env + +if [[ "${OPT_CLOUD}" == "aws" ]]; then + AWS_DR_BUCKET=$(aws s3api list-buckets | jq '.Buckets[] | select(.Name | startswith("aws-deepracer")) | .Name' -r) + AWS_EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` + AWS_REGION="`echo \"$AWS_EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" + if [[ ! -z "${AWS_DR_BUCKET}" ]]; then + sed -i "s//$AWS_DR_BUCKET/g" $INSTALL_DIR/system.env + else + sed -i "s//not-defined/g" $INSTALL_DIR/system.env + fi + sed -i "s//default/g" $INSTALL_DIR/system.env else - sed -i "s//local/g" $INSTALL_DIR/current-run.env + AWS_REGION="us-east-1" + sed -i "s//minio/g" $INSTALL_DIR/system.env + sed -i "s//not-defined/g" $INSTALL_DIR/system.env + echo "Please run 'aws configure --profile minio' to set the credentials" fi -if [[ "${OPT_ARCH}" == "gpu" ]]; -then - sed -i "s//True/g" $INSTALL_DIR/current-run.env +sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/system.env +sed -i "s//$AWS_REGION/g" $INSTALL_DIR/system.env + + +if [[ "${OPT_ARCH}" == "gpu" ]]; then + SAGEMAKER_TAG="gpu" +elif [[ -n "${CPU_INTEL}" ]]; then + SAGEMAKER_TAG="cpu-avx-mkl" else - sed -i "s//False/g" $INSTALL_DIR/current-run.env + SAGEMAKER_TAG="cpu" fi +sed -i "s//$SAGEMAKER_TAG/g" $INSTALL_DIR/system.env +sed -i "s//$CPU_LEVEL/g" $INSTALL_DIR/system.env #set proxys if required for arg in "$@"; @@ -84,17 +121,18 @@ do done # Download docker images. Change to build statements if locally built images are desired. -docker pull larsll/deepracer-rlcoach:v2 -docker pull awsdeepracercommunity/deepracer-robomaker:cpu -docker pull awsdeepracercommunity/deepracer-sagemaker:$OPT_ARCH +docker pull larsll/deepracer-rlcoach:v2.2 +docker pull awsdeepracercommunity/deepracer-robomaker:$CPU_LEVEL +docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_TAG docker pull larsll/deepracer-loganalysis:v2-cpu # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' +docker swarm init docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then - docker network create $SAGEMAKER_NW + docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm fi # ensure our variables are set on startup diff --git a/bin/prepare.sh b/bin/prepare.sh index e7f40f54..ad29e888 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -34,7 +34,7 @@ then ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') - if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; + if [ -n "$ADDL_DISK" ] && [ -z "$ADDL_PART" ]; then echo "Found $ADDL_DISK, preparing it for use" echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK @@ -49,7 +49,7 @@ then echo "Error during preparing of additional disk. Exiting." exit 1 fi - elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; + elif [ -n "$ADDL_DISK" ] && [ -n "$ADDL_PART" ]; then echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." @@ -58,37 +58,6 @@ then fi fi -## Do I have an ephemeral disk / temporary storage for runtime output - looking for /dev/nvme0n1 (AWS)? -if [[ "${CLOUD_NAME}" == "aws" ]]; -then - - ADDL_DISK=$(lsblk | awk '/^nvme0n1/ {print $1}') - ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') - - if [ -n $ADDL_DISK ] && [ -z $ADDL_PART]; - then - echo "Found $ADDL_DISK, preparing it for use" - echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK - sleep 1s - ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"p1") - sudo mkfs.ext4 $ADDL_DEVICE - sudo mkdir -p /mnt - echo "$ADDL_DEVICE /mnt ext4 rw,user,noauto 0 0" | sudo tee -a /etc/fstab - mount /mnt - if [ $? -ne 0 ] - then - echo "Error during preparing of temporary disk. Exiting." - exit 1 - fi - elif [ -n $ADDL_DISK ] && [ -n $ADDL_PART]; - then - echo "Found $ADDL_DISK - $ADDL_PART already mounted, taking no action." - - else - echo "Did not find $ADDL_DISK, taking no action." - fi -fi - ## Adding Nvidia Drivers if [[ "${ARCH}" == "gpu" ]]; then @@ -132,9 +101,9 @@ if [[ "$CLOUD_INIT" -ne 0 ]]; then echo "Rebooting in 5 seconds. Will continue with install." cd $DIR - ./runonce.sh "./init.sh -m /mnt -c ${CLOUD_NAME} -a ${ARCH}" + ./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}" sleep 5s sudo reboot else - echo "First stage done. Please reboot and run init.sh" + echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}" fi diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh new file mode 100644 index 00000000..19df9b1a --- /dev/null +++ b/bin/scripts_wrapper.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +function dr-upload-custom-files { + if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; + then + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + fi + eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) + echo "Uploading files to $CUSTOM_TARGET" + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET +} + +function dr-upload-model { + dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" +} + +function dr-list-aws-models { + dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" +} + +function dr-set-upload-model { + dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" +} + +function dr-download-custom-files { + if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; + then + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + fi + eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) + echo "Downloading files from $CUSTOM_TARGET" + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ +} + +function dr-start-training { + dr-update-env + bash -c "cd $DIR/scripts/training && ./start.sh $@" +} + +function dr-increment-training { + dr-update-env && ${DIR}/scripts/training/increment.sh "$@" && dr-update-env +} + +function dr-stop-training { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" +} + +function dr-start-evaluation { + dr-update-env + bash -c "cd $DIR/scripts/evaluation && ./start.sh $@" +} + +function dr-stop-evaluation { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" +} + +function dr-start-loganalysis { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" +} + +function dr-stop-loganalysis { + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') + if [ -n "$LOG_ANALYSIS_ID" ]; then + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./stop.sh" + else + echo "Log-analysis is not running." + fi + +} + +function dr-logs-sagemaker { + + STACK_NAME="deepracer-$DR_RUN_ID" + RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} + + SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) + + if [[ -n $SAGEMAKER_CONTAINERS ]]; + then + for CONTAINER in $SAGEMAKER_CONTAINERS; do + CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) + CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') + COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') + COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) + if [[ -n $COMPOSE_FILE ]]; then + docker logs -f $CONTAINER + fi + done + else + echo "Sagemaker is not running." + fi + +} + +function dr-logs-robomaker { + eval ROBOMAKER_ID=$(docker ps | grep "deepracer-${DR_RUN_ID}_robomaker" | cut -f1 -d\ | head -1) + if [ -n "$ROBOMAKER_ID" ]; then + docker logs -f $ROBOMAKER_ID + else + echo "Robomaker is not running." + fi +} + +function dr-logs-robomaker-debug { + eval ROBOMAKER_ID=$(docker ps | grep "deepracer-${DR_RUN_ID}_robomaker" | cut -f1 -d\ | head -1) + if [ -n "$ROBOMAKER_ID" ]; then + docker logs -f $ROBOMAKER_ID 2>&1 | grep DEBUG + else + echo "Robomaker is not running." + fi +} + +function dr-logs-loganalysis { + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') + if [ -n "$LOG_ANALYSIS_ID" ]; then + docker logs -f $LOG_ANALYSIS_ID + else + echo "Log-analysis is not running." + fi + +} + +function dr-url-loganalysis { + eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') + if [ -n "$LOG_ANALYSIS_ID" ]; then + eval URL=$(docker logs $LOG_ANALYSIS_ID | perl -n -e'/(http:\/\/127\.0\.0\.1\:8888\/\?.*)/; print $1') + echo "Log-analysis URL:" + echo $URL + else + echo "Log-analysis is not running." + fi +} diff --git a/defaults/template-run.env b/defaults/template-run.env index bd918bed..b26364b7 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,26 +1,24 @@ -DR_CLOUD= -DR_AWS_APP_REGION=us-east-1 +DR_RUN_ID=0 DR_WORLD_NAME=LGSWide DR_CHANGE_START_POSITION=True -DR_UPLOAD_S3_PROFILE=default -DR_UPLOAD_S3_BUCKET=aws-deepracer-mybucketidinreal -DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix -DR_UPLOAD_MODEL_NAME=mymodelname +DR_RACE_TYPE=TIME_TRIAL +DR_TARGET_REWARD_SCORE=None DR_CAR_COLOR=Red DR_CAR_NAME=FastCar -DR_KINESIS_STREAM_NAME=my-test-stream -DR_KINESIS_STREAM_ENABLE=False -DR_GUI_ENABLE=False -DR_GPU_AVAILABLE= -DR_ROBOMAKER_IMAGE_TYPE=cpu +DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix +DR_UPLOAD_MODEL_NAME=mymodelname +DR_EVAL_NUMBER_OF_TRIALS=5 +DR_EVAL_NUMBER_OF_RESETS=0 +DR_EVAL_IS_CONTINUOUS=False +DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained -DR_LOCAL_S3_PROFILE=default DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files -DR_LOCAL_S3_PARAMS_FILE=training-params.yaml -DR_TARGET_REWARD_SCORE=None -DR_NUMBER_OF_EPISODES=500 -DR_LOCAL_S3_REWARD_KEY=custom_files/reward_function.py +DR_LOCAL_S3_TRAINING_PARAMS_FILE=training-params.yaml +DR_LOCAL_S3_EVAL_PARAMS_FILE=eval-params.yaml +DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json +DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json +DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py DR_LOCAL_S3_METRICS_KEY=$DR_LOCAL_S3_MODEL_PREFIX/metrics/training_metrics.json diff --git a/defaults/template-system.env b/defaults/template-system.env new file mode 100644 index 00000000..20c079aa --- /dev/null +++ b/defaults/template-system.env @@ -0,0 +1,14 @@ +DR_CLOUD= +DR_AWS_APP_REGION= +DR_CHANGE_START_POSITION=True +DR_UPLOAD_S3_PROFILE=default +DR_UPLOAD_S3_BUCKET= +DR_LOCAL_S3_PROFILE= +DR_GUI_ENABLE=False +DR_KINESIS_STREAM_NAME=None +DR_KINESIS_STREAM_ENABLE=False +DR_SAGEMAKER_IMAGE= +DR_ROBOMAKER_IMAGE= +DR_COACH_IMAGE=v2.2 +DR_WORKERS=1 + diff --git a/docker/docker-compose-azure.yml b/docker/docker-compose-azure.yml index dd05646e..92e0c668 100644 --- a/docker/docker-compose-azure.yml +++ b/docker/docker-compose-azure.yml @@ -1,23 +1,19 @@ version: '3.7' +networks: + default: + external: true + name: sagemaker-local + services: minio: image: minio/minio ports: - "9000:9000" - container_name: minio command: gateway azure - restart: unless-stopped environment: - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} - rl_coach: - environment: - - S3_ENDPOINT_URL=http://minio:9000 - depends_on: - - minio - robomaker: - environment: - - S3_ENDPOINT_URL=http://minio:9000 + diff --git a/docker/docker-compose-endpoint.yml b/docker/docker-compose-endpoint.yml new file mode 100644 index 00000000..a14b009f --- /dev/null +++ b/docker/docker-compose-endpoint.yml @@ -0,0 +1,9 @@ +version: '3.7' + +services: + rl_coach: + environment: + - S3_ENDPOINT_URL=http://minio:9000 + robomaker: + environment: + - S3_ENDPOINT_URL=http://minio:9000 diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml index ea42a7d8..7254ea35 100644 --- a/docker/docker-compose-local.yml +++ b/docker/docker-compose-local.yml @@ -1,23 +1,23 @@ version: '3.7' +networks: + default: + external: true + name: sagemaker-local + services: minio: image: minio/minio ports: - "9000:9000" - container_name: minio command: server /data - restart: unless-stopped environment: - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} + - MINIO_UID + - MINIO_GID + - MINIO_USERNAME + - MINIO_GROUPNAME volumes: - - /mnt/deepracer/minio:/data - rl_coach: - environment: - - S3_ENDPOINT_URL=http://minio:9000 - depends_on: - - minio - robomaker: - environment: - - S3_ENDPOINT_URL=http://minio:9000 + - ${DR_DIR}/data/minio:/data + diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index a84301a6..a76c248c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -2,32 +2,32 @@ version: '3.7' networks: default: - external: - name: sagemaker-local + external: true + name: sagemaker-local services: rl_coach: - image: larsll/deepracer-rlcoach:v2 + image: larsll/deepracer-rlcoach:${DR_COACH_IMAGE} environment: - - GPU_AVAILABLE=${DR_GPU_AVAILABLE} + - SAGEMAKER_IMAGE=${DR_SAGEMAKER_IMAGE} - PRETRAINED=${DR_LOCAL_S3_PRETRAINED} - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX} - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - container_name: coach + - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY} + - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY} volumes: - '//var/run/docker.sock:/var/run/docker.sock' - - '/robo/container:/robo/container' + - '/tmp/sagemaker:/tmp/sagemaker' robomaker: - image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE_TYPE} + image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] volumes: - - /mnt/deepracer/recording:/mnt/recording + - "${DR_DIR}/data/recording:/mnt/recording" ports: - - "8080:8080" - container_name: robomaker - restart: unless-stopped + - "${DR_ROBOMAKER_PORT}:8080" + - "${DR_ROBOMAKER_GUI_PORT}:5900" environment: - XAUTHORITY=/root/.Xauthority - DISPLAY_N=:0 @@ -35,10 +35,11 @@ services: - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} + - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - APP_REGION=${DR_AWS_APP_REGION} - - S3_YAML_NAME=${DR_LOCAL_S3_PARAMS_FILE} + - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - depends_on: - - rl_coach + - ROLLOUT_IDX=0 diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py new file mode 100755 index 00000000..9666e22f --- /dev/null +++ b/scripts/evaluation/prepare-config.py @@ -0,0 +1,54 @@ +#!/usr/bin/python3 + +import boto3 +import sys +import os +import time +import json +import io +import yaml + +config = {} +config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') +config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +config['JOB_TYPE'] = 'EVALUATION' +config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') +config['METRIC_NAME'] = 'TrainingRewardScore' +config['METRIC_NAMESPACE'] = 'AWSDeepRacer' +config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_METRICS_KEY', 'DeepRacer-Metrics/EvalMetrics-{}.json'.format(str(round(time.time())))) +config['MODEL_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') +config['MODEL_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') +config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_NUMBER_OF_RESETS', '0') +config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', '0') +config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') +config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') +config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') +config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') +config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') + +s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) +s3_region = config['AWS_REGION'] +s3_bucket = config['MODEL_S3_BUCKET'] +s3_prefix = config['MODEL_S3_PREFIX'] +s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') +if s3_mode == 'profile': + s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') +else: # mode is 'role' + s3_profile = None +s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval-params.yaml') +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) + +session = boto3.session.Session(profile_name=s3_profile) +s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) + +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) +local_yaml_path = os.path.abspath(os.path.join('/tmp', 'training-params-' + str(round(time.time())) + '.yaml')) + +with open(local_yaml_path, 'w') as yaml_file: + yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) + +s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 39adedc0..f81204e9 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -1,9 +1,27 @@ +#!/usr/bin/env bash + +source $DR_DIR/bin/scripts_wrapper.sh + +usage(){ + echo "Usage: $0 [-w]" + echo " -w Wipes the target AWS DeepRacer model structure before upload." + exit 1 +} + +trap ctrl_c INT + # set evaluation specific environment variables export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" -export METRICS_S3_OBJECT_KEY=metrics/eval_metrics.json +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE} +S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" + +echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" +python3 prepare-config.py -docker-compose $DR_COMPOSE_FILE up -d +COMPOSE_FILES=$DR_COMPOSE_FILE +STACK_NAME="deepracer-$DR_RUN_ID" +docker stack deploy $COMPOSE_FILES $STACK_NAME echo 'waiting for containers to start up...' diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index ba454672..c9a62f7d 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -1,10 +1,22 @@ #!/usr/bin/env bash -docker-compose $DR_COMPOSE_FILE down +STACK_NAME="deepracer-$DR_RUN_ID" +RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} -SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') -if [[ -n $SAGEMAKER ]]; +SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) + +if [[ -n $SAGEMAKER_CONTAINERS ]]; then - docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') - docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') -fi \ No newline at end of file + for CONTAINER in $SAGEMAKER_CONTAINERS; do + CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) + CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') + COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') + COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) + if [[ -n $COMPOSE_FILE ]]; then + sudo docker-compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME + docker container rm $CONTAINER + fi + done +fi + +docker stack rm $STACK_NAME \ No newline at end of file diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 8996dd93..478128fb 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash docker run --rm -it -p "8888:8888" \ --v `pwd`/../../logs:/workspace/logs \ +-v `pwd`/../../data/logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ --v `pwd`/../../analysis:/workspace/analysis \ +-v `pwd`/../../data/analysis:/workspace/analysis \ -e HOST_PERMS="$(id -u):$(id -g)" \ --name loganalysis \ larsll/deepracer-loganalysis:v2-cpu diff --git a/scripts/training/increment.sh b/scripts/training/increment.sh index db399de4..dcbd6d1f 100755 --- a/scripts/training/increment.sh +++ b/scripts/training/increment.sh @@ -39,7 +39,7 @@ usage esac done -CONFIG_FILE=$(echo $DR_DIR/current-run.env) +CONFIG_FILE=$DR_CONFIG echo "Configuration file $CONFIG_FILE will be updated." ## Read in data diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 92f87bcb..1702c8e0 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -19,12 +19,13 @@ config['METRIC_NAME'] = 'TrainingRewardScore' config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') + '/metrics/training_metrics.json' -config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_CUSTOM_FILES_PREFIX', 'custom_files') + '/model_metadata.json' +config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_METRICS_KEY', 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time())))) +config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') config['NUMBER_OF_EPISODES'] = os.environ.get('DR_NUMBER_OF_EPISODES', '0') config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') -config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_CUSTOM_FILES_PREFIX', 'custom_files') + '/reward_function.py' +config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py') config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') +config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1) config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') @@ -37,10 +38,15 @@ s3_region = config['AWS_REGION'] s3_bucket = config['SAGEMAKER_SHARED_S3_BUCKET'] s3_prefix = config['SAGEMAKER_SHARED_S3_PREFIX'] -s3_yaml_name = os.environ.get('DR_LOCAL_S3_PARAMS_FILE', 'training_params.yaml') +s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') +if s3_mode == 'profile': + s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') +else: # mode is 'role' + s3_profile = None +s3_yaml_name = os.environ.get('DR_LOCAL_S3_TRAINING_PARAMS_FILE', 'training_params.yaml') yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) -session = boto3.session.Session() +session = boto3.session.Session(profile_name=s3_profile) s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 2ba8c3c8..e3e8763d 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +source $DR_DIR/bin/scripts_wrapper.sh + usage(){ echo "Usage: $0 [-w]" echo " -w Wipes the target AWS DeepRacer model structure before upload." @@ -40,21 +42,18 @@ then fi fi -echo "Creating Robomaker configuration in $S3_PATH/training_params.yaml" -export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" -if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; -then - docker-compose $DR_COMPOSE_FILE up -d minio -fi +echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" python3 prepare-config.py - -#export COMPOSE_FILE=$DR_COMPOSE_FILE -docker-compose $DR_COMPOSE_FILE up -d +export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} +COMPOSE_FILES=$DR_COMPOSE_FILE +STACK_NAME="deepracer-$DR_RUN_ID" +docker stack deploy $COMPOSE_FILES $STACK_NAME echo 'Waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start -sleep 5 +sleep 15 if xhost >& /dev/null; then @@ -82,5 +81,5 @@ then fi else echo "No display. Falling back to CLI mode." - docker logs -f $(docker ps | awk ' /sagemaker/ { print $1 }') + dr-logs-sagemaker fi diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index 047b2b00..c9a62f7d 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -1,10 +1,22 @@ #!/usr/bin/env bash -#export COMPOSE_FILE=$DR_COMPOSE_FILE -docker-compose $DR_COMPOSE_FILE down -SAGEMAKER=$(docker ps | awk ' /sagemaker/ { print $1 }') -if [[ -n $SAGEMAKER ]]; +STACK_NAME="deepracer-$DR_RUN_ID" +RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} + +SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) + +if [[ -n $SAGEMAKER_CONTAINERS ]]; then - docker stop $(docker ps | awk ' /sagemaker/ { print $1 }') - docker rm $(docker ps -a | awk ' /sagemaker/ { print $1 }') -fi \ No newline at end of file + for CONTAINER in $SAGEMAKER_CONTAINERS; do + CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) + CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') + COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') + COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) + if [[ -n $COMPOSE_FILE ]]; then + sudo docker-compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME + docker container rm $CONTAINER + fi + done +fi + +docker stack rm $STACK_NAME \ No newline at end of file diff --git a/scripts/training/temp.yml b/scripts/training/temp.yml deleted file mode 100644 index 522088b6..00000000 --- a/scripts/training/temp.yml +++ /dev/null @@ -1,24 +0,0 @@ ---- -'ALTERNATE_DRIVING_DIRECTION': 'false' -'AWS_REGION': 'us-east-1' -'CAR_COLOR': 'Red' -'CAR_NAME': 'MyCar' -'CHANGE_START_POSITION': 'true' -'JOB_TYPE': 'TRAINING' -'KINESIS_VIDEO_STREAM_NAME': 'my-kinesis-stream' -'METRICS_S3_BUCKET': 'bucket' -'METRICS_S3_OBJECT_KEY': 'rl-deepracer-sagemaker/metrics/training_metrics.json' -'METRIC_NAME': 'TrainingRewardScore' -'METRIC_NAMESPACE': 'AWSDeepRacer' -'MODEL_METADATA_FILE_S3_KEY': 'custom_files/model_metadata.json' -'NUMBER_OF_EPISODES': '0' -'RACE_TYPE': 'TIME_TRIAL' -'REWARD_FILE_S3_KEY': 'custom_files/rewards.py' -'ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID': 'Dummy' -'SAGEMAKER_SHARED_S3_BUCKET': 'bucket' -'SAGEMAKER_SHARED_S3_PREFIX': 'rl-deepracer-sagemaker' -'SIMTRACE_S3_BUCKET': 'bucket' -'SIMTRACE_S3_PREFIX': 'rl-deepracer-sagemaker' -'TARGET_REWARD_SCORE': 'None' -'TRAINING_JOB_ARN': 'arn:Dummy' -'WORLD_NAME': 'LGSWide' diff --git a/scripts/upload/list-set-models.sh b/scripts/upload/list-set-models.sh index c31e26a8..aad25909 100755 --- a/scripts/upload/list-set-models.sh +++ b/scripts/upload/list-set-models.sh @@ -30,7 +30,7 @@ esac done TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} -WORK_DIR=/mnt/deepracer/tmp-list +WORK_DIR=${DR_DIR}/tmp/aws-models mkdir -p ${WORK_DIR} if [[ -n "${OPT_CACHE}" ]]; @@ -94,7 +94,7 @@ else fi done - CONFIG_FILE=$(echo $DR_DIR/current-run.env) + CONFIG_FILE=$DR_CONFIG echo "Configuration file $CONFIG_FILE will be updated." if [[ -n "${MODEL_NAME}" ]]; then diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 00f6a57c..cc037b3c 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -69,7 +69,7 @@ SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} SOURCE_S3_METRICS=${DR_LOCAL_S3_METRICS_KEY} -WORK_DIR=/mnt/deepracer/tmp/ +WORK_DIR=${DR_DIR}/tmp/upload/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model # Download information on model. From d58eeaf3782cfb3b84f6b0f9f872cf26b4724b60 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 14 May 2020 06:48:06 +0000 Subject: [PATCH 056/428] Removing unnecessary line --- defaults/template-system.env | 1 - 1 file changed, 1 deletion(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index 20c079aa..5914c9e2 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -1,6 +1,5 @@ DR_CLOUD= DR_AWS_APP_REGION= -DR_CHANGE_START_POSITION=True DR_UPLOAD_S3_PROFILE=default DR_UPLOAD_S3_BUCKET= DR_LOCAL_S3_PROFILE= From 12a7515d6044d29604b2715b48676886f276491b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 14 May 2020 21:16:29 +0200 Subject: [PATCH 057/428] Simplified Setup for Evaluation + Expose Robomaker Logs (#28) * Independent evaluation * Updates * Updates * Metrics fix * Path update --- bin/activate.sh | 15 ++++++--- bin/init.sh | 3 +- defaults/template-run.env | 3 +- defaults/template-system.env | 2 +- docker/docker-compose-eval.yml | 31 +++++++++++++++++++ docker/docker-compose-mount.yml | 6 ++++ ...ompose.yml => docker-compose-training.yml} | 7 +---- scripts/evaluation/prepare-config.py | 19 ++++++++++-- scripts/evaluation/start.sh | 23 +++++++++++--- scripts/evaluation/stop.sh | 18 +---------- scripts/log-analysis/start.sh | 1 - scripts/training/prepare-config.py | 10 ++++-- scripts/training/start.sh | 19 +++++++++--- 13 files changed, 111 insertions(+), 46 deletions(-) create mode 100644 docker/docker-compose-eval.yml create mode 100644 docker/docker-compose-mount.yml rename docker/{docker-compose.yml => docker-compose-training.yml} (83%) diff --git a/bin/activate.sh b/bin/activate.sh index 25439088..3bf7a232 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -56,17 +56,20 @@ if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml -c $DIR/docker/docker-compose-endpoint.yml" DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-azure.yml" elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml -c $DIR/docker/docker-compose-endpoint.yml" + DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml -c $DIR/docker/docker-compose-endpoint.yml" DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-local.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" - DR_COMPOSE_FILE="-c $DIR/docker/docker-compose.yml" + DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml" + DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml" fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. @@ -74,14 +77,16 @@ if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) - DR_COMPOSE_FILE="$DR_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" export DR_LOCAL_S3_AUTH_MODE="profile" else export DR_LOCAL_S3_AUTH_MODE="role" fi -export DR_COMPOSE_FILE +export DR_TRAIN_COMPOSE_FILE +export DR_EVAL_COMPOSE_FILE export DR_LOCAL_PROFILE_ENDPOINT_URL if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then diff --git a/bin/init.sh b/bin/init.sh index e114a177..fe203396 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -59,8 +59,7 @@ INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd cd $INSTALL_DIR # create directory structure for docker volumes -mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/recording -mkdir -p $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket +mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis sudo mkdir -p /tmp/sagemaker diff --git a/defaults/template-run.env b/defaults/template-run.env index b26364b7..060c6748 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -11,6 +11,7 @@ DR_EVAL_NUMBER_OF_TRIALS=5 DR_EVAL_NUMBER_OF_RESETS=0 DR_EVAL_IS_CONTINUOUS=False DR_EVAL_OFF_TRACK_PENALTY=5.0 +DR_EVAL_SAVE_MP4=False DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker @@ -21,4 +22,4 @@ DR_LOCAL_S3_EVAL_PARAMS_FILE=eval-params.yaml DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py -DR_LOCAL_S3_METRICS_KEY=$DR_LOCAL_S3_MODEL_PREFIX/metrics/training_metrics.json +DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics diff --git a/defaults/template-system.env b/defaults/template-system.env index 5914c9e2..a6d2e89f 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -10,4 +10,4 @@ DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= DR_COACH_IMAGE=v2.2 DR_WORKERS=1 - +DR_ROBOMAKER_MOUNT_LOGS=False diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml new file mode 100644 index 00000000..685b1e03 --- /dev/null +++ b/docker/docker-compose-eval.yml @@ -0,0 +1,31 @@ +version: '3.7' + +networks: + default: + external: true + name: sagemaker-local + +services: + rl_coach: + image: larsll/deepracer-rlcoach:${DR_COACH_IMAGE} + deploy: + replicas: 0 + robomaker: + image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} + command: ["${ROBOMAKER_COMMAND}"] + ports: + - "${DR_ROBOMAKER_PORT}:8080" + - "${DR_ROBOMAKER_GUI_PORT}:5900" + environment: + - XAUTHORITY=/root/.Xauthority + - DISPLAY_N=:0 + - WORLD_NAME=${DR_WORLD_NAME} + - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} + - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} + - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - APP_REGION=${DR_AWS_APP_REGION} + - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} + - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} + - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} + - ENABLE_GUI=${DR_GUI_ENABLE} + - ROLLOUT_IDX=0 diff --git a/docker/docker-compose-mount.yml b/docker/docker-compose-mount.yml new file mode 100644 index 00000000..20ccc9f0 --- /dev/null +++ b/docker/docker-compose-mount.yml @@ -0,0 +1,6 @@ +version: '3.7' + +services: + robomaker: + volumes: + - "${DR_MOUNT_DIR}:/root/.ros/log" diff --git a/docker/docker-compose.yml b/docker/docker-compose-training.yml similarity index 83% rename from docker/docker-compose.yml rename to docker/docker-compose-training.yml index a76c248c..adbf70ab 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose-training.yml @@ -23,8 +23,6 @@ services: robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] - volumes: - - "${DR_DIR}/data/recording:/mnt/recording" ports: - "${DR_ROBOMAKER_PORT}:8080" - "${DR_ROBOMAKER_GUI_PORT}:5900" @@ -32,11 +30,8 @@ services: - XAUTHORITY=/root/.Xauthority - DISPLAY_N=:0 - WORLD_NAME=${DR_WORLD_NAME} - - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - APP_REGION=${DR_AWS_APP_REGION} - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 9666e22f..d9b04bcc 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -8,6 +8,9 @@ import io import yaml +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") + config = {} config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') @@ -17,7 +20,13 @@ config['METRIC_NAME'] = 'TrainingRewardScore' config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_METRICS_KEY', 'DeepRacer-Metrics/EvalMetrics-{}.json'.format(str(round(time.time())))) + +metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) +if metrics_prefix is not None: + config['METRICS_S3_OBJECT_KEY'] = '{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))) +else: + config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))) + config['MODEL_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') config['MODEL_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') @@ -30,6 +39,12 @@ config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') +save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) + +if save_mp4: + config['MP4_S3_BUCKET'] = config['MODEL_S3_BUCKET'] + config['MP4_S3_OBJECT_PREFIX'] = '{}/{}'.format(config['MODEL_S3_PREFIX'],'mp4') + s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) s3_region = config['AWS_REGION'] s3_bucket = config['MODEL_S3_BUCKET'] @@ -46,7 +61,7 @@ s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) -local_yaml_path = os.path.abspath(os.path.join('/tmp', 'training-params-' + str(round(time.time())) + '.yaml')) +local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'eval-params-' + str(round(time.time())) + '.yaml')) with open(local_yaml_path, 'w') as yaml_file: yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index f81204e9..0a9bc71c 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -10,17 +10,30 @@ usage(){ trap ctrl_c INT +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + # set evaluation specific environment variables -export ROBOMAKER_COMMAND="./run.sh build evaluation.launch" -export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE} S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" +STACK_NAME="deepracer-eval-$DR_RUN_ID" + +export ROBOMAKER_COMMAND="./run.sh run evaluation.launch" +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE} + +if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; +then + COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE -c $DR_DIR/docker/docker-compose-mount.yml" + export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" + mkdir -p $DR_MOUNT_DIR +else + COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE" +fi echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" python3 prepare-config.py -COMPOSE_FILES=$DR_COMPOSE_FILE -STACK_NAME="deepracer-$DR_RUN_ID" - docker stack deploy $COMPOSE_FILES $STACK_NAME echo 'waiting for containers to start up...' diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index c9a62f7d..416eacfd 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -1,22 +1,6 @@ #!/usr/bin/env bash -STACK_NAME="deepracer-$DR_RUN_ID" +STACK_NAME="deepracer-eval-$DR_RUN_ID" RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} -SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) - -if [[ -n $SAGEMAKER_CONTAINERS ]]; -then - for CONTAINER in $SAGEMAKER_CONTAINERS; do - CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) - CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') - COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') - COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) - if [[ -n $COMPOSE_FILE ]]; then - sudo docker-compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME - docker container rm $CONTAINER - fi - done -fi - docker stack rm $STACK_NAME \ No newline at end of file diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 478128fb..b7b2f1b7 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -4,6 +4,5 @@ docker run --rm -it -p "8888:8888" \ -v `pwd`/../../data/logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v `pwd`/../../data/analysis:/workspace/analysis \ --e HOST_PERMS="$(id -u):$(id -g)" \ --name loganalysis \ larsll/deepracer-loganalysis:v2-cpu diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 1702c8e0..f17d4fa5 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -19,7 +19,13 @@ config['METRIC_NAME'] = 'TrainingRewardScore' config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['METRICS_S3_OBJECT_KEY'] = os.environ.get('DR_LOCAL_S3_METRICS_KEY', 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time())))) + +metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) +if metrics_prefix is not None: + config['METRICS_S3_OBJECT_KEY'] = '{}/TrainingMetrics.json'.format(metrics_prefix) +else: + config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time()))) + config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') config['NUMBER_OF_EPISODES'] = os.environ.get('DR_NUMBER_OF_EPISODES', '0') config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') @@ -50,7 +56,7 @@ s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) -local_yaml_path = os.path.abspath(os.path.join('/tmp', 'training-params-' + str(round(time.time())) + '.yaml')) +local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml')) with open(local_yaml_path, 'w') as yaml_file: yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index e3e8763d..c62b013b 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -42,13 +42,24 @@ then fi fi +# set evaluation specific environment variables +STACK_NAME="deepracer-$DR_RUN_ID" + +export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} + +if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; +then + COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE -c $DR_DIR/docker/docker-compose-mount.yml" + export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" + mkdir -p $DR_MOUNT_DIR +else + COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE" +fi + echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" python3 prepare-config.py -export ROBOMAKER_COMMAND="./run.sh build distributed_training.launch" -export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} -COMPOSE_FILES=$DR_COMPOSE_FILE -STACK_NAME="deepracer-$DR_RUN_ID" docker stack deploy $COMPOSE_FILES $STACK_NAME echo 'Waiting for containers to start up...' From 87685b1f1e32025afdae97a9b951d13f316ba678 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 15 May 2020 10:39:28 +0000 Subject: [PATCH 058/428] Path fix --- bin/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/init.sh b/bin/init.sh index fe203396..f95fcdcc 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -60,7 +60,7 @@ cd $INSTALL_DIR # create directory structure for docker volumes mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket -mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis +mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/tmp sudo mkdir -p /tmp/sagemaker # create symlink to current user's home .aws directory From 2b0a45b2ac8b30ce97b6519b2f292bca6e006378 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 16 May 2020 17:43:16 +0000 Subject: [PATCH 059/428] Use return not exit --- bin/activate.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 3bf7a232..160ecbf1 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -11,7 +11,7 @@ function dr-update-env { done else echo "File system.env does not exist." - exit 1 + return 1 fi if [[ -f "$DR_CONFIG" ]] @@ -24,7 +24,7 @@ function dr-update-env { done else echo "File run.env does not exist." - exit 1 + return 1 fi if [[ -z "${DR_RUN_ID}" ]]; then @@ -49,7 +49,7 @@ then dr-update-env else echo "No configuration file." - exit 1 + return 1 fi if [[ "${DR_CLOUD,,}" == "azure" ]]; From 5bd83326c28cf51cf427f49a76c8be7ea684d1ac Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 16 May 2020 18:13:22 +0000 Subject: [PATCH 060/428] Updated documentation for local mode. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0643f71f..fff9a88d 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. After having run init.sh do the following: -* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. +* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. * Configure your normal AWS credentials with `aws configure` if this is not already in place on your system. This is required to use the model upload functionality. ### Environment Variables From 57f8ce8c1c77e9fc3304f62854e2be277b0d1c18 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 18 May 2020 07:54:13 +0000 Subject: [PATCH 061/428] Fixing Metrics key --- scripts/upload/upload-model.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index cc037b3c..fc006ae1 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -66,7 +66,7 @@ else fi SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} -SOURCE_S3_METRICS=${DR_LOCAL_S3_METRICS_KEY} +SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" WORK_DIR=${DR_DIR}/tmp/upload/ From 464a56b59a9c345554d79465032841a9dc66df99 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 19 May 2020 22:18:47 +0200 Subject: [PATCH 062/428] Fixes to install related issues (#36) * Fix for bug #29 * Fix for bug #31 * Fix for bug #30 --- bin/init.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index f95fcdcc..5826201e 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -7,8 +7,11 @@ function ctrl_c() { exit 1 } +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" + OPT_ARCH="gpu" -OPT_CLOUD="local" +OPT_CLOUD="" while getopts ":m:c:a:" opt; do case $opt in @@ -24,6 +27,12 @@ exit 1 esac done +if [[ -z "$OPT_CLOUD" ]]; then + source $SCRIPT_DIR/detect.sh + OPT_CLOUD=$CLOUD_NAME + echo "Detected cloud type to be $CLOUD_NAME" +fi + # Find CPU Level CPU_LEVEL="cpu" if [[ "$(dmesg | grep AVX | wc -l)" > 0 ]]; then @@ -54,8 +63,6 @@ then fi fi - -INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" cd $INSTALL_DIR # create directory structure for docker volumes @@ -80,10 +87,13 @@ cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env if [[ "${OPT_CLOUD}" == "aws" ]]; then AWS_DR_BUCKET=$(aws s3api list-buckets | jq '.Buckets[] | select(.Name | startswith("aws-deepracer")) | .Name' -r) + AWS_DR_BUCKET_COUNT=$(echo $AWS_DR_BUCKET | wc -w) AWS_EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` AWS_REGION="`echo \"$AWS_EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" - if [[ ! -z "${AWS_DR_BUCKET}" ]]; then + if [ "$AWS_DR_BUCKET_COUNT" -eq 1 ]; then sed -i "s//$AWS_DR_BUCKET/g" $INSTALL_DIR/system.env + elif [ "$AWS_DR_BUCKET_COUNT" -gt 1 ]; then + sed -i "s//found-$AWS_DR_BUCKET_COUNT-buckets/g" $INSTALL_DIR/system.env else sed -i "s//not-defined/g" $INSTALL_DIR/system.env fi @@ -135,7 +145,10 @@ then fi # ensure our variables are set on startup -echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile +NUM_IN_PROFILE=$(cat $HOME/.profile | grep "$INSTALL_DIR/bin/activate.sh" | wc -l) +if [ "$NUM_IN_PROFILE" -eq 0 ]; then + echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile +fi # mark as done date | tee $INSTALL_DIR/DONE From 38ed1680dd6f6fc12fb11f3a25d8117a38a4cc9e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 22 May 2020 06:51:52 +0000 Subject: [PATCH 063/428] Disable AVX-512 in Robomaker --- bin/init.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 5826201e..28cfaaa2 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -43,9 +43,10 @@ if [[ "$(dmesg | grep AVX2 | wc -l)" > 0 ]]; then CPU_LEVEL="cpu-avx2" fi -if [[ "$(dmesg | grep AVX-512 | wc -l)" > 0 ]]; then - CPU_LEVEL="cpu-avx512" -fi +# Disabled due to performance issues with AVX-512 image +# if [[ "$(dmesg | grep AVX-512 | wc -l)" > 0 ]]; then +# CPU_LEVEL="cpu-avx512" +# fi # Check if Intel (to ensure MKN) if [[ "$(dmesg | grep GenuineIntel | wc -l)" > 0 ]]; then From ee09138ca6828028c30e6a01f47582d2e7c7e01c Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 22 May 2020 21:20:05 +0200 Subject: [PATCH 064/428] Update H2B and OA Parameters (#37) * Add Alternate Driving Direction * Adding H2B and OA to config files * Adding domain randomization --- defaults/hyperparameters.json | 2 +- defaults/template-run.env | 22 +++++++++- scripts/evaluation/prepare-config.py | 65 +++++++++++++++++++++------- scripts/training/prepare-config.py | 37 +++++++++++++--- 4 files changed, 102 insertions(+), 24 deletions(-) diff --git a/defaults/hyperparameters.json b/defaults/hyperparameters.json index 25ad617f..3ec50a39 100644 --- a/defaults/hyperparameters.json +++ b/defaults/hyperparameters.json @@ -2,7 +2,7 @@ "batch_size": 64, "beta_entropy": 0.01, "discount_factor": 0.995, - "e_greedy_value": 1.0, + "e_greedy_value": 0.05, "epsilon_steps": 10000, "exploration_type": "categorical", "loss_type": "huber", diff --git a/defaults/template-run.env b/defaults/template-run.env index 060c6748..4e9e2cba 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,16 +1,20 @@ DR_RUN_ID=0 -DR_WORLD_NAME=LGSWide +DR_WORLD_NAME=reInvent2019_track DR_CHANGE_START_POSITION=True +DR_ALTERNATE_DRIVING_DIRECTION=False DR_RACE_TYPE=TIME_TRIAL DR_TARGET_REWARD_SCORE=None DR_CAR_COLOR=Red DR_CAR_NAME=FastCar +DR_DISPLAY_NAME=$DR_CAR_NAME +DR_RACER_NAME=racer1 +DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix DR_UPLOAD_MODEL_NAME=mymodelname DR_EVAL_NUMBER_OF_TRIALS=5 -DR_EVAL_NUMBER_OF_RESETS=0 DR_EVAL_IS_CONTINUOUS=False DR_EVAL_OFF_TRACK_PENALTY=5.0 +DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained @@ -23,3 +27,17 @@ DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.j DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics +DR_OA_NUMBER_OF_OBSTACLES=6 +DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 +DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False +DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS=False +DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS=2 +DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_H2B_IS_LANE_CHANGE=False +DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 +DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 +DR_H2B_LANE_CHANGE_DISTANCE=1.0 +DR_H2B_NUMBER_OF_BOT_CARS=3 +DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 +DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False +DR_H2B_BOT_CAR_SPEED=0.2 \ No newline at end of file diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index d9b04bcc..f976f9d9 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -12,39 +12,72 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") config = {} + +# Basic configuration; including all buckets etc. config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') -config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') -config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') config['JOB_TYPE'] = 'EVALUATION' config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') +config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') +config['MODEL_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') +config['MODEL_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') + +# Metrics config['METRIC_NAME'] = 'TrainingRewardScore' config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') - metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) if metrics_prefix is not None: config['METRICS_S3_OBJECT_KEY'] = '{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))) else: config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))) -config['MODEL_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') -config['MODEL_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') -config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_NUMBER_OF_RESETS', '0') -config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', '0') -config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') -config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') -config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') -config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') -config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') - +# MP4 configuration / sav save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) - if save_mp4: config['MP4_S3_BUCKET'] = config['MODEL_S3_BUCKET'] config['MP4_S3_OBJECT_PREFIX'] = '{}/{}'.format(config['MODEL_S3_PREFIX'],'mp4') +# Car and training +config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') +config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') +config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') +config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') +config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') +config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') + +is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) +if is_continous: + config['NUMBER_OF_RESETS'] = '10000' + config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') + +config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') +config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') + +# Object Avoidance +if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': + config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') + config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') + config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') + config['PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS', 'False') + config['NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS'] = os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS', '2') + config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + +# Head to Bot +if config['RACE_TYPE'] == 'HEAD_TO_BOT': + config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') + config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') + config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') + config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') + config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') + config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') + config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') + config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') + +# S3 Setup / write and upload file s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) s3_region = config['AWS_REGION'] s3_bucket = config['MODEL_S3_BUCKET'] diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index f17d4fa5..888d53a7 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -9,11 +9,7 @@ import yaml config = {} -config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false') config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') -config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') -config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') -config['CHANGE_START_POSITION'] = os.environ.get('DR_CHANGE_START_POSITION', 'true') config['JOB_TYPE'] = 'TRAINING' config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') config['METRIC_NAME'] = 'TrainingRewardScore' @@ -28,7 +24,6 @@ config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') config['NUMBER_OF_EPISODES'] = os.environ.get('DR_NUMBER_OF_EPISODES', '0') -config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py') config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1) @@ -38,7 +33,39 @@ config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') config['TARGET_REWARD_SCORE'] = os.environ.get('DR_TARGET_REWARD_SCORE', 'None') config['TRAINING_JOB_ARN'] = 'arn:Dummy' + +# Car and training +config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') +config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') +config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') +config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') + +config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false') +config['CHANGE_START_POSITION'] = os.environ.get('DR_CHANGE_START_POSITION', 'true') +config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') + +# Object Avoidance +if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': + config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') + config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') + config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') + config['PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS', 'False') + config['NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS'] = os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS', '2') + config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + +# Head to Bot +if config['RACE_TYPE'] == 'HEAD_TO_BOT': + config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') + config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') + config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') + config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') + config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') + config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') + config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') + config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) s3_region = config['AWS_REGION'] From 31fad1114f9b605d00b8fea09f0c7bb0aca4de26 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 24 May 2020 15:43:59 +0000 Subject: [PATCH 065/428] Bump RL Coach Image --- bin/init.sh | 2 +- defaults/template-system.env | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 28cfaaa2..b45fd94f 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -131,7 +131,7 @@ do done # Download docker images. Change to build statements if locally built images are desired. -docker pull larsll/deepracer-rlcoach:v2.2 +docker pull larsll/deepracer-rlcoach:v2.3 docker pull awsdeepracercommunity/deepracer-robomaker:$CPU_LEVEL docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_TAG docker pull larsll/deepracer-loganalysis:v2-cpu diff --git a/defaults/template-system.env b/defaults/template-system.env index a6d2e89f..bbc5c117 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -8,6 +8,6 @@ DR_KINESIS_STREAM_NAME=None DR_KINESIS_STREAM_ENABLE=False DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= -DR_COACH_IMAGE=v2.2 +DR_COACH_IMAGE=v2.3 DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False From cc290e28a197ff608616bfdc8152ff1f959da0c0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 26 May 2020 21:40:55 +0200 Subject: [PATCH 066/428] Multiple Robomaker Workers (#38) Allows user to start several Robomaker workers. --- bin/init.sh | 2 +- docker/docker-compose-robomaker-multi.yml | 15 +++++++++++++ docker/docker-compose-training.yml | 1 - scripts/training/start.sh | 26 +++++++++++++++++------ 4 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 docker/docker-compose-robomaker-multi.yml diff --git a/bin/init.sh b/bin/init.sh index b45fd94f..2460a6f9 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -47,7 +47,7 @@ fi # if [[ "$(dmesg | grep AVX-512 | wc -l)" > 0 ]]; then # CPU_LEVEL="cpu-avx512" # fi - + # Check if Intel (to ensure MKN) if [[ "$(dmesg | grep GenuineIntel | wc -l)" > 0 ]]; then CPU_INTEL="true" diff --git a/docker/docker-compose-robomaker-multi.yml b/docker/docker-compose-robomaker-multi.yml new file mode 100644 index 00000000..be2e3ede --- /dev/null +++ b/docker/docker-compose-robomaker-multi.yml @@ -0,0 +1,15 @@ +version: '3.7' + +networks: + bridge: + external: true + +services: + robomaker: + networks: + - default + - bridge + deploy: + replicas: ${DR_WORKERS} + volumes: + - "${DR_DIR}/tmp/comms.${DR_RUN_ID}:/mnt/comms" diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index adbf70ab..2da18f16 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -37,4 +37,3 @@ services: - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - - ROLLOUT_IDX=0 diff --git a/scripts/training/start.sh b/scripts/training/start.sh index c62b013b..64561e5f 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -27,6 +27,10 @@ usage esac done +# Ensure Sagemaker's folder is there +sudo mkdir -p /tmp/sagemaker + +#Check if files are available S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) @@ -42,12 +46,7 @@ then fi fi -# set evaluation specific environment variables -STACK_NAME="deepracer-$DR_RUN_ID" - -export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" -export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} - +# Base compose file if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; then COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE -c $DR_DIR/docker/docker-compose-mount.yml" @@ -57,6 +56,21 @@ else COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE" fi +# set evaluation specific environment variables +STACK_NAME="deepracer-$DR_RUN_ID" + +if [ "$DR_WORKERS" -gt 1 ]; then + echo "Starting $DR_WORKERS workers" + mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID + rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/* + COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-robomaker-multi.yml" + export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch" +else + export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" +fi + +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} + echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" python3 prepare-config.py From 23514de4616f724a1cf8c12a30e5a1693088283f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 27 May 2020 07:06:43 +0000 Subject: [PATCH 067/428] Adding option to upload specifc checkpoint --- scripts/upload/upload-model.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index fc006ae1..32110e09 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -17,10 +17,12 @@ function ctrl_c() { exit 1 } -while getopts ":fwdhbp:" opt; do +while getopts ":fwdhbp:c:" opt; do case $opt in b) OPT_CHECKPOINT="Best" ;; +c) OPT_CHECKPOINT_NUM="$OPTARG" +;; f) OPT_FORCE="True" ;; d) OPT_DRYRUN="--dryrun" @@ -113,8 +115,13 @@ if [ -z "$CHECKPOINT_INDEX" ]; then exit 1 fi -if [ -z "$OPT_CHECKPOINT" ]; then - echo "Checking for latest checkpoint" +if [ -n "$OPT_CHECKPOINT_NUM" ]; then + echo "Checking for checkpoint $OPT_CHECKPOINT_NUM" + export OPT_CHECKPOINT_NUM + CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/') + CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` +elif [ -z "$OPT_CHECKPOINT" ]; then + echo "Checking for latest tested checkpoint" CHECKPOINT_FILE=`jq -r .last_checkpoint.name < $CHECKPOINT_INDEX` CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` echo "Latest checkpoint = $CHECKPOINT" From b09aaa416a86777b43f990d15de1e6e4cf9fb619 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 28 May 2020 18:25:35 +0000 Subject: [PATCH 068/428] Preventing infinite restarts of containers --- docker/docker-compose-training.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 2da18f16..c9a07763 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -20,6 +20,9 @@ services: volumes: - '//var/run/docker.sock:/var/run/docker.sock' - '/tmp/sagemaker:/tmp/sagemaker' + deploy: + restart_policy: + condition: none robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] @@ -37,3 +40,6 @@ services: - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} + deploy: + restart_policy: + condition: none \ No newline at end of file From ae0a0b10014757e5b616dd5874b76f535250ccee Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 31 May 2020 21:25:31 +0200 Subject: [PATCH 069/428] CloudWatch Logging (#41) Writing docker logs to AWS CloudWatch Logs --- bin/activate.sh | 6 ++++++ defaults/template-system.env | 1 + docker/docker-compose-cwlog.yml | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+) create mode 100644 docker/docker-compose-cwlog.yml diff --git a/bin/activate.sh b/bin/activate.sh index 160ecbf1..47e79325 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -72,6 +72,12 @@ else DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml" fi +# Enable logs in CloudWatch +if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE -c $DIR/docker/docker-compose-cwlog.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE -c $DIR/docker/docker-compose-cwlog.yml" +fi + ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq 0 ]; then diff --git a/defaults/template-system.env b/defaults/template-system.env index bbc5c117..59e75801 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -11,3 +11,4 @@ DR_ROBOMAKER_IMAGE= DR_COACH_IMAGE=v2.3 DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False +DR_CLOUD_WATCH_ENABLE=False \ No newline at end of file diff --git a/docker/docker-compose-cwlog.yml b/docker/docker-compose-cwlog.yml new file mode 100644 index 00000000..e9b78a9e --- /dev/null +++ b/docker/docker-compose-cwlog.yml @@ -0,0 +1,19 @@ +version: '3.7' + +services: + rl_coach: + logging: + driver: awslogs + options: + awslogs-group: '/deepracer-for-cloud' + awslogs-create-group: 'true' + awslogs-region: ${DR_AWS_APP_REGION} + tag: "{{.Name}}" + robomaker: + logging: + driver: awslogs + options: + awslogs-group: '/deepracer-for-cloud' + awslogs-create-group: 'true' + awslogs-region: ${DR_AWS_APP_REGION} + tag: "{{.Name}}" \ No newline at end of file From cc60d6750c8399cf759c0ea4d4a10de285d721d6 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 3 Jun 2020 22:43:33 +0200 Subject: [PATCH 070/428] Removing unused parameters (#39) * Removing unused parameters --- defaults/template-run.env | 1 - scripts/evaluation/prepare-config.py | 2 -- scripts/training/prepare-config.py | 4 ---- 3 files changed, 7 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index 4e9e2cba..8e9032d1 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -3,7 +3,6 @@ DR_WORLD_NAME=reInvent2019_track DR_CHANGE_START_POSITION=True DR_ALTERNATE_DRIVING_DIRECTION=False DR_RACE_TYPE=TIME_TRIAL -DR_TARGET_REWARD_SCORE=None DR_CAR_COLOR=Red DR_CAR_NAME=FastCar DR_DISPLAY_NAME=$DR_CAR_NAME diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index f976f9d9..4cb328c2 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -24,8 +24,6 @@ def str2bool(v): config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') # Metrics -config['METRIC_NAME'] = 'TrainingRewardScore' -config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) if metrics_prefix is not None: diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 888d53a7..07996272 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -12,8 +12,6 @@ config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') config['JOB_TYPE'] = 'TRAINING' config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') -config['METRIC_NAME'] = 'TrainingRewardScore' -config['METRIC_NAMESPACE'] = 'AWSDeepRacer' config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) @@ -23,7 +21,6 @@ config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time()))) config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') -config['NUMBER_OF_EPISODES'] = os.environ.get('DR_NUMBER_OF_EPISODES', '0') config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py') config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1) @@ -31,7 +28,6 @@ config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') -config['TARGET_REWARD_SCORE'] = os.environ.get('DR_TARGET_REWARD_SCORE', 'None') config['TRAINING_JOB_ARN'] = 'arn:Dummy' # Car and training From 6e256cd912b427b3605efcbe548864f637c7b5c5 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 17:58:11 +0000 Subject: [PATCH 071/428] Adding docs path --- docs/INSTALL.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/INSTALL.md diff --git a/docs/INSTALL.md b/docs/INSTALL.md new file mode 100644 index 00000000..17af5c8d --- /dev/null +++ b/docs/INSTALL.md @@ -0,0 +1 @@ +# Installation \ No newline at end of file From f738548f704eb4def6e5315bed590badf7ecc231 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 7 Jun 2020 19:59:13 +0200 Subject: [PATCH 072/428] Set theme jekyll-theme-slate --- docs/_config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/_config.yml diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 00000000..c7418817 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-slate \ No newline at end of file From 14ad1795dcb4ddfa004a00a37d4485596b598c8e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:06:06 +0000 Subject: [PATCH 073/428] Trigger build --- docs/INSTALL.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 17af5c8d..bc467f86 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -1 +1,3 @@ -# Installation \ No newline at end of file +# Installation + +Adding a line \ No newline at end of file From 59087fca546c36e18b2b56f9c67d9513de38da47 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:10:32 +0000 Subject: [PATCH 074/428] Configuring Markdown processor --- docs/_config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index c7418817..5d91f2fd 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1 +1,2 @@ -theme: jekyll-theme-slate \ No newline at end of file +theme: jekyll-theme-slate +markdown: GFM \ No newline at end of file From 341b9d593be6f98a1f0d49d87da4f328d48c8575 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:13:43 +0000 Subject: [PATCH 075/428] Renaming root file --- docs/INSTALL.md | 3 --- docs/index.md | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 docs/INSTALL.md create mode 100644 docs/index.md diff --git a/docs/INSTALL.md b/docs/INSTALL.md deleted file mode 100644 index bc467f86..00000000 --- a/docs/INSTALL.md +++ /dev/null @@ -1,3 +0,0 @@ -# Installation - -Adding a line \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..f1cec9b6 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,3 @@ +# Deepracer for Cloud + +Adding a line \ No newline at end of file From 866d140d82a478cf9d8d8b41380c576dffac288a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:21:43 +0000 Subject: [PATCH 076/428] Moving around documentation --- docs/_config.yml | 3 +- docs/index.md | 2 +- docs/installation.md | 116 +++++++++++++++++++++++++++++++++++++++++++ docs/reference.md | 51 +++++++++++++++++++ 4 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 docs/installation.md create mode 100644 docs/reference.md diff --git a/docs/_config.yml b/docs/_config.yml index 5d91f2fd..9ab6acae 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,2 +1,3 @@ theme: jekyll-theme-slate -markdown: GFM \ No newline at end of file +markdown: GFM +name: Deepracer-for-Cloud \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index f1cec9b6..8a5caae1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,3 @@ -# Deepracer for Cloud +# Deepracer-for-Cloud Documentation Adding a line \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..406f2823 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,116 @@ +# Installing Deepracer-for-Cloud + +## Requirements + +Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. + +**AWS**: +* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge + * Ubuntu 18.04 + * Minimum 30 GB, preferred 40 GB of OS disk. + * Ephemeral Drive connected + * Minimum 8 GB GPU-RAM + * Recommended at least 6 VCPUs +* S3 bucket. Preferrably in same region as EC2 instance. + +**Azure**: +* N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard + * Ubuntu 18.04 + * Standard 30 GB OS drive is sufficient to get started. + * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. + * Minimum 8 GB GPU-RAM + * Recommended at least 6 VCPUs +* Storage Account with one Blob container configured for Access Key authentication. + +## Installation + +The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. + + git clone https://github.com/larsll/deepracer-for-cloud.git + cd deepracer-for-cloud && ./bin/prepare.sh + +This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. + +The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. + +For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. + +The Init Script takes a few parameters: +| Variable | Description | +|----------|-------------| +| `-c ` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` | +| `-a ` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. | + +*TODO: Document how to configure via cloud-init.* + +## Environment Setup + +The environment is set via the `CLOUD` parameter in `system.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. + +### AWS + +In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. + +#### IAM Roles + +To use IAM Roles: +* An empty S3 bucket in the same region as the EC2 instance. +* An IAM Role that has permissions to: + * Access both the *new* S3 bucket as well as the DeepRacer bucket. + * AmazonVPCReadOnlyAccess + * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis +* An EC2 instance with the IAM Role assigned. +* Configure `run.env` as follows: + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` +* Run `dr-update-env` for configuration to take effect. + +#### Manual setup +For access with IAM user: +* An empty S3 bucket in the same region as the EC2 instance. +* A real AWS IAM user set up with access keys: + * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. + * Use `aws configure` to configure this into the default profile. +* Configure `run.env` as follows: + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + +### Azure + +In Azure mode the script-set requires the following: +* A storage account with a blob container set up with access keys: + * Use `aws configure --profile ` to configure this into a specific profile. + * Access Key ID is the Storage Account name. + * Secret Access Key is the Access Key for the Storage Account. + * The blob container is equivalent to the S3 bucket. +* A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. +* Configure `run.env` as follows: + * `DR_LOCAL_S3_PROFILE=` + * `DR_LOCAL_S3_BUCKET=` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + +As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models. + +If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. + +### Local + +Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. + +After having run init.sh do the following: +* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. +* Configure your normal AWS credentials with `aws configure` if this is not already in place on your system. This is required to use the model upload functionality. + +## Basic Usage + +Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. If `dr-update` is not found, try `source activate.sh` to get aliases defined. + +Ensure that the configuration files are uploaded into the bucket `dr-upload-custom-files`. Start a training with `dr-start-training`. + diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 00000000..1763945f --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,51 @@ +# Deepracer-for-Cloud Reference + +## Environment Variables +The scripts assume that two files `systen.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. + +| Variable | Description | +|----------|-------------| +| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| +| `DR_WORLD_NAME` | Defines the track to be used.| +| `DR_NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| +| `DR_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| +| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| +| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| +| `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| +| `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| +| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| +| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| +| `DR_LOCAL_S3_PARAMS_FILE` | YAML file path used to configure Robomaker relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| +| `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| +| `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| +| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| +| `DR_UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| +| `DR_CAR_COLOR` | Color of car | +| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading. | +| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | +| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | +| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | +| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | +| `DR_GPU_AVAILABLE` | Is GPU enabled? | +| `DR_DOCKER_IMAGE_TYPE` | `cpu` or `gpu`; docker images will be used based on this | + +## Commands + +| Command | Description | +|---------|-------------| +| `dr-update` | Loads in all scripts and environment variables again.| +| `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.| +| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| +| `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| +| `dr-start-training` | Starts a training session in the local VM based on current configuration.| +| `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.| +| `dr-stop-training` | Stops the current local training session. Uploads log files.| +| `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| +| `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| +| `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| +| `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| +| `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| +| `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| +| `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | +| `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. | +| `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | From 847f446ff57b88426a63d8b03988da4ad5673e36 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:22:40 +0000 Subject: [PATCH 077/428] Further documentation changes --- README.md | 164 +------------------------------------------------- docs/index.md | 2 +- 2 files changed, 3 insertions(+), 163 deletions(-) diff --git a/README.md b/README.md index fff9a88d..d7810267 100644 --- a/README.md +++ b/README.md @@ -14,166 +14,6 @@ Main differences to the work done by Alex is: * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -## Requirements +# Documentation -Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. - -**AWS**: -* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - * Ubuntu 18.04 - * Minimum 30 GB, preferred 40 GB of OS disk. - * Ephemeral Drive connected - * Minimum 8 GB GPU-RAM - * Recommended at least 6 VCPUs -* S3 bucket. Preferrably in same region as EC2 instance. - -**Azure**: -* N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard - * Ubuntu 18.04 - * Standard 30 GB OS drive is sufficient to get started. - * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. - * Minimum 8 GB GPU-RAM - * Recommended at least 6 VCPUs -* Storage Account with one Blob container configured for Access Key authentication. - -## Installation - -The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - - git clone https://github.com/larsll/deepracer-for-cloud.git - cd deepracer-for-cloud && ./bin/prepare.sh - -This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. - -The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. - -For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. - -The Init Script takes a few parameters: -| Variable | Description | -|----------|-------------| -| `-c ` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` | -| `-a ` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. | - -*TODO: Document how to configure via cloud-init.* - -## Environment Setup - -The environment is set via the `CLOUD` parameter in `system.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. - -### AWS - -In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. - -#### IAM Roles - -To use IAM Roles: -* An empty S3 bucket in the same region as the EC2 instance. -* An IAM Role that has permissions to: - * Access both the *new* S3 bucket as well as the DeepRacer bucket. - * AmazonVPCReadOnlyAccess - * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis -* An EC2 instance with the IAM Role assigned. -* Configure `run.env` as follows: - * `DR_LOCAL_S3_PROFILE=default` - * `DR_LOCAL_S3_BUCKET=` - * `DR_UPLOAD_S3_PROFILE=default` - * `DR_UPLOAD_S3_BUCKET=` -* Run `dr-update-env` for configuration to take effect. - -#### Manual setup -For access with IAM user: -* An empty S3 bucket in the same region as the EC2 instance. -* A real AWS IAM user set up with access keys: - * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. - * Use `aws configure` to configure this into the default profile. -* Configure `run.env` as follows: - * `DR_LOCAL_S3_PROFILE=default` - * `DR_LOCAL_S3_BUCKET=` - * `DR_UPLOAD_S3_PROFILE=default` - * `DR_UPLOAD_S3_BUCKET=` -* Run `dr-update` for configuration to take effect. - -### Azure - -In Azure mode the script-set requires the following: -* A storage account with a blob container set up with access keys: - * Use `aws configure --profile ` to configure this into a specific profile. - * Access Key ID is the Storage Account name. - * Secret Access Key is the Access Key for the Storage Account. - * The blob container is equivalent to the S3 bucket. -* A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. -* Configure `run.env` as follows: - * `DR_LOCAL_S3_PROFILE=` - * `DR_LOCAL_S3_BUCKET=` - * `DR_UPLOAD_S3_PROFILE=default` - * `DR_UPLOAD_S3_BUCKET=` -* Run `dr-update` for configuration to take effect. - -As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models. - -If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. - -### Local - -Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. - -After having run init.sh do the following: -* Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. -* Configure your normal AWS credentials with `aws configure` if this is not already in place on your system. This is required to use the model upload functionality. - -### Environment Variables -The scripts assume that two files `systen.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. - -| Variable | Description | -|----------|-------------| -| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| -| `DR_WORLD_NAME` | Defines the track to be used.| -| `DR_NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| -| `DR_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| -| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| -| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| -| `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| -| `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| -| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| -| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| -| `DR_LOCAL_S3_PARAMS_FILE` | YAML file path used to configure Robomaker relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| -| `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| -| `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| -| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| -| `DR_UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| -| `DR_CAR_COLOR` | Color of car | -| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading. | -| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | -| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | -| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | -| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | -| `DR_GPU_AVAILABLE` | Is GPU enabled? | -| `DR_DOCKER_IMAGE_TYPE` | `cpu` or `gpu`; docker images will be used based on this | - -## Usage - -Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. If `dr-update` is not found, try `source activate.sh` to get aliases defined. - -Ensure that the configuration files are uploaded into the bucket `dr-upload-custom-files`. Start a training with `dr-start-training`. - -### Commands - -| Command | Description | -|---------|-------------| -| `dr-update` | Loads in all scripts and environment variables again.| -| `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.| -| `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| -| `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| -| `dr-start-training` | Starts a training session in the local VM based on current configuration.| -| `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.| -| `dr-stop-training` | Stops the current local training session. Uploads log files.| -| `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| -| `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| -| `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| -| `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| -| `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| -| `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| -| `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | -| `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. | -| `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | +Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud) \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 8a5caae1..f129ba2f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,3 @@ # Deepracer-for-Cloud Documentation -Adding a line \ No newline at end of file +See the following \ No newline at end of file From cffd5f410407dbf8b8cf8fe7d0b0b47ab1720b5e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:31:09 +0000 Subject: [PATCH 078/428] Create an index --- README.md | 2 +- docs/_config.yml | 3 ++- docs/index.md | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d7810267..cb59bb1b 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,6 @@ Main differences to the work done by Alex is: * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -# Documentation +## Documentation Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud) \ No newline at end of file diff --git a/docs/_config.yml b/docs/_config.yml index 9ab6acae..c0e30f22 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,3 +1,4 @@ +--- theme: jekyll-theme-slate markdown: GFM -name: Deepracer-for-Cloud \ No newline at end of file +name: Deepracer-for-Cloud diff --git a/docs/index.md b/docs/index.md index f129ba2f..f28a4e7d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,6 @@ -# Deepracer-for-Cloud Documentation +# Documentation -See the following \ No newline at end of file +### Setup + +* [Initial Installation](installation) +* [Reference](reference) \ No newline at end of file From 8c2bb6df270315c1a6572773e6eb258b8ae3e5d9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:33:50 +0000 Subject: [PATCH 079/428] Updates --- docs/index.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index f28a4e7d..30c1729a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,21 @@ -# Documentation +# Introduction + +Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). + +This repo started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. -### Setup +Main differences to the work done by Alex is: +* Runtime S3 storage is setup to fit the connected cloud platform: + * Azure: Local 'virtual' S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). + * AWS: Directly connects to a real S3 bucket. +* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. +* Configuration has been reorganized : + * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. + * `system.env` contains system-wide constants (expected to be configured only at setup) + * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). + * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. + +# Documentation * [Initial Installation](installation) * [Reference](reference) \ No newline at end of file From afa51a9dfbb36a0af5b7fbf3aeaa1c418bfdb751 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 7 Jun 2020 18:37:26 +0000 Subject: [PATCH 080/428] Fixing --- docs/installation.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/installation.md b/docs/installation.md index 406f2823..0d407954 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -36,6 +36,7 @@ The installation script will adapt `.profile` to ensure that all settings are ap For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. The Init Script takes a few parameters: + | Variable | Description | |----------|-------------| | `-c ` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` | From eb88d567c94b4748778b3e12a30e07de0c61d9b9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 12 Jun 2020 20:59:33 +0200 Subject: [PATCH 081/428] Create Dev Branch --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cb59bb1b..ae96cf47 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,8 @@ Main differences to the work done by Alex is: * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. +## Features + ## Documentation -Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud) \ No newline at end of file +Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud) From d08b81f1880d04e591b5948525bdfeb7b4873424 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 12 Jun 2020 21:00:10 +0200 Subject: [PATCH 082/428] Ability to select between docker swarm and docker-compose (#44) * Initial version for test * Fixing evaluation and log-level * Bug-fixes --- bin/activate.sh | 50 ++++++++++++++++------- defaults/template-system.env | 3 +- docker/docker-compose-eval.yml | 4 +- docker/docker-compose-robomaker-multi.yml | 9 ---- docker/docker-compose-training-swarm.yml | 12 ++++++ docker/docker-compose-training.yml | 8 +--- scripts/evaluation/start.sh | 10 ++++- scripts/evaluation/stop.sh | 11 ++++- scripts/training/start.sh | 13 ++++-- scripts/training/stop.sh | 11 ++++- 10 files changed, 90 insertions(+), 41 deletions(-) create mode 100644 docker/docker-compose-training-swarm.yml diff --git a/bin/activate.sh b/bin/activate.sh index 47e79325..2e2a676d 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -30,8 +30,15 @@ function dr-update-env { if [[ -z "${DR_RUN_ID}" ]]; then export DR_RUN_ID=0 fi - export DR_ROBOMAKER_PORT=$(echo "8080 + $DR_RUN_ID" | bc) - export DR_ROBOMAKER_GUI_PORT=$(echo "5900 + $DR_RUN_ID" | bc) + + if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; + then + export DR_ROBOMAKER_PORT=$(expr 8080 + $DR_RUN_ID) + export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID) + else + export DR_ROBOMAKER_PORT="8080-8100" + export DR_ROBOMAKER_GUI_PORT="5900-5920" + fi } @@ -52,30 +59,45 @@ else return 1 fi +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + export DR_DOCKER_FILE_SEP="-c" +else + export DR_DOCKER_FILE_SEP="-f" +fi + +# Prepare the docker compose files depending on parameters if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml -c $DIR/docker/docker-compose-endpoint.yml" - DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml -c $DIR/docker/docker-compose-endpoint.yml" - DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-azure.yml" + DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-azure.yml" elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml -c $DIR/docker/docker-compose-endpoint.yml" - DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml -c $DIR/docker/docker-compose-endpoint.yml" - DR_MINIO_COMPOSE_FILE="-c $DIR/docker/docker-compose-local.yml" + DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" - DR_TRAIN_COMPOSE_FILE="-c $DIR/docker/docker-compose-training.yml" - DR_EVAL_COMPOSE_FILE="-c $DIR/docker/docker-compose-eval.yml" + DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" + DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml" +fi + +# Prevent docker swarms to restart +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training-swarm.yml" fi # Enable logs in CloudWatch if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then - DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE -c $DIR/docker/docker-compose-cwlog.yml" - DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE -c $DIR/docker/docker-compose-cwlog.yml" + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml" fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. @@ -83,8 +105,8 @@ if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) - DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" - DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE -c $DIR/docker/docker-compose-keys.yml" + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" export DR_LOCAL_S3_AUTH_MODE="profile" else diff --git a/defaults/template-system.env b/defaults/template-system.env index 59e75801..9706f699 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -11,4 +11,5 @@ DR_ROBOMAKER_IMAGE= DR_COACH_IMAGE=v2.3 DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False -DR_CLOUD_WATCH_ENABLE=False \ No newline at end of file +DR_CLOUD_WATCH_ENABLE=False +DR_DOCKER_STYLE=swarm \ No newline at end of file diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 685b1e03..6894199b 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -8,14 +8,12 @@ networks: services: rl_coach: image: larsll/deepracer-rlcoach:${DR_COACH_IMAGE} - deploy: - replicas: 0 + command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"] robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] ports: - "${DR_ROBOMAKER_PORT}:8080" - - "${DR_ROBOMAKER_GUI_PORT}:5900" environment: - XAUTHORITY=/root/.Xauthority - DISPLAY_N=:0 diff --git a/docker/docker-compose-robomaker-multi.yml b/docker/docker-compose-robomaker-multi.yml index be2e3ede..62718412 100644 --- a/docker/docker-compose-robomaker-multi.yml +++ b/docker/docker-compose-robomaker-multi.yml @@ -1,15 +1,6 @@ version: '3.7' -networks: - bridge: - external: true - services: robomaker: - networks: - - default - - bridge - deploy: - replicas: ${DR_WORKERS} volumes: - "${DR_DIR}/tmp/comms.${DR_RUN_ID}:/mnt/comms" diff --git a/docker/docker-compose-training-swarm.yml b/docker/docker-compose-training-swarm.yml new file mode 100644 index 00000000..35667971 --- /dev/null +++ b/docker/docker-compose-training-swarm.yml @@ -0,0 +1,12 @@ +version: '3.7' + +services: + rl_coach: + deploy: + restart_policy: + condition: none + robomaker: + deploy: + restart_policy: + condition: none + replicas: ${DR_WORKERS} \ No newline at end of file diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index c9a07763..521c8903 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -20,9 +20,6 @@ services: volumes: - '//var/run/docker.sock:/var/run/docker.sock' - '/tmp/sagemaker:/tmp/sagemaker' - deploy: - restart_policy: - condition: none robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] @@ -39,7 +36,4 @@ services: - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - - ENABLE_GUI=${DR_GUI_ENABLE} - deploy: - restart_policy: - condition: none \ No newline at end of file + - ENABLE_GUI=${DR_GUI_ENABLE} \ No newline at end of file diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 0a9bc71c..401f6621 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -24,7 +24,7 @@ export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE} if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; then - COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE -c $DR_DIR/docker/docker-compose-mount.yml" + COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" mkdir -p $DR_MOUNT_DIR else @@ -34,7 +34,13 @@ fi echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" python3 prepare-config.py -docker stack deploy $COMPOSE_FILES $STACK_NAME +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack deploy $COMPOSE_FILES $STACK_NAME +else + docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d +fi echo 'waiting for containers to start up...' diff --git a/scripts/evaluation/stop.sh b/scripts/evaluation/stop.sh index 416eacfd..4743dd4a 100755 --- a/scripts/evaluation/stop.sh +++ b/scripts/evaluation/stop.sh @@ -3,4 +3,13 @@ STACK_NAME="deepracer-eval-$DR_RUN_ID" RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} -docker stack rm $STACK_NAME \ No newline at end of file +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack rm $STACK_NAME +else + COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ ) + export DR_CURRENT_PARAMS_FILE="" + export ROBOMAKER_COMMAND="" + docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down +fi \ No newline at end of file diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 64561e5f..9f974910 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -49,7 +49,7 @@ fi # Base compose file if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; then - COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE -c $DR_DIR/docker/docker-compose-mount.yml" + COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" mkdir -p $DR_MOUNT_DIR else @@ -63,7 +63,7 @@ if [ "$DR_WORKERS" -gt 1 ]; then echo "Starting $DR_WORKERS workers" mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/* - COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-robomaker-multi.yml" + COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml" export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch" else export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" @@ -74,7 +74,14 @@ export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" python3 prepare-config.py -docker stack deploy $COMPOSE_FILES $STACK_NAME +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack deploy $COMPOSE_FILES $STACK_NAME +else + docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS +fi + echo 'Waiting for containers to start up...' #sleep for 20 seconds to allow the containers to start diff --git a/scripts/training/stop.sh b/scripts/training/stop.sh index c9a62f7d..66efbfd2 100755 --- a/scripts/training/stop.sh +++ b/scripts/training/stop.sh @@ -19,4 +19,13 @@ then done fi -docker stack rm $STACK_NAME \ No newline at end of file +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack rm $STACK_NAME +else + COMPOSE_FILES=$(echo ${DR_TRAIN_COMPOSE_FILE} | cut -f1-2 -d\ ) + export DR_CURRENT_PARAMS_FILE="" + export ROBOMAKER_COMMAND="" + docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down +fi \ No newline at end of file From 86b4965ec94e6c532723d4e66baca1546f4bb806 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 12 Jun 2020 21:25:30 +0200 Subject: [PATCH 083/428] Replica Number from Swarm (#47) * Adding a variable with the replica slot number * Loading replica number from Swarm --- docker/docker-compose-training-swarm.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/docker-compose-training-swarm.yml b/docker/docker-compose-training-swarm.yml index 35667971..06e250bb 100644 --- a/docker/docker-compose-training-swarm.yml +++ b/docker/docker-compose-training-swarm.yml @@ -9,4 +9,6 @@ services: deploy: restart_policy: condition: none - replicas: ${DR_WORKERS} \ No newline at end of file + replicas: ${DR_WORKERS} + environment: + - DOCKER_REPLICA_SLOT={{.Task.Slot}} \ No newline at end of file From 0cb01915dc8c7ac87979a830c2f6000ef86eaa54 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 12 Jun 2020 21:27:50 +0200 Subject: [PATCH 084/428] Support use of local X-Server (#46) * Fixing evaluation and log-level * Initial Commit * Moving files --- bin/activate.sh | 9 ++++++++- defaults/template-system.env | 3 ++- docker/docker-compose-local-xorg.yml | 10 ++++++++++ utils/setup-xorg.sh | 19 +++++++++++++++++++ utils/start-xorg.sh | 7 +++++++ 5 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 docker/docker-compose-local-xorg.yml create mode 100755 utils/setup-xorg.sh create mode 100755 utils/start-xorg.sh diff --git a/bin/activate.sh b/bin/activate.sh index 2e2a676d..20f6c763 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -37,7 +37,7 @@ function dr-update-env { export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID) else export DR_ROBOMAKER_PORT="8080-8100" - export DR_ROBOMAKER_GUI_PORT="5900-5920" + export DR_ROBOMAKER_GUI_PORT="5901-5920" fi } @@ -88,6 +88,13 @@ else DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml" fi +# Prevent docker swarms to restart +if [[ "${DR_HOST_X,,}" == "true" ]]; +then + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml" +fi + # Prevent docker swarms to restart if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then diff --git a/defaults/template-system.env b/defaults/template-system.env index 9706f699..fd66cea6 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -12,4 +12,5 @@ DR_COACH_IMAGE=v2.3 DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False DR_CLOUD_WATCH_ENABLE=False -DR_DOCKER_STYLE=swarm \ No newline at end of file +DR_DOCKER_STYLE=swarm +DR_HOST_X=False \ No newline at end of file diff --git a/docker/docker-compose-local-xorg.yml b/docker/docker-compose-local-xorg.yml new file mode 100644 index 00000000..5565359d --- /dev/null +++ b/docker/docker-compose-local-xorg.yml @@ -0,0 +1,10 @@ +version: '3.7' + +services: + robomaker: + ipc: host + environment: + - DISPLAY + - USE_EXTERNAL_X=${DR_HOST_X} + volumes: + - '/tmp/.X11-unix/:/tmp/.X11-unix' \ No newline at end of file diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh new file mode 100755 index 00000000..bdbd3111 --- /dev/null +++ b/utils/setup-xorg.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Script to install basic X-Windows on a headless instance (e.g. in EC2) + +# Install additional packages +sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils \ + menu mesa-utils xterm jwm x11vnc -y --no-install-recommends + +# Configure +sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config +BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//') +sudo nvidia-xconfig --busid=$BUS_ID --enable-all-gpus -o /etc/X11/xorg.conf + +sudo tee -a /etc/X11/xorg.conf << EOF + +Section "DRI" + Mode 0666 +EndSection +EOF diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh new file mode 100755 index 00000000..a5abba04 --- /dev/null +++ b/utils/start-xorg.sh @@ -0,0 +1,7 @@ +#!/bin/bash +export DISPLAY=:0 +xinit /usr/bin/jwm & +sleep 1 +xrandr -s 1400x900 +x11vnc -bg -forever -nopw -rfbport 5900 -display WAIT$DISPLAY & +sleep 1 From 39da8829eea79b90d1b9fe4cc9cf8346d9df3656 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 12 Jun 2020 22:43:45 +0200 Subject: [PATCH 085/428] Complete multi-node Swarm feature (#49) * Mount election directory only if in compose * Force placement of RL_Coach * Sagemaker on central node only --- bin/init.sh | 2 ++ docker/docker-compose-training-swarm.yml | 2 ++ scripts/training/start.sh | 11 ++++++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 2460a6f9..cf22d6d0 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -139,6 +139,8 @@ docker pull larsll/deepracer-loganalysis:v2-cpu # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' docker swarm init +SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) +docker node update --label-add Sagemaker=true $SWARM_NODE docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then diff --git a/docker/docker-compose-training-swarm.yml b/docker/docker-compose-training-swarm.yml index 06e250bb..abff3912 100644 --- a/docker/docker-compose-training-swarm.yml +++ b/docker/docker-compose-training-swarm.yml @@ -5,6 +5,8 @@ services: deploy: restart_policy: condition: none + placement: + constraints: [node.labels.Sagemaker == true ] robomaker: deploy: restart_policy: diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 9f974910..3b3dd844 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -61,9 +61,14 @@ STACK_NAME="deepracer-$DR_RUN_ID" if [ "$DR_WORKERS" -gt 1 ]; then echo "Starting $DR_WORKERS workers" - mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID - rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/* - COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml" + + if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; + then + mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID + rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/* + COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml" + fi + export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch" else export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" From e606fe3a53a0a88109d7f19ca121b25d131c98ad Mon Sep 17 00:00:00 2001 From: dartjason <51768630+dartjason@users.noreply.github.com> Date: Sun, 14 Jun 2020 04:37:25 -0400 Subject: [PATCH 086/428] Add Autorun functionality (#45) --- bin/autorun.sh | 33 +++++++++ bin/init.sh | 33 +++++++++ utils/sample-createspot.sh | 133 +++++++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 bin/autorun.sh create mode 100644 utils/sample-createspot.sh diff --git a/bin/autorun.sh b/bin/autorun.sh new file mode 100644 index 00000000..0a8da5df --- /dev/null +++ b/bin/autorun.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +## this is the default autorun script +## file should run automatically after init.sh completes. +## this script downloads your configured run.env, system.env and any custom container requests + +INSTALL_DIR_TEMP="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" + +## retrieve the s3_location name you sent the instance in user data launch +## assumed to first line of file +S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/bin/s3_training_location.txt) + +source $INSTALL_DIR_TEMP/bin/activate.sh + +## get the updatated run.env and system.env files and any others you stashed in s3 +aws s3 sync s3://$S3_LOCATION $INSTALL_DIR_TEMP + +## get the right docker containers, if needed +SYSENV="$INSTALL_DIR_TEMP/system.env" +SAGEMAKER_IMAGE=$(cat $SYSENV | grep DR_SAGEMAKER_IMAGE | sed 's/.*=//') +ROBOMAKER_IMAGE=$(cat $SYSENV | grep DR_ROBOMAKER_IMAGE | sed 's/.*=//') + +docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_IMAGE +docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_IMAGE + +dr-reload + +date | tee $INSTALL_DIR_TEMP/DONE-AUTORUN + +## start training +cd $INSTALL_DIR_TEMP/scripts/training +./start.sh + diff --git a/bin/init.sh b/bin/init.sh index cf22d6d0..8dae4a8a 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -155,3 +155,36 @@ fi # mark as done date | tee $INSTALL_DIR/DONE + + + +## Optional auturun feature +# if using automation scripts to auto configure and run +# you must pass s3_training_location.txt to this instance in order for this to work +if [[ -f "$INSTALL_DIR/bin/s3_training_location.txt" ]] +then + ## read in first line. first line always assumed to be training location regardless what else is in file + TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/bin/s3_training_location.txt) + + #get bucket name + TRAINING_BUCKET=${TRAINING_LOC%%/*} + #get prefix. minor exception handling in case there is no prefix and a root bucket is passed + if [[ "$TRAININGLOC" == *"/"* ]] + then + TRAINING_PREFIX=${TRAININGLOC#*/} + else + TRAINING_PREFIX="" + fi + + ##check if custom autorun script exists in s3 training bucket. If not, use default in this repo + aws s3api head-object --bucket $TRAINING_BUCKET --key $TRAINING_PREFIX/autorun.sh || not_exist=true + if [ $not_exist ]; then + echo "custom file does not exist, using local copy" + else + echo "custom script does exist, use it" + aws s3 cp s3://$TRAINING_LOC/autorun.sh $INSTALL_DIR/bin/autorun.sh + fi + chmod +x $INSTALL_DIR/bin/autorun.sh + bash -c "source $INSTALL_DIR/bin/autorun.sh" +fi + diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh new file mode 100644 index 00000000..d17d0fe6 --- /dev/null +++ b/utils/sample-createspot.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash + +## This is sample code that will generally show you how to launch a spot instance on aws and leverage the +## automation built into deepracer-for-cloud to automatically start training +## Changes required to work: +## Input location where your training will take place -- S3_LOCATION +## Input security group, iam role, and key-name + +## First you need to tell the script where in s3 your training will take place +## can be either a bucket at the root level, or a bucket/prefix. don't include the s3:// + +S3_LOCATION=<#########> + +## extract bucket location +BUCKET=${S3_LOCATION%%/*} + +## extract prefix location +if [[ "$S3_LOCATION" == *"/"* ]] +then + PREFIX=${S3_LOCATION#*/} +else + PREFIX="" +fi + +## Fill these out with your custom information if you want to upload and submit to leaderboard. not required to run +DR_UPLOAD_S3_PREFIX=######## + +## set the instance type you want to launch +INSTANCE_TYPE=c5.2xlarge + +## if you want to modify additional variables from the default, add them here, then add them to section further below called replace static paramamters. I've only done World name for now +WORLD_NAME=FS_June2020 + +## modify this if you want additional robomaker workers +DR_WORKERS=1 + +## select which images you want to use. these will be used later for a docker pull +DR_SAGEMAKER_IMAGE=cpu-avx-mkl +DR_ROBOMAKER_IMAGE=cpu-avx2 + +## check the s3 location for existing training folders +## automatically determine the latest training run (highest number), and set model parameters accordingly +## this script assumes the format rl-deepracer-1, rl-deepracer-2, etc. you will need to modify if your schema differs + +LAST_TRAINING=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}') +## drop trailing slash +LAST_TRAINING=$(echo $LAST_TRAINING | sed 's:/*$::') + +CONFIG_FILE="./run.env" +OLD_SYSTEMENV="./system.env" + +## incorporate logic from increment.sh, slightly modified to use last training +OPT_DELIM='-' +## Read in data +CURRENT_RUN_MODEL=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}') +## drop trailing slash +CURRENT_RUN_MODEL=$(echo $LAST_TRAINING | sed 's:/*$::') +## get number at the end +CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \ + awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') + +if [ -z $LAST_TRAINING ] +then + echo No prior training found + if [[ $PREFIX == "" ]] + then + NEW_RUN_MODEL=rl-deepracer-1 + else + NEW_RUN_MODEL="$PREFIX/rl-deepracer-1" + fi + PRETRAINED=False + CURRENT_RUN_MODEL=$NEW_RUN_MODEL +else + + NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc ) + PRETRAINED=True + + if [[ $PREFIX == "" ]] + then + NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") + else + NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") + NEW_RUN_MODEL="$PREFIX/$NEW_RUN_MODEL" + CURRENT_RUN_MODEL="$PREFIX/$CURRENT_RUN_MODEL" + fi + echo Last training was $CURRENT_RUN_MODEL so next training is $NEW_RUN_MODEL +fi + +## Replace dynamic parameters in run.env (still local to your directory) +sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; ; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g" "$CONFIG_FILE" && echo "Done." + +## Replace static parameters in run.env (still local to your directory) +sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$DR_UPLOAD_S3_PREFIX/g" "$CONFIG_FILE" +sed -i.bak -re "s/(DR_WORLD_NAME=).*$/\1$WORLD_NAME/g" "$CONFIG_FILE" + +## Replace static paramaters in system.env file, including sagemaker and robomaker images (still local to your directory) and the number of DR_workers +sed -i.bak -re "s/(DR_UPLOAD_S3_BUCKET=).*$/\1$DR_UPLOAD_S3_BUCKET/g; s/(DR_SAGEMAKER_IMAGE=).*$/\1$DR_SAGEMAKER_IMAGE/g; s/(DR_ROBOMAKER_IMAGE=).*$/\1$DR_ROBOMAKER_IMAGE/g; s/(DR_WORKERS=).*$/\1$DR_WORKERS/g" "$OLD_SYSTEMENV" + +## upload the new run.env and system.env files into your S3 bucket (same s3 location identified earlier) +## files are loaded into the node-config folder/prefix. You can also upload other files to node config, and they +## will sync to the EC2 instance as part of the autorun script later. If you add other files, make sure they are +## in node-config in the same directory structure as DRfc; example: s3location/node-config/scripts/training/.start.sh +RUNENV_LOCATION=$S3_LOCATION/node-config/run.env +SYSENV_LOCATION=$S3_LOCATION/node-config/system.env + +aws s3 cp ./run.env s3://$RUNENV_LOCATION +aws s3 cp ./system.env s3://$SYSENV_LOCATION + +## upload a custom autorun script to S3. there is a default autorun script in the repo that will be used unless a custom one is specified here instead +#aws s3 cp ./autorun.sh s3://$S3_LOCATION/autorun.sh + +## upload custom files -- if you dont want this, comment these lines out +aws s3 cp ./model_metadata.json s3://$S3_LOCATION/custom_files/model_metadata.json +aws s3 cp ./reward_function.py s3://$S3_LOCATION/custom_files/reward_function.py +aws s3 cp ./hyperparameters.json s3://$S3_LOCATION/custom_files/hyperparameters.json + +## launch an ec2 +## update with your own settings, including key-name, security-group, and iam-instance-profile at a minimum +## user data includes a command to create a .txt file which simply contains the name of the s3 location +## this filename will be used as fundamental input to autorun.sh script run later on that instance +## you need to ensure you have proper IAM permissions to launch this instance + +aws ec2 run-instances \ + --image-id ami-085925f297f89fce1 \ + --count 1 \ + --instance-type $INSTANCE_TYPE \ + --key-name <####keyname####> \ + --security-group-ids sg-<####sgid####> \ + --block-device-mappings 'DeviceName=/dev/sda1,Ebs={DeleteOnTermination=true,VolumeSize=40}' \ + --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \ + --instance-market-options MarketType=spot \ + --user-data "#!/bin/bash + su -c 'git clone https://github.com/larsll/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/bin/s3_training_location.txt && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" From cf66db0840621bdeeb5a52321b89001635ff526a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 14 Jun 2020 08:43:17 +0000 Subject: [PATCH 087/428] Fixing filepath for autorun url --- .gitignore | 3 ++- bin/autorun.sh | 2 +- bin/init.sh | 6 ++---- utils/sample-createspot.sh | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 0b814f8b..923271bc 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ recording /*.bak DONE data/ -tmp/ \ No newline at end of file +tmp/ +autorun.s3url \ No newline at end of file diff --git a/bin/autorun.sh b/bin/autorun.sh index 0a8da5df..b8c1d399 100644 --- a/bin/autorun.sh +++ b/bin/autorun.sh @@ -8,7 +8,7 @@ INSTALL_DIR_TEMP="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && ## retrieve the s3_location name you sent the instance in user data launch ## assumed to first line of file -S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/bin/s3_training_location.txt) +S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/autorun.s3url) source $INSTALL_DIR_TEMP/bin/activate.sh diff --git a/bin/init.sh b/bin/init.sh index 8dae4a8a..82afaedb 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -156,15 +156,13 @@ fi # mark as done date | tee $INSTALL_DIR/DONE - - ## Optional auturun feature # if using automation scripts to auto configure and run # you must pass s3_training_location.txt to this instance in order for this to work -if [[ -f "$INSTALL_DIR/bin/s3_training_location.txt" ]] +if [[ -f "$INSTALL_DIR/autorun.s3url" ]] then ## read in first line. first line always assumed to be training location regardless what else is in file - TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/bin/s3_training_location.txt) + TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/autorun.s3url) #get bucket name TRAINING_BUCKET=${TRAINING_LOC%%/*} diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh index d17d0fe6..19cf8e9f 100644 --- a/utils/sample-createspot.sh +++ b/utils/sample-createspot.sh @@ -130,4 +130,4 @@ aws ec2 run-instances \ --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \ --instance-market-options MarketType=spot \ --user-data "#!/bin/bash - su -c 'git clone https://github.com/larsll/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/bin/s3_training_location.txt && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" + su -c 'git clone https://github.com/larsll/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" From 8053fbb0d4b60175218e2620d88a9e49e683c109 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 15 Jun 2020 20:16:10 +0200 Subject: [PATCH 088/428] Fix minio and docker-compose (#51) * Bugfix to ensure minio also works with compose-mode --- bin/activate.sh | 10 ++++++++-- bin/scripts_wrapper.sh | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 20f6c763..0e030b4e 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -108,7 +108,7 @@ if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. -if [ $(aws sts get-caller-identity | jq '.Arn' | awk /assumed-role/ | wc -l) -eq 0 ]; +if [ $(aws sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -eq 0 ]; then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) @@ -129,7 +129,13 @@ if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then export MINIO_USERNAME=$(id -u -n) export MINIO_GID=$(id -g) export MINIO_GROUPNAME=$(id -g -n) - docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; + then + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + else + docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d + fi + fi source $SCRIPT_DIR/scripts_wrapper.sh diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 19df9b1a..cf0205ff 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -3,7 +3,12 @@ function dr-upload-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; + then + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + else + docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d + fi fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" @@ -25,7 +30,12 @@ function dr-set-upload-model { function dr-download-custom-files { if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; then - docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; + then + docker stack deploy $DR_MINIO_COMPOSE_FILE s3 + else + docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d + fi fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" From 5e3da0fa26fee3fbbfb2a7700b22341c63725d76 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 16 Jun 2020 22:37:24 +0200 Subject: [PATCH 089/428] Enabling adjustment of round robin start pos (#52) --- defaults/template-run.env | 5 +++-- scripts/training/prepare-config.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index 8e9032d1..62c862b0 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,7 +1,5 @@ DR_RUN_ID=0 DR_WORLD_NAME=reInvent2019_track -DR_CHANGE_START_POSITION=True -DR_ALTERNATE_DRIVING_DIRECTION=False DR_RACE_TYPE=TIME_TRIAL DR_CAR_COLOR=Red DR_CAR_NAME=FastCar @@ -15,6 +13,9 @@ DR_EVAL_IS_CONTINUOUS=False DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False +DR_TRAIN_CHANGE_START_POSITION=True +DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False +DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 07996272..9f7d2219 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -39,8 +39,9 @@ config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') -config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false') -config['CHANGE_START_POSITION'] = os.environ.get('DR_CHANGE_START_POSITION', 'true') +config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false')) +config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) +config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') # Object Avoidance From 61078e3e5dab0dbbd3bbeca9e95b2e9b14fe9852 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 17 Jun 2020 22:57:54 +0200 Subject: [PATCH 090/428] Completing XAUTH setup (#53) --- docker/docker-compose-eval.yml | 1 - docker/docker-compose-local-xorg.yml | 4 +++- docker/docker-compose-training.yml | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 6894199b..680dc21b 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -15,7 +15,6 @@ services: ports: - "${DR_ROBOMAKER_PORT}:8080" environment: - - XAUTHORITY=/root/.Xauthority - DISPLAY_N=:0 - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} diff --git a/docker/docker-compose-local-xorg.yml b/docker/docker-compose-local-xorg.yml index 5565359d..b4ba5663 100644 --- a/docker/docker-compose-local-xorg.yml +++ b/docker/docker-compose-local-xorg.yml @@ -6,5 +6,7 @@ services: environment: - DISPLAY - USE_EXTERNAL_X=${DR_HOST_X} + - XAUTHORITY=/root/.Xauthority volumes: - - '/tmp/.X11-unix/:/tmp/.X11-unix' \ No newline at end of file + - '/tmp/.X11-unix/:/tmp/.X11-unix' + - '${XAUTHORITY}:/root/.Xauthority' \ No newline at end of file diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 521c8903..2c0ff433 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -27,7 +27,6 @@ services: - "${DR_ROBOMAKER_PORT}:8080" - "${DR_ROBOMAKER_GUI_PORT}:5900" environment: - - XAUTHORITY=/root/.Xauthority - DISPLAY_N=:0 - WORLD_NAME=${DR_WORLD_NAME} - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} From a7062d14e4547d9e1e8be5e11bad8347e59754f8 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 17 Jun 2020 23:28:11 +0200 Subject: [PATCH 091/428] Update to Documentation (#54) --- README.md | 36 +++++++++++++++++++++++++++--- docs/index.md | 39 ++++++++++++++++++++++++++++---- docs/installation.md | 53 +++++++++++++++++++++++++++++++++----------- docs/opengl.md | 31 ++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 20 deletions(-) create mode 100644 docs/opengl.md diff --git a/README.md b/README.md index ae96cf47..dfff9ec5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # DeepRacer-For-Cloud Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). -This repo started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. +## Introduction + +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. Main differences to the work done by Alex is: * Runtime S3 storage is setup to fit the connected cloud platform: @@ -14,8 +16,36 @@ Main differences to the work done by Alex is: * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. -## Features +## Main Features + +DRfC supports a wide set of features to ensure that you can focus on creating the best model: +* User-friendly + * Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups. + * Wide set of scripts (`dr-*`) enables effortless training. + * Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them. +* Modes + * Time Trial + * Object Avoidance + * Head-to-Bot +* Training + * Multiple Robomaker instances per Sagemaker (N:1) to improve training progress. + * Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel. + * Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances. +* Evaluation + * Evaluate independently from training. + * Save evaluation run to MP4 file in S3. +* Logging + * Training metrics and trace files are stored to S3. + * Optional integration with AWS CloudWatch. + * Optional exposure of Robomaker internal log-files. +* Technology + * Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL) ## Documentation -Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud) +Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud). + +## Support + +* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support. +* Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. diff --git a/docs/index.md b/docs/index.md index 30c1729a..536fe0b0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,21 +1,52 @@ # Introduction -Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). +Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server. -This repo started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. Main differences to the work done by Alex is: * Runtime S3 storage is setup to fit the connected cloud platform: * Azure: Local 'virtual' S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). * AWS: Directly connects to a real S3 bucket. + * Local: Local 'virtual' S3 instance (minio) storing files locally on the server. * Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. * Configuration has been reorganized : * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. * `system.env` contains system-wide constants (expected to be configured only at setup) * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). - * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. + +# Main Features + +DRfC supports a wide set of features to ensure that you can focus on creating the best model: +* User-friendly + * Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups. + * Wide set of scripts (`dr-*`) enables effortless training. + * Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them. +* Modes + * Time Trial + * Object Avoidance + * Head-to-Bot +* Training + * Multiple Robomaker instances per Sagemaker (N:1) to improve training progress. + * Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel. + * Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances. +* Evaluation + * Evaluate independently from training. + * Save evaluation run to MP4 file in S3. +* Logging + * Training metrics and trace files are stored to S3. + * Optional integration with AWS CloudWatch. + * Optional exposure of Robomaker internal log-files. +* Technology + * Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL) # Documentation * [Initial Installation](installation) -* [Reference](reference) \ No newline at end of file +* [Reference](reference) +* [GPU Accelerated OpenGL for Robomaker](opengl) + +# Support + +* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support. +* Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. diff --git a/docs/installation.md b/docs/installation.md index 0d407954..ff60fc34 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,14 +2,14 @@ ## Requirements -Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. +Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. Both CPU-only as well as GPU systems are supported. **AWS**: -* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge +* EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training. * Ubuntu 18.04 * Minimum 30 GB, preferred 40 GB of OS disk. * Ephemeral Drive connected - * Minimum 8 GB GPU-RAM + * Minimum of 8 GB GPU-RAM if running with GPU. * Recommended at least 6 VCPUs * S3 bucket. Preferrably in same region as EC2 instance. @@ -21,7 +21,14 @@ Depending on your needs as well as specific needs of the cloud platform you can * Minimum 8 GB GPU-RAM * Recommended at least 6 VCPUs * Storage Account with one Blob container configured for Access Key authentication. - + +**Local**: +* A modern, comparatively powerful, Intel based system. + * Ubuntu 18.04 or 20.04 - Windows not supported, other Linux-dristros likely to work. + * 4 core-CPU, equivalent to 8 vCPUs; the more the better. + * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. + * System RAM + GPU RAM should be at least 32 GB. + ## Installation The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. @@ -46,13 +53,15 @@ The Init Script takes a few parameters: ## Environment Setup -The environment is set via the `CLOUD` parameter in `system.env`; it can be `Azure`, `AWS` or `Local`. It is case-insensitive. Depending on the value the virtual or native S3 instance will be configured accordingly. +The initialization script will attempt to auto-detect your environment (`Azure`, `AWS` or `Local`), and store the outcome in the `DR_CLOUD` parameter in `system.env`. You can also pass in a `-c ` parameter to override it, e.g. if you want to run the minio-based `local` mode in the cloud. + +The main difference between the mode is based on authentication mechanisms and type of storage being configured. The next chapters will review each type of environment on its own. ### AWS In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. -#### IAM Roles +#### IAM Role To use IAM Roles: * An empty S3 bucket in the same region as the EC2 instance. @@ -60,13 +69,14 @@ To use IAM Roles: * Access both the *new* S3 bucket as well as the DeepRacer bucket. * AmazonVPCReadOnlyAccess * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis -* An EC2 instance with the IAM Role assigned. + * CloudWatch +* An EC2 instance with the defined IAM Role assigned. * Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` -* Run `dr-update-env` for configuration to take effect. +* Run `dr-update` for configuration to take effect. #### Manual setup For access with IAM user: @@ -86,6 +96,7 @@ For access with IAM user: In Azure mode the script-set requires the following: * A storage account with a blob container set up with access keys: * Use `aws configure --profile ` to configure this into a specific profile. + * `` can be defined by the user, but do not use `default`. * Access Key ID is the Storage Account name. * Secret Access Key is the Access Key for the Storage Account. * The blob container is equivalent to the S3 bucket. @@ -105,13 +116,29 @@ If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. -After having run init.sh do the following: +In Local mode the script-set requires the following: * Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. -* Configure your normal AWS credentials with `aws configure` if this is not already in place on your system. This is required to use the model upload functionality. +* A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. +* Configure `run.env` as follows: + * `DR_LOCAL_S3_PROFILE=minio` + * `DR_LOCAL_S3_BUCKET=bucket` + * `DR_UPLOAD_S3_PROFILE=default` + * `DR_UPLOAD_S3_BUCKET=` +* Run `dr-update` for configuration to take effect. + +## First Run + +For the first run the following final steps are needed. This creates a training run with all default values in -## Basic Usage +* Define your custom files in `custom_files/` - samples can be found in `defaults` which you must copy over: + * `hyperparameters.json` - definining the training hyperparameters + * `model_metadata.json` - defining the action space and sensors + * `reward_function.py` - defining the reward function +* Upload the files into the bucket with `dr-upload-custom-files`. This will also start minio if required. +* Start training with `dr-start-training` -Before every session run `dr-update` to ensure that the environment variables are set correctly. This also creates a set of aliases/commands that makes it easier to operate the setup. If `dr-update` is not found, try `source activate.sh` to get aliases defined. +After a while you will see the sagemaker logs on the screen. -Ensure that the configuration files are uploaded into the bucket `dr-upload-custom-files`. Start a training with `dr-start-training`. +## Troubleshooting +If things do not start as expected - e.g. you get a message "Sagemaker is not running" then run `docker ps -a` to see if the containers are running or if they stopped due to errors. You can use `docker logs -f ` to check the errors. diff --git a/docs/opengl.md b/docs/opengl.md new file mode 100644 index 00000000..78f8b3d6 --- /dev/null +++ b/docs/opengl.md @@ -0,0 +1,31 @@ +# GPU Accelerated OpenGL for Robomaker + +One way to improve performance, especially of Robomaker, is to enable GPU-accelerated OpenGL. OpenGL can significantly improve Gazebo performance, even where the GPU does not have enough GPU RAM, or is too old, to support Tensorflow. + +## Desktop + +On a Ubuntu desktop running Unity there are hardly any additional steps required. + +* Ensure that a recent Nvidia driver is installed and is running. +* Ensure that nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. +* Configure DRfC using the following settings in `system.env`: + * `DR_DOCKER_STYLE=compose`; Docker Swarm does not support the `ipc=host` option required for X-Windows to work properly. + * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. + * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. + +With recent Nvidia drivers you can comfirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU. + +## Headless Server + +Also a headless server with a GPU, e.g. an EC2 instance, or a local computer with a displayless GPU (e.g. Tesla K40, K80, M40). + +* Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. +* Setup an X-server on the host. `utils\setup-xorg.sh` is a basic installation script. +* Configure DRfC using the following settings in `system.env`: + * `DR_DOCKER_STYLE=compose`; Docker Swarm does not support the `ipc=host` option required for X-Windows to work properly. + * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. + * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. + +Before training ensure that the server is running, including VNC if you want to connect. `utils\start-xorg.sh` is provided as sample. + +With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. From 623be4de386da916c2f8a0929f9952954878dabc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 19 Jun 2020 18:00:07 +0000 Subject: [PATCH 092/428] Fix issue if aws is configured with table output. --- bin/activate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index 0e030b4e..0a69247e 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -108,7 +108,7 @@ if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. -if [ $(aws sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -eq 0 ]; +if [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -eq 0 ]; then export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) From 0b1396b0a815226016003bce5c3f8e0f10e328b4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 19 Jun 2020 18:12:13 +0000 Subject: [PATCH 093/428] Fix init.sh for azure --- bin/init.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/init.sh b/bin/init.sh index 82afaedb..31b3739b 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -99,6 +99,11 @@ if [[ "${OPT_CLOUD}" == "aws" ]]; then sed -i "s//not-defined/g" $INSTALL_DIR/system.env fi sed -i "s//default/g" $INSTALL_DIR/system.env +elif [[ "${OPT_CLOUD}" == "azure" ]]; then + AWS_REGION="us-east-1" + sed -i "s//azure/g" $INSTALL_DIR/system.env + sed -i "s//not-defined/g" $INSTALL_DIR/system.env + echo "Please run 'aws configure --profile azure' to set the credentials" else AWS_REGION="us-east-1" sed -i "s//minio/g" $INSTALL_DIR/system.env From 3acc72820ee9b1d3994940c0b225b71aa50bd76d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 19 Jun 2020 20:37:38 +0000 Subject: [PATCH 094/428] Documentation update --- docs/_config.yml | 5 +++++ docs/docker.md | 49 ++++++++++++++++++++++++++++++++++++++++++++ docs/index.md | 8 +++++--- docs/multi_run.md | 17 +++++++++++++++ docs/multi_worker.md | 17 +++++++++++++++ 5 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 docs/docker.md create mode 100644 docs/multi_run.md create mode 100644 docs/multi_worker.md diff --git a/docs/_config.yml b/docs/_config.yml index c0e30f22..5c24e7b9 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -2,3 +2,8 @@ theme: jekyll-theme-slate markdown: GFM name: Deepracer-for-Cloud +plugins: + - jekyll-relative-links +relative_links: + enabled: true + collections: false \ No newline at end of file diff --git a/docs/docker.md b/docs/docker.md new file mode 100644 index 00000000..2986d20a --- /dev/null +++ b/docs/docker.md @@ -0,0 +1,49 @@ +# About the Docker setup + +DRfC supports running Docker in to modes `swarm` and `compose` - this behaviour is configured in `system.env` through `DR_DOCKER_STYLE`. + +## Swarm Mode + +Docker Swarm mode is the default. Docker Swarm makes it possible to connect multiple hosts together to spread the load -- esp. useful if one wants to run multiple Robomaker workers, but can also be useful locally if one has two computers that each are not powerful enough to run DeepRacer. + +In Swarm mode DRfC creates Stacks, using `docker stack`. During operations one can check running stacks through `docker stack ls`, and running services through `docker stack ls`. + +DRfC is installed only on the manager. (The first installed host.) Swarm workers are 'dumb' and do not need to have DRfC installed. + +### Key features + +* Allows user to connect multiple computers on the same network. (In AWS the instances must be connected on same VPC, and instances must be allowed to communicate.) +* Supports [multiple Robomaker workers](multi_worker.md) +* Supports [running multiple parallel experiments](multi_run.md) + +### Limitations + +* The Sagemaker container can only be run on the manager. +* OpenGL is not supported as Swarm does not support `ipc=host`. + +### Connecting Workers + +* On the manager run `docker swarm join-token manager`. +* On the worker run the command that was displayed on the manager `docker swarm join --token :`. + +### Ports + +Docker Swarm will automatically put a load-balancer in front of all replicas in a service. This means that the ROS Web View, which provides a video stream of the DeepRacer during training, will be load balanced - sharing one port (`8080`). If you have multiple workers (even across multiple hosts) then press F5 to cycle through them. + +## Compose Mode + +In Compose mode DRfC creates Services, using `docker-compose`. During operations one can check running stacks through `docker service ls`, and running services through `docker service ps`. + +### Key features + +* Supports [multiple Robomaker workers](multi_worker.md) +* Supports [running multiple parallel experiments](multi_run.md) +* Supports [GPU Accelerated OpenGL for Robomaker](opengl.md) + +### Limitations + +* Workload cannot be spread across multiple hosts. + +### Ports + +In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 536fe0b0..4826ac80 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,9 +42,11 @@ DRfC supports a wide set of features to ensure that you can focus on creating th # Documentation -* [Initial Installation](installation) -* [Reference](reference) -* [GPU Accelerated OpenGL for Robomaker](opengl) +* [Initial Installation](installation.md) +* [Reference](reference.md) +* [Using multiple Robomaker workers](multi_worker.md) +* [Running multiple parallel experiments](multi_run.md) +* [GPU Accelerated OpenGL for Robomaker](opengl.md) # Support diff --git a/docs/multi_run.md b/docs/multi_run.md new file mode 100644 index 00000000..d0e6de38 --- /dev/null +++ b/docs/multi_run.md @@ -0,0 +1,17 @@ +# Running Multiple Experiments + +It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`. + +The feature works by creating unique prefixes to the container names: +* In Swarm mode this is done through defining a stack name (default: deepracer-0) +* In Compose mode this is done through adding a project name. + +## Suggested way to use the feature + +By default `run.env` is loaded when DRfC is activated - but it is possible to load a separate configuration through `source bin/activate.sh `. + +The best way to use this feature is to have a bash-shell per experiment, and to load a separate configuration per shell. + +After activating one can control each experiment independently through using the `dr-*` commands. + +If using local or Azure the S3 / Minio instance will be shared, and is running only once. \ No newline at end of file diff --git a/docs/multi_worker.md b/docs/multi_worker.md new file mode 100644 index 00000000..8205c20f --- /dev/null +++ b/docs/multi_worker.md @@ -0,0 +1,17 @@ +# Using multiple Robomaker workers + +One way to accelerate training is to launch multiple Robomaker workers that feed into one Sagemaker instance. + +The number of workers is configured through setting `system.env` `DR_WORKERS` to the desired number of workers. The result is that the number of episodes (hyperparameter `num_episodes_between_training`) will be divivided over the number of workers. The theoretical maximum number of workers equals `num_episodes_between_training`. + +The training can be started as normal. + +## How many workers do I need? + +One Robomaker worker requires 2-4 vCPUs. Tests show that a `c5.4xlarge` instance can run 3 workers and the Sagemaker without a drop in performance. Using OpenGL images reduces the number of vCPUs required per worker. + +To avoid issues with the position from which evaluations are run ensure that `( num_episodes_between_training / DR_WORKERS) * DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST = 1.0`. + +Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.1`. + +Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration. \ No newline at end of file From 899110f84e82abcd948c45041b6ed32edfa111c4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 20 Jun 2020 22:46:16 +0200 Subject: [PATCH 095/428] Fix Swarm upgrade issues (#59) --- bin/activate.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 0a69247e..fa5d6ddb 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -60,11 +60,18 @@ else fi # Check if we will use Docker Swarm or Docker Compose +# If not defined then use Swarm +if [[ -z "${DR_DOCKER_STYLE}" ]]; then + export DR_DOCKER_STYLE="swarm" +fi + if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then - export DR_DOCKER_FILE_SEP="-c" + export DR_DOCKER_FILE_SEP="-c" + SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) + SWARM_NODE_UPDATE=$(docker node update --label-add Sagemaker=true $SWARM_NODE) else - export DR_DOCKER_FILE_SEP="-f" + export DR_DOCKER_FILE_SEP="-f" fi # Prepare the docker compose files depending on parameters From fe85fc16e8a8a657f6a996a50faf0e2f5b20c07e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 25 Jun 2020 23:10:03 +0200 Subject: [PATCH 096/428] Adding loganalysis to sagemaker-local network --- scripts/log-analysis/start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index b7b2f1b7..368f73ae 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -5,4 +5,5 @@ docker run --rm -it -p "8888:8888" \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v `pwd`/../../data/analysis:/workspace/analysis \ --name loganalysis \ +--network sagemaker-local \ larsll/deepracer-loganalysis:v2-cpu From 2dbcb13c959caef7b66c3e35af0ef13f7c0e767b Mon Sep 17 00:00:00 2001 From: dartjason <51768630+dartjason@users.noreply.github.com> Date: Thu, 25 Jun 2020 17:18:21 -0400 Subject: [PATCH 097/428] Update sample-createspot.sh (#61) Bug fix -- add missing line to update DR_LOCAL_S3_BUCKET --- utils/sample-createspot.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh index 19cf8e9f..c4331848 100644 --- a/utils/sample-createspot.sh +++ b/utils/sample-createspot.sh @@ -88,6 +88,7 @@ fi ## Replace dynamic parameters in run.env (still local to your directory) sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; ; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g" "$CONFIG_FILE" && echo "Done." +sed -i.bak -re "s/(DR_LOCAL_S3_BUCKET=).*$/\1$BUCKET/g" "$CONFIG_FILE" ## Replace static parameters in run.env (still local to your directory) sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$DR_UPLOAD_S3_PREFIX/g" "$CONFIG_FILE" From d35932c14eb9be56260e005e8308bab78af846e1 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 28 Jun 2020 19:39:54 +0200 Subject: [PATCH 098/428] Fixing the Training start.sh output (#62) Refactoring of how training/start.sh and evaluation/start.sh interact with the different logging services. Ensures that gnome-terminal is only called if DR_HOST_X and DISPLAY has been set. If yes this will lead to separate terminal windows being opened with the log. Otherwise run log inline. Also adds quiet option to not output any logging at all. --- bin/scripts_wrapper.sh | 180 ++++++++++++++++++++++++++++++---- scripts/evaluation/start.sh | 41 ++++---- scripts/log-analysis/start.sh | 4 +- scripts/training/start.sh | 80 ++++++++------- 4 files changed, 233 insertions(+), 72 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index cf0205ff..f9830d13 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -44,7 +44,7 @@ function dr-download-custom-files { function dr-start-training { dr-update-env - bash -c "cd $DIR/scripts/training && ./start.sh $@" + $DIR/scripts/training/start.sh "$@" } function dr-increment-training { @@ -57,7 +57,7 @@ function dr-stop-training { function dr-start-evaluation { dr-update-env - bash -c "cd $DIR/scripts/evaluation && ./start.sh $@" + $DIR/scripts/evaluation/start.sh "$@" } function dr-stop-evaluation { @@ -80,6 +80,62 @@ function dr-stop-loganalysis { function dr-logs-sagemaker { + local OPTIND + + while getopts ":w:" opt; do + case $opt in + w) OPT_WAIT=$OPTARG + ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac + done + + SAGEMAKER_CONTAINER=$(dr-find-sagemaker) + + if [[ -z "$SAGEMAKER_CONTAINER" ]]; + then + if [[ -n "$OPT_WAIT" ]]; then + WAIT_TIME=$OPT_WAIT + echo "Waiting up to $WAIT_TIME seconds for Sagemaker to start up..." + until [ -n "$SAGEMAKER_CONTAINER" ] + do + sleep 1 + ((WAIT_TIME--)) + if [ "$WAIT_TIME" -lt 1 ]; then + echo "Sagemaker is not running." + exit 1 + fi + SAGEMAKER_CONTAINER=$(dr-find-sagemaker) + done + else + echo "Sagemaker is not running." + exit 1 + fi + fi + + if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; + then + if [ -x "$(command -v gnome-terminal)" ]; + then + gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/bin/bash -c "!!; docker logs -f ${SAGEMAKER_CONTAINER}" 2> /dev/null + echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate gnome-terminal. " + elif [ -x "$(command -v x-terminal-emulator)" ]; + then + x-terminal-emulator -e /bin/sh -c "!!; docker logs -f ${SAGEMAKER_CONTAINER}" 2> /dev/null + echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate terminal. " + else + echo 'Could not find a defined x-terminal-emulator. Displaying inline.' + docker logs -f $SAGEMAKER_CONTAINER + fi + else + docker logs -f $SAGEMAKER_CONTAINER + fi + +} + +function dr-find-sagemaker { + STACK_NAME="deepracer-$DR_RUN_ID" RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} @@ -93,31 +149,121 @@ function dr-logs-sagemaker { COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) if [[ -n $COMPOSE_FILE ]]; then - docker logs -f $CONTAINER + echo $CONTAINER + return fi done - else - echo "Sagemaker is not running." fi } function dr-logs-robomaker { - eval ROBOMAKER_ID=$(docker ps | grep "deepracer-${DR_RUN_ID}_robomaker" | cut -f1 -d\ | head -1) - if [ -n "$ROBOMAKER_ID" ]; then - docker logs -f $ROBOMAKER_ID + + OPT_REPLICA=1 + local OPTIND + + while getopts ":w:n:e" opt; do + case $opt in + w) OPT_WAIT=$OPTARG + ;; + n) OPT_REPLICA=$OPTARG + ;; + e) OPT_EVAL="-e" + ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac + done + + ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL}) + + if [[ -z "$ROBOMAKER_CONTAINER" ]]; + then + if [[ -n "$OPT_WAIT" ]]; then + WAIT_TIME=$OPT_WAIT + echo "Waiting up to $WAIT_TIME seconds for Robomaker #${OPT_REPLICA} to start up..." + until [ -n "$ROBOMAKER_CONTAINER" ] + do + sleep 1 + ((WAIT_TIME--)) + if [ "$WAIT_TIME" -lt 1 ]; then + echo "Robomaker #${OPT_REPLICA} is not running." + exit 1 + fi + ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA}) + done else - echo "Robomaker is not running." + echo "Robomaker #${OPT_REPLICA} is not running." + exit 1 fi -} + fi -function dr-logs-robomaker-debug { - eval ROBOMAKER_ID=$(docker ps | grep "deepracer-${DR_RUN_ID}_robomaker" | cut -f1 -d\ | head -1) - if [ -n "$ROBOMAKER_ID" ]; then - docker logs -f $ROBOMAKER_ID 2>&1 | grep DEBUG + if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; + then + if [ -x "$(command -v gnome-terminal)" ]; + then + gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/bin/bash -c "!!; docker logs -f ${ROBOMAKER_CONTAINER}" 2> /dev/null + echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate gnome-terminal. " + elif [ -x "$(command -v x-terminal-emulator)" ]; + then + x-terminal-emulator -e /bin/sh -c "!!; docker logs -f ${ROBOMAKER_CONTAINER}" 2> /dev/null + echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate terminal. " else - echo "Robomaker is not running." + echo 'Could not find a defined x-terminal-emulator. Displaying inline.' + docker logs -f $ROBOMAKER_CONTAINER fi + else + docker logs -f $ROBOMAKER_CONTAINER + fi + +} + +function dr-find-robomaker { + + local OPTIND + + OPT_PREFIX="deepracer" + + while getopts ":n:e" opt; do + case $opt in + n) OPT_REPLICA=$OPTARG + ;; + e) OPT_PREFIX="-eval" + ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac + done + + eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}_robomaker.${OPT_REPLICA}" | cut -f1 -d\ | head -1) + if [ -n "$ROBOMAKER_ID" ]; then + echo $ROBOMAKER_ID + else + echo "Robomaker is not running." + fi +} + +function dr-get-robomaker-stats { + + local OPTIND + OPT_REPLICA=1 + + while getopts ":n:" opt; do + case $opt in + n) OPT_REPLICA=$OPTARG + ;; + \?) echo "Invalid option -$OPTARG" >&2 + ;; + esac + done + + eval ROBOMAKER_ID=$(dr-find-robomaker -n $OPT_REPLICA ) + if [ -n "$ROBOMAKER_ID" ]; then + echo "Showing statistics for Robomaker #$OPT_REPLICA - container $ROBOMAKER_ID" + docker exec -ti $ROBOMAKER_ID bash -c "gz stats" + else + echo "Robomaker #$OPT_REPLICA is not running." + fi } function dr-logs-loganalysis { @@ -133,9 +279,7 @@ function dr-logs-loganalysis { function dr-url-loganalysis { eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then - eval URL=$(docker logs $LOG_ANALYSIS_ID | perl -n -e'/(http:\/\/127\.0\.0\.1\:8888\/\?.*)/; print $1') - echo "Log-analysis URL:" - echo $URL + docker exec "$LOG_ANALYSIS_ID" bash -c "source .venv/bin/activate && jupyter notebook list" else echo "Log-analysis is not running." fi diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 401f6621..fc600a58 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -3,8 +3,8 @@ source $DR_DIR/bin/scripts_wrapper.sh usage(){ - echo "Usage: $0 [-w]" - echo " -w Wipes the target AWS DeepRacer model structure before upload." + echo "Usage: $0 [-q]" + echo " -q Quiet - does not start log tracing." exit 1 } @@ -15,6 +15,19 @@ function ctrl_c() { exit 1 } +while getopts ":q" opt; do +case $opt in +q) OPT_QUIET="QUIET" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + + # set evaluation specific environment variables S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" STACK_NAME="deepracer-eval-$DR_RUN_ID" @@ -32,7 +45,7 @@ else fi echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" -python3 prepare-config.py +python3 $DR_DIR/scripts/evaluation/prepare-config.py # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; @@ -42,21 +55,11 @@ else docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d fi -echo 'waiting for containers to start up...' - -#sleep for 20 seconds to allow the containers to start -sleep 15 - -if xhost >& /dev/null; -then - echo "Display exists, using gnome-terminal for logs and starting vncviewer." +# Request to be quiet. Quitting here. +if [ -n "$OPT_QUIET" ]; then + exit 0 +fi - echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps | awk ' /robomaker/ { print $1 }')" +# Trigger requested log-file +dr-logs-robomaker -w 15 -e - echo 'attempting to open vnc viewer...' - gnome-terminal -x sh -c "!!; vncviewer localhost:8080" -else - echo "No display. Falling back to CLI mode." - docker logs -f $(docker ps | awk ' /robomaker/ { print $1 }') -fi diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 368f73ae..269fa424 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash -docker run --rm -it -p "8888:8888" \ +docker run --rm -d -p "8888:8888" \ -v `pwd`/../../data/logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v `pwd`/../../data/analysis:/workspace/analysis \ --name loganalysis \ --network sagemaker-local \ larsll/deepracer-loganalysis:v2-cpu + +docker logs -f loganalysis \ No newline at end of file diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 3b3dd844..870d385a 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -3,8 +3,12 @@ source $DR_DIR/bin/scripts_wrapper.sh usage(){ - echo "Usage: $0 [-w]" + echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ]" echo " -w Wipes the target AWS DeepRacer model structure before upload." + echo " -q Do not output / follow a log when starting." + echo " -a Follow all Sagemaker and Robomaker logs." + echo " -s Follow Sagemaker logs (default)." + echo " -r [n] Follow Robomaker logs for worker n (default worker 0 / replica 1)." exit 1 } @@ -15,10 +19,27 @@ function ctrl_c() { exit 1 } -while getopts ":wh" opt; do +OPT_DISPLAY="SAGEMAKER" + +while getopts ":whqsar:" opt; do case $opt in w) OPT_WIPE="WIPE" ;; +q) OPT_QUIET="QUIET" +;; +s) OPT_DISPLAY="SAGEMAKER" +;; +a) OPT_DISPLAY="ALL" +;; +r) # Check if value is in numeric format. + OPT_DISPLAY="ROBOMAKER" + if [[ $OPTARG =~ ^[0-9]+$ ]]; then + OPT_ROBOMAKER=$OPTARG + else + OPT_ROBOMAKER=0 + ((OPTIND--)) + fi +;; h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 @@ -28,7 +49,9 @@ esac done # Ensure Sagemaker's folder is there -sudo mkdir -p /tmp/sagemaker +if [ ! -d /tmp/sagemaker ]; then + sudo mkdir -p /tmp/sagemaker +fi #Check if files are available S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" @@ -77,7 +100,7 @@ fi export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" -python3 prepare-config.py +python3 $DR_DIR/scripts/training/prepare-config.py # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; @@ -87,36 +110,25 @@ else docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS fi -echo 'Waiting for containers to start up...' - -#sleep for 20 seconds to allow the containers to start -sleep 15 - -if xhost >& /dev/null; -then - echo "Display exists, using gnome-terminal for logs and starting vncviewer." - if ! [ -x "$(command -v gnome-terminal)" ]; - then - echo 'Error: skip showing sagemaker logs because gnome-terminal is not installed. This is normal if you are on a different OS to Ubuntu.' - else - echo 'attempting to pull up sagemaker logs...' - gnome-terminal -x sh -c "!!; docker logs -f $(docker ps -a | awk ' /sagemaker/ { print $1 }')" - fi +# Request to be quiet. Quitting here. +if [ -n "$OPT_QUIET" ]; then + exit 0 +fi - if ! [ -x "$(command -v gnome-terminal)" ]; - then - if ! [ -x "$(command -v vncviewer)" ]; - then - echo 'Error: vncviewer is not present on the PATH. Make sure you install it and add it to the PATH.' - else - echo 'attempting to open vnc viewer...' - vncviewer localhost:8080 - fi - else - echo 'attempting to open vnc viewer...' - gnome-terminal -x sh -c "!!; vncviewer localhost:8080" +# Trigger requested log-file +if [[ "${OPT_DISPLAY,,}" == "all" && -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then + dr-logs-sagemaker -w 15 + if [ "${DR_WORKERS}" -gt 1 ]; then + for i in $(seq 1 ${DR_WORKERS}) + do + dr-logs-robomaker -w 15 -n $i + done + else + dr-logs-robomaker -w 15 fi -else - echo "No display. Falling back to CLI mode." - dr-logs-sagemaker +elif [[ "${OPT_DISPLAY,,}" == "robomaker" ]]; then + dr-logs-robomaker -w 15 -n $OPT_ROBOMAKER +elif [[ "${OPT_DISPLAY,,}" == "sagemaker" ]]; then + dr-logs-sagemaker -w 15 fi + From bae7b2f8d780e691cf4684c45e6a3c97359d5c15 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 4 Jul 2020 14:36:19 +0200 Subject: [PATCH 099/428] Documentation on multiple GPUs (#63) --- docs/index.md | 1 + docs/multi_gpu.md | 60 ++++++++++++++++++++++++++++++++++ utils/Dockerfile.sagemaker-gpu | 2 ++ utils/cuda-check-tf.py | 10 ++++++ utils/cuda-check.sh | 5 +++ 5 files changed, 78 insertions(+) create mode 100644 docs/multi_gpu.md create mode 100644 utils/Dockerfile.sagemaker-gpu create mode 100644 utils/cuda-check-tf.py create mode 100755 utils/cuda-check.sh diff --git a/docs/index.md b/docs/index.md index 4826ac80..8f52b89d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -47,6 +47,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th * [Using multiple Robomaker workers](multi_worker.md) * [Running multiple parallel experiments](multi_run.md) * [GPU Accelerated OpenGL for Robomaker](opengl.md) +* [Having multiple GPUs in one Computer](multi_gpu.md) # Support diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md new file mode 100644 index 00000000..7c84a0c8 --- /dev/null +++ b/docs/multi_gpu.md @@ -0,0 +1,60 @@ +# Training on a Computer with more than one GPU + +In some cases you might end up with having a computer with more than one GPU. This may be common on a workstation +which may have one GPU for general graphics (e.g. GTX 10-series, RTX 20-series), as well as a data center GPU +like a Tesla K40, K80 or M40. + +In this setting it can get a bit chaotic as DeepRacer will 'greedily' put any workload on any GPU - which will +lead to Out-of-Memory somewhere down the road. + +## Checking available GPUs + +You can use Tensorflow to give you an overview of available devices running `utils/cuda-check.sh`. + +It will say something like: +``` +2020-07-04 12:25:55.179580: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA +2020-07-04 12:25:55.547206: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 0 with properties: +name: GeForce GTX 1650 major: 7 minor: 5 memoryClockRate(GHz): 1.68 +pciBusID: 0000:04:00.0 +totalMemory: 3.82GiB freeMemory: 3.30GiB +2020-07-04 12:25:55.732066: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 1 with properties: +name: Tesla M40 24GB major: 5 minor: 2 memoryClockRate(GHz): 1.112 +pciBusID: 0000:81:00.0 +totalMemory: 22.41GiB freeMemory: 22.30GiB +2020-07-04 12:25:55.732141: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1 +2020-07-04 12:25:56.745647: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix: +2020-07-04 12:25:56.745719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] 0 1 +2020-07-04 12:25:56.745732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0: N N +2020-07-04 12:25:56.745743: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1: N N +2020-07-04 12:25:56.745973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5) +2020-07-04 12:25:56.750352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2) +2020-07-04 12:25:56.774305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1 +2020-07-04 12:25:56.774408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix: +2020-07-04 12:25:56.774425: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] 0 1 +2020-07-04 12:25:56.774436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0: N N +2020-07-04 12:25:56.774446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1: N N +2020-07-04 12:25:56.774551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5) +2020-07-04 12:25:56.774829: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2) +['/device:GPU:0', '/device:GPU:1'] +``` +In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Tesla M40. + +### Selecting Device + +#### Robomaker +To control the Robomaker then add the following to `docker-compose-training.yml` in the `docker/` directory: +``` +CUDA_VISIBLE_DEVICES=1 +``` +The number is the CUDA number of the GPU you want the Robomakers to use. + +#### Sagemaker + +Sagemaker is more critical to place, but also more complicated, as you will have to build a new Docker image for it to work. + +A template is in `utils/Dockerfile.sagemaker-gpu`. Again the number is the applicable CUDA number. + +Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu` with x being anything you like. + +Update `system.env` to use the new image. \ No newline at end of file diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu new file mode 100644 index 00000000..fc864923 --- /dev/null +++ b/utils/Dockerfile.sagemaker-gpu @@ -0,0 +1,2 @@ +FROM awsdeepracercommunity/deepracer-sagemaker:gpu +ENV CUDA_VISIBLE_DEVICES=1 \ No newline at end of file diff --git a/utils/cuda-check-tf.py b/utils/cuda-check-tf.py new file mode 100644 index 00000000..b3360ca8 --- /dev/null +++ b/utils/cuda-check-tf.py @@ -0,0 +1,10 @@ +from tensorflow.python.client import device_lib +import tensorflow as tf + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + +gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05) +sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) +print(get_available_gpus()) diff --git a/utils/cuda-check.sh b/utils/cuda-check.sh new file mode 100755 index 00000000..db94e35b --- /dev/null +++ b/utils/cuda-check.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +CONTAINER_ID=$(docker create --rm -ti --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py") +docker cp $DR_DIR/utils/cuda-check-tf.py $CONTAINER_ID:/opt/install/ +docker start -a $CONTAINER_ID \ No newline at end of file From 7c4d02cd182d4296790d32e8a99e02e029d43c16 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 5 Jul 2020 10:04:19 +0200 Subject: [PATCH 100/428] Multi-GPU fixes (#64) * Documentation on multiple GPUs * Updates to make it more generic --- defaults/template-system.env | 3 ++- docker/docker-compose-training.yml | 3 ++- docs/multi_gpu.md | 3 ++- utils/cuda-check.sh | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index fd66cea6..88b5b672 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -13,4 +13,5 @@ DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False DR_CLOUD_WATCH_ENABLE=False DR_DOCKER_STYLE=swarm -DR_HOST_X=False \ No newline at end of file +DR_HOST_X=False +# CUDA_VISIBLE_DEVICES=0 \ No newline at end of file diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 2c0ff433..91fdca0b 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -35,4 +35,5 @@ services: - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - - ENABLE_GUI=${DR_GUI_ENABLE} \ No newline at end of file + - ENABLE_GUI=${DR_GUI_ENABLE} + - CUDA_VISIBLE_DEVICES \ No newline at end of file diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 7c84a0c8..70f0518b 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -43,7 +43,8 @@ In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Te ### Selecting Device #### Robomaker -To control the Robomaker then add the following to `docker-compose-training.yml` in the `docker/` directory: +To control the Robomaker then add the following to `system.env`: + ``` CUDA_VISIBLE_DEVICES=1 ``` diff --git a/utils/cuda-check.sh b/utils/cuda-check.sh index db94e35b..0ecce85f 100755 --- a/utils/cuda-check.sh +++ b/utils/cuda-check.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -CONTAINER_ID=$(docker create --rm -ti --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py") +CONTAINER_ID=$(docker create --rm -ti -e CUDA_VISIBLE_DEVICES --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py") docker cp $DR_DIR/utils/cuda-check-tf.py $CONTAINER_ID:/opt/install/ docker start -a $CONTAINER_ID \ No newline at end of file From 8308f01621079444b994df4a90fda27166392f65 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 5 Jul 2020 10:05:43 +0200 Subject: [PATCH 101/428] Show all streams in a browser (#65) * Generate multi-worker streaming page * Updating docs --- docs/multi_worker.md | 6 +++- utils/start-local-browser.sh | 67 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100755 utils/start-local-browser.sh diff --git a/docs/multi_worker.md b/docs/multi_worker.md index 8205c20f..cad05eb3 100644 --- a/docs/multi_worker.md +++ b/docs/multi_worker.md @@ -14,4 +14,8 @@ To avoid issues with the position from which evaluations are run ensure that `( Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.1`. -Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration. \ No newline at end of file +Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration. + +## Watching the streams + +If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time. \ No newline at end of file diff --git a/utils/start-local-browser.sh b/utils/start-local-browser.sh new file mode 100755 index 00000000..b21478ff --- /dev/null +++ b/utils/start-local-browser.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +usage(){ + echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality]" + echo " -w Width of individual stream." + echo " -h Heigth of individual stream." + echo " -q Quality of the stream image." + echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +# Stream definition +TOPIC="/racecar/deepracer/kvs_stream" +WIDTH=480 +HEIGHT=360 +QUALITY=75 + +while getopts ":whqs:" opt; do +case $opt in +w) WIDTH="$OPTARG" +;; +h) HEIGHT="$OPTARG" +;; +q) QUALITY="$OPTARG" +;; +t) TOPIC="$OPTARG" +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +FILE=$DR_DIR/tmp/streams-$DR_RUN_ID.html + +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + echo "This script does not support swarm mode." + exit +fi + +echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

" > $FILE + +ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE") +if [ -z "$ROBOMAKER_CONTAINERS" ]; then + echo "No running robomakers. Exiting." + exit +fi + +for c in $ROBOMAKER_CONTAINERS; do + C_PORT=$(docker inspect $c | jq -r '.[0].NetworkSettings.Ports["8080/tcp"][0].HostPort') + C_URL="http://localhost:${C_PORT}/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" + C_IMG="" + echo $C_IMG >> $FILE +done + +echo "" >> $FILE + +firefox --new-tab `readlink -f $FILE ` & \ No newline at end of file From 968345280d6c69a50a512bc92c9f94d112ebae81 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 11 Jul 2020 11:08:54 +0200 Subject: [PATCH 102/428] New command 'dr-view-stream' --- bin/scripts_wrapper.sh | 5 +++++ utils/start-local-browser.sh | 12 ++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index f9830d13..79de4f2b 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -284,3 +284,8 @@ function dr-url-loganalysis { echo "Log-analysis is not running." fi } + +function dr-view-stream { + ${DIR}/utils/start-local-browser.sh "$@" +} + diff --git a/utils/start-local-browser.sh b/utils/start-local-browser.sh index b21478ff..7eaf8f33 100755 --- a/utils/start-local-browser.sh +++ b/utils/start-local-browser.sh @@ -1,11 +1,12 @@ #!/usr/bin/env bash usage(){ - echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality]" + echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]" echo " -w Width of individual stream." echo " -h Heigth of individual stream." echo " -q Quality of the stream image." echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" + echo " -b Browser command (default: firefox --new-tab)" exit 1 } @@ -21,8 +22,9 @@ TOPIC="/racecar/deepracer/kvs_stream" WIDTH=480 HEIGHT=360 QUALITY=75 +BROWSER="firefox --new-tab" -while getopts ":whqs:" opt; do +while getopts ":w:h:q:t:b:" opt; do case $opt in w) WIDTH="$OPTARG" ;; @@ -32,6 +34,8 @@ q) QUALITY="$OPTARG" ;; t) TOPIC="$OPTARG" ;; +b) BROWSER="$OPTARG" +;; \?) echo "Invalid option -$OPTARG" >&2 usage ;; @@ -63,5 +67,5 @@ for c in $ROBOMAKER_CONTAINERS; do done echo "" >> $FILE - -firefox --new-tab `readlink -f $FILE ` & \ No newline at end of file +echo "Starting browser '$BROWSER'." +$BROWSER `readlink -f $FILE ` & \ No newline at end of file From 57040217501ed4638ac33a8ef6be4ba2771a800f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 11 Jul 2020 16:19:30 +0200 Subject: [PATCH 103/428] Tweaking GPU detection --- bin/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index ad29e888..e677d1f4 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -18,7 +18,7 @@ source $DIR/detect.sh echo "Detected cloud type ${CLOUD_NAME}" ## Do I have a GPU -GPUS=$(lspci | awk '/NVIDIA/ && /3D controller/' | wc -l) +GPUS=$(lspci | awk '/NVIDIA/ && /VGA/' | wc -l) if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then ARCH="cpu" @@ -82,7 +82,7 @@ then curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update && sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime - jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json + cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json fi sudo systemctl enable docker sudo systemctl restart docker From c247264425fd1c10fd33273c77e8c6af12845ea0 Mon Sep 17 00:00:00 2001 From: dartjason <51768630+dartjason@users.noreply.github.com> Date: Fri, 17 Jul 2020 13:44:13 -0400 Subject: [PATCH 104/428] Separate Configuration for Multiple Workers. (#57) Allowing for separate configuration per worker Co-authored-by: Lars Lorentz Ludvigsen Co-authored-by: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Co-authored-by: dartjason --- defaults/template-run.env | 3 +- defaults/template-worker.env | 21 +++++++ docker/docker-compose-training.yml | 3 +- docs/multi_worker.md | 6 +- scripts/training/prepare-config.py | 91 +++++++++++++++++++++++++++++- scripts/training/start.sh | 17 ++++-- 6 files changed, 132 insertions(+), 9 deletions(-) create mode 100644 defaults/template-worker.env diff --git a/defaults/template-run.env b/defaults/template-run.env index 62c862b0..fed1f463 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -16,6 +16,7 @@ DR_EVAL_SAVE_MP4=False DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 +DR_TRAIN_MULTI_CONFIG=False DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker @@ -40,4 +41,4 @@ DR_H2B_LANE_CHANGE_DISTANCE=1.0 DR_H2B_NUMBER_OF_BOT_CARS=3 DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False -DR_H2B_BOT_CAR_SPEED=0.2 \ No newline at end of file +DR_H2B_BOT_CAR_SPEED=0.2 diff --git a/defaults/template-worker.env b/defaults/template-worker.env new file mode 100644 index 00000000..6efbdfbd --- /dev/null +++ b/defaults/template-worker.env @@ -0,0 +1,21 @@ +DR_WORLD_NAME=reInvent2019_track +DR_RACE_TYPE=TIME_TRIAL +DR_RACER_NAME=racer1 +DR_ENABLE_DOMAIN_RANDOMIZATION=False +DR_TRAIN_CHANGE_START_POSITION=True +DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False +DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 +DR_OA_NUMBER_OF_OBSTACLES=6 +DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 +DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False +DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS=False +DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS=2 +DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_H2B_IS_LANE_CHANGE=False +DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 +DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 +DR_H2B_LANE_CHANGE_DISTANCE=1.0 +DR_H2B_NUMBER_OF_BOT_CARS=3 +DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 +DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False +DR_H2B_BOT_CAR_SPEED=0.2 diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 91fdca0b..107a12a8 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -36,4 +36,5 @@ services: - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - - CUDA_VISIBLE_DEVICES \ No newline at end of file + - CUDA_VISIBLE_DEVICES + - MULTI_CONFIG \ No newline at end of file diff --git a/docs/multi_worker.md b/docs/multi_worker.md index cad05eb3..6b285124 100644 --- a/docs/multi_worker.md +++ b/docs/multi_worker.md @@ -16,6 +16,10 @@ Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_RO Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration. +## Training with different parameters for each worker + +It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME). To enable, set DR_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc. (So alongside run.env, you should have woker-2.env, worker-3.env, etc. run.env is still used for worker 1) Modify the worker env files with your desired changes, which can be more than just the world_name. These additional worker env files are only used if you are training with multiple workers. + ## Watching the streams -If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time. \ No newline at end of file +If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time. diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 9f7d2219..ee10924d 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -85,4 +85,93 @@ with open(local_yaml_path, 'w') as yaml_file: yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) -s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) + + +# Training with different configurations on each worker (aka Multi Config training) +config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False') + +if config['MULTI_CONFIG'] == "True": + num_workers = int(os.environ.get('DR_WORKERS',1)) + multi_config = {} + multi_config['multi_config'] = [None] * num_workers + + for i in range(1,num_workers+1,1): + if i == 1: + # copy training_params to training_params_1 + s3_yaml_name_list = s3_yaml_name.split('.') + s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i + + #upload additional training params files + yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp)) + s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) + + # Store in multi_config array + multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp, + 'world_name': config['WORLD_NAME']} + + else: # i >= 2 + #read in additional configuration file. format of file must be worker#-run.env + location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) + with open(location, 'r') as fh: + vars_dict = dict( + tuple(line.split('=')) + for line in fh.read().splitlines() if not line.startswith('#') + ) + + # Reset parameters for the configuration of this worker number + os.environ.update(vars_dict) + + # Update car and training parameters + config.update({'WORLD_NAME': os.environ.get('DR_WORLD_NAME')}) + config.update({'RACE_TYPE': os.environ.get('DR_RACE_TYPE')}) + config.update({'ALTERNATE_DRIVING_DIRECTION': os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION')}) + config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')}) + config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')}) + config.update({'ENABLE_DOMAIN_RANDOMIZATION': os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION')}) + + # Update Object Avoidance parameters + if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': + config.update({'NUMBER_OF_OBSTACLES': os.environ.get('DR_OA_NUMBER_OF_OBSTACLES')}) + config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')}) + config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')}) + config.update({'PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS')}) + config.update({'NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS': os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS')}) + config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')}) + config.update({'NUMBER_OF_BOT_CARS': '0'}) + + # Update Head to Bot parameters + if config['RACE_TYPE'] == 'HEAD_TO_BOT': + config.update({'IS_LANE_CHANGE': os.environ.get('DR_H2B_IS_LANE_CHANGE')}) + config.update({'LOWER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME')}) + config.update({'UPPER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME')}) + config.update({'LANE_CHANGE_DISTANCE': os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE')}) + config.update({'NUMBER_OF_BOT_CARS': os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS')}) + config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')}) + config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')}) + config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')}) + config.update({'NUMBER_OF_OBSTACLES': '0'}) + + # Clear bot cars and obstacles in case present from earlier worker + if config['RACE_TYPE'] == 'TIME_TRIAL': + config.update({'NUMBER_OF_BOT_CARS': '0'}) + config.update({'NUMBER_OF_OBSTACLES': '0'}) + + #split string s3_yaml_name, insert the worker number, and add back on the .yaml extension + s3_yaml_name_list = s3_yaml_name.split('.') + s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i + + #upload additional training params files + yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp)) + local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml')) + with open(local_yaml_path, 'w') as yaml_file: + yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) + s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) + + # Store in multi_config array + multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp, + 'world_name': config['WORLD_NAME']} + + print(json.dumps(multi_config)) + +else: + s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 870d385a..582b5e34 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -82,6 +82,10 @@ fi # set evaluation specific environment variables STACK_NAME="deepracer-$DR_RUN_ID" +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} + +WORKER_CONFIG=$(python3 $DR_DIR/scripts/training/prepare-config.py) + if [ "$DR_WORKERS" -gt 1 ]; then echo "Starting $DR_WORKERS workers" @@ -92,16 +96,19 @@ if [ "$DR_WORKERS" -gt 1 ]; then COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml" fi + if [ "$DR_TRAIN_MULTI_CONFIG" == "True" ]; then + export MULTI_CONFIG=$WORKER_CONFIG + echo "Multi-config training, creating multiple Robomaker configurations in $S3_PATH" + else + echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" + fi export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch" + else export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" + echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" fi -export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} - -echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" -python3 $DR_DIR/scripts/training/prepare-config.py - # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then From bc0fd9af693e263638ce3a4213d35dc6e77c0f66 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 21 Jul 2020 08:11:36 +0000 Subject: [PATCH 105/428] Fixing GPU Detection --- bin/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index e677d1f4..7128cf71 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -18,7 +18,7 @@ source $DIR/detect.sh echo "Detected cloud type ${CLOUD_NAME}" ## Do I have a GPU -GPUS=$(lspci | awk '/NVIDIA/ && /VGA/' | wc -l) +GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l ) if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then ARCH="cpu" From 2fd386696037e9e4eb28fd9ae29172f2efd1863d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 21 Jul 2020 22:57:20 +0200 Subject: [PATCH 106/428] Updates for Object Avoidance (#67) * Remove PSEUDO_RANDOM_* * Add car color to multi-config * Remove parameters not required * Adding object positions * Adding support for MIN_EVAL_TRIALS --- defaults/template-run.env | 4 ++-- defaults/template-worker.env | 5 ++--- scripts/evaluation/prepare-config.py | 10 ++++++++-- scripts/training/prepare-config.py | 26 +++++++++++++++++++------- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index fed1f463..a373248a 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -17,6 +17,7 @@ DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_TRAIN_MULTI_CONFIG=False +DR_TRAIN_MIN_EVAL_TRIALS=5 DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker @@ -31,9 +32,8 @@ DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False -DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS=False -DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS=2 DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_OA_OBJECT_POSITIONS= DR_H2B_IS_LANE_CHANGE=False DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 diff --git a/defaults/template-worker.env b/defaults/template-worker.env index 6efbdfbd..81a2d91c 100644 --- a/defaults/template-worker.env +++ b/defaults/template-worker.env @@ -1,6 +1,6 @@ DR_WORLD_NAME=reInvent2019_track DR_RACE_TYPE=TIME_TRIAL -DR_RACER_NAME=racer1 +DR_CAR_COLOR=Blue DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False @@ -8,9 +8,8 @@ DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False -DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS=False -DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS=2 DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_OA_OBJECT_POSITIONS= DR_H2B_IS_LANE_CHANGE=False DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 4cb328c2..e3b0fdd8 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -60,10 +60,16 @@ def str2bool(v): config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') - config['PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS', 'False') - config['NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS'] = os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS', '2') config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") + if object_position_str != "": + object_positions = [] + for o in object_position_str.split(";"): + object_positions.append(o) + config['OBJECT_POSITIONS'] = object_positions + config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) + # Head to Bot if config['RACE_TYPE'] == 'HEAD_TO_BOT': config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index ee10924d..90ee2d9f 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -35,7 +35,6 @@ config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') -config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') @@ -43,16 +42,23 @@ config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') +config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5') # Object Avoidance if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') - config['PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS', 'False') - config['NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS'] = os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS', '2') config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") + if object_position_str != "": + object_positions = [] + for o in object_position_str.split(";"): + object_positions.append(o) + config['OBJECT_POSITIONS'] = object_positions + config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) + # Head to Bot if config['RACE_TYPE'] == 'HEAD_TO_BOT': config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') @@ -124,6 +130,7 @@ # Update car and training parameters config.update({'WORLD_NAME': os.environ.get('DR_WORLD_NAME')}) config.update({'RACE_TYPE': os.environ.get('DR_RACE_TYPE')}) + config.update({'CAR_COLOR': os.environ.get('DR_CAR_COLOR')}) config.update({'ALTERNATE_DRIVING_DIRECTION': os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION')}) config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')}) config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')}) @@ -134,10 +141,16 @@ config.update({'NUMBER_OF_OBSTACLES': os.environ.get('DR_OA_NUMBER_OF_OBSTACLES')}) config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')}) config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')}) - config.update({'PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_PSEUDO_RANDOMIZE_OBSTACLE_LOCATIONS')}) - config.update({'NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS': os.environ.get('DR_OA_NUMBER_OF_PSEUDO_RANDOM_PLACEMENTS')}) config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')}) - config.update({'NUMBER_OF_BOT_CARS': '0'}) + object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") + if object_position_str != "": + object_positions = [] + for o in object_position_str.replace('"','').split(";"): + object_positions.append(o) + config.update({'OBJECT_POSITIONS': object_positions}) + config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))}) + else: + config.pop('OBJECT_POSITIONS') # Update Head to Bot parameters if config['RACE_TYPE'] == 'HEAD_TO_BOT': @@ -149,7 +162,6 @@ config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')}) config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')}) config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')}) - config.update({'NUMBER_OF_OBSTACLES': '0'}) # Clear bot cars and obstacles in case present from earlier worker if config['RACE_TYPE'] == 'TIME_TRIAL': From ae70f25591ea5031b1c1d1f659447ee0384d6e70 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 5 Jul 2020 11:01:11 +0200 Subject: [PATCH 107/428] Ability to change start position (#66) * Enable change of start position --- defaults/template-run.env | 1 + scripts/training/prepare-config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index a373248a..4ebfc8ae 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -15,6 +15,7 @@ DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False +DR_TRAIN_START_POSITION_OFFSET=0.0 DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_TRAIN_MULTI_CONFIG=False DR_TRAIN_MIN_EVAL_TRIALS=5 diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 90ee2d9f..05484447 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -41,6 +41,7 @@ config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false')) config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') +config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00') config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5') From 8cbf1ea9e46edee24556f1ecf02f189d53dd5419 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 22 Jul 2020 19:44:30 +0200 Subject: [PATCH 108/428] Fix error if no OBJECT POSITION in OA --- scripts/training/prepare-config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 05484447..e606d8ba 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -151,7 +151,7 @@ config.update({'OBJECT_POSITIONS': object_positions}) config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))}) else: - config.pop('OBJECT_POSITIONS') + config.pop('OBJECT_POSITIONS',[]) # Update Head to Bot parameters if config['RACE_TYPE'] == 'HEAD_TO_BOT': From 6ccdbb421493b55814efbb4844d947f112cf3a1f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 27 Jul 2020 22:39:03 +0200 Subject: [PATCH 109/428] Fixing upload for new version of Console (#69) * Fixing upload for new version of Console * Remove aws-deepracer bucket detection * Updated documentation --- bin/init.sh | 11 +-- bin/scripts_wrapper.sh | 8 ++- defaults/template-run.env | 1 - docs/index.md | 1 + docs/upload.md | 22 ++++++ scripts/upload/increment.sh | 92 +++++++++++++++++++++++++ scripts/upload/list-set-models.sh | 110 ------------------------------ scripts/upload/upload-model.sh | 36 +++------- 8 files changed, 133 insertions(+), 148 deletions(-) create mode 100644 docs/upload.md create mode 100755 scripts/upload/increment.sh delete mode 100755 scripts/upload/list-set-models.sh diff --git a/bin/init.sh b/bin/init.sh index 31b3739b..cfc1c4e2 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -85,19 +85,10 @@ cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/ cp $INSTALL_DIR/defaults/template-system.env $INSTALL_DIR/system.env cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env - if [[ "${OPT_CLOUD}" == "aws" ]]; then - AWS_DR_BUCKET=$(aws s3api list-buckets | jq '.Buckets[] | select(.Name | startswith("aws-deepracer")) | .Name' -r) - AWS_DR_BUCKET_COUNT=$(echo $AWS_DR_BUCKET | wc -w) AWS_EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` AWS_REGION="`echo \"$AWS_EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" - if [ "$AWS_DR_BUCKET_COUNT" -eq 1 ]; then - sed -i "s//$AWS_DR_BUCKET/g" $INSTALL_DIR/system.env - elif [ "$AWS_DR_BUCKET_COUNT" -gt 1 ]; then - sed -i "s//found-$AWS_DR_BUCKET_COUNT-buckets/g" $INSTALL_DIR/system.env - else - sed -i "s//not-defined/g" $INSTALL_DIR/system.env - fi + sed -i "s//not-defined/g" $INSTALL_DIR/system.env sed -i "s//default/g" $INSTALL_DIR/system.env elif [[ "${OPT_CLOUD}" == "azure" ]]; then AWS_REGION="us-east-1" diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 79de4f2b..bf43bfbd 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -20,11 +20,15 @@ function dr-upload-model { } function dr-list-aws-models { - dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" + echo "Due to changes in AWS DeepRacer Console this command is no longer available." } function dr-set-upload-model { - dr-update-env && ${DIR}/scripts/upload/list-set-models.sh "$@" + echo "Due to changes in AWS DeepRacer Console this command is no longer available." +} + +function dr-increment-upload-model { + dr-update-env && ${DIR}/scripts/upload/increment.sh "$@" && dr-update-env } function dr-download-custom-files { diff --git a/defaults/template-run.env b/defaults/template-run.env index 4ebfc8ae..62488b89 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -7,7 +7,6 @@ DR_DISPLAY_NAME=$DR_CAR_NAME DR_RACER_NAME=racer1 DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix -DR_UPLOAD_MODEL_NAME=mymodelname DR_EVAL_NUMBER_OF_TRIALS=5 DR_EVAL_IS_CONTINUOUS=False DR_EVAL_OFF_TRACK_PENALTY=5.0 diff --git a/docs/index.md b/docs/index.md index 8f52b89d..1795ebfc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -43,6 +43,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th # Documentation * [Initial Installation](installation.md) +* [Upload Model to Console](upload.md) * [Reference](reference.md) * [Using multiple Robomaker workers](multi_worker.md) * [Running multiple parallel experiments](multi_run.md) diff --git a/docs/upload.md b/docs/upload.md new file mode 100644 index 00000000..f56ea9b4 --- /dev/null +++ b/docs/upload.md @@ -0,0 +1,22 @@ +# Upload Model to AWS Console + +Starting end July 2020 the AWS DeepRacer Console was re-designed which is now changing the way +that models need to be uploaded to enable them to be evaluated or submitted to the AWS hosted Summit or Virtual League events. + +## Create Upload Bucket + +The recommendation is to create a unique bucket in `us-east-1` which is used as 'transit' between your training bucket, local or in an AWS region close to your EC2 instances. + +The bucket needs to be defined so that 'Objects can be public'; AWS will create a specific IAM policy to access the data in your bucket as part of the import. + +## Configure Upload Bucket + +In `system.env` set `DR_UPLOAD_S3_BUCKET` to the name of your created bucket. + +In `run.env` set the `DR_UPLOAD_S3_PREFIX` to any prefix of your choice. + +## Upload Model + +After configuring the system you can run `dr-upload-model`; it will copy out the required parts of `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX` into `s3://DR_UPLOAD_S3_BUCKET/DR_UPLOAD_S3_PREFIX`. + +Once uploaded you can use the [Import model](https://console.aws.amazon.com/deepracer/home?region=us-east-1#models/importModel) feature of the AWS DeepRacer console to load the model into the model store. \ No newline at end of file diff --git a/scripts/upload/increment.sh b/scripts/upload/increment.sh new file mode 100755 index 00000000..e22ac05c --- /dev/null +++ b/scripts/upload/increment.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +usage(){ + echo "Usage: $0 [-f] [-w] [-p ] [-d ]" + echo "" + echo "Command will increment a numerical suffix on the current upload model." + echo "-p model Sets the to-be name to be rather than auto-incremeneting the previous model." + echo "-d delim Delimiter in model-name (e.g. '-' in 'test-model-1')" + echo "-f Force. Ask for no confirmations." + echo "-w Wipe the S3 prefix to ensure that two models are not mixed." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +OPT_DELIM='-' + +while getopts ":fwp:d:" opt; do +case $opt in + +f) OPT_FORCE="True" +;; +p) OPT_PREFIX="$OPTARG" +;; +w) OPT_WIPE="--delete" +;; +d) OPT_DELIM="$OPTARG" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +CONFIG_FILE=$DR_CONFIG +echo "Configuration file $CONFIG_FILE will be updated." + +## Read in data +CURRENT_UPLOAD_MODEL=$(grep -e "^DR_UPLOAD_S3_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') +CURRENT_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL}" | \ + awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') +if [[ -z ${CURRENT_UPLOAD_MODEL_NUM} ]]; +then + NEW_UPLOAD_MODEL="${CURRENT_UPLOAD_MODEL}${OPT_DELIM}1" +else + NEW_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL_NUM} + 1" | bc ) + NEW_UPLOAD_MODEL=$(echo $CURRENT_UPLOAD_MODEL | sed "s/${CURRENT_UPLOAD_MODEL_NUM}\$/${NEW_UPLOAD_MODEL_NUM}/") +fi + +if [[ -n "${NEW_UPLOAD_MODEL}" ]]; +then + echo "Incrementing model from ${CURRENT_UPLOAD_MODEL} to ${NEW_UPLOAD_MODEL}" + if [[ -z "${OPT_FORCE}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi + fi + sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$NEW_UPLOAD_MODEL/g" "$CONFIG_FILE" && echo "Done." +else + echo "Error in determining new model. Aborting." + exit 1 +fi + +if [[ -n "${OPT_WIPE}" ]]; +then + MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} ) + if [[ -n "${MODEL_DIR_S3}" ]]; + then + echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} exists. Will wipe." + fi + if [[ -z "${OPT_FORCE}" ]]; + then + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi + fi + aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} --recursive +fi diff --git a/scripts/upload/list-set-models.sh b/scripts/upload/list-set-models.sh deleted file mode 100755 index aad25909..00000000 --- a/scripts/upload/list-set-models.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash -#set -x - -usage(){ - echo "Usage: $0 [-h] [-s ] [-c]" - echo " -s model Configures environment to upload into selected model." - echo " -c Use local cache of models." - exit 1 -} - -trap ctrl_c INT - -function ctrl_c() { - echo "Requested to stop." - exit 1 -} - -while getopts ":chs:" opt; do -case $opt in -s) OPT_SET="$OPTARG" -;; -c) OPT_CACHE="cache" -;; -h) usage -;; -\?) echo "Invalid option -$OPTARG" >&2 -usage -;; -esac -done - -TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} -WORK_DIR=${DR_DIR}/tmp/aws-models -mkdir -p ${WORK_DIR} - -if [[ -n "${OPT_CACHE}" ]]; -then - PARAM_FILES=$(ls -t "${WORK_DIR}" ) - echo -e "Using local cache..." -else - PARAM_FILES=$(aws ${DR_UPLOAD_PROFILE} s3 ls s3://${TARGET_S3_BUCKET} --recursive | awk '/training_params*/ {print $4}' ) - echo -e "\nLooking for DeepRacer models in s3://${TARGET_S3_BUCKET}...\n" -fi - - -if [[ -z "${PARAM_FILES}" ]]; -then - echo "No models found in s3://{TARGET_S3_BUCKET}. Exiting." - exit 1 -fi - -if [[ -z "${OPT_SET}" ]]; -then - echo "+---------------------------------------------------------------------------+" - printf "| %-40s | %-30s |\n" "Model Name" "Creation Time" - echo "+---------------------------------------------------------------------------+" - - for PARAM_FILE in $PARAM_FILES; do - if [[ -z "${OPT_CACHE}" ]]; then - aws ${DR_UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet - PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') - else - PARAM_FILE_L=$PARAM_FILE - fi - MODIFICATION_TIME=$(stat -c %Y ${WORK_DIR}/${PARAM_FILE_L}) - MODIFICATION_TIME_STR=$(echo "@${MODIFICATION_TIME}" | xargs date -d ) - MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') - printf "| %-40s | %-30s |\n" "$MODEL_NAME" "$MODIFICATION_TIME_STR" - done - - echo "+---------------------------------------------------------------------------+" - echo -e "\nSet the model with dr-set-upload-model -s .\n" -else - echo -e "Looking for DeepRacer model ${OPT_SET} in s3://${TARGET_S3_BUCKET}..." - - for PARAM_FILE in $PARAM_FILES; do - if [[ -z "${OPT_CACHE}" ]]; then - aws ${DR_UPLOAD_PROFILE} s3 cp s3://${TARGET_S3_BUCKET}/${PARAM_FILE} ${WORK_DIR}/ --quiet - PARAM_FILE_L=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[2]}') - MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') - if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then - MATCHED_PREFIX=$(echo "$PARAM_FILE" | awk '{split($0,a,"/"); print a[1]}') - echo "Found in ${MODEL_NAME} in ${MATCHED_PREFIX}". - break - fi - else - PARAM_FILE_L=$PARAM_FILE - MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | awk '{split($0,a,"/"); print a[2] }') - if [ "${MODEL_NAME}" = "${OPT_SET}" ]; then - MATCHED_PREFIX=$(awk '/SAGEMAKER_SHARED_S3_PREFIX/ {print $2}' ${WORK_DIR}/${PARAM_FILE_L} | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - echo "Found in ${MODEL_NAME} in ${MATCHED_PREFIX}". - break - fi - fi - done - - CONFIG_FILE=$DR_CONFIG - echo "Configuration file $CONFIG_FILE will be updated." - if [[ -n "${MODEL_NAME}" ]]; - then - read -r -p "Are you sure? [y/N] " response - if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] - then - echo "Aborting." - exit 1 - else - sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$MATCHED_PREFIX/g; s/(DR_UPLOAD_MODEL_NAME=).*$/\1$MODEL_NAME/g" "$CONFIG_FILE" && echo "Done." - fi - fi -fi \ No newline at end of file diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 32110e09..93be02c0 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -70,37 +70,22 @@ SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" - WORK_DIR=${DR_DIR}/tmp/upload/ -mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model +mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip # Download information on model. -PARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 sync s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX} ${WORK_DIR} --exclude "*" --include "training_params*" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) -if [ -n "$PARAM_FILE" ]; -then - TARGET_METADATA_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/REWARD_FILE_S3_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/"$(awk '/METRICS_S3_OBJECT_KEY/ {print $2}' $PARAM_FILE | sed "s/^\([\"']\)\(.*\)\1\$/\2/g") - TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" - MODEL_NAME=$(awk '/MODEL_METADATA_FILE_S3_KEY/ {print $2}' $PARAM_FILE | awk '{split($0,a,"/"); print a[2] }') - echo "Detected DeepRacer Model ${MODEL_NAME} at s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/." -else - echo "No DeepRacer information found in s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}. Exiting" - exit 1 -fi +TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/reward_function.py" +TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" # Check if metadata-files are available REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) +# METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) -if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$METRICS_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; +if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then echo "All meta-data files found. Looking for checkpoint." - # SOURCE_METADATA_FILE_S3_KEY="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/reward.py" - # SOURCE_REWARD_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/model_metadata.json" - # SOURCE_METRICS_FILE="s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_CONFIG}/metrics/metric.json" else echo "Meta-data files are not found. Exiting." exit 1 @@ -136,7 +121,8 @@ fi if [ -n "$CHECKPOINT" ]; then CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f) cp ${METADATA_FILE} ${WORK_DIR}model/ - echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint +# echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint + echo ${CHECKPOINT_FILE} | tee ${WORK_DIR}model/.coach_checkpoint > /dev/null else echo "Checkpoint not found. Exiting." exit 1 @@ -145,7 +131,7 @@ fi # Upload files if [[ -z "${OPT_FORCE}" ]]; then - echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to ${MODEL_NAME} in s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" + echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" read -r -p "Are you sure? [y/N] " response if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] then @@ -154,10 +140,10 @@ then fi fi -touch ${WORK_DIR}model/.ready +# echo "" > ${WORK_DIR}model/.ready +rm ${WORK_DIR}model/deepracer_checkpoints.json cd ${WORK_DIR} aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${DR_UPLOAD_PROFILE} s3 cp ${METADATA_FILE} ${TARGET_METADATA_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +# aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} From 789cb6a87342516bea55544d5d906e7ad37c603a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 29 Jul 2020 08:52:56 +0200 Subject: [PATCH 110/428] Adding start pos offset to multi-worker --- defaults/template-worker.env | 1 + scripts/training/prepare-config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-worker.env b/defaults/template-worker.env index 81a2d91c..11aabd5b 100644 --- a/defaults/template-worker.env +++ b/defaults/template-worker.env @@ -5,6 +5,7 @@ DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 +DR_TRAIN_START_POSITION_OFFSET=0.0 DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index e606d8ba..0194bd34 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -136,6 +136,7 @@ config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')}) config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')}) config.update({'ENABLE_DOMAIN_RANDOMIZATION': os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION')}) + config.update({'START_POSITION_OFFSET': os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')}) # Update Object Avoidance parameters if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': From 98c80120289a94358edc73a9b16dfbbe0e759432 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 31 Jul 2020 15:18:46 +0000 Subject: [PATCH 111/428] Use return instead of exit when returning scripts --- bin/scripts_wrapper.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index bf43bfbd..633e8a64 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -108,13 +108,13 @@ function dr-logs-sagemaker { ((WAIT_TIME--)) if [ "$WAIT_TIME" -lt 1 ]; then echo "Sagemaker is not running." - exit 1 + return 1 fi SAGEMAKER_CONTAINER=$(dr-find-sagemaker) done else echo "Sagemaker is not running." - exit 1 + return 1 fi fi @@ -192,13 +192,13 @@ function dr-logs-robomaker { ((WAIT_TIME--)) if [ "$WAIT_TIME" -lt 1 ]; then echo "Robomaker #${OPT_REPLICA} is not running." - exit 1 + return 1 fi ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA}) done else echo "Robomaker #${OPT_REPLICA} is not running." - exit 1 + return 1 fi fi From 17be07140ee48639b49284774038cf561e4820fa Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 21 Jul 2020 18:02:57 +0000 Subject: [PATCH 112/428] Create container dependency file --- bin/init.sh | 28 ++++++++++++++++++++++------ defaults/dependencies.json | 5 +++++ defaults/template-system.env | 2 +- 3 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 defaults/dependencies.json diff --git a/bin/init.sh b/bin/init.sh index cfc1c4e2..79863c7d 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -113,8 +113,6 @@ elif [[ -n "${CPU_INTEL}" ]]; then else SAGEMAKER_TAG="cpu" fi -sed -i "s//$SAGEMAKER_TAG/g" $INSTALL_DIR/system.env -sed -i "s//$CPU_LEVEL/g" $INSTALL_DIR/system.env #set proxys if required for arg in "$@"; @@ -127,10 +125,28 @@ do done # Download docker images. Change to build statements if locally built images are desired. -docker pull larsll/deepracer-rlcoach:v2.3 -docker pull awsdeepracercommunity/deepracer-robomaker:$CPU_LEVEL -docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_TAG -docker pull larsll/deepracer-loganalysis:v2-cpu +COACH_VERSION=$(jq -r '.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +sed -i "s//$COACH_VERSION/g" $INSTALL_DIR/system.env + +ROBOMAKER_VERSION=$(jq -r '.robomaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +if [ -n $ROBOMAKER_VERSION ]; then + ROBOMAKER_VERSION=$ROBOMAKER_VERSION-$CPU_LEVEL +else + ROBOMAKER_VERSION=$CPU_LEVEL +fi +sed -i "s//$ROBOMAKER_VERSION/g" $INSTALL_DIR/system.env + +SAGEMAKER_VERSION=$(jq -r '.sagemaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +if [ -n $SAGEMAKER_VERSION ]; then + SAGEMAKER_VERSION=$SAGEMAKER_VERSION-$SAGEMAKER_TAG +else + SAGEMAKER_VERSION=$SAGEMAKER_TAG +fi +sed -i "s//$SAGEMAKER_VERSION/g" $INSTALL_DIR/system.env + +docker pull larsll/deepracer-rlcoach:$COACH_VERSION +docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_VERSION +docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' diff --git a/defaults/dependencies.json b/defaults/dependencies.json new file mode 100644 index 00000000..edda7941 --- /dev/null +++ b/defaults/dependencies.json @@ -0,0 +1,5 @@ +{ + "rl_coach":"v3.0-dev", + "robomaker":"3.0.1-dev", + "sagemaker":"3.0.1-dev" +} \ No newline at end of file diff --git a/defaults/template-system.env b/defaults/template-system.env index 88b5b672..27a92a05 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -8,7 +8,7 @@ DR_KINESIS_STREAM_NAME=None DR_KINESIS_STREAM_ENABLE=False DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= -DR_COACH_IMAGE=v2.3 +DR_COACH_IMAGE= DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False DR_CLOUD_WATCH_ENABLE=False From 06523d86c8ddb7abe37617b7c02820b8260dde7b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jul 2020 21:25:07 +0200 Subject: [PATCH 113/428] Fixing multi-config --- scripts/training/prepare-config.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 0194bd34..9d502784 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -96,9 +96,10 @@ # Training with different configurations on each worker (aka Multi Config training) config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False') +num_workers = int(config['NUM_WORKERS']) -if config['MULTI_CONFIG'] == "True": - num_workers = int(os.environ.get('DR_WORKERS',1)) +if config['MULTI_CONFIG'] == "True" and num_workers > 0: + multi_config = {} multi_config['multi_config'] = [None] * num_workers @@ -153,6 +154,12 @@ config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))}) else: config.pop('OBJECT_POSITIONS',[]) + else: + config.pop('NUMBER_OF_OBSTACLES', None) + config.pop('MIN_DISTANCE_BETWEEN_OBSTACLES', None) + config.pop('RANDOMIZE_OBSTACLE_LOCATIONS', None) + config.pop('IS_OBSTACLE_BOT_CAR', None) + config.pop('OBJECT_POSITIONS',[]) # Update Head to Bot parameters if config['RACE_TYPE'] == 'HEAD_TO_BOT': @@ -164,11 +171,15 @@ config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')}) config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')}) config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')}) - - # Clear bot cars and obstacles in case present from earlier worker - if config['RACE_TYPE'] == 'TIME_TRIAL': - config.update({'NUMBER_OF_BOT_CARS': '0'}) - config.update({'NUMBER_OF_OBSTACLES': '0'}) + else: + config.pop('IS_LANE_CHANGE', None) + config.pop('LOWER_LANE_CHANGE_TIME', None) + config.pop('UPPER_LANE_CHANGE_TIME', None) + config.pop('LANE_CHANGE_DISTANCE', None) + config.pop('NUMBER_OF_BOT_CARS', None) + config.pop('MIN_DISTANCE_BETWEEN_BOT_CARS', None) + config.pop('RANDOMIZE_BOT_CAR_LOCATIONS', None) + config.pop('BOT_CAR_SPEED', None) #split string s3_yaml_name, insert the worker number, and add back on the .yaml extension s3_yaml_name_list = s3_yaml_name.split('.') From 6d8c035344ec0c22517da9f39d868f51b7da8cf4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jul 2020 21:29:46 +0200 Subject: [PATCH 114/428] Tune start-x script --- utils/start-xorg.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh index a5abba04..f0f775be 100755 --- a/utils/start-xorg.sh +++ b/utils/start-xorg.sh @@ -1,5 +1,8 @@ #!/bin/bash export DISPLAY=:0 +touch ~/.Xauthority +export XAUTHORITY=~/.Xauthority + xinit /usr/bin/jwm & sleep 1 xrandr -s 1400x900 From fa6f2fe0166686fe2d331e0b3331ef7dad184341 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 23 Jul 2020 21:59:15 +0200 Subject: [PATCH 115/428] Fix setup-xorg for multi-GPU machine --- utils/setup-xorg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh index bdbd3111..7937e99b 100755 --- a/utils/setup-xorg.sh +++ b/utils/setup-xorg.sh @@ -8,7 +8,7 @@ sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils \ # Configure sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config -BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//') +BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1) sudo nvidia-xconfig --busid=$BUS_ID --enable-all-gpus -o /etc/X11/xorg.conf sudo tee -a /etc/X11/xorg.conf << EOF From 34643d89268ed77ad435a7c3790fad9e28930e25 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 26 Jul 2020 21:05:58 +0200 Subject: [PATCH 116/428] Remove need for shared memory --- docker/docker-compose-local-xorg.yml | 2 +- utils/Dockerfile.sagemaker-gpu | 4 ++-- utils/start-xorg.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/docker-compose-local-xorg.yml b/docker/docker-compose-local-xorg.yml index b4ba5663..af5ae9cf 100644 --- a/docker/docker-compose-local-xorg.yml +++ b/docker/docker-compose-local-xorg.yml @@ -2,11 +2,11 @@ version: '3.7' services: robomaker: - ipc: host environment: - DISPLAY - USE_EXTERNAL_X=${DR_HOST_X} - XAUTHORITY=/root/.Xauthority + - QT_X11_NO_MITSHM=1 volumes: - '/tmp/.X11-unix/:/tmp/.X11-unix' - '${XAUTHORITY}:/root/.Xauthority' \ No newline at end of file diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index fc864923..78df5b95 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:gpu -ENV CUDA_VISIBLE_DEVICES=1 \ No newline at end of file +FROM awsdeepracercommunity/deepracer-sagemaker:3.0.1-dev-gpu +ENV CUDA_VISIBLE_DEVICES=1 diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh index f0f775be..6ea40766 100755 --- a/utils/start-xorg.sh +++ b/utils/start-xorg.sh @@ -3,7 +3,7 @@ export DISPLAY=:0 touch ~/.Xauthority export XAUTHORITY=~/.Xauthority -xinit /usr/bin/jwm & +nohup xinit /usr/bin/jwm & sleep 1 xrandr -s 1400x900 x11vnc -bg -forever -nopw -rfbport 5900 -display WAIT$DISPLAY & From 112cbcfd0a725aace421ddbe14d3bbb20e115914 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 26 Jul 2020 20:50:45 +0000 Subject: [PATCH 117/428] Tuning X --- utils/setup-xorg.sh | 2 +- utils/start-xorg.sh | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh index 7937e99b..91c146b0 100755 --- a/utils/setup-xorg.sh +++ b/utils/setup-xorg.sh @@ -3,7 +3,7 @@ # Script to install basic X-Windows on a headless instance (e.g. in EC2) # Install additional packages -sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils \ +sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \ menu mesa-utils xterm jwm x11vnc -y --no-install-recommends # Configure diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh index 6ea40766..ab8deca9 100755 --- a/utils/start-xorg.sh +++ b/utils/start-xorg.sh @@ -1,10 +1,11 @@ #!/bin/bash export DISPLAY=:0 -touch ~/.Xauthority -export XAUTHORITY=~/.Xauthority nohup xinit /usr/bin/jwm & sleep 1 xrandr -s 1400x900 -x11vnc -bg -forever -nopw -rfbport 5900 -display WAIT$DISPLAY & +x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & sleep 1 + +xauth generate $DISPLAY +export XAUTHORITY=~/.Xauthority From 864c88d53203f0058e3da24025119c036ccff8c7 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 26 Jul 2020 21:35:50 +0000 Subject: [PATCH 118/428] Do not track nohup.out --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 923271bc..9f40e1b9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ recording DONE data/ tmp/ -autorun.s3url \ No newline at end of file +autorun.s3url +nohup.out From 87c5ed2079cdf2bc061706165f21e0d9355840ea Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 27 Jul 2020 06:26:12 +0000 Subject: [PATCH 119/428] Use label for Robomaker placement --- bin/init.sh | 1 + docker/docker-compose-training-swarm.yml | 2 ++ scripts/training/start.sh | 16 ++++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/bin/init.sh b/bin/init.sh index 79863c7d..b5c8191b 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -153,6 +153,7 @@ SAGEMAKER_NW='sagemaker-local' docker swarm init SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) docker node update --label-add Sagemaker=true $SWARM_NODE +docker node update --label-add Robomaker=true $SWARM_NODE docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then diff --git a/docker/docker-compose-training-swarm.yml b/docker/docker-compose-training-swarm.yml index abff3912..57650970 100644 --- a/docker/docker-compose-training-swarm.yml +++ b/docker/docker-compose-training-swarm.yml @@ -12,5 +12,7 @@ services: restart_policy: condition: none replicas: ${DR_WORKERS} + placement: + constraints: [node.labels.Robomaker == true ] environment: - DOCKER_REPLICA_SLOT={{.Task.Slot}} \ No newline at end of file diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 582b5e34..5d648466 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -112,6 +112,22 @@ fi # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then + ROBOMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Robomaker == "true") | .ID' | wc -l) + if [[ "$ROBOMAKER_NODES" -eq 0 ]]; + then + echo "ERROR: No Swarm Nodes labelled for placement of Robomaker. Please add Robomaker node." + echo " Example: docker node update --label-add Robomaker=true $(docker node inspect self | jq .[0].ID -r)" + exit 0 + fi + + SAGEMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Sagemaker == "true") | .ID' | wc -l) + if [[ "$SAGEMAKER_NODES" -eq 0 ]]; + then + echo "ERROR: No Swarm Nodes labelled for placement of Sagemaker. Please add Sagemaker node." + echo " Example: docker node update --label-add Sagemaker=true $(docker node inspect self | jq .[0].ID -r)" + exit 0 + fi + docker stack deploy $COMPOSE_FILES $STACK_NAME else docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS From b3e863d2af7d465b7a6399eaa7500941ba01562f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 30 Jul 2020 09:22:34 +0000 Subject: [PATCH 120/428] Populate variables for Host X --- scripts/training/start.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 5d648466..4d4db907 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -109,6 +109,14 @@ else echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" fi +# Check if we are using Host X -- ensure variables are populated +if [[ "${DR_HOST_X,,}" == "true" ]]; +then + if [[ -z "$XAUTHORITY" ]]; then + export XAUTHORITY=~/.Xauthority + fi +fi + # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then From c3f227d2fc91fd1202ad00505b0f48cbffc19a22 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 3 Aug 2020 19:54:46 +0200 Subject: [PATCH 121/428] Expose CUDA_VISIBLE_DEVICES to Evaluation --- docker/docker-compose-eval.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 680dc21b..9bbe4fbe 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -16,6 +16,7 @@ services: - "${DR_ROBOMAKER_PORT}:8080" environment: - DISPLAY_N=:0 + - CUDA_VISIBLE_DEVICES - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} From 54108acced7372d3ea474d88fc9c46cef071e065 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 3 Aug 2020 19:55:18 +0200 Subject: [PATCH 122/428] Add a debugging reward function. --- defaults/debug-reward_function.py | 60 +++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 defaults/debug-reward_function.py diff --git a/defaults/debug-reward_function.py b/defaults/debug-reward_function.py new file mode 100644 index 00000000..0b964c40 --- /dev/null +++ b/defaults/debug-reward_function.py @@ -0,0 +1,60 @@ +import math +import numpy +import rospy +import time + +class Reward: + + ''' + Debugging reward function to be used to track performance of local training. + Will print out the Real-Time-Factor (RTF), as well as how many + steps-per-second (sim-time) that the system is able to deliver. + ''' + + def __init__(self, verbose=False, track_time=False): + self.verbose = verbose + self.track_time = track_time + + if track_time: + TIME_WINDOW=10 + self.time = numpy.zeros([TIME_WINDOW, 2]) + + if verbose: + print("Initializing Reward Class") + + def get_time(self): + + wall_time_incr = numpy.max(self.time[:,0]) - numpy.min(self.time[:,0]) + sim_time_incr = numpy.max(self.time[:,1]) - numpy.min(self.time[:,1]) + + rtf = sim_time_incr / wall_time_incr + fps = (self.time.shape[0] - 1) / sim_time_incr + + return rtf, fps + + def record_time(self, steps): + + index = int(steps) % self.time.shape[0] + self.time[index,0] = time.time() + self.time[index,1] = rospy.get_time() + + def reward_function(self, params): + + # Read input parameters + steps = params["steps"] + + if self.track_time: + self.record_time(steps) + + if self.track_time: + if steps >= self.time.shape[0]: + rtf, fps = self.get_time() + print("TIME: s: {}, rtf: {}, fps:{}".format(int(steps), round(rtf, 2), round(fps, 2) )) + + return 1.0 + + +reward_object = Reward(verbose=False, track_time=True) + +def reward_function(params): + return reward_object.reward_function(params) From d1c96086fe38b07b06530fd3bd7ce6fdceedf2b8 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 1 Aug 2020 22:01:02 +0200 Subject: [PATCH 123/428] Fix start-xorg.sh to allow multiple X servers --- utils/start-xorg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh index ab8deca9..50058088 100755 --- a/utils/start-xorg.sh +++ b/utils/start-xorg.sh @@ -1,7 +1,7 @@ #!/bin/bash export DISPLAY=:0 -nohup xinit /usr/bin/jwm & +nohup xinit /usr/bin/jwm -- $DISPLAY & sleep 1 xrandr -s 1400x900 x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & From 73b920729fc2dc6c8a4873b101d196ff91113ad3 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 3 Aug 2020 20:15:00 +0200 Subject: [PATCH 124/428] Added version checking --- bin/activate.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/bin/activate.sh b/bin/activate.sh index fa5d6ddb..66e884bb 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -1,4 +1,11 @@ #!/bin/bash + +DEPENDENCY_VERSION="3.0" + +verlte() { + [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] +} + function dr-update-env { if [[ -f "$DIR/system.env" ]] @@ -145,6 +152,22 @@ if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then fi +## Version check +SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE | jq -r .[].Config.Labels.version) +if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then + echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER" +fi + +ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE | jq -r .[].Config.Labels.version) +if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then + echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER" +fi + +COACH_VER=$(docker inspect larsll/deepracer-rlcoach:$DR_COACH_IMAGE | jq -r .[].Config.Labels.version) +if ! verlte $DEPENDENCY_VERSION $COACH_VER; then + echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER" +fi + source $SCRIPT_DIR/scripts_wrapper.sh function dr-update { From 48986a2b9bfcc647f3ffe6d1d606a2550ee27a26 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 4 Aug 2020 20:32:35 +0200 Subject: [PATCH 125/428] Bumping dependency for images to RC1 --- defaults/dependencies.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index edda7941..dffb4044 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,5 +1,5 @@ { - "rl_coach":"v3.0-dev", - "robomaker":"3.0.1-dev", - "sagemaker":"3.0.1-dev" -} \ No newline at end of file + "rl_coach":"v3.0-rc1", + "robomaker":"3.0.1-rc1", + "sagemaker":"3.0.1-rc1" +} From 6a8d56cb02d0afbb50398da3cce36497f69839ff Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 4 Aug 2020 20:37:00 +0200 Subject: [PATCH 126/428] Updated GL documentation --- docs/opengl.md | 10 ++++++---- utils/Dockerfile.sagemaker-gpu | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/opengl.md b/docs/opengl.md index 78f8b3d6..c2c79966 100644 --- a/docs/opengl.md +++ b/docs/opengl.md @@ -9,11 +9,12 @@ On a Ubuntu desktop running Unity there are hardly any additional steps required * Ensure that a recent Nvidia driver is installed and is running. * Ensure that nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. * Configure DRfC using the following settings in `system.env`: - * `DR_DOCKER_STYLE=compose`; Docker Swarm does not support the `ipc=host` option required for X-Windows to work properly. * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. -With recent Nvidia drivers you can comfirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU. +Before running `dr-start-training` ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined. + +With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU. ## Headless Server @@ -22,10 +23,11 @@ Also a headless server with a GPU, e.g. an EC2 instance, or a local computer wit * Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. * Setup an X-server on the host. `utils\setup-xorg.sh` is a basic installation script. * Configure DRfC using the following settings in `system.env`: - * `DR_DOCKER_STYLE=compose`; Docker Swarm does not support the `ipc=host` option required for X-Windows to work properly. * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. -Before training ensure that the server is running, including VNC if you want to connect. `utils\start-xorg.sh` is provided as sample. +Before training ensure that the server is running, including VNC if you want to connect. Ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined. + +Basic start-up including creation of variables can be achieved with `source utils\start-xorg.sh`. With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index 78df5b95..e72da699 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:3.0.1-dev-gpu +FROM awsdeepracercommunity/deepracer-sagemaker:3.0.1-rc1-gpu ENV CUDA_VISIBLE_DEVICES=1 From d7a8cc391496df4ce7eeba3b510858f0a0e9ef5b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 5 Aug 2020 01:30:50 +0000 Subject: [PATCH 127/428] parameter documentation updates --- docs/reference.md | 77 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 1763945f..db586847 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,33 +1,70 @@ # Deepracer-for-Cloud Reference ## Environment Variables -The scripts assume that two files `systen.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. +The scripts assume that two files `system.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. | Variable | Description | |----------|-------------| -| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| -| `DR_WORLD_NAME` | Defines the track to be used.| -| `DR_NUMBER_OF_TRIALS` | Defines the number of trials in an evaluation session.| -| `DR_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| -| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| -| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| +| `DR_RUN_ID` | Used if you have multiple independent training jobs only a single DRfC instance. This is an advanced configuration and generally you should just leave this as the default `0`.| +| `DR_WORLD_NAME` | Defines the track to be used.| +| `DR_RACE_TYPE` | Valid options are `TIME_TRIAL`, `OBJECT_AVOIDANCE`, and `HEAD_TO_BOT`.| +| `DR_CAR_COLOR` | Valid options are `Black`, `Grey`, `Blue`, `Red`, `Orange`, `White`, and `Purple`.| +| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading.| +| `DR_ENABLE_DOMAIN_RANDOMIZATION` | If `True`, this cycles through different environment colors and lighting each episode. This is typically used to make your model more robust and generalized instead of tightly aligned with the simulator| +| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| +| `DR_EVAL_NUMBER_OF_TRIALS` | How many laps to complete for evaluation simulations.| +| `DR_EVAL_IS_CONTINUOUS` | If False, your evaluation trial will end if you car goes off track or is in a collision. If True, your car will take the penalty times as configured in those parameters, but continue evaluating the trial.| +| `DR_EVAL_OFF_TRACK_PENALTY` | Number of seconds penalty time added for an off track during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| +| `DR_EVAL_COLLISION_PENALTY` | Number of seconds penalty time added for a collision during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| +| `DR_EVAL_SAVE_MP4` | TODO | +| `DR_TRAIN_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| +| `DR_TRAIN_ALTERNATE_DRIVING_DIRECTION` | `True` or `False`. If `True`, the car will alternate driving between clockwise and counter-clockwise each episode.| +| `DR_TRAIN_START_POSITION_OFFSET` | Used to control where to start the training from on first episode.| +| `DR_TRAIN_ROUND_ROBIN_ADVANCE_DISTANCE` | How far to progress each episode in round robin. 0.05 is 5% of the track. Generally best to try and keep this to even numbers that match with your total number of episodes to allow for even distribution around the track. For example, if 20 episodes per iternation, .05 or .10 or .20 would be good.| +| `DR_TRAIN_MULTI_CONFIG` | `True` or `False`. This is used if you want to use different run.env configurations for each worker in a multi worker training run. See multi config documentation for more details on how to set this up.| +| `DR_TRAIN_MIN_EVAL_TRIALS` | The minimum number of evaluation trials run between each training iteration. Evaluations will continue as long as policy training is occuring and may be more than this number. This establishes the minimum, and is generally useful if you want to speed up training especially when using gpu sagemaker containers.| +| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| +| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| | `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| +| `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| | `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| -| `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| -| `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| -| `DR_LOCAL_S3_PARAMS_FILE` | YAML file path used to configure Robomaker relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| +| `DR_LOCAL_S3_TRAINING_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during training. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| +| `DR_LOCAL_S3_EVAL_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during evaluations. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| +| `DR_LOCAL_S3_MODEL_METADATA_KEY` | Location where the `model_metadata.json` file is stored.| +| `DR_LOCAL_S3_HYPERPARAMETERS_KEY` | Location where the `hyperparameters.json` file is stored.| +| `DR_LOCAL_S3_REWARD_KEY` | Location where the `reward_function.py` file is stored.| +| `DR_LOCAL_S3_METRICS_PREFIX` | Location where the metrics will be stored.| +| `DR_OA_NUMBER_OF_OBSTACLES` | For Object Avoidance, the number of obstacles on the track.| +| `DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES` | Minimum distance in meters between obstacles.| +| `DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS` | If True, obstacle locations will randomly change after each episode.| +| `DR_OA_IS_OBSTACLE_BOT_CAR` | If True, obstacles will appear as a stationary car instead of a box.| +| `DR_OA_OBJECT_POSITIONS` | TODO.| +| `DR_H2B_IS_LANE_CHANGE` | If True, bot cars will change lanes based on configuration.| +| `DR_H2B_LOWER_LANE_CHANGE_TIME` | Minimum time in seconds before car will change lanes.| +| `DR_H2B_UPPER_LANE_CHANGE_TIME` | Maximum time in seconds before car will change langes.| +| `DR_H2B_LANE_CHANGE_DISTANCE` | Distance in meters how long it will take the car to change lanes.| +| `DR_H2B_NUMBER_OF_BOT_CARS` | Number of bot cars on the track.| +| `DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS` | Minimum distance between bot cars.| +| `DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS` | If True, bot car locations will randomly change after each episode.| +| `DR_H2B_BOT_CAR_SPEED` | How fast the bot cars go in meters per second.| +| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| +| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | | `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| | `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| -| `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| -| `DR_UPLOAD_MODEL_NAME` | Display name of model, not currently used; `dr-set-upload-model` sets it for readability purposes.| -| `DR_CAR_COLOR` | Color of car | -| `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading. | -| `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | -| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | -| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | -| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | -| `DR_GPU_AVAILABLE` | Is GPU enabled? | -| `DR_DOCKER_IMAGE_TYPE` | `cpu` or `gpu`; docker images will be used based on this | +| `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| +| `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | +| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | +| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | +| `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.| +| `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.| +| `DR_COACH_IMAGE` | Determines which coach image will be used for training.| +| `DR_WORKERS` | Number of Robomaker workers to be used for training. See additional documentation for more information about this feature.| +| `DR_ROBOMAKER_MOUNT_LOGS` | TODO.| +| `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.| +| `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`. Use Compose for openGL optimized containers.| +| `DR_HOST_X` | TODO.| +| `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.| + ## Commands From 0badabcddea6bec9fc862097792020a522b47fbe Mon Sep 17 00:00:00 2001 From: Wiktor Gancarz Date: Sat, 8 Aug 2020 11:38:03 +0100 Subject: [PATCH 128/428] Additional information for upload command Added description on switches and a note for managing models --- docs/upload.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/upload.md b/docs/upload.md index f56ea9b4..598bf0b2 100644 --- a/docs/upload.md +++ b/docs/upload.md @@ -19,4 +19,17 @@ In `run.env` set the `DR_UPLOAD_S3_PREFIX` to any prefix of your choice. After configuring the system you can run `dr-upload-model`; it will copy out the required parts of `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX` into `s3://DR_UPLOAD_S3_BUCKET/DR_UPLOAD_S3_PREFIX`. -Once uploaded you can use the [Import model](https://console.aws.amazon.com/deepracer/home?region=us-east-1#models/importModel) feature of the AWS DeepRacer console to load the model into the model store. \ No newline at end of file +Once uploaded you can use the [Import model](https://console.aws.amazon.com/deepracer/home?region=us-east-1#models/importModel) feature of the AWS DeepRacer console to load the model into the model store. + +## Things to know + +### Upload switches +There are several useful switches to the upload command: + * f - this will force upload, no confirmation question if you want to proceed with upload + * w - wipes the target AWS DeepRacer model structure before upload in the designated bucket/prefix + * d - dry-Run mode, does not perform any write or delete operatios on target + * b - uploads best checkpoint instead of default which is last checkpoint + * p prefix - uploads model into specified S3 prefix + +### Managing your models +You should decide how you're going to manage your models. Upload to AWS does not preserve all the files created locally so if you delete your local files you will find it hard to go back to a previous model and resume training. From 0d3e9d75ea1da9f53f7f1d3b2d5081b9700520c6 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 20 Jun 2020 20:34:36 +0000 Subject: [PATCH 129/428] Further docker changes --- docs/docker.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docker.md b/docs/docker.md index 2986d20a..3ace2737 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -20,6 +20,7 @@ DRfC is installed only on the manager. (The first installed host.) Swarm workers * The Sagemaker container can only be run on the manager. * OpenGL is not supported as Swarm does not support `ipc=host`. +* Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique name. If you have multiple Docker nodes ensure that they all have the image available. ### Connecting Workers From c7c92038421987fa7bf22672734591ac2e011e5b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 8 Aug 2020 17:40:09 +0200 Subject: [PATCH 130/428] Update docker.md --- docs/docker.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/docker.md b/docs/docker.md index 3ace2737..78a42218 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -19,8 +19,7 @@ DRfC is installed only on the manager. (The first installed host.) Swarm workers ### Limitations * The Sagemaker container can only be run on the manager. -* OpenGL is not supported as Swarm does not support `ipc=host`. -* Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique name. If you have multiple Docker nodes ensure that they all have the image available. +* Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique tag, not in Docker Hub. If you have multiple Docker nodes ensure that they all have the image available. ### Connecting Workers @@ -47,4 +46,4 @@ In Compose mode DRfC creates Services, using `docker-compose`. During operations ### Ports -In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports. \ No newline at end of file +In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports. From 163658b8e140136fdf1299dc43f1503993c1a391 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 8 Aug 2020 18:07:22 +0200 Subject: [PATCH 131/428] Changed version logic. --- bin/activate.sh | 4 ++-- bin/init.sh | 6 +++--- defaults/dependencies.json | 11 +++++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 66e884bb..61ca3352 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -1,7 +1,5 @@ #!/bin/bash -DEPENDENCY_VERSION="3.0" - verlte() { [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] } @@ -153,6 +151,8 @@ if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then fi ## Version check +DEPENDENCY_VERSION=$(jq -r '.master_version | select (.!=null)' $DIR/defaults/dependencies.json) + SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE | jq -r .[].Config.Labels.version) if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER" diff --git a/bin/init.sh b/bin/init.sh index b5c8191b..38c2c682 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -125,10 +125,10 @@ do done # Download docker images. Change to build statements if locally built images are desired. -COACH_VERSION=$(jq -r '.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +COACH_VERSION=$(jq -r '.containers.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) sed -i "s//$COACH_VERSION/g" $INSTALL_DIR/system.env -ROBOMAKER_VERSION=$(jq -r '.robomaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +ROBOMAKER_VERSION=$(jq -r '.containers.robomaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) if [ -n $ROBOMAKER_VERSION ]; then ROBOMAKER_VERSION=$ROBOMAKER_VERSION-$CPU_LEVEL else @@ -136,7 +136,7 @@ else fi sed -i "s//$ROBOMAKER_VERSION/g" $INSTALL_DIR/system.env -SAGEMAKER_VERSION=$(jq -r '.sagemaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) +SAGEMAKER_VERSION=$(jq -r '.containers.sagemaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) if [ -n $SAGEMAKER_VERSION ]; then SAGEMAKER_VERSION=$SAGEMAKER_VERSION-$SAGEMAKER_TAG else diff --git a/defaults/dependencies.json b/defaults/dependencies.json index dffb4044..a5b61d0f 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,5 +1,8 @@ { - "rl_coach":"v3.0-rc1", - "robomaker":"3.0.1-rc1", - "sagemaker":"3.0.1-rc1" -} + "master_version": "3.0", + "containers": { + "rl_coach": "v3.0-rc1", + "robomaker": "3.0.1-rc1", + "sagemaker": "3.0.1-rc1" + } +} \ No newline at end of file From c096eff3ed563ab7ee516c5de089d8adb3f4c688 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 9 Aug 2020 20:09:43 +0000 Subject: [PATCH 132/428] Preparing release of 3.0 --- defaults/dependencies.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index a5b61d0f..2053e839 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "3.0", "containers": { - "rl_coach": "v3.0-rc1", - "robomaker": "3.0.1-rc1", - "sagemaker": "3.0.1-rc1" + "rl_coach": "v3.0", + "robomaker": "3.0.1", + "sagemaker": "3.0.1" } -} \ No newline at end of file +} From fcaac853717aa0fd31f4d0b5b9e353e4d553a648 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 23 Aug 2020 16:50:16 +0000 Subject: [PATCH 133/428] Fix issue #72 --- docs/installation.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/installation.md b/docs/installation.md index ff60fc34..cebeb4cd 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -74,6 +74,7 @@ To use IAM Roles: * Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` +* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. @@ -87,6 +88,7 @@ For access with IAM user: * Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` +* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. @@ -104,6 +106,7 @@ In Azure mode the script-set requires the following: * Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=` * `DR_LOCAL_S3_BUCKET=` +* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. @@ -122,6 +125,7 @@ In Local mode the script-set requires the following: * Configure `run.env` as follows: * `DR_LOCAL_S3_PROFILE=minio` * `DR_LOCAL_S3_BUCKET=bucket` +* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. From 43089a1d024c3d6d1b55dd54fa28517b1a0e7506 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 31 Aug 2020 02:49:03 +0000 Subject: [PATCH 134/428] create tar.gz file directly Co-authored-by: dartjason --- bin/scripts_wrapper.sh | 4 ++++ docs/upload.md | 5 +++++ scripts/upload/upload-car.sh | 30 ++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100755 scripts/upload/upload-car.sh diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 633e8a64..8982ea37 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -19,6 +19,10 @@ function dr-upload-model { dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" } +function dr-upload-car-zip { + dr-update-env && ${DIR}/scripts/upload/upload-car.sh "$@" +} + function dr-list-aws-models { echo "Due to changes in AWS DeepRacer Console this command is no longer available." } diff --git a/docs/upload.md b/docs/upload.md index 598bf0b2..62730710 100644 --- a/docs/upload.md +++ b/docs/upload.md @@ -33,3 +33,8 @@ There are several useful switches to the upload command: ### Managing your models You should decide how you're going to manage your models. Upload to AWS does not preserve all the files created locally so if you delete your local files you will find it hard to go back to a previous model and resume training. + + +### Create file formatted for physical car, and upload to S3 +You can also create the file in the format necessary to run on the physical car directly from DRfC, without going through the AWS console. +This is executed by running 'dr-upload-car-zip'; it will copy files out of the running sagemaker container, format them into the proper .tar.gz file, and upload that file to `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX`. One of the limitations of this approach is that it only uses the latest checkpoint, and does not have the option to use the "best" checkpoint, or an earlier checkpoint. Another limitation is that the sagemaker container must be running at the time this command is executed. diff --git a/scripts/upload/upload-car.sh b/scripts/upload/upload-car.sh new file mode 100755 index 00000000..2e575f73 --- /dev/null +++ b/scripts/upload/upload-car.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# This script creates the tar.gz file necessary to operate inside a deepracer physical car +# The file is created directly from within the sagemaker container, using the most recent checkpoint + +# Find name of sagemaker container +SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) +if [[ -n $SAGEMAKER_CONTAINERS ]]; +then + for CONTAINER in $SAGEMAKER_CONTAINERS; do + CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) + CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') + echo $CONTAINER_NAME + done +fi + +#create tmp directory if it doesnt already exit +mkdir -p $DR_DIR/tmp/car_upload +cd $DR_DIR/tmp/car_upload +#ensure directory is empty +rm -r $DR_DIR/tmp/car_upload/* +#The files we want are located inside the sagemaker container at /opt/ml/model. Copy them to the tmp directory +docker cp $CONTAINER_NAME:/opt/ml/model $DR_DIR/tmp/car_upload +cd $DR_DIR/tmp/car_upload/model +#create a tar.gz file containing all of these files +tar -czvf carfile.tar.gz * + +#upload to s3 +aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz + From 28b259d277207aea49fd1721447658bef821ebe0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 1 Sep 2020 21:08:22 +0000 Subject: [PATCH 135/428] Initial fix for #77 --- scripts/upload/upload-model.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 93be02c0..1782003a 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -105,15 +105,19 @@ if [ -n "$OPT_CHECKPOINT_NUM" ]; then export OPT_CHECKPOINT_NUM CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/') CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` + CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $time | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_NAME --arg time `date +%s`) + CHECKPOINT_JSON=$(echo $CHECKPOINT_JSON_PART | jq '. | {last_checkpoint: .checkpoint, best_checkpoint: .checkpoint}') elif [ -z "$OPT_CHECKPOINT" ]; then echo "Checking for latest tested checkpoint" CHECKPOINT_FILE=`jq -r .last_checkpoint.name < $CHECKPOINT_INDEX` CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` + CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .last_checkpoint, best_checkpoint: .last_checkpoint}' < $CHECKPOINT_INDEX ) echo "Latest checkpoint = $CHECKPOINT" else echo "Checking for best checkpoint" CHECKPOINT_FILE=`jq -r .best_checkpoint.name < $CHECKPOINT_INDEX` CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` + CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .best_checkpoint, best_checkpoint: .best_checkpoint}' < $CHECKPOINT_INDEX ) echo "Best checkpoint: $CHECKPOINT" fi @@ -141,8 +145,8 @@ then fi # echo "" > ${WORK_DIR}model/.ready -rm ${WORK_DIR}model/deepracer_checkpoints.json cd ${WORK_DIR} +echo ${CHECKPOINT_JSON} > ${WORK_DIR}model/deepracer_checkpoints.json aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} # aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} From 65b4b2996ecc194b4ac1e5f341ac1122077c7f54 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 2 Sep 2020 07:33:26 +0000 Subject: [PATCH 136/428] Bugfix --- scripts/upload/upload-model.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 1782003a..3958d5c0 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -105,7 +105,8 @@ if [ -n "$OPT_CHECKPOINT_NUM" ]; then export OPT_CHECKPOINT_NUM CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/') CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` - CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $time | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_NAME --arg time `date +%s`) + TIMESTAMP=`date +%s` + CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $timestamp | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_FILE --arg timestamp $TIMESTAMP) CHECKPOINT_JSON=$(echo $CHECKPOINT_JSON_PART | jq '. | {last_checkpoint: .checkpoint, best_checkpoint: .checkpoint}') elif [ -z "$OPT_CHECKPOINT" ]; then echo "Checking for latest tested checkpoint" From 9ec24fd93ba551b82d76f197e7ebd3659140ff11 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Tue, 15 Sep 2020 09:04:04 +0200 Subject: [PATCH 137/428] Bump Robomaker to 3.0.2 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 2053e839..b9bf90d2 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "3.0", "containers": { "rl_coach": "v3.0", - "robomaker": "3.0.1", + "robomaker": "3.0.2", "sagemaker": "3.0.1" } } From f9c0d273ce88317d6d496453a1d84ef54f64a197 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 24 Sep 2020 19:55:49 +0000 Subject: [PATCH 138/428] Windows documentation --- docs/index.md | 1 + docs/installation.md | 3 ++- docs/windows.md | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 docs/windows.md diff --git a/docs/index.md b/docs/index.md index 1795ebfc..9da85e8c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -49,6 +49,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th * [Running multiple parallel experiments](multi_run.md) * [GPU Accelerated OpenGL for Robomaker](opengl.md) * [Having multiple GPUs in one Computer](multi_gpu.md) +* [Installing on Windows](windows.md) # Support diff --git a/docs/installation.md b/docs/installation.md index cebeb4cd..7171126b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -24,10 +24,11 @@ Depending on your needs as well as specific needs of the cloud platform you can **Local**: * A modern, comparatively powerful, Intel based system. - * Ubuntu 18.04 or 20.04 - Windows not supported, other Linux-dristros likely to work. + * Ubuntu 18.04 or 20.04, other Linux-dristros likely to work. * 4 core-CPU, equivalent to 8 vCPUs; the more the better. * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. * System RAM + GPU RAM should be at least 32 GB. +* Running DRfC Ubuntu 20.04 on Windows using Windows Subsystem for Linux 2 is possible. See [Installing on Windows](windows.md) ## Installation diff --git a/docs/windows.md b/docs/windows.md new file mode 100644 index 00000000..bbca8289 --- /dev/null +++ b/docs/windows.md @@ -0,0 +1,36 @@ +# Installing on Windows + +## Prerequisites + +The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html). + +The further instructions assume that you have a working Nvidia enabled Docker. + +## Additional steps + +The `bin/prepare.sh` will not work for a Ubuntu WSL installation, hence additional steps will be required. + +### Adding required packages + +Install the additional packages with the following command: + +``` +sudo apt-get install jq awscli python3-boto3 docker-compose +``` + +### Configure Docker + +To ensure we always have a GPU enabled Docker container, run: +``` +cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json +sudo usermod -a -G docker $(id -un) +``` + +### Install DRfC + +You can now run `bin/init.sh -a gpu -c local` to setup DRfC. + +## Known Issues + +* `init.sh` is not able to detect the GPU given differences in the Nvidia drivers, and the WSL2 Linux Kernel. You need to manually set the GPU image in `system.env`. +* Docker does not start automatically when you launch Ubuntu. Start it with `sudo service docker start`. \ No newline at end of file From 6bb9a330542d42cf754c4ba0f7c155a6125081e7 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 24 Sep 2020 20:27:23 +0000 Subject: [PATCH 139/428] Fixing bug #79 --- scripts/upload/upload-model.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 3958d5c0..f3ab1958 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -124,7 +124,12 @@ fi # Find checkpoint & model files - download if [ -n "$CHECKPOINT" ]; then - CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f) + CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) + CHECKPOINT_MODEL_FILE_COUNT=$(echo $CHECKPOINT_MODEL_FILES | wc -l) + if [ "$CHECKPOINT_MODEL_FILE_COUNT" -eq 0 ]; then + echo "No model files found. Files possibly deleted. Try again." + exit 1 + fi cp ${METADATA_FILE} ${WORK_DIR}model/ # echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint echo ${CHECKPOINT_FILE} | tee ${WORK_DIR}model/.coach_checkpoint > /dev/null From e6d035588a1d7119c4807383bcc9c0ec9bc7d658 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 24 Sep 2020 20:45:22 +0000 Subject: [PATCH 140/428] Delete sagemaker-local network if it exists --- bin/init.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/init.sh b/bin/init.sh index 38c2c682..7c4f11b1 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -158,6 +158,9 @@ docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm +else + docker network rm $SAGEMAKER_NW + docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm fi # ensure our variables are set on startup From 1edb3480e5506c6011fcad71e6b0e31bf95e6646 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 26 Sep 2020 12:57:29 +0200 Subject: [PATCH 141/428] Reduce time-window for logs to 5m without -a flag --- bin/scripts_wrapper.sh | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 8982ea37..30bcc2cd 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -89,11 +89,14 @@ function dr-stop-loganalysis { function dr-logs-sagemaker { local OPTIND + OPT_TIME="--since 5m" while getopts ":w:" opt; do case $opt in w) OPT_WAIT=$OPTARG ;; + a) OPT_TIME="" + ;; \?) echo "Invalid option -$OPTARG" >&2 ;; esac @@ -126,18 +129,18 @@ function dr-logs-sagemaker { then if [ -x "$(command -v gnome-terminal)" ]; then - gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/bin/bash -c "!!; docker logs -f ${SAGEMAKER_CONTAINER}" 2> /dev/null + gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate gnome-terminal. " elif [ -x "$(command -v x-terminal-emulator)" ]; then - x-terminal-emulator -e /bin/sh -c "!!; docker logs -f ${SAGEMAKER_CONTAINER}" 2> /dev/null + x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate terminal. " else echo 'Could not find a defined x-terminal-emulator. Displaying inline.' - docker logs -f $SAGEMAKER_CONTAINER + docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER fi else - docker logs -f $SAGEMAKER_CONTAINER + docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER fi } @@ -169,8 +172,9 @@ function dr-logs-robomaker { OPT_REPLICA=1 local OPTIND + OPT_TIME="--since 5m" - while getopts ":w:n:e" opt; do + while getopts ":w:n:ea" opt; do case $opt in w) OPT_WAIT=$OPTARG ;; @@ -178,6 +182,8 @@ function dr-logs-robomaker { ;; e) OPT_EVAL="-e" ;; + a) OPT_TIME="" + ;; \?) echo "Invalid option -$OPTARG" >&2 ;; esac @@ -210,18 +216,18 @@ function dr-logs-robomaker { then if [ -x "$(command -v gnome-terminal)" ]; then - gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/bin/bash -c "!!; docker logs -f ${ROBOMAKER_CONTAINER}" 2> /dev/null + gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate gnome-terminal. " elif [ -x "$(command -v x-terminal-emulator)" ]; then - x-terminal-emulator -e /bin/sh -c "!!; docker logs -f ${ROBOMAKER_CONTAINER}" 2> /dev/null + x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate terminal. " else echo 'Could not find a defined x-terminal-emulator. Displaying inline.' - docker logs -f $ROBOMAKER_CONTAINER + docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER fi else - docker logs -f $ROBOMAKER_CONTAINER + docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER fi } From 628ca4d49207d12c192a07567375b4907c8eee52 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 26 Sep 2020 13:01:13 +0200 Subject: [PATCH 142/428] Change default arch to cpu-avx --- bin/init.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 38c2c682..fe28c8b8 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -34,10 +34,7 @@ if [[ -z "$OPT_CLOUD" ]]; then fi # Find CPU Level -CPU_LEVEL="cpu" -if [[ "$(dmesg | grep AVX | wc -l)" > 0 ]]; then - CPU_LEVEL="cpu-avx" -fi +CPU_LEVEL="cpu-avx" if [[ "$(dmesg | grep AVX2 | wc -l)" > 0 ]]; then CPU_LEVEL="cpu-avx2" @@ -60,7 +57,7 @@ then if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then echo "No GPU detected in docker. Using CPU". - OPT_ARCH="cpu" + OPT_ARCH="cpu-avx fi fi From 080265da7e96b3a1cabd92a0e18b7d8e6e95d9dc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 26 Sep 2020 18:28:00 +0200 Subject: [PATCH 143/428] Fix typo --- bin/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/init.sh b/bin/init.sh index fe28c8b8..54edc4e4 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -57,7 +57,7 @@ then if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then echo "No GPU detected in docker. Using CPU". - OPT_ARCH="cpu-avx + OPT_ARCH="cpu-avx" fi fi From 93eadb790edada11b7b92554509d94c45e4de077 Mon Sep 17 00:00:00 2001 From: breadcentric <31169082+breadcentric@users.noreply.github.com> Date: Mon, 28 Sep 2020 23:28:23 +0100 Subject: [PATCH 144/428] Add clarity for local setup --- docs/installation.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/installation.md b/docs/installation.md index 7171126b..32617230 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -35,13 +35,16 @@ Depending on your needs as well as specific needs of the cloud platform you can The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. git clone https://github.com/larsll/deepracer-for-cloud.git + +**For cloud setup** execute: + cd deepracer-for-cloud && ./bin/prepare.sh This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. -For local install it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. +**For local install** it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. The Init Script takes a few parameters: From d46935c31f802c1e1a94169d0bb0c54a44c5ed95 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 29 Sep 2020 15:52:56 +0200 Subject: [PATCH 145/428] Update prepare.sh --- bin/prepare.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 7128cf71..9585a33d 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -12,8 +12,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \ - sudo apt-get -y install jq - + sudo apt-get install --no-install-recommends -y jq source $DIR/detect.sh echo "Detected cloud type ${CLOUD_NAME}" @@ -64,7 +63,7 @@ then sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' - sudo bash -c 'apt update && apt install -y nvidia-driver-440 cuda-minimal-build-10-2 -o Dpkg::Options::="--force-overwrite"' + sudo bash -c 'apt update && apt install -y nvidia-driver-440-server cuda-minimal-build-10-2 --no-install-recommends -o Dpkg::Options::="--force-overwrite"' fi ## Adding AWSCli @@ -73,7 +72,7 @@ sudo apt-get install -y awscli python3-boto3 ## Installing Docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" -sudo apt-get update && sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io if [[ "${ARCH}" == "gpu" ]]; then @@ -81,7 +80,7 @@ then curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get update && sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime + sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json fi sudo systemctl enable docker From fe33b1e9b12303ab8348308b776c1f63595a742f Mon Sep 17 00:00:00 2001 From: jochem725 Date: Tue, 29 Sep 2020 17:37:46 +0200 Subject: [PATCH 146/428] Fix autorun TRAINING_LOC variable name --- bin/init.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 139bfd6a..c2004cc3 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -180,9 +180,9 @@ then #get bucket name TRAINING_BUCKET=${TRAINING_LOC%%/*} #get prefix. minor exception handling in case there is no prefix and a root bucket is passed - if [[ "$TRAININGLOC" == *"/"* ]] + if [[ "$TRAINING_LOC" == *"/"* ]] then - TRAINING_PREFIX=${TRAININGLOC#*/} + TRAINING_PREFIX=${TRAINING_LOC#*/} else TRAINING_PREFIX="" fi From 3232aa7a75da81633f59f933b0c416ed04e4f06f Mon Sep 17 00:00:00 2001 From: jochem725 Date: Tue, 29 Sep 2020 17:42:06 +0200 Subject: [PATCH 147/428] Fix custom files path in spot example --- utils/sample-createspot.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh index c4331848..f4867702 100644 --- a/utils/sample-createspot.sh +++ b/utils/sample-createspot.sh @@ -86,8 +86,15 @@ else echo Last training was $CURRENT_RUN_MODEL so next training is $NEW_RUN_MODEL fi +if [[ $PREFIX == "" ]] +then + CUSTOM_FILES_PREFIX="custom_files" +else + CUSTOM_FILES_PREFIX="$PREFIX/custom_files" +fi + ## Replace dynamic parameters in run.env (still local to your directory) -sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; ; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g" "$CONFIG_FILE" && echo "Done." +sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g; s:(DR_LOCAL_S3_CUSTOM_FILES_PREFIX=).*$:\1$CUSTOM_FILES_PREFIX:g" "$CONFIG_FILE" sed -i.bak -re "s/(DR_LOCAL_S3_BUCKET=).*$/\1$BUCKET/g" "$CONFIG_FILE" ## Replace static parameters in run.env (still local to your directory) From 1d44122971af6ce37a82fed540f862a167d96051 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 30 Sep 2020 08:51:54 +0200 Subject: [PATCH 148/428] Remote Video Streaming (#80) * Allow viewing multiple streams in swarm mode * Move proxy server to separate script * Proper intergration into DRfC * Undo changes to training stop.sh * Altered documentation Co-authored-by: dungviettran89 --- bin/scripts_wrapper.sh | 9 +++ defaults/template-system.env | 1 + docker/docker-compose-webviewer.yml | 16 +++++ docs/reference.md | 5 +- scripts/viewer/start.sh | 90 +++++++++++++++++++++++++++++ scripts/viewer/stop.sh | 12 ++++ 6 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 docker/docker-compose-webviewer.yml create mode 100755 scripts/viewer/start.sh create mode 100755 scripts/viewer/stop.sh diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 30bcc2cd..d5b1289e 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -303,3 +303,12 @@ function dr-view-stream { ${DIR}/utils/start-local-browser.sh "$@" } +function dr-start-viewer { + dr-update-env + $DIR/scripts/viewer/start.sh "$@" +} + +function dr-stop-viewer { + dr-update-env + $DIR/scripts/viewer/stop.sh "$@" +} \ No newline at end of file diff --git a/defaults/template-system.env b/defaults/template-system.env index 27a92a05..829a0ad3 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -14,4 +14,5 @@ DR_ROBOMAKER_MOUNT_LOGS=False DR_CLOUD_WATCH_ENABLE=False DR_DOCKER_STYLE=swarm DR_HOST_X=False +DR_WEBVIEWER_PORT=8100 # CUDA_VISIBLE_DEVICES=0 \ No newline at end of file diff --git a/docker/docker-compose-webviewer.yml b/docker/docker-compose-webviewer.yml new file mode 100644 index 00000000..3aad10b8 --- /dev/null +++ b/docker/docker-compose-webviewer.yml @@ -0,0 +1,16 @@ +version: '3.7' + +networks: + default: + external: true + name: sagemaker-local + +services: + proxy: + image: nginx + ports: + - "${DR_WEBVIEWER_PORT}:80" + volumes: + - ${DR_VIEWER_HTML}/:/usr/share/nginx/html/index.html + - ${DR_NGINX_CONF}:/etc/nginx/conf.d/default.conf + diff --git a/docs/reference.md b/docs/reference.md index db586847..9fd358f4 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -62,7 +62,8 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_ROBOMAKER_MOUNT_LOGS` | TODO.| | `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.| | `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`. Use Compose for openGL optimized containers.| -| `DR_HOST_X` | TODO.| +| `DR_HOST_X` | Uses the host X-windows server, rather than starting one inside of Robomaker. Required for OpenGL images.| +| `DR_WEBVIEWER_PORT` | Port for the web-viewer proxy which enables the streaming of all robomaker workers at once.| | `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.| @@ -81,6 +82,8 @@ The scripts assume that two files `system.env` containing constant configuration | `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| | `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| | `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| +| `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible from remote.| +| `dr-stop-viewer` | Stops the NGINX proxy.| | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh new file mode 100755 index 00000000..20047531 --- /dev/null +++ b/scripts/viewer/start.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +usage(){ + echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]" + echo " -w Width of individual stream." + echo " -h Heigth of individual stream." + echo " -q Quality of the stream image." + echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" + echo " -b Browser command (default: firefox --new-tab)" + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +# Stream definition +TOPIC="/racecar/deepracer/kvs_stream" +WIDTH=480 +HEIGHT=360 +QUALITY=75 +BROWSER="firefox --new-tab" + +while getopts ":w:h:q:t:b:" opt; do +case $opt in +w) WIDTH="$OPTARG" +;; +h) HEIGHT="$OPTARG" +;; +q) QUALITY="$OPTARG" +;; +t) TOPIC="$OPTARG" +;; +b) BROWSER="$OPTARG" +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html +export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf + +cat << EOF > $DR_NGINX_CONF +server { + listen 80; + location / { + root /usr/share/nginx/html; + index index.html index.htm; + } +EOF +echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

" > $DR_VIEWER_HTML + +ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE") +if [ -z "$ROBOMAKER_CONTAINERS" ]; then + echo "No running robomakers. Exiting." + exit +fi + +for c in $ROBOMAKER_CONTAINERS; do + C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" + C_IMG="" + echo $C_IMG >> $DR_VIEWER_HTML + echo " location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >> $DR_NGINX_CONF +done + +echo "" >> $DR_VIEWER_HTML +echo "}" >> $DR_NGINX_CONF + +# Check if we will use Docker Swarm or Docker Compose +STACK_NAME="deepracer-$DR_RUN_ID-viewer" +COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml + +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack deploy $COMPOSE_FILES $STACK_NAME +else + docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d +fi + +# Starting browser if using local X and having display defined. +if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then + echo "Starting browser '$BROWSER'." + $BROWSER "http://127.0.01:8100" & +fi + diff --git a/scripts/viewer/stop.sh b/scripts/viewer/stop.sh new file mode 100755 index 00000000..97e5e218 --- /dev/null +++ b/scripts/viewer/stop.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +STACK_NAME="deepracer-$DR_RUN_ID-viewer" +COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml + +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack rm $STACK_NAME +else + docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down +fi \ No newline at end of file From 5114e6807efcf93654cecb91cacd9cdcd38300c1 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 1 Oct 2020 23:03:53 +0200 Subject: [PATCH 149/428] Upgrading to Robomaker 3.0.3 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index b9bf90d2..d28d6a78 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "3.0", "containers": { "rl_coach": "v3.0", - "robomaker": "3.0.2", + "robomaker": "3.0.3", "sagemaker": "3.0.1" } } From 4aa9ea789a585c16599004f05852ac920815aebe Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 1 Oct 2020 23:26:21 +0200 Subject: [PATCH 150/428] Fix typo in viewer start.sh --- scripts/viewer/start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index 20047531..7fc7800e 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -77,7 +77,7 @@ COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then - docker stack deploy $COMPOSE_FILES $STACK_NAME + docker stack deploy -c $COMPOSE_FILES $STACK_NAME else docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d fi From d5a26460b1ec37fc4e0ca4cd19ef6725197c12be Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 3 Oct 2020 11:28:55 +0200 Subject: [PATCH 151/428] Tournament Mode (#73) * Removing unused parameters * Unused parameter * Adding other start script * Further docker changes * Create container dependency file * Fixing multi-config * Tune start-x script * Fix setup-xorg for multi-GPU machine * Remove need for shared memory * Tuning X * Do not track nohup.out * Use label for Robomaker placement * Populate variables for Host X * Fix start-xorg.sh to allow multiple X servers * Expose CUDA_VISIBLE_DEVICES to Evaluation * Add a debugging reward function. * Added version checking * Bumping dependency for images to RC1 * Updated GL documentation * parameter documentation updates * Additional information for upload command Added description on switches and a note for managing models * Update docker.md * Changed version logic. * Preparing release of 3.0 * Adding tmp config files * Initial files * Adding other start script * Adding tmp config files * Functioning version Co-authored-by: Ubuntu Co-authored-by: Wiktor Gancarz --- .gitignore | 1 + bin/scripts_wrapper.sh | 11 ++++ defaults/sample-tournament.json | 24 ++++++++ defaults/template-run.env | 2 + scripts/tournament/prepare-config.py | 90 ++++++++++++++++++++++++++++ scripts/tournament/start.sh | 85 ++++++++++++++++++++++++++ scripts/tournament/stop.sh | 15 +++++ 7 files changed, 228 insertions(+) create mode 100644 defaults/sample-tournament.json create mode 100755 scripts/tournament/prepare-config.py create mode 100755 scripts/tournament/start.sh create mode 100755 scripts/tournament/stop.sh diff --git a/.gitignore b/.gitignore index 9f40e1b9..8b122011 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ recording/ recording /*.env /*.bak +/*.json DONE data/ tmp/ diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index d5b1289e..bf571b59 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -72,6 +72,17 @@ function dr-stop-evaluation { ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" } + +function dr-start-tournament { + dr-update-env + $DIR/scripts/tournament/start.sh "$@" +} + +function dr-stop-tournament { + ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/tournament && ./stop.sh" +} + + function dr-start-loganalysis { ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" } diff --git a/defaults/sample-tournament.json b/defaults/sample-tournament.json new file mode 100644 index 00000000..d22eeb0c --- /dev/null +++ b/defaults/sample-tournament.json @@ -0,0 +1,24 @@ +{ + "racers": [ + { + "racer_name": "Racer1", + "s3_bucket": "bucket", + "s3_prefix": "Model1" + }, + { + "racer_name": "Racer2", + "s3_bucket": "bucket", + "s3_prefix": "Model2" + }, + { + "racer_name": "Racer3", + "s3_bucket": "bucket", + "s3_prefix": "Model3" + }, + { + "racer_name": "Racer4", + "s3_bucket": "bucket", + "s3_prefix": "Model4" + } + ] +} diff --git a/defaults/template-run.env b/defaults/template-run.env index 62488b89..9655df9b 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -25,6 +25,8 @@ DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files DR_LOCAL_S3_TRAINING_PARAMS_FILE=training-params.yaml DR_LOCAL_S3_EVAL_PARAMS_FILE=eval-params.yaml +DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE=tournament-params.yaml +DR_LOCAL_S3_TOURNAMENT_JSON_FILE=tournament.json DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py diff --git a/scripts/tournament/prepare-config.py b/scripts/tournament/prepare-config.py new file mode 100755 index 00000000..362374a0 --- /dev/null +++ b/scripts/tournament/prepare-config.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +import boto3 +import sys +import os +import time +import json +import io +import yaml + +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") + +config = {} + +# Basic configuration; common for all racers + +tournament_s3_prefix = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'tournament') +tournament_s3_bucket = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') + +config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') +config['JOB_TYPE'] = 'EVALUATION' +config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') +config['RACE_TYPE'] = 'HEAD_TO_MODEL' +config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') +config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') + +is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) +if is_continous: + config['NUMBER_OF_RESETS'] = '10000' + config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') + +config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') + +# Tournament bucket for logs, and overall storage +tournament_config = os.environ.get('DR_LOCAL_S3_TOURNAMENT_JSON_FILE', 'tournament.json') +print("Reading in tournament file {}".format(tournament_config)) + +config['RACER_NAME'] = [] +config['DISPLAY_NAME'] = [] +config['MODEL_S3_PREFIX'] = [] +config['MODEL_S3_BUCKET'] = [] +config['SIMTRACE_S3_PREFIX'] = [] +config['SIMTRACE_S3_BUCKET'] = [] +config['KINESIS_VIDEO_STREAM_NAME'] = [] +config['METRICS_S3_BUCKET'] = [] +config['METRICS_S3_PREFIX'] = [] +config['MP4_S3_BUCKET'] = [] +config['MP4_S3_OBJECT_PREFIX'] = [] +config['MODEL_METADATA_FILE_S3_KEY'] = [] + +with open(tournament_config) as tournament_config_json: + tournament_config_data = json.load(tournament_config_json) + for r in tournament_config_data['racers']: + config['RACER_NAME'].append(r['racer_name']) + config['DISPLAY_NAME'].append(r['racer_name']) + config['MODEL_S3_PREFIX'].append(r['s3_prefix']) + config['MODEL_S3_BUCKET'].append(r['s3_bucket']) + config['MODEL_METADATA_FILE_S3_KEY'].append("{}/model/model_metadata.json".format(r['s3_prefix'])) + config['KINESIS_VIDEO_STREAM_NAME'].append("None") + config['SIMTRACE_S3_BUCKET'].append(tournament_s3_bucket) + config['SIMTRACE_S3_PREFIX'].append("{}/{}/simtrace/".format(tournament_s3_prefix, r['racer_name'])) + config['MP4_S3_BUCKET'].append(tournament_s3_bucket) + config['MP4_S3_OBJECT_PREFIX'].append("{}/{}/mp4/".format(tournament_s3_prefix, r['racer_name'])) + config['METRICS_S3_BUCKET'].append(tournament_s3_bucket) + config['METRICS_S3_PREFIX'].append("{}/{}/metrics/".format(tournament_s3_prefix, r['racer_name'])) + +# S3 Setup / write and upload file +s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) +s3_region = config['AWS_REGION'] +s3_bucket = tournament_s3_bucket +s3_prefix = tournament_s3_prefix +s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') +if s3_mode == 'profile': + s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') +else: # mode is 'role' + s3_profile = None +s3_yaml_name = os.environ.get('DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE', 'tournament-params.yaml') +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) + +session = boto3.session.Session(profile_name=s3_profile) +s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) + +yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) +local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'tournament-params-' + str(round(time.time())) + '.yaml')) + +with open(local_yaml_path, 'w') as yaml_file: + yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) + +s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) diff --git a/scripts/tournament/start.sh b/scripts/tournament/start.sh new file mode 100755 index 00000000..5e0f8f45 --- /dev/null +++ b/scripts/tournament/start.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +source $DR_DIR/bin/scripts_wrapper.sh + +usage(){ + echo "Usage: $0 [-q] [-f yaml-file]" + echo " -q Quiet - does not start log tracing." + echo " -f filename Tournament Yaml configuration." + echo " -w Wipe tournament / restart." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + +while getopts ":wqf:" opt; do +case $opt in +q) OPT_QUIET="QUIET" +;; +f) OPT_YAML_FILE="$OPTARG" +;; +h) usage +;; +w) OPT_WIPE="WIPE" +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + +# set evaluation specific environment variables +S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" +STACK_NAME="deepracer-eval-$DR_RUN_ID" + +export ROBOMAKER_COMMAND="./run.sh run tournament.launch" +export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE} + +#Check if files are available +S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) +if [[ $S3_FILES > 0 ]]; +then + if [[ -z $OPT_WIPE ]]; + then + echo "Selected path $S3_PATH exists. Continuing execution of tournament." + else + echo "Wiping path $S3_PATH." + aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH} + echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" + python3 $DR_DIR/scripts/tournament/prepare-config.py + fi +else + echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" + python3 $DR_DIR/scripts/tournament/prepare-config.py +fi + +if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; +then + COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" + export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" + mkdir -p $DR_MOUNT_DIR +else + COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE" +fi + +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack deploy $COMPOSE_FILES $STACK_NAME +else + docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d +fi + +# Request to be quiet. Quitting here. +if [ -n "$OPT_QUIET" ]; then + exit 0 +fi + +# Trigger requested log-file +dr-logs-robomaker -w 15 -e + diff --git a/scripts/tournament/stop.sh b/scripts/tournament/stop.sh new file mode 100755 index 00000000..4743dd4a --- /dev/null +++ b/scripts/tournament/stop.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +STACK_NAME="deepracer-eval-$DR_RUN_ID" +RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} + +# Check if we will use Docker Swarm or Docker Compose +if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; +then + docker stack rm $STACK_NAME +else + COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ ) + export DR_CURRENT_PARAMS_FILE="" + export ROBOMAKER_COMMAND="" + docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down +fi \ No newline at end of file From 1907f1dc07ed38443e8c886a00227bd699632d69 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 3 Oct 2020 11:37:43 +0200 Subject: [PATCH 152/428] Expose PRETRAINED_CHECKPOINT to/from Sagemaker --- defaults/template-run.env | 1 + docker/docker-compose-training.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index 9655df9b..29a59fa5 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -20,6 +20,7 @@ DR_TRAIN_MULTI_CONFIG=False DR_TRAIN_MIN_EVAL_TRIALS=5 DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained +DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 107a12a8..48b174e3 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -13,6 +13,7 @@ services: - PRETRAINED=${DR_LOCAL_S3_PRETRAINED} - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX} - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - PRETRAINED_CHECKPOINT=${DR_LOCAL_S3_PRETRAINED_CHECKPOINT} - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY} From fea47c47472b0ab7de7c1cfc91e2ea8f0f1142aa Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 3 Oct 2020 11:38:19 +0200 Subject: [PATCH 153/428] Increase version to 3.0.4 --- defaults/dependencies.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index d28d6a78..86354f5b 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "3.0", "containers": { - "rl_coach": "v3.0", - "robomaker": "3.0.3", + "rl_coach": "v3.0.4", + "robomaker": "3.0.4", "sagemaker": "3.0.1" } } From 423babe62cbd93424da268a2f3a37a1dac5975c5 Mon Sep 17 00:00:00 2001 From: Jochem Lugtenburg Date: Sun, 4 Oct 2020 10:24:09 +0200 Subject: [PATCH 154/428] Upload reward function for console compatibility (#86) * Upload reward function for console compatibility * Copy reward_function from prefix path * Fix reward file path * Fix target reward S3 path * Fix uploader backward compatibility * Fix if-statement Co-authored-by: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> --- scripts/training/prepare-config.py | 8 +++++++- scripts/upload/upload-model.sh | 12 ++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 9d502784..131f81b3 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -92,7 +92,13 @@ with open(local_yaml_path, 'w') as yaml_file: yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) - +# Copy the reward function to the s3 prefix bucket for compatability with DeepRacer console. +reward_function_key = os.path.normpath(os.path.join(s3_prefix, "reward_function.py")) +copy_source = { + 'Bucket': s3_bucket, + 'Key': config['REWARD_FILE_S3_KEY'] +} +s3_client.copy(copy_source, Bucket=s3_bucket, Key=reward_function_key) # Training with different configurations on each worker (aka Multi Config training) config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False') diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index f3ab1958..46a02bb6 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -74,11 +74,19 @@ WORK_DIR=${DR_DIR}/tmp/upload/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip # Download information on model. -TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/reward_function.py" +TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_function.py" TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" # Check if metadata-files are available -REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py) + +if [ -z "$REWARD_IN_ROOT" ]; +then + REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +else + REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +fi + METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) # METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) From 56684f0de848206dbd6e72dde797a776654be159 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 4 Oct 2020 10:31:49 +0200 Subject: [PATCH 155/428] Changing reward detection --- scripts/upload/upload-model.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 46a02bb6..90f09745 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -78,12 +78,12 @@ TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_f TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" # Check if metadata-files are available -REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py) - -if [ -z "$REWARD_IN_ROOT" ]; +REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py 2> /dev/null | wc -l) +if [ "$REWARD_IN_ROOT" -ne 0 ]; then REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) else + echo "Looking for Reward Function in s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD}" REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) fi From 435ecb481c3bba696840bfb376793a258590babf Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 4 Oct 2020 11:28:17 +0200 Subject: [PATCH 156/428] Change loganalysis image reference --- scripts/log-analysis/start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 269fa424..5e7024c7 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -6,6 +6,6 @@ docker run --rm -d -p "8888:8888" \ -v `pwd`/../../data/analysis:/workspace/analysis \ --name loganalysis \ --network sagemaker-local \ - larsll/deepracer-loganalysis:v2-cpu + larsll/deepracer-loganalysis:latest docker logs -f loganalysis \ No newline at end of file From 26b7e7058190ef6412de1019dd277e79fca3b810 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 5 Oct 2020 11:06:26 +0200 Subject: [PATCH 157/428] Defining Version --- defaults/model_metadata.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/defaults/model_metadata.json b/defaults/model_metadata.json index a3ecc63f..0a74e607 100644 --- a/defaults/model_metadata.json +++ b/defaults/model_metadata.json @@ -22,5 +22,6 @@ } ], "sensor": ["FRONT_FACING_CAMERA"], - "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW" + "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", + "version": "3" } From 60b70bb2d99befb32a55081c898352fce8686eed Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 5 Oct 2020 15:37:06 +0000 Subject: [PATCH 158/428] Fixing image version detection --- bin/activate.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 61ca3352..5a867dc6 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -153,19 +153,22 @@ fi ## Version check DEPENDENCY_VERSION=$(jq -r '.master_version | select (.!=null)' $DIR/defaults/dependencies.json) -SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE | jq -r .[].Config.Labels.version) +SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) +if [ -z "$SAGEMAKER_VER" ]; then SAGEMAKER_VER=$DR_SAGEMAKER_IMAGE; fi if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then - echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER" + echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER." fi -ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE | jq -r .[].Config.Labels.version) +ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version ) +if [ -z "$ROBOMAKER_VER" ]; then ROBOMAKER_VER=$DR_ROBOMAKER_IMAGE; fi if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then - echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER" + echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER." fi -COACH_VER=$(docker inspect larsll/deepracer-rlcoach:$DR_COACH_IMAGE | jq -r .[].Config.Labels.version) +COACH_VER=$(docker inspect larsll/deepracer-rlcoach:$DR_COACH_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) +if [ -z "$COACH_VER" ]; then COACH_VER=$DR_COACH_IMAGE; fi if ! verlte $DEPENDENCY_VERSION $COACH_VER; then - echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER" + echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER." fi source $SCRIPT_DIR/scripts_wrapper.sh From edd4613345fed3f75e95395ba7cb94bb6544b463 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 6 Oct 2020 09:42:53 +0200 Subject: [PATCH 159/428] Fix path typo for Viewer --- docker/docker-compose-webviewer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-compose-webviewer.yml b/docker/docker-compose-webviewer.yml index 3aad10b8..8b148ae8 100644 --- a/docker/docker-compose-webviewer.yml +++ b/docker/docker-compose-webviewer.yml @@ -11,6 +11,6 @@ services: ports: - "${DR_WEBVIEWER_PORT}:80" volumes: - - ${DR_VIEWER_HTML}/:/usr/share/nginx/html/index.html + - ${DR_VIEWER_HTML}:/usr/share/nginx/html/index.html - ${DR_NGINX_CONF}:/etc/nginx/conf.d/default.conf From 1feb5b0890ad7998484985afcb99f68fc9634a0c Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 13 Oct 2020 19:44:10 +0200 Subject: [PATCH 160/428] Updating viewer --- bin/scripts_wrapper.sh | 7 +++++++ scripts/viewer/start.sh | 7 +++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index bf571b59..faeb1ad0 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -322,4 +322,11 @@ function dr-start-viewer { function dr-stop-viewer { dr-update-env $DIR/scripts/viewer/stop.sh "$@" +} + +function dr-update-viewer { + dr-update-env + $DIR/scripts/viewer/stop.sh "$@" + $DIR/scripts/viewer/start.sh "$@" + } \ No newline at end of file diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index 7fc7800e..754d53ee 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -53,8 +53,7 @@ server { index index.html index.htm; } EOF -echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

" > $DR_VIEWER_HTML - +echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
" > $DR_VIEWER_HTML ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE") if [ -z "$ROBOMAKER_CONTAINERS" ]; then echo "No running robomakers. Exiting." @@ -63,12 +62,12 @@ fi for c in $ROBOMAKER_CONTAINERS; do C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" - C_IMG="" + C_IMG="
" echo $C_IMG >> $DR_VIEWER_HTML echo " location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >> $DR_NGINX_CONF done -echo "" >> $DR_VIEWER_HTML +echo "
" >> $DR_VIEWER_HTML echo "}" >> $DR_NGINX_CONF # Check if we will use Docker Swarm or Docker Compose From 8b55b75803f39eb077f9bf518bcdfa53fb545e89 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 14 Oct 2020 18:08:08 +0000 Subject: [PATCH 161/428] Change processor detection --- bin/init.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index c2004cc3..423783fd 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -36,17 +36,12 @@ fi # Find CPU Level CPU_LEVEL="cpu-avx" -if [[ "$(dmesg | grep AVX2 | wc -l)" > 0 ]]; then +if [[ "$(cat /proc/cpuinfo | grep avx2 | wc -l)" > 0 ]]; then CPU_LEVEL="cpu-avx2" fi - -# Disabled due to performance issues with AVX-512 image -# if [[ "$(dmesg | grep AVX-512 | wc -l)" > 0 ]]; then -# CPU_LEVEL="cpu-avx512" -# fi # Check if Intel (to ensure MKN) -if [[ "$(dmesg | grep GenuineIntel | wc -l)" > 0 ]]; then +if [[ "$(cat /proc/cpuinfo | grep GenuineIntel | wc -l)" > 0 ]]; then CPU_INTEL="true" fi From 0ba24d35f733be532192c28d8a95d36a0a5b7618 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 14 Oct 2020 18:09:47 +0000 Subject: [PATCH 162/428] Start docker if required --- bin/activate.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/activate.sh b/bin/activate.sh index 5a867dc6..c535d4df 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -64,6 +64,9 @@ else return 1 fi +# Check if Docker runs -- if not, then start it. +service docker status > /dev/null || sudo service docker start + # Check if we will use Docker Swarm or Docker Compose # If not defined then use Swarm if [[ -z "${DR_DOCKER_STYLE}" ]]; then From 9886764da7e7e0187974bcc798318fb879d823cf Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 17 Oct 2020 15:24:15 +0200 Subject: [PATCH 163/428] Improved detection of CUDA in Docker --- bin/init.sh | 3 ++- utils/Dockerfile.gpu-detect | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 utils/Dockerfile.gpu-detect diff --git a/bin/init.sh b/bin/init.sh index 423783fd..e910c8bd 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -48,7 +48,8 @@ fi # Check GPU if [[ "${OPT_ARCH}" == "gpu" ]] then - GPUS=$(docker run --rm --gpus all nvidia/cuda:10.2-base nvidia-smi "-L" 2> /dev/null | awk '/GPU .:/' | wc -l ) + docker build -t local/gputest - < $INSTALL_DIR/utils/Dockerfile.gpu-detect + GPUS=$(docker run --rm --gpus all local/gputest 2> /dev/null | awk '/Device: ./' | wc -l ) if [ $? -ne 0 ] || [ $GPUS -eq 0 ] then echo "No GPU detected in docker. Using CPU". diff --git a/utils/Dockerfile.gpu-detect b/utils/Dockerfile.gpu-detect new file mode 100644 index 00000000..81d78e96 --- /dev/null +++ b/utils/Dockerfile.gpu-detect @@ -0,0 +1,4 @@ +FROM nvidia/cuda:10.2-base +RUN apt-get update && apt-get install -y --no-install-recommends wget python3 +RUN wget https://gist.githubusercontent.com/f0k/63a664160d016a491b2cbea15913d549/raw/f25b6b38932cfa489150966ee899e5cc899bf4a6/cuda_check.py +CMD ["python3","cuda_check.py"] \ No newline at end of file From bf2db32507c6e6f7fcc26439fe8464de7088fba0 Mon Sep 17 00:00:00 2001 From: Ben Tillman Date: Fri, 23 Oct 2020 15:36:26 +1300 Subject: [PATCH 164/428] Remove extra leading slash from docker socket path --- docker/docker-compose-training.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 48b174e3..3b8eb7ed 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -1,4 +1,4 @@ -version: '3.7' +version: "3.7" networks: default: @@ -19,8 +19,8 @@ services: - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY} - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY} volumes: - - '//var/run/docker.sock:/var/run/docker.sock' - - '/tmp/sagemaker:/tmp/sagemaker' + - "/var/run/docker.sock:/var/run/docker.sock" + - "/tmp/sagemaker:/tmp/sagemaker" robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] @@ -31,11 +31,11 @@ services: - DISPLAY_N=:0 - WORLD_NAME=${DR_WORLD_NAME} - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - APP_REGION=${DR_AWS_APP_REGION} - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - CUDA_VISIBLE_DEVICES - - MULTI_CONFIG \ No newline at end of file + - MULTI_CONFIG From c6f3ba2ece5ff2a87805251ef66c176edd88dc62 Mon Sep 17 00:00:00 2001 From: Ben Tillman Date: Fri, 23 Oct 2020 15:59:38 +1300 Subject: [PATCH 165/428] Use sysctl to determine CPU capabilities on macOS --- bin/init.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index e910c8bd..8e1b8414 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -36,12 +36,16 @@ fi # Find CPU Level CPU_LEVEL="cpu-avx" -if [[ "$(cat /proc/cpuinfo | grep avx2 | wc -l)" > 0 ]]; then +if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep avx2 | wc -l)" > 0 ]]; then + CPU_LEVEL="cpu-avx2" +elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n hw.optional.avx2_0)" == 1 ]]; then CPU_LEVEL="cpu-avx2" fi - + # Check if Intel (to ensure MKN) -if [[ "$(cat /proc/cpuinfo | grep GenuineIntel | wc -l)" > 0 ]]; then +if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep GenuineIntel | wc -l)" > 0 ]]; then + CPU_INTEL="true" +elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n machdep.cpu.vendor)" == "GenuineIntel" ]]; then CPU_INTEL="true" fi From 65d4c5445d3896dd0a5e8360d521a18a003adc6f Mon Sep 17 00:00:00 2001 From: Ben Tillman Date: Fri, 23 Oct 2020 16:05:26 +1300 Subject: [PATCH 166/428] Skip docker server startup if service is not available --- bin/activate.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index c535d4df..417ecfca 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -65,7 +65,9 @@ else fi # Check if Docker runs -- if not, then start it. -service docker status > /dev/null || sudo service docker start +if [[ "$(type service 2> /dev/null)" ]]; then + service docker status > /dev/null || sudo service docker start +fi # Check if we will use Docker Swarm or Docker Compose # If not defined then use Swarm From fcc410a6c04518472343ffa3d9a4120d0a9c1989 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 23 Oct 2020 21:57:01 +0200 Subject: [PATCH 167/428] Enable all-time for dr-logs-sagemaker --- bin/scripts_wrapper.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index faeb1ad0..4c135a6e 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -102,7 +102,7 @@ function dr-logs-sagemaker { local OPTIND OPT_TIME="--since 5m" - while getopts ":w:" opt; do + while getopts ":w:a" opt; do case $opt in w) OPT_WAIT=$OPTARG ;; @@ -329,4 +329,4 @@ function dr-update-viewer { $DIR/scripts/viewer/stop.sh "$@" $DIR/scripts/viewer/start.sh "$@" -} \ No newline at end of file +} From 23af7ccbd1a179c3e2927d298ae48c2b50470920 Mon Sep 17 00:00:00 2001 From: Ben Tillman Date: Tue, 3 Nov 2020 15:34:41 +1300 Subject: [PATCH 168/428] Use square bracket syntax for integer comparison --- scripts/training/start.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 4d4db907..303e546c 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -57,8 +57,8 @@ fi S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) -if [[ $S3_FILES > 0 ]]; -then +if [[ "$S3_FILES" -gt 0 ]]; +then if [[ -z $OPT_WIPE ]]; then echo "Selected path $S3_PATH exists. Delete it, or use -w option. Exiting." From ed7c06c762226fad6a9c4d67379d170b5ed8ed3e Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 8 Nov 2020 12:41:13 +0100 Subject: [PATCH 169/428] Fixing startup of browser with swarm --- scripts/viewer/start.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index 754d53ee..e45a50d5 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -84,6 +84,10 @@ fi # Starting browser if using local X and having display defined. if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then echo "Starting browser '$BROWSER'." + if [ "${DR_DOCKER_STYLE,,}" == "swarm" ]; + then + sleep 5 + fi $BROWSER "http://127.0.01:8100" & fi From ee9dd095e31505a7cc9c334430c82853c6c8b970 Mon Sep 17 00:00:00 2001 From: Ben Tillman Date: Tue, 3 Nov 2020 15:40:54 +1300 Subject: [PATCH 170/428] Set group writable on sagemaker temp folder --- bin/init.sh | 1 + scripts/training/start.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/bin/init.sh b/bin/init.sh index 8e1b8414..bc5e58ca 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -67,6 +67,7 @@ cd $INSTALL_DIR mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/tmp sudo mkdir -p /tmp/sagemaker +sudo chmod -R g+w /tmp/sagemaker # create symlink to current user's home .aws directory # NOTE: AWS cli must be installed for this to work diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 303e546c..a81b691e 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -51,6 +51,7 @@ done # Ensure Sagemaker's folder is there if [ ! -d /tmp/sagemaker ]; then sudo mkdir -p /tmp/sagemaker + sudo chmod -R g+w /tmp/sagemaker fi #Check if files are available From 3de1724ba9b15af44171f5219ab366f8c1af8a48 Mon Sep 17 00:00:00 2001 From: TheRayG <58921678+TheRayG@users.noreply.github.com> Date: Wed, 18 Nov 2020 12:53:01 +0800 Subject: [PATCH 171/428] Update README.md Corrected the URL to the DeepRacer Build repo in the README.md file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dfff9ec5..b7cf57f2 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Provides a quick and easy way to get up and running with a DeepRacer training en ## Introduction -DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/larsll/deepracer-build) repository. Main differences to the work done by Alex is: * Runtime S3 storage is setup to fit the connected cloud platform: From b17816857e42a25dae5fddda5e10dc98292fd5c8 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 19 Nov 2020 19:18:59 +0000 Subject: [PATCH 172/428] Make 3.0.5 Robomaker default --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 86354f5b..f277c019 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "3.0", "containers": { "rl_coach": "v3.0.4", - "robomaker": "3.0.4", + "robomaker": "3.0.5", "sagemaker": "3.0.1" } } From e21fafad2e49effc43527bbf7299fdcd2484afb2 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 21 Nov 2020 23:28:28 +0100 Subject: [PATCH 173/428] Fix robomaker detection --- scripts/viewer/start.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index e45a50d5..f55dd71d 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -54,7 +54,8 @@ server { } EOF echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
" > $DR_VIEWER_HTML -ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE") +ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ ) + if [ -z "$ROBOMAKER_CONTAINERS" ]; then echo "No running robomakers. Exiting." exit From 5efd87f73c738eb5157d154201c9e45a6fba117d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 22 Nov 2020 09:35:23 +0000 Subject: [PATCH 174/428] Initial version to allow minio outside docker --- bin/activate.sh | 6 ++++++ bin/init.sh | 6 ++++++ defaults/template-system.env | 1 + 3 files changed, 13 insertions(+) diff --git a/bin/activate.sh b/bin/activate.sh index 417ecfca..30bb1c93 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -99,6 +99,12 @@ then DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml" +elif [[ "${DR_CLOUD,,}" == "remote" ]]; +then + export DR_LOCAL_S3_ENDPOINT_URL="$DR_REMOTE_MINIO_URL" + DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" + DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" + DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" diff --git a/bin/init.sh b/bin/init.sh index bc5e58ca..45c3797e 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -93,6 +93,12 @@ elif [[ "${OPT_CLOUD}" == "azure" ]]; then sed -i "s//azure/g" $INSTALL_DIR/system.env sed -i "s//not-defined/g" $INSTALL_DIR/system.env echo "Please run 'aws configure --profile azure' to set the credentials" +elif [[ "${OPT_CLOUD}" == "remote" ]]; then + AWS_REGION="us-east-1" + sed -i "s//minio/g" $INSTALL_DIR/system.env + sed -i "s//not-defined/g" $INSTALL_DIR/system.env + echo "Please run 'aws configure --profile minio' to set the credentials" + echo "Please define DR_REMOTE_MINIO_URL in system.env to point to remote minio instance." else AWS_REGION="us-east-1" sed -i "s//minio/g" $INSTALL_DIR/system.env diff --git a/defaults/template-system.env b/defaults/template-system.env index 829a0ad3..f8c6a97d 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -15,4 +15,5 @@ DR_CLOUD_WATCH_ENABLE=False DR_DOCKER_STYLE=swarm DR_HOST_X=False DR_WEBVIEWER_PORT=8100 +# DR_REMOTE_MINIO_URL=http://mynas:9000 # CUDA_VISIBLE_DEVICES=0 \ No newline at end of file From a51112a7c5ebdd4ca5de7bc63f5b903a1da3e459 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 22 Nov 2020 11:04:37 +0100 Subject: [PATCH 175/428] Basic tournament instructions --- docs/index.md | 1 + docs/tournament.md | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 docs/tournament.md diff --git a/docs/index.md b/docs/index.md index 9da85e8c..01ac6fb4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -50,6 +50,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th * [GPU Accelerated OpenGL for Robomaker](opengl.md) * [Having multiple GPUs in one Computer](multi_gpu.md) * [Installing on Windows](windows.md) +* [Run a Head-to-Head Tournament](tournament.md) # Support diff --git a/docs/tournament.md b/docs/tournament.md new file mode 100644 index 00000000..762d9892 --- /dev/null +++ b/docs/tournament.md @@ -0,0 +1,36 @@ +# Head-to-Head Tournament (Beta) + +It is possible to run a head-to-head tournament, similar to the elimination brackets +run by AWS in the Virtual Circuits to determine the winner of the head-to-bot races. + +## Introduction + +The concept for tournament is that you have a set of models, each in their own path +(S3 bucket + prefix). Additionally you define one prefix where all the outcomes will be stored. + +Each race in the tournament will require you to start and stop the tournament execution; the code will update the outcome prefix with the current status. + +## Configuration + +### run.env + +Configure `run.env` with the following parameters: +* `DR_LOCAL_S3_MODEL_PREFIX` will be the path where all the outcomes are stored. +* `DR_LOCAL_S3_TOURNAMENT_JSON_FILE` is the local filesystem path to your tournament configuation +* `DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE` is the path where the generated tournament parameters are uploaded + in S3. Can be left unchanged in most cases. +* `DR_EVAL_NUMBER_OF_TRIALS`, `DR_EVAL_IS_CONTINUOUS`, `DR_EVAL_OFF_TRACK_PENALTY`, + `DR_EVAL_COLLISION_PENALTY` and `DR_EVAL_SAVE_MP4` to be configured as a normal evaluation run. + + +### tournament.json + +Create a `tournament.json` based on `defaults/sample-tournament.json`. You will have one entry per model. +Required configuration per racer is: +* `racer_name`: The display name of the racer +* `s3_bucket`: The S3 bucket where the model for this racer is stored +* `s3_prefix`: The S3 prefix where the model for this racer is stored. + +## Run + +Run the tournament with `dr-start-tournament`; one race will be run. Once completed you need to do `dr-stop-tournament` and `dr-start-tournament` to make it run the next race. Iterate until done. \ No newline at end of file From d5577c7d17bca795e215b00b8832366796ec1479 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 22 Nov 2020 13:56:18 +0000 Subject: [PATCH 176/428] Fix evaluation in swarm --- bin/activate.sh | 1 + docker/docker-compose-eval-swarm.yml | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 docker/docker-compose-eval-swarm.yml diff --git a/bin/activate.sh b/bin/activate.sh index 417ecfca..52d046fa 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -116,6 +116,7 @@ fi if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training-swarm.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval-swarm.yml" fi # Enable logs in CloudWatch diff --git a/docker/docker-compose-eval-swarm.yml b/docker/docker-compose-eval-swarm.yml new file mode 100644 index 00000000..753dd99b --- /dev/null +++ b/docker/docker-compose-eval-swarm.yml @@ -0,0 +1,18 @@ +version: '3.7' + +services: + rl_coach: + deploy: + restart_policy: + condition: none + placement: + constraints: [node.labels.Sagemaker == true ] + robomaker: + deploy: + restart_policy: + condition: none + replicas: 1 + placement: + constraints: [node.labels.Robomaker == true ] + environment: + - DOCKER_REPLICA_SLOT={{.Task.Slot}} \ No newline at end of file From 4ffe4f0dad9d0c7000aa7474f70201916ef25e85 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 22 Nov 2020 20:37:46 +0100 Subject: [PATCH 177/428] Fixing viewer for stack --- scripts/viewer/start.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index f55dd71d..2e054950 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -54,13 +54,23 @@ server { } EOF echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
" > $DR_VIEWER_HTML -ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ ) + +if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then + ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ ) +else + ROBOMAKER_SERVICE_REPLICAS=$(docker service ps deepracer-${DR_RUN_ID}_robomaker | awk '/robomaker/ { print $1 }') + for c in $ROBOMAKER_SERVICE_REPLICAS; do + ROBOMAKER_CONTAINER_IP=$(docker inspect $c | jq -r '.[].NetworksAttachments[] | select (.Network.Spec.Name == "sagemaker-local") | .Addresses[0] ' | cut -f1 -d/) + ROBOMAKER_CONTAINERS="${ROBOMAKER_CONTAINERS} ${ROBOMAKER_CONTAINER_IP}" + done +fi if [ -z "$ROBOMAKER_CONTAINERS" ]; then echo "No running robomakers. Exiting." exit fi + for c in $ROBOMAKER_CONTAINERS; do C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" C_IMG="
" From 454cd758b890a4768d1d3e422c00126e6b4f0d85 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 22 Nov 2020 20:59:16 +0100 Subject: [PATCH 178/428] Ensure placement of viewer being central node --- docker/docker-compose-webviewer-swarm.yml | 15 +++++++++++++++ scripts/viewer/start.sh | 1 + 2 files changed, 16 insertions(+) create mode 100644 docker/docker-compose-webviewer-swarm.yml diff --git a/docker/docker-compose-webviewer-swarm.yml b/docker/docker-compose-webviewer-swarm.yml new file mode 100644 index 00000000..bec31188 --- /dev/null +++ b/docker/docker-compose-webviewer-swarm.yml @@ -0,0 +1,15 @@ +version: '3.7' + +networks: + default: + external: true + name: sagemaker-local + +services: + proxy: + deploy: + restart_policy: + condition: none + replicas: 1 + placement: + constraints: [node.labels.Sagemaker == true ] diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index 2e054950..11872988 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -87,6 +87,7 @@ COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then + COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-webviewer-swarm.yml" docker stack deploy -c $COMPOSE_FILES $STACK_NAME else docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d From 531ce56df43ab7760f1b0a2c77e0cc9ca03342cc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 23 Nov 2020 08:54:11 +0100 Subject: [PATCH 179/428] Download a model from S3 into local bucket (#92) * Initial download code * Typo in script --- bin/scripts_wrapper.sh | 4 ++ docs/reference.md | 1 + scripts/upload/download-model.sh | 106 +++++++++++++++++++++++++++++++ scripts/upload/upload-model.sh | 2 +- 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100755 scripts/upload/download-model.sh diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 4c135a6e..505c318e 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -19,6 +19,10 @@ function dr-upload-model { dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" } +function dr-download-model { + dr-update-env && ${DIR}/scripts/upload/download-model.sh "$@" +} + function dr-upload-car-zip { dr-update-env && ${DIR}/scripts/upload/upload-car.sh "$@" } diff --git a/docs/reference.md b/docs/reference.md index 9fd358f4..ce72689c 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -89,3 +89,4 @@ The scripts assume that two files `system.env` containing constant configuration | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | | `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. | | `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | +| `dr-download-model` | Downloads a file from a 'real' S3 location into a local prefix of choice. | diff --git a/scripts/upload/download-model.sh b/scripts/upload/download-model.sh new file mode 100755 index 00000000..2677ec9d --- /dev/null +++ b/scripts/upload/download-model.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +usage(){ + echo "Usage: $0 [-f] [-w] [-d] -s -t &2 +usage +;; +esac +done + +if [[ -n "${OPT_DRYRUN}" ]]; +then + echo "*** DRYRUN MODE ***" +fi + +SOURCE_S3_URL=${OPT_SOURCE} + +if [[ -z "${SOURCE_S3_URL}" ]]; +then + echo "No source URL to download model from." + exit 1 +fi + +TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET} +TARGET_S3_PREFIX=${OPT_TARGET} +if [[ -z "${TARGET_S3_PREFIX}" ]]; +then + echo "No target prefix defined. Exiting." + exit 1 +fi + +SOURCE_REWARD_FILE_S3_KEY="${SOURCE_S3_URL}/reward_function.py" +SOURCE_HYPERPARAM_FILE_S3_KEY="${SOURCE_S3_URL}/ip/hyperparameters.json" +SOURCE_METADATA_S3_KEY="${SOURCE_S3_URL}/model/model_metadata.json" + +WORK_DIR=${DR_DIR}/tmp/download +mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}/config ${WORK_DIR}/full + +# Check if metadata-files are available +REWARD_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_REWARD_FILE_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +METADATA_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_METADATA_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +HYPERPARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_HYPERPARAM_FILE_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) + +if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; +then + echo "All meta-data files found. Source model ${SOURCE_S3_URL} valid." +else + echo "Meta-data files are not found. Source model ${SOURCE_S3_URL} not valid. Exiting." + exit 1 +fi + +# Upload files +if [[ -z "${OPT_FORCE}" ]]; +then + echo "Ready to download model ${SOURCE_S3_URL} to local ${TARGET_S3_PREFIX}" + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi +fi + +cd ${WORK_DIR} +aws ${DR_UPLOAD_PROFILE} s3 sync ${SOURCE_S3_URL} ${WORK_DIR}/full/ ${OPT_DRYRUN} +aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync ${WORK_DIR}/full/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN} ${OPT_WIPE} + +if [[ -n "${OPT_CONFIG}" ]]; +then + echo "Copy configuration to custom_files" + cp ${WORK_DIR}/config/* ${DR_DIR}/custom_files/ +fi + +echo "Done." diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 90f09745..baf6a480 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -1,7 +1,7 @@ #!/bin/bash usage(){ - echo "Usage: $0 [-f] [-w] [-d] [-c ] [-p ]" + echo "Usage: $0 [-f] [-w] [-d] [-b] [-c ] [-p ]" echo " -f Force upload. No confirmation question." echo " -w Wipes the target AWS DeepRacer model structure before upload." echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." From 472239e6f6598586b000975bdf2cd691982199df Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 19 Dec 2020 11:50:52 +0100 Subject: [PATCH 180/428] Do not auto-init on login if local --- bin/init.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index bc5e58ca..ebf8e597 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -161,10 +161,12 @@ else docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm fi -# ensure our variables are set on startup -NUM_IN_PROFILE=$(cat $HOME/.profile | grep "$INSTALL_DIR/bin/activate.sh" | wc -l) -if [ "$NUM_IN_PROFILE" -eq 0 ]; then - echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile +# ensure our variables are set on startup - not for local setup. +if [[ "${OPT_CLOUD}" != "local" ]]; then + NUM_IN_PROFILE=$(cat $HOME/.profile | grep "$INSTALL_DIR/bin/activate.sh" | wc -l) + if [ "$NUM_IN_PROFILE" -eq 0 ]; then + echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile + fi fi # mark as done From 74ddc2d8d899573d8904a379efea4933e599da80 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 28 Dec 2020 21:53:10 +0100 Subject: [PATCH 181/428] Separate port for Evaluation Robomakers --- bin/activate.sh | 6 ++++-- docker/docker-compose-eval.yml | 2 +- docker/docker-compose-training.yml | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 52d046fa..194dc7d1 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -38,10 +38,12 @@ function dr-update-env { if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then - export DR_ROBOMAKER_PORT=$(expr 8080 + $DR_RUN_ID) + export DR_ROBOMAKER_TRAIN_PORT=$(expr 8080 + $DR_RUN_ID) + export DR_ROBOMAKER_EVAL_PORT=$(expr 8180 + $DR_RUN_ID) export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID) else - export DR_ROBOMAKER_PORT="8080-8100" + export DR_ROBOMAKER_TRAIN_PORT="8080-8100" + export DR_ROBOMAKER_EVAL_PORT="8080-8100" export DR_ROBOMAKER_GUI_PORT="5901-5920" fi diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 9bbe4fbe..1ef8fa8d 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -13,7 +13,7 @@ services: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] ports: - - "${DR_ROBOMAKER_PORT}:8080" + - "${DR_ROBOMAKER_EVAL_PORT}:8080" environment: - DISPLAY_N=:0 - CUDA_VISIBLE_DEVICES diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 3b8eb7ed..91f5b9b7 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -25,7 +25,7 @@ services: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} command: ["${ROBOMAKER_COMMAND}"] ports: - - "${DR_ROBOMAKER_PORT}:8080" + - "${DR_ROBOMAKER_TRAIN_PORT}:8080" - "${DR_ROBOMAKER_GUI_PORT}:5900" environment: - DISPLAY_N=:0 From 38fdf58ea6adf77f661f77e7c198a818b52500c2 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 28 Dec 2020 22:06:03 +0100 Subject: [PATCH 182/428] Version bump for dev branch --- defaults/dependencies.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index f277c019..b1b696e5 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { - "master_version": "3.0", + "master_version": "3.1", "containers": { - "rl_coach": "v3.0.4", - "robomaker": "3.0.5", - "sagemaker": "3.0.1" + "rl_coach": "v3.1.0-dev", + "robomaker": "3.1.0-dev", + "sagemaker": "3.0.2-dev" } } From d66182017bec6b45552ad034693f4cd9e5ae6246 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 28 Dec 2020 22:21:57 +0100 Subject: [PATCH 183/428] New Metadata and Hyper Parameters --- defaults/hyperparameters.json | 3 ++- defaults/model_metadata.json | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/defaults/hyperparameters.json b/defaults/hyperparameters.json index 3ec50a39..4c317fb9 100644 --- a/defaults/hyperparameters.json +++ b/defaults/hyperparameters.json @@ -11,5 +11,6 @@ "num_epochs": 10, "stack_size": 1, "term_cond_avg_score": 350.0, - "term_cond_max_episodes": 1000 + "term_cond_max_episodes": 1000, + "sac_alpha": 0.2 } \ No newline at end of file diff --git a/defaults/model_metadata.json b/defaults/model_metadata.json index 0a74e607..023866f9 100644 --- a/defaults/model_metadata.json +++ b/defaults/model_metadata.json @@ -23,5 +23,7 @@ ], "sensor": ["FRONT_FACING_CAMERA"], "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", + "training_algorithm": "clipped_ppo", + "action_space_type": "discrete", "version": "3" } From 13e93d860d2affcf4beede921eb4008615b97967 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 29 Dec 2020 16:41:42 +0100 Subject: [PATCH 184/428] Updated dependencies --- defaults/dependencies.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index f277c019..95ca5fb1 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "3.0", "containers": { - "rl_coach": "v3.0.4", - "robomaker": "3.0.5", + "rl_coach": "v3.0.5", + "robomaker": "3.0.6", "sagemaker": "3.0.1" } } From 4c123c05a1fecb903cfda8c9af603d0193e10c91 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 29 Dec 2020 18:49:55 +0100 Subject: [PATCH 185/428] Tuning default configuration files --- defaults/template-run.env | 13 ++++++------- defaults/template-system.env | 1 + 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index 29a59fa5..ecc81c7e 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -6,9 +6,8 @@ DR_CAR_NAME=FastCar DR_DISPLAY_NAME=$DR_CAR_NAME DR_RACER_NAME=racer1 DR_ENABLE_DOMAIN_RANDOMIZATION=False -DR_UPLOAD_S3_PREFIX=DeepRacer-SageMaker-RoboMaker-comm-prefix -DR_EVAL_NUMBER_OF_TRIALS=5 -DR_EVAL_IS_CONTINUOUS=False +DR_EVAL_NUMBER_OF_TRIALS=3 +DR_EVAL_IS_CONTINUOUS=True DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False @@ -18,20 +17,20 @@ DR_TRAIN_START_POSITION_OFFSET=0.0 DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_TRAIN_MULTI_CONFIG=False DR_TRAIN_MIN_EVAL_TRIALS=5 +DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last -DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker -DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files -DR_LOCAL_S3_TRAINING_PARAMS_FILE=training-params.yaml -DR_LOCAL_S3_EVAL_PARAMS_FILE=eval-params.yaml +DR_LOCAL_S3_TRAINING_PARAMS_FILE=training_params.yaml +DR_LOCAL_S3_EVAL_PARAMS_FILE=evaluation_params.yaml DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE=tournament-params.yaml DR_LOCAL_S3_TOURNAMENT_JSON_FILE=tournament.json DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics +DR_UPLOAD_S3_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False diff --git a/defaults/template-system.env b/defaults/template-system.env index f8c6a97d..2935c781 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -2,6 +2,7 @@ DR_CLOUD= DR_AWS_APP_REGION= DR_UPLOAD_S3_PROFILE=default DR_UPLOAD_S3_BUCKET= +DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_PROFILE= DR_GUI_ENABLE=False DR_KINESIS_STREAM_NAME=None From b3e946229c34961df64d2c30f5074cae7b6853c3 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 29 Dec 2020 20:09:52 +0100 Subject: [PATCH 186/428] Improved upload - metrics and more --- scripts/upload/prepare-config.py | 65 ++++++++++++++++++++++++++++++++ scripts/upload/upload-model.sh | 20 ++++++---- 2 files changed, 78 insertions(+), 7 deletions(-) create mode 100755 scripts/upload/prepare-config.py diff --git a/scripts/upload/prepare-config.py b/scripts/upload/prepare-config.py new file mode 100755 index 00000000..80f672a6 --- /dev/null +++ b/scripts/upload/prepare-config.py @@ -0,0 +1,65 @@ +#!/usr/bin/python3 + +import boto3 +import sys +import os +import time +import json +import io +import yaml + +config = {} +config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') +config['JOB_TYPE'] = 'TRAINING' +config['METRICS_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket') +config['METRICS_S3_OBJECT_KEY'] = "{}/TrainingMetrics.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) +config['MODEL_METADATA_FILE_S3_KEY'] = "{}/model/model_metadata.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) +config['REWARD_FILE_S3_KEY'] = "{}/reward_function.py".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) +config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket') +config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('TARGET_S3_PREFIX', 'rl-deepracer-sagemaker') + +# Car and training +config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') +config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') +config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') +config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') + +config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false')) +config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) +config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') +config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00') +config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') +config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5') + +# Object Avoidance +if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': + config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') + config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') + config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') + config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + + object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") + if object_position_str != "": + object_positions = [] + for o in object_position_str.split(";"): + object_positions.append(o) + config['OBJECT_POSITIONS'] = object_positions + config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) + +# Head to Bot +if config['RACE_TYPE'] == 'HEAD_TO_BOT': + config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') + config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') + config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') + config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') + config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') + config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') + config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') + config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') + +local_yaml_path = os.path.abspath(os.path.join(os.environ.get('WORK_DIR'),'training_params.yaml')) +print(local_yaml_path) +with open(local_yaml_path, 'w') as yaml_file: + yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) \ No newline at end of file diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index baf6a480..dbd36d72 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -44,8 +44,8 @@ then echo "*** DRYRUN MODE ***" fi -TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} -TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} +export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} +export TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} if [[ -z "${DR_UPLOAD_S3_BUCKET}" ]]; then @@ -70,12 +70,14 @@ SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" -WORK_DIR=${DR_DIR}/tmp/upload/ +export WORK_DIR=${DR_DIR}/tmp/upload/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip -# Download information on model. +# Upload information on model. +TARGET_PARAMS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/training_params.yaml" TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_function.py" TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" +TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/TrainingMetrics.json" # Check if metadata-files are available REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py 2> /dev/null | wc -l) @@ -89,9 +91,9 @@ fi METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -# METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) +METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) -if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; +if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ] && [ -n "$METRICS_FILE" ]; then echo "All meta-data files found. Looking for checkpoint." else @@ -146,6 +148,9 @@ else exit 1 fi +# Create Training Params Yaml. +PARAMS_FILE=$(python3 $DR_DIR/scripts/upload/prepare-config.py) + # Upload files if [[ -z "${OPT_FORCE}" ]]; then @@ -163,5 +168,6 @@ cd ${WORK_DIR} echo ${CHECKPOINT_JSON} > ${WORK_DIR}model/deepracer_checkpoints.json aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} -# aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN} aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} From bfc91d498fc337e2690b899dbde7aae85ac24fc1 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 29 Dec 2020 20:43:12 +0100 Subject: [PATCH 187/428] Upload include BODY_SHELL_TYPE --- defaults/template-run.env | 5 +++-- scripts/training/prepare-config.py | 3 +++ scripts/upload/prepare-config.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index ecc81c7e..cdd613a3 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -1,10 +1,11 @@ DR_RUN_ID=0 DR_WORLD_NAME=reInvent2019_track DR_RACE_TYPE=TIME_TRIAL -DR_CAR_COLOR=Red DR_CAR_NAME=FastCar +DR_CAR_BODY_SHELL_TYPE=deepracer +DR_CAR_COLOR=Red DR_DISPLAY_NAME=$DR_CAR_NAME -DR_RACER_NAME=racer1 +DR_RACER_NAME=$DR_CAR_NAME DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_EVAL_NUMBER_OF_TRIALS=3 DR_EVAL_IS_CONTINUOUS=True diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 131f81b3..554c39b7 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -31,6 +31,9 @@ config['TRAINING_JOB_ARN'] = 'arn:Dummy' # Car and training +config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') +if config['BODY_SHELL_TYPE'] == 'deepracer': + config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') diff --git a/scripts/upload/prepare-config.py b/scripts/upload/prepare-config.py index 80f672a6..a7520a39 100755 --- a/scripts/upload/prepare-config.py +++ b/scripts/upload/prepare-config.py @@ -19,7 +19,9 @@ config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('TARGET_S3_PREFIX', 'rl-deepracer-sagemaker') # Car and training -config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') +config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') +if config['BODY_SHELL_TYPE'] == 'deepracer': + config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') From 8946a979da76f754f303fb272b441b41cb8d5739 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 30 Dec 2020 13:41:27 +0000 Subject: [PATCH 188/428] Fixing issue with detection of eval robomaker --- bin/scripts_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 505c318e..56949dd8 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -219,7 +219,7 @@ function dr-logs-robomaker { echo "Robomaker #${OPT_REPLICA} is not running." return 1 fi - ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA}) + ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL}) done else echo "Robomaker #${OPT_REPLICA} is not running." From 2304f3f206118c92ec494842d792c152531a2a32 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 30 Dec 2020 13:45:35 +0000 Subject: [PATCH 189/428] Remove output causing issues when waiting --- bin/scripts_wrapper.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 56949dd8..be75a422 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -267,8 +267,6 @@ function dr-find-robomaker { eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}_robomaker.${OPT_REPLICA}" | cut -f1 -d\ | head -1) if [ -n "$ROBOMAKER_ID" ]; then echo $ROBOMAKER_ID - else - echo "Robomaker is not running." fi } From 29888367c98cf773474e4f8bba6b9eadb2ac6f0b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Wed, 30 Dec 2020 15:08:19 +0000 Subject: [PATCH 190/428] Clear variable for Robomaker logs --- bin/scripts_wrapper.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index be75a422..aa8fbb8b 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -186,6 +186,7 @@ function dr-find-sagemaker { function dr-logs-robomaker { OPT_REPLICA=1 + OPT_EVAL="" local OPTIND OPT_TIME="--since 5m" From 6407812edc68ab8dcd57eac274101adfb196be25 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Wed, 30 Dec 2020 16:14:11 +0100 Subject: [PATCH 191/428] Don't restart minio --- bin/scripts_wrapper.sh | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index aa8fbb8b..99184754 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -1,15 +1,6 @@ #!/bin/bash function dr-upload-custom-files { - if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; - then - if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; - then - docker stack deploy $DR_MINIO_COMPOSE_FILE s3 - else - docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d - fi - fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET @@ -40,15 +31,6 @@ function dr-increment-upload-model { } function dr-download-custom-files { - if [[ "${DR_CLOUD,,}" == "azure" || "${DR_CLOUD,,}" == "local" ]]; - then - if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; - then - docker stack deploy $DR_MINIO_COMPOSE_FILE s3 - else - docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d - fi - fi eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ From 9269fd88c72b7c9b4ebb45cfe7a3f68d24b8042b Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Wed, 30 Dec 2020 16:14:50 +0100 Subject: [PATCH 192/428] Moving to sagemaker 3.0.2 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 95ca5fb1..3a2820e9 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -3,6 +3,6 @@ "containers": { "rl_coach": "v3.0.5", "robomaker": "3.0.6", - "sagemaker": "3.0.1" + "sagemaker": "3.0.2" } } From 3bdc3b71cd1f39cea0718ad7bfa1743511b6390e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 31 Dec 2020 17:37:34 +0100 Subject: [PATCH 193/428] Add Head-to-Model - remove Tournament (#95) * Remove tournament part 1 * Enable basic HEAD_TO_MODEL * Tuning Head-to-Model * Fixing bug with display name in non-H2H --- bin/scripts_wrapper.sh | 8 +-- defaults/sample-tournament.json | 24 -------- defaults/template-run.env | 7 ++- scripts/evaluation/prepare-config.py | 80 ++++++++++++++++++++----- scripts/tournament/prepare-config.py | 90 ---------------------------- scripts/tournament/start.sh | 85 -------------------------- scripts/tournament/stop.sh | 15 ----- 7 files changed, 71 insertions(+), 238 deletions(-) delete mode 100644 defaults/sample-tournament.json delete mode 100755 scripts/tournament/prepare-config.py delete mode 100755 scripts/tournament/start.sh delete mode 100755 scripts/tournament/stop.sh diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 99184754..2492d63f 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -60,12 +60,7 @@ function dr-stop-evaluation { function dr-start-tournament { - dr-update-env - $DIR/scripts/tournament/start.sh "$@" -} - -function dr-stop-tournament { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/tournament && ./stop.sh" + echo "Tournaments are no longer supported. Use Head-to-Model evaluation instead." } @@ -73,6 +68,7 @@ function dr-start-loganalysis { ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" } + function dr-stop-loganalysis { eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then diff --git a/defaults/sample-tournament.json b/defaults/sample-tournament.json deleted file mode 100644 index d22eeb0c..00000000 --- a/defaults/sample-tournament.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "racers": [ - { - "racer_name": "Racer1", - "s3_bucket": "bucket", - "s3_prefix": "Model1" - }, - { - "racer_name": "Racer2", - "s3_bucket": "bucket", - "s3_prefix": "Model2" - }, - { - "racer_name": "Racer3", - "s3_bucket": "bucket", - "s3_prefix": "Model3" - }, - { - "racer_name": "Racer4", - "s3_bucket": "bucket", - "s3_prefix": "Model4" - } - ] -} diff --git a/defaults/template-run.env b/defaults/template-run.env index cdd613a3..e1622415 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -12,6 +12,11 @@ DR_EVAL_IS_CONTINUOUS=True DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False +DR_EVAL_OPP_S3_MODEL_PREFIX=rl-deepracer-sagemaker +DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer +DR_EVAL_OPP_CAR_NAME=FasterCar +DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME +DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False DR_TRAIN_START_POSITION_OFFSET=0.0 @@ -25,8 +30,6 @@ DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files DR_LOCAL_S3_TRAINING_PARAMS_FILE=training_params.yaml DR_LOCAL_S3_EVAL_PARAMS_FILE=evaluation_params.yaml -DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE=tournament-params.yaml -DR_LOCAL_S3_TOURNAMENT_JSON_FILE=tournament.json DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index e3b0fdd8..3d8d6115 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -12,39 +12,56 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") config = {} +config['CAR_COLOR'] = [] +config['BODY_SHELL_TYPE'] = [] +config['RACER_NAME'] = [] +config['DISPLAY_NAME'] = [] +config['MODEL_S3_PREFIX'] = [] +config['MODEL_S3_BUCKET'] = [] +config['SIMTRACE_S3_PREFIX'] = [] +config['SIMTRACE_S3_BUCKET'] = [] +config['KINESIS_VIDEO_STREAM_NAME'] = [] +config['METRICS_S3_BUCKET'] = [] +config['METRICS_S3_OBJECT_KEY'] = [] +config['MP4_S3_BUCKET'] = [] +config['MP4_S3_OBJECT_PREFIX'] = [] # Basic configuration; including all buckets etc. config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') config['JOB_TYPE'] = 'EVALUATION' config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') -config['MODEL_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') -config['MODEL_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') -config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') + +config['MODEL_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) +config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) +config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) +config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) # Metrics -config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') +config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) if metrics_prefix is not None: - config['METRICS_S3_OBJECT_KEY'] = '{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))) + config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time())))) else: - config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))) + config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time())))) # MP4 configuration / sav save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) if save_mp4: - config['MP4_S3_BUCKET'] = config['MODEL_S3_BUCKET'] - config['MP4_S3_OBJECT_PREFIX'] = '{}/{}'.format(config['MODEL_S3_PREFIX'],'mp4') + config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) + config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'bucket'),'mp4')) # Car and training -config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') -config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') +body_shell_type = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') +config['BODY_SHELL_TYPE'].append(body_shell_type) +if body_shell_type == 'deepracer': + config['CAR_COLOR'].append(os.environ.get('DR_CAR_COLOR', 'Red')) +config['DISPLAY_NAME'].append(os.environ.get('DR_DISPLAY_NAME', 'racer1')) +config['RACER_NAME'].append(os.environ.get('DR_RACER_NAME', 'racer1')) + config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') -config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') -config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) @@ -81,17 +98,48 @@ def str2bool(v): config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') +# Head to Model +if config['RACE_TYPE'] == 'HEAD_TO_MODEL': + config['MODEL_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) + config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) + config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) + config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) + + # Metrics + config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) + metrics_prefix = os.environ.get('DR_EVAL_OPP_S3_METRICS_PREFIX', '{}/{}'.format(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'),'metrics')) + if metrics_prefix is not None: + config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time())))) + else: + config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time())))) + + # MP4 configuration / sav + save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) + if save_mp4: + config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) + config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_EVAL_OPP_MODEL_PREFIX', 'bucket'),'mp4')) + + # Car and training + config['DISPLAY_NAME'].append(os.environ.get('DR_EVAL_OPP_DISPLAY_NAME', 'racer1')) + config['RACER_NAME'].append(os.environ.get('DR_EVAL_OPP_RACER_NAME', 'racer1')) + + body_shell_type = os.environ.get('DR_EVAL_OPP_CAR_BODY_SHELL_TYPE', 'deepracer') + config['BODY_SHELL_TYPE'].append(body_shell_type) + config['VIDEO_JOB_TYPE'] = 'EVALUATION' + config['CAR_COLOR'] = ['Purple', 'Orange'] + config['MODEL_NAME'] = config['DISPLAY_NAME'] + # S3 Setup / write and upload file s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) s3_region = config['AWS_REGION'] -s3_bucket = config['MODEL_S3_BUCKET'] -s3_prefix = config['MODEL_S3_PREFIX'] +s3_bucket = config['MODEL_S3_BUCKET'][0] +s3_prefix = config['MODEL_S3_PREFIX'][0] s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') if s3_mode == 'profile': s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') else: # mode is 'role' s3_profile = None -s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval-params.yaml') +s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval_params.yaml') yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) session = boto3.session.Session(profile_name=s3_profile) diff --git a/scripts/tournament/prepare-config.py b/scripts/tournament/prepare-config.py deleted file mode 100755 index 362374a0..00000000 --- a/scripts/tournament/prepare-config.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/python3 - -import boto3 -import sys -import os -import time -import json -import io -import yaml - -def str2bool(v): - return v.lower() in ("yes", "true", "t", "1") - -config = {} - -# Basic configuration; common for all racers - -tournament_s3_prefix = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'tournament') -tournament_s3_bucket = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') - -config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') -config['JOB_TYPE'] = 'EVALUATION' -config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') -config['RACE_TYPE'] = 'HEAD_TO_MODEL' -config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') -config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') - -is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) -if is_continous: - config['NUMBER_OF_RESETS'] = '10000' - config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') - -config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') - -# Tournament bucket for logs, and overall storage -tournament_config = os.environ.get('DR_LOCAL_S3_TOURNAMENT_JSON_FILE', 'tournament.json') -print("Reading in tournament file {}".format(tournament_config)) - -config['RACER_NAME'] = [] -config['DISPLAY_NAME'] = [] -config['MODEL_S3_PREFIX'] = [] -config['MODEL_S3_BUCKET'] = [] -config['SIMTRACE_S3_PREFIX'] = [] -config['SIMTRACE_S3_BUCKET'] = [] -config['KINESIS_VIDEO_STREAM_NAME'] = [] -config['METRICS_S3_BUCKET'] = [] -config['METRICS_S3_PREFIX'] = [] -config['MP4_S3_BUCKET'] = [] -config['MP4_S3_OBJECT_PREFIX'] = [] -config['MODEL_METADATA_FILE_S3_KEY'] = [] - -with open(tournament_config) as tournament_config_json: - tournament_config_data = json.load(tournament_config_json) - for r in tournament_config_data['racers']: - config['RACER_NAME'].append(r['racer_name']) - config['DISPLAY_NAME'].append(r['racer_name']) - config['MODEL_S3_PREFIX'].append(r['s3_prefix']) - config['MODEL_S3_BUCKET'].append(r['s3_bucket']) - config['MODEL_METADATA_FILE_S3_KEY'].append("{}/model/model_metadata.json".format(r['s3_prefix'])) - config['KINESIS_VIDEO_STREAM_NAME'].append("None") - config['SIMTRACE_S3_BUCKET'].append(tournament_s3_bucket) - config['SIMTRACE_S3_PREFIX'].append("{}/{}/simtrace/".format(tournament_s3_prefix, r['racer_name'])) - config['MP4_S3_BUCKET'].append(tournament_s3_bucket) - config['MP4_S3_OBJECT_PREFIX'].append("{}/{}/mp4/".format(tournament_s3_prefix, r['racer_name'])) - config['METRICS_S3_BUCKET'].append(tournament_s3_bucket) - config['METRICS_S3_PREFIX'].append("{}/{}/metrics/".format(tournament_s3_prefix, r['racer_name'])) - -# S3 Setup / write and upload file -s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) -s3_region = config['AWS_REGION'] -s3_bucket = tournament_s3_bucket -s3_prefix = tournament_s3_prefix -s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') -if s3_mode == 'profile': - s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') -else: # mode is 'role' - s3_profile = None -s3_yaml_name = os.environ.get('DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE', 'tournament-params.yaml') -yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) - -session = boto3.session.Session(profile_name=s3_profile) -s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) - -yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) -local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'tournament-params-' + str(round(time.time())) + '.yaml')) - -with open(local_yaml_path, 'w') as yaml_file: - yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) - -s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) diff --git a/scripts/tournament/start.sh b/scripts/tournament/start.sh deleted file mode 100755 index 5e0f8f45..00000000 --- a/scripts/tournament/start.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash - -source $DR_DIR/bin/scripts_wrapper.sh - -usage(){ - echo "Usage: $0 [-q] [-f yaml-file]" - echo " -q Quiet - does not start log tracing." - echo " -f filename Tournament Yaml configuration." - echo " -w Wipe tournament / restart." - exit 1 -} - -trap ctrl_c INT - -function ctrl_c() { - echo "Requested to stop." - exit 1 -} - -while getopts ":wqf:" opt; do -case $opt in -q) OPT_QUIET="QUIET" -;; -f) OPT_YAML_FILE="$OPTARG" -;; -h) usage -;; -w) OPT_WIPE="WIPE" -;; -\?) echo "Invalid option -$OPTARG" >&2 -usage -;; -esac -done - -# set evaluation specific environment variables -S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" -STACK_NAME="deepracer-eval-$DR_RUN_ID" - -export ROBOMAKER_COMMAND="./run.sh run tournament.launch" -export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE} - -#Check if files are available -S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) -if [[ $S3_FILES > 0 ]]; -then - if [[ -z $OPT_WIPE ]]; - then - echo "Selected path $S3_PATH exists. Continuing execution of tournament." - else - echo "Wiping path $S3_PATH." - aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH} - echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" - python3 $DR_DIR/scripts/tournament/prepare-config.py - fi -else - echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" - python3 $DR_DIR/scripts/tournament/prepare-config.py -fi - -if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; -then - COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" - export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" - mkdir -p $DR_MOUNT_DIR -else - COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE" -fi - -# Check if we will use Docker Swarm or Docker Compose -if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; -then - docker stack deploy $COMPOSE_FILES $STACK_NAME -else - docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d -fi - -# Request to be quiet. Quitting here. -if [ -n "$OPT_QUIET" ]; then - exit 0 -fi - -# Trigger requested log-file -dr-logs-robomaker -w 15 -e - diff --git a/scripts/tournament/stop.sh b/scripts/tournament/stop.sh deleted file mode 100755 index 4743dd4a..00000000 --- a/scripts/tournament/stop.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -STACK_NAME="deepracer-eval-$DR_RUN_ID" -RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} - -# Check if we will use Docker Swarm or Docker Compose -if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; -then - docker stack rm $STACK_NAME -else - COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ ) - export DR_CURRENT_PARAMS_FILE="" - export ROBOMAKER_COMMAND="" - docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down -fi \ No newline at end of file From 0ae3f244b2f1f936ec795b000482e127624e91e0 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 31 Dec 2020 17:42:50 +0100 Subject: [PATCH 194/428] Update documentation to cover Head-to-Head --- docs/head-to-head.md | 26 ++++++++++++++++++++++++++ docs/index.md | 2 +- docs/tournament.md | 36 ------------------------------------ 3 files changed, 27 insertions(+), 37 deletions(-) create mode 100644 docs/head-to-head.md delete mode 100644 docs/tournament.md diff --git a/docs/head-to-head.md b/docs/head-to-head.md new file mode 100644 index 00000000..f8ea6ba1 --- /dev/null +++ b/docs/head-to-head.md @@ -0,0 +1,26 @@ +# Head-to-Head Race (Beta) + +It is possible to run a head-to-head race, similar to the races in the brackets +run by AWS in the Virtual Circuits to determine the winner of the head-to-bot races. + +This replaces the "Tournament Mode". + +## Introduction + +The concept is that you have two models racing each other, one Purple and one Orange Car. One car +is powered by our primary configured model, and the second car is powered by the model in `DR_EVAL_OPP_S3_MODEL_PREFIX` + +## Configuration + +### run.env + +Configure `run.env` with the following parameters: +* `DR_RACE_TYPE` should be `HEAD_TO_MODEL`. +* `DR_EVAL_OPP_S3_MODEL_PREFIX` will be the S3 prefix for the secondary model. +* `DR_EVAL_OPP_CAR_NAME` is the display name of this model. + +Metrics, Traces and Videos will be stored in each models' prefix. + +## Run + +Run the race with `dr-start-evaluation`; one race will be run. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 01ac6fb4..50487bdb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -50,7 +50,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th * [GPU Accelerated OpenGL for Robomaker](opengl.md) * [Having multiple GPUs in one Computer](multi_gpu.md) * [Installing on Windows](windows.md) -* [Run a Head-to-Head Tournament](tournament.md) +* [Run a Head-to-Head Race](head-to-head.md) # Support diff --git a/docs/tournament.md b/docs/tournament.md deleted file mode 100644 index 762d9892..00000000 --- a/docs/tournament.md +++ /dev/null @@ -1,36 +0,0 @@ -# Head-to-Head Tournament (Beta) - -It is possible to run a head-to-head tournament, similar to the elimination brackets -run by AWS in the Virtual Circuits to determine the winner of the head-to-bot races. - -## Introduction - -The concept for tournament is that you have a set of models, each in their own path -(S3 bucket + prefix). Additionally you define one prefix where all the outcomes will be stored. - -Each race in the tournament will require you to start and stop the tournament execution; the code will update the outcome prefix with the current status. - -## Configuration - -### run.env - -Configure `run.env` with the following parameters: -* `DR_LOCAL_S3_MODEL_PREFIX` will be the path where all the outcomes are stored. -* `DR_LOCAL_S3_TOURNAMENT_JSON_FILE` is the local filesystem path to your tournament configuation -* `DR_LOCAL_S3_TOURNAMENT_PARAMS_FILE` is the path where the generated tournament parameters are uploaded - in S3. Can be left unchanged in most cases. -* `DR_EVAL_NUMBER_OF_TRIALS`, `DR_EVAL_IS_CONTINUOUS`, `DR_EVAL_OFF_TRACK_PENALTY`, - `DR_EVAL_COLLISION_PENALTY` and `DR_EVAL_SAVE_MP4` to be configured as a normal evaluation run. - - -### tournament.json - -Create a `tournament.json` based on `defaults/sample-tournament.json`. You will have one entry per model. -Required configuration per racer is: -* `racer_name`: The display name of the racer -* `s3_bucket`: The S3 bucket where the model for this racer is stored -* `s3_prefix`: The S3 prefix where the model for this racer is stored. - -## Run - -Run the tournament with `dr-start-tournament`; one race will be run. Once completed you need to do `dr-stop-tournament` and `dr-start-tournament` to make it run the next race. Iterate until done. \ No newline at end of file From b108d33374406dee05256dbefcc1c8f30d6d29ab Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 3 Jan 2021 10:22:38 +0000 Subject: [PATCH 195/428] Merge branch 'master' into dev From 396f46eedf2c9da94aa523769089e8cd93f81cd4 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 9 Jan 2021 17:50:43 +0100 Subject: [PATCH 196/428] Fix to enable non-local Minio --- bin/activate.sh | 6 ++++-- docker/docker-compose-endpoint.yml | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 3063de88..905fa97b 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -97,6 +97,7 @@ then elif [[ "${DR_CLOUD,,}" == "local" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" + export DR_MINIO_URL="http://minio:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" @@ -104,9 +105,10 @@ then elif [[ "${DR_CLOUD,,}" == "remote" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="$DR_REMOTE_MINIO_URL" + export DR_MINIO_URL="$DR_REMOTE_MINIO_URL" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" - DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" - DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml" + DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" else DR_LOCAL_PROFILE_ENDPOINT_URL="" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" diff --git a/docker/docker-compose-endpoint.yml b/docker/docker-compose-endpoint.yml index a14b009f..83d5eb4c 100644 --- a/docker/docker-compose-endpoint.yml +++ b/docker/docker-compose-endpoint.yml @@ -3,7 +3,7 @@ version: '3.7' services: rl_coach: environment: - - S3_ENDPOINT_URL=http://minio:9000 + - S3_ENDPOINT_URL=${DR_MINIO_URL} robomaker: environment: - - S3_ENDPOINT_URL=http://minio:9000 + - S3_ENDPOINT_URL=${DR_MINIO_URL} From becb226be8d420df0891f56fba0ce3ef4b8a548a Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 9 Jan 2021 18:17:30 +0100 Subject: [PATCH 197/428] Fix to ensure Azure compatibility --- bin/activate.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/activate.sh b/bin/activate.sh index 905fa97b..8b53c241 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -90,6 +90,7 @@ fi if [[ "${DR_CLOUD,,}" == "azure" ]]; then export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" + export DR_MINIO_URL="http://minio:9000" DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" From 440bbf4fdf69c207c37cde98d374d9aab8b5c9d0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 9 Jan 2021 20:16:21 +0000 Subject: [PATCH 198/428] Ensure that Minio Compose file is undefined if not needed --- bin/activate.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/activate.sh b/bin/activate.sh index 8b53c241..e4ccffdd 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -110,6 +110,7 @@ then DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" + DR_MINIO_COMPOSE_FILE="" else DR_LOCAL_PROFILE_ENDPOINT_URL="" DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" From 3843aa30d1de1d1ce2458fab342ecc0d80826f8a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 9 Jan 2021 21:14:16 +0000 Subject: [PATCH 199/428] Bump to 3.1.0-dev sagemaker --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 24682e41..702c5a00 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -3,6 +3,6 @@ "containers": { "rl_coach": "v3.1.0-dev", "robomaker": "3.1.0-dev", - "sagemaker": "3.0.2" + "sagemaker": "3.1.0-dev" } } From b2a4c8c7d8351d35564003e759c1c210e49f9cbc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 18 Jan 2021 19:03:49 +0000 Subject: [PATCH 200/428] Enable RTF_OVERRIDE --- defaults/template-run.env | 2 ++ docker/docker-compose-eval.yml | 1 + docker/docker-compose-training.yml | 1 + 3 files changed, 4 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index e1622415..98e6ebb8 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -17,12 +17,14 @@ DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer DR_EVAL_OPP_CAR_NAME=FasterCar DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME +#DR_EVAL_RTF=1.0 DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False DR_TRAIN_START_POSITION_OFFSET=0.0 DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 DR_TRAIN_MULTI_CONFIG=False DR_TRAIN_MIN_EVAL_TRIALS=5 +#DR_TRAIN_RTF=1.0 DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker DR_LOCAL_S3_PRETRAINED=False DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 1ef8fa8d..b16d103b 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -27,3 +27,4 @@ services: - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - ROLLOUT_IDX=0 + - RTF_OVERRIDE=${DR_EVAL_RTF} diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 91f5b9b7..d33d45a2 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -39,3 +39,4 @@ services: - ENABLE_GUI=${DR_GUI_ENABLE} - CUDA_VISIBLE_DEVICES - MULTI_CONFIG + - RTF_OVERRIDE=${DR_TRAIN_RTF} From d65d4eee840c23102f9f99231466806145137546 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 24 Jan 2021 11:07:42 +0000 Subject: [PATCH 201/428] Renaming adaptations --- README.md | 15 ++------------- bin/activate.sh | 2 +- bin/init.sh | 2 +- defaults/dependencies.json | 2 +- docker/docker-compose-eval.yml | 2 +- docker/docker-compose-training.yml | 2 +- docs/index.md | 13 +------------ docs/installation.md | 2 +- utils/sample-createspot.sh | 2 +- 9 files changed, 10 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index b7cf57f2..ab5e9771 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,7 @@ Provides a quick and easy way to get up and running with a DeepRacer training en ## Introduction -DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/larsll/deepracer-build) repository. - -Main differences to the work done by Alex is: -* Runtime S3 storage is setup to fit the connected cloud platform: - * Azure: Local 'virtual' S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). - * AWS: Directly connects to a real S3 bucket. -* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. -* Configuration has been reorganized : - * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. - * `system.env` contains system-wide constants (expected to be configured only at setup) - * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). - * `docker/.env` remains the home for more static configuration. This is not expected to change between sessions. +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities are in the [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository. ## Main Features @@ -43,7 +32,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th ## Documentation -Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://larsll.github.io/deepracer-for-cloud). +Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://aws-deepracer-community.github.io/deepracer-for-cloud). ## Support diff --git a/bin/activate.sh b/bin/activate.sh index 52d046fa..54630a66 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -171,7 +171,7 @@ if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER." fi -COACH_VER=$(docker inspect larsll/deepracer-rlcoach:$DR_COACH_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) +COACH_VER=$(docker inspect awsdeepracercommunity/deepracer-rlcoach:$DR_COACH_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) if [ -z "$COACH_VER" ]; then COACH_VER=$DR_COACH_IMAGE; fi if ! verlte $DEPENDENCY_VERSION $COACH_VER; then echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER." diff --git a/bin/init.sh b/bin/init.sh index ebf8e597..c8a11cbf 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -142,7 +142,7 @@ else fi sed -i "s//$SAGEMAKER_VERSION/g" $INSTALL_DIR/system.env -docker pull larsll/deepracer-rlcoach:$COACH_VERSION +docker pull awsdeepracercommunity/deepracer-rlcoach:$COACH_VERSION docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_VERSION docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 3a2820e9..0d3b2f53 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,7 +1,7 @@ { "master_version": "3.0", "containers": { - "rl_coach": "v3.0.5", + "rl_coach": "3.0.5", "robomaker": "3.0.6", "sagemaker": "3.0.2" } diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 9bbe4fbe..15bc0396 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -7,7 +7,7 @@ networks: services: rl_coach: - image: larsll/deepracer-rlcoach:${DR_COACH_IMAGE} + image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE} command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"] robomaker: image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 3b8eb7ed..253ff2a6 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -7,7 +7,7 @@ networks: services: rl_coach: - image: larsll/deepracer-rlcoach:${DR_COACH_IMAGE} + image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE} environment: - SAGEMAKER_IMAGE=${DR_SAGEMAKER_IMAGE} - PRETRAINED=${DR_LOCAL_S3_PRETRAINED} diff --git a/docs/index.md b/docs/index.md index 01ac6fb4..59f70e24 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,18 +2,7 @@ Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server. -DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/larsll/deepracer-build) repository. - -Main differences to the work done by Alex is: -* Runtime S3 storage is setup to fit the connected cloud platform: - * Azure: Local 'virtual' S3 instance (minio) is now using an Azure Storage Account / Blob Storage as a back-end. This allows for access between sesssions using e.g. Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/). - * AWS: Directly connects to a real S3 bucket. - * Local: Local 'virtual' S3 instance (minio) storing files locally on the server. -* Robomaker and Log Analysis containers are extended with required drivers to enable Tensorflow to use the GPU. Containers are all pre-compiled and available from Docker Hub. -* Configuration has been reorganized : - * `custom_files/hyperparameters.json` stores the runtime hyperparameters, which logically belongs together with the model_metadata.json and rewards.py files. - * `system.env` contains system-wide constants (expected to be configured only at setup) - * `run.env` contains user session configuration (pretraining, track etc.) as well as information about where to upload your model (S3 bucket and prefix). +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/aws-deepracer-community/deepracer) repository. # Main Features diff --git a/docs/installation.md b/docs/installation.md index 32617230..bba6e3b1 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -34,7 +34,7 @@ Depending on your needs as well as specific needs of the cloud platform you can The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - git clone https://github.com/larsll/deepracer-for-cloud.git + git clone https://github.com/awsdeepracercommunity/deepracer-for-cloud.git **For cloud setup** execute: diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh index f4867702..dc79c2e5 100644 --- a/utils/sample-createspot.sh +++ b/utils/sample-createspot.sh @@ -138,4 +138,4 @@ aws ec2 run-instances \ --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \ --instance-market-options MarketType=spot \ --user-data "#!/bin/bash - su -c 'git clone https://github.com/larsll/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" + su -c 'git clone https://github.com/awsdeepracercommunity/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" From a8062fd0046cb3bd2ea0536a8c46ad8caa57375c Mon Sep 17 00:00:00 2001 From: Tomasz Ptak Date: Wed, 3 Feb 2021 08:14:17 +0000 Subject: [PATCH 202/428] Fix git repository urls --- docs/installation.md | 2 +- utils/sample-createspot.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index bba6e3b1..d5e6b045 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -34,7 +34,7 @@ Depending on your needs as well as specific needs of the cloud platform you can The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - git clone https://github.com/awsdeepracercommunity/deepracer-for-cloud.git + git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git **For cloud setup** execute: diff --git a/utils/sample-createspot.sh b/utils/sample-createspot.sh index dc79c2e5..564174e2 100644 --- a/utils/sample-createspot.sh +++ b/utils/sample-createspot.sh @@ -138,4 +138,4 @@ aws ec2 run-instances \ --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \ --instance-market-options MarketType=spot \ --user-data "#!/bin/bash - su -c 'git clone https://github.com/awsdeepracercommunity/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" + su -c 'git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" From 92c175904ac4f3acac4c29a87805628f85c2621f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 3 Feb 2021 14:57:56 +0100 Subject: [PATCH 203/428] Fix broken URL --- docs/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation.md b/docs/installation.md index bba6e3b1..d5e6b045 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -34,7 +34,7 @@ Depending on your needs as well as specific needs of the cloud platform you can The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - git clone https://github.com/awsdeepracercommunity/deepracer-for-cloud.git + git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git **For cloud setup** execute: From 9e4a8d71734c8f735fc8c92f184a6c5a0a95daeb Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 3 Feb 2021 18:54:20 +0100 Subject: [PATCH 204/428] Update dependencies.json --- defaults/dependencies.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 2a7f4e5e..deea5ffe 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "3.1", "containers": { - "rl_coach": "3.1.0-dev", - "robomaker": "3.1.0-dev", - "sagemaker": "3.1.0-dev" + "rl_coach": "3.1.0", + "robomaker": "3.1.0", + "sagemaker": "3.1.0" } } From ee8d9cf75faed3736f0b725c07807e1574e4c8d6 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 4 Feb 2021 20:20:55 +0100 Subject: [PATCH 205/428] Bump version --- utils/Dockerfile.sagemaker-gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index e72da699..44c596ab 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:3.0.1-rc1-gpu +FROM awsdeepracercommunity/deepracer-sagemaker:3.1.0-gpu ENV CUDA_VISIBLE_DEVICES=1 From 9191fdc529235a4ac7dcd89671667cb6cee2d696 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 4 Feb 2021 21:24:24 +0100 Subject: [PATCH 206/428] Adding SAC sample --- defaults/model_metadata_sac.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 defaults/model_metadata_sac.json diff --git a/defaults/model_metadata_sac.json b/defaults/model_metadata_sac.json new file mode 100644 index 00000000..bf99366d --- /dev/null +++ b/defaults/model_metadata_sac.json @@ -0,0 +1,8 @@ +{ + "action_space": {"speed": {"high": 2, "low": 1}, "steering_angle": {"high": 30, "low": -30}}, + "sensor": ["FRONT_FACING_CAMERA"], + "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", + "training_algorithm": "sac", + "action_space_type": "continuous", + "version": "3" +} From 4495704bbf585cf3e8475bd4d3c909dcc40537ea Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 20 Feb 2021 19:18:18 +0100 Subject: [PATCH 207/428] Moving to community loganalysis --- defaults/template-system.env | 1 + scripts/log-analysis/start.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index 2935c781..2e121b7a 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -9,6 +9,7 @@ DR_KINESIS_STREAM_NAME=None DR_KINESIS_STREAM_ENABLE=False DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= +DR_ANALYSIS_IMAGE=cpu DR_COACH_IMAGE= DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 5e7024c7..1ada0093 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -6,6 +6,6 @@ docker run --rm -d -p "8888:8888" \ -v `pwd`/../../data/analysis:/workspace/analysis \ --name loganalysis \ --network sagemaker-local \ - larsll/deepracer-loganalysis:latest + awsdeepracercommunity/deepracer-analysis:$DR_ANALYSIS_IMAGE docker logs -f loganalysis \ No newline at end of file From 45e32e0f9412a54f8d2d750209a814d5b2ffb5eb Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 20 Feb 2021 19:48:50 +0000 Subject: [PATCH 208/428] Changing loganalysis command --- bin/scripts_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 2492d63f..a9b65e8b 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -285,7 +285,7 @@ function dr-logs-loganalysis { function dr-url-loganalysis { eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then - docker exec "$LOG_ANALYSIS_ID" bash -c "source .venv/bin/activate && jupyter notebook list" + docker exec "$LOG_ANALYSIS_ID" bash -c "jupyter notebook list" else echo "Log-analysis is not running." fi From 177baba39d3a0db92f9557bf80f3ff202001cf60 Mon Sep 17 00:00:00 2001 From: Sorin Date: Fri, 26 Feb 2021 17:57:49 +0200 Subject: [PATCH 209/428] docs/installation.md: Updated Troubleshooting section (#54) * docs/installation.md: Updated Troubleshooting section Co-authored-by: Sorin Patrasoiu Co-authored-by: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> --- docs/installation.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/installation.md b/docs/installation.md index d5e6b045..3461429d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -149,4 +149,13 @@ After a while you will see the sagemaker logs on the screen. ## Troubleshooting -If things do not start as expected - e.g. you get a message "Sagemaker is not running" then run `docker ps -a` to see if the containers are running or if they stopped due to errors. You can use `docker logs -f ` to check the errors. +Here are some hints for troubleshooting specific issues you may encounter + +### Local training troubleshooting + +| Issue | Troubleshooting hint | +|------------- | ---------------------| +Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors +Check docker errors for specific container | Run `docker logs -f ` +Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.
If you don't care which one to use, you can get the first one by running ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask).
Edit `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr `.
Rerun `./bin/init.sh -c local -a cpu` + From ce55f451f561f78c093f4ebf9250658fe8db6534 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 27 Feb 2021 19:28:58 +0100 Subject: [PATCH 210/428] Moving to 3.1.1 --- defaults/dependencies.json | 6 +++--- utils/Dockerfile.sagemaker-gpu | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index deea5ffe..51e3ea2d 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "3.1", "containers": { - "rl_coach": "3.1.0", - "robomaker": "3.1.0", - "sagemaker": "3.1.0" + "rl_coach": "3.1.1", + "robomaker": "3.1.1", + "sagemaker": "3.1.1" } } diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index 44c596ab..8a42d33b 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:3.1.0-gpu +FROM awsdeepracercommunity/deepracer-sagemaker:3.1.1-gpu-nv ENV CUDA_VISIBLE_DEVICES=1 From 038c0777485bca5a1c8e34465db8c1293c19a068 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 2 Mar 2021 17:09:23 +0100 Subject: [PATCH 211/428] Fix dr-url-loganalysis --- bin/scripts_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index a9b65e8b..75629661 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -285,7 +285,7 @@ function dr-logs-loganalysis { function dr-url-loganalysis { eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then - docker exec "$LOG_ANALYSIS_ID" bash -c "jupyter notebook list" + docker exec "$LOG_ANALYSIS_ID" bash -c "jupyter server list" else echo "Log-analysis is not running." fi From df38117a585efe2ead5a8cc48ef938d9bbd5c5e3 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 3 Mar 2021 16:09:48 +0100 Subject: [PATCH 212/428] Upgrade to Robomaker 3.1.2 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 51e3ea2d..6be94b9a 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "3.1", "containers": { "rl_coach": "3.1.1", - "robomaker": "3.1.1", + "robomaker": "3.1.2", "sagemaker": "3.1.1" } } From ab90b185dbcd83351a3fc46a3a04e24df499f527 Mon Sep 17 00:00:00 2001 From: Jerry Gamblin Date: Wed, 3 Mar 2021 11:54:27 -0600 Subject: [PATCH 213/428] Update Installation.md (#57) * Update installation.md * Update reference.md Co-authored-by: Jerry Gamblin <@jgamblin> --- docs/installation.md | 89 ++++++++++++++++++++++++-------------------- docs/reference.md | 8 ++-- 2 files changed, 52 insertions(+), 45 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 3461429d..9f85f6d7 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -5,41 +5,48 @@ Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. Both CPU-only as well as GPU systems are supported. **AWS**: + * EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training. - * Ubuntu 18.04 - * Minimum 30 GB, preferred 40 GB of OS disk. - * Ephemeral Drive connected - * Minimum of 8 GB GPU-RAM if running with GPU. - * Recommended at least 6 VCPUs + * Ubuntu 18.04 + * Minimum 30 GB, preferred 40 GB of OS disk. + * Ephemeral Drive connected + * Minimum of 8 GB GPU-RAM if running with GPU. + * Recommended at least 6 VCPUs * S3 bucket. Preferrably in same region as EC2 instance. **Azure**: + * N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard - * Ubuntu 18.04 - * Standard 30 GB OS drive is sufficient to get started. - * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. - * Minimum 8 GB GPU-RAM - * Recommended at least 6 VCPUs + * Ubuntu 18.04 + * Standard 30 GB OS drive is sufficient to get started. + * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. + * Minimum 8 GB GPU-RAM + * Recommended at least 6 VCPUs * Storage Account with one Blob container configured for Access Key authentication. **Local**: + * A modern, comparatively powerful, Intel based system. - * Ubuntu 18.04 or 20.04, other Linux-dristros likely to work. - * 4 core-CPU, equivalent to 8 vCPUs; the more the better. - * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. - * System RAM + GPU RAM should be at least 32 GB. + * Ubuntu 18.04 or 20.04, other Linux-dristros likely to work. + * 4 core-CPU, equivalent to 8 vCPUs; the more the better. + * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. + * System RAM + GPU RAM should be at least 32 GB. * Running DRfC Ubuntu 20.04 on Windows using Windows Subsystem for Linux 2 is possible. See [Installing on Windows](windows.md) ## Installation The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. - git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git - +```shell +git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git +``` + **For cloud setup** execute: - - cd deepracer-for-cloud && ./bin/prepare.sh - + +```shell +cd deepracer-for-cloud && ./bin/prepare.sh +``` + This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. @@ -68,6 +75,7 @@ In AWS it is possible to set up authentication to S3 in two ways: Integrated sig #### IAM Role To use IAM Roles: + * An empty S3 bucket in the same region as the EC2 instance. * An IAM Role that has permissions to: * Access both the *new* S3 bucket as well as the DeepRacer bucket. @@ -75,24 +83,24 @@ To use IAM Roles: * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis * CloudWatch * An EC2 instance with the defined IAM Role assigned. -* Configure `run.env` as follows: +* Configure `system.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` -* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. #### Manual setup + For access with IAM user: + * An empty S3 bucket in the same region as the EC2 instance. * A real AWS IAM user set up with access keys: * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. - * Use `aws configure` to configure this into the default profile. -* Configure `run.env` as follows: + * Use `aws configure` to configure this into the default profile. +* Configure `system.env` as follows: * `DR_LOCAL_S3_PROFILE=default` * `DR_LOCAL_S3_BUCKET=` -* Configure `system.env` as follows: * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. @@ -100,17 +108,17 @@ For access with IAM user: ### Azure In Azure mode the script-set requires the following: + * A storage account with a blob container set up with access keys: - * Use `aws configure --profile ` to configure this into a specific profile. - * `` can be defined by the user, but do not use `default`. - * Access Key ID is the Storage Account name. - * Secret Access Key is the Access Key for the Storage Account. - * The blob container is equivalent to the S3 bucket. + * Use `aws configure --profile ` to configure this into a specific profile. + * `` can be defined by the user, but do not use `default`. + * Access Key ID is the Storage Account name. + * Secret Access Key is the Access Key for the Storage Account. + * The blob container is equivalent to the S3 bucket. * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. -* Configure `run.env` as follows: - * `DR_LOCAL_S3_PROFILE=` - * `DR_LOCAL_S3_BUCKET=` * Configure `system.env` as follows: + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. @@ -124,24 +132,24 @@ If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. In Local mode the script-set requires the following: + * Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. -* Configure `run.env` as follows: - * `DR_LOCAL_S3_PROFILE=minio` - * `DR_LOCAL_S3_BUCKET=bucket` * Configure `system.env` as follows: + * `DR_LOCAL_S3_PROFILE=default` + * `DR_LOCAL_S3_BUCKET=` * `DR_UPLOAD_S3_PROFILE=default` * `DR_UPLOAD_S3_BUCKET=` * Run `dr-update` for configuration to take effect. ## First Run -For the first run the following final steps are needed. This creates a training run with all default values in +For the first run the following final steps are needed. This creates a training run with all default values in * Define your custom files in `custom_files/` - samples can be found in `defaults` which you must copy over: - * `hyperparameters.json` - definining the training hyperparameters - * `model_metadata.json` - defining the action space and sensors - * `reward_function.py` - defining the reward function + * `hyperparameters.json` - definining the training hyperparameters + * `model_metadata.json` - defining the action space and sensors + * `reward_function.py` - defining the reward function * Upload the files into the bucket with `dr-upload-custom-files`. This will also start minio if required. * Start training with `dr-start-training` @@ -157,5 +165,4 @@ Here are some hints for troubleshooting specific issues you may encounter |------------- | ---------------------| Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors Check docker errors for specific container | Run `docker logs -f ` -Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.
If you don't care which one to use, you can get the first one by running ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask).
Edit `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr `.
Rerun `./bin/init.sh -c local -a cpu` - +Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.
If you don't care which one to use, you can get the first one by running ```ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask)```.
Edit `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr `.
Rerun `./bin/init.sh -c local -a cpu` diff --git a/docs/reference.md b/docs/reference.md index ce72689c..cbeebe4c 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,6 +1,7 @@ # Deepracer-for-Cloud Reference ## Environment Variables + The scripts assume that two files `system.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. | Variable | Description | @@ -66,12 +67,11 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_WEBVIEWER_PORT` | Port for the web-viewer proxy which enables the streaming of all robomaker workers at once.| | `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.| - ## Commands | Command | Description | |---------|-------------| -| `dr-update` | Loads in all scripts and environment variables again.| +| `dr-update` | Loads in all scripts and environment variables again.| | `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.| | `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| | `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| @@ -81,8 +81,8 @@ The scripts assume that two files `system.env` containing constant configuration | `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| | `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| | `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| -| `dr-start-loganalysis` | Stops the Jupyter log-analysis container.| -| `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible from remote.| +| `dr-stop-loganalysis` | Stops the Jupyter log-analysis container.| +| `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible remotly.| | `dr-stop-viewer` | Stops the NGINX proxy.| | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| From ec0c4f34513c904c665c32a224903d922933a409 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 3 Mar 2021 20:06:24 +0100 Subject: [PATCH 214/428] Extend upload with model import into DeepRacer console (#55) * Initial import model script * Add installation instructions. * Adding documentation and error handling in script. --- bin/init.sh | 2 +- defaults/template-system.env | 1 + docs/upload.md | 12 ++++++- scripts/upload/import-model.py | 58 ++++++++++++++++++++++++++++++++++ scripts/upload/upload-model.sh | 14 +++++++- 5 files changed, 84 insertions(+), 3 deletions(-) create mode 100755 scripts/upload/import-model.py diff --git a/bin/init.sh b/bin/init.sh index c7f5ecfc..5de31b6e 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -105,7 +105,7 @@ else sed -i "s//not-defined/g" $INSTALL_DIR/system.env echo "Please run 'aws configure --profile minio' to set the credentials" fi - +sed -i "s//to-be-defined/g" $INSTALL_DIR/system.env sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/system.env sed -i "s//$AWS_REGION/g" $INSTALL_DIR/system.env diff --git a/defaults/template-system.env b/defaults/template-system.env index 2e121b7a..85e38f05 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -2,6 +2,7 @@ DR_CLOUD= DR_AWS_APP_REGION= DR_UPLOAD_S3_PROFILE=default DR_UPLOAD_S3_BUCKET= +DR_UPLOAD_S3_ROLE= DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_PROFILE= DR_GUI_ENABLE=False diff --git a/docs/upload.md b/docs/upload.md index 62730710..1d533bb1 100644 --- a/docs/upload.md +++ b/docs/upload.md @@ -30,11 +30,21 @@ There are several useful switches to the upload command: * d - dry-Run mode, does not perform any write or delete operatios on target * b - uploads best checkpoint instead of default which is last checkpoint * p prefix - uploads model into specified S3 prefix + * i - imports model using the prefix as the model name + * I name - import model with a specific model name" + +### Import +If you want to use the import switches (`-i` or `-I`) there are a few pre-requisites. + +* Python packages to be installed with `pip install`: + * pandas + * deepracer-utils +* Install boto3 service `deepracer` with `python -m deepracer install-cli --force`. +* Create an IAM Role which the Deepracer service can use to access S3. Declare the ARN in `DR_UPLOAD_S3_ROLE` in `system.env`. ### Managing your models You should decide how you're going to manage your models. Upload to AWS does not preserve all the files created locally so if you delete your local files you will find it hard to go back to a previous model and resume training. - ### Create file formatted for physical car, and upload to S3 You can also create the file in the format necessary to run on the physical car directly from DRfC, without going through the AWS console. This is executed by running 'dr-upload-car-zip'; it will copy files out of the running sagemaker container, format them into the proper .tar.gz file, and upload that file to `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX`. One of the limitations of this approach is that it only uses the latest checkpoint, and does not have the option to use the "best" checkpoint, or an earlier checkpoint. Another limitation is that the sagemaker container must be running at the time this command is executed. diff --git a/scripts/upload/import-model.py b/scripts/upload/import-model.py new file mode 100755 index 00000000..9b3e038a --- /dev/null +++ b/scripts/upload/import-model.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +import boto3 +import sys +import os +import time +import json +import io +import yaml +from botocore.loaders import UnknownServiceError + +try: + import pandas as pd + import deepracer +except ImportError: + print("You need to install pandas and deepracer-utils to use the import function.") + exit(1) + +# Read in command +aws_profile = sys.argv[1] +aws_s3_role = sys.argv[2] +aws_s3_bucket = sys.argv[3] +aws_s3_prefix = sys.argv[4] +dr_model_name = sys.argv[5] + +if not aws_s3_role: + print("You must configure an IAM role with access to the S3 bucket in variable DR_UPLOAD_S3_ROLE ") + exit(1) + +session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile) + +try: + dr = session.client('deepracer') +except UnknownServiceError: + print ("Boto3 service 'deepracer' is not installed. Cannot import model.") + print ("Install with 'pip install deepracer-utils' and 'python -m deepracer install-cli --force'") + exit(1) + +# Load model to check if it already exists +a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25) +model_dict = a['Models'] +while "NextToken" in a: + a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25, NextToken=a["NextToken"]) + model_dict.extend(a['Models']) + +models = pd.DataFrame.from_dict(model_dict) + +if models[models['ModelName']==dr_model_name].size > 0: + sys.exit('Model {} already exists.'.format(dr_model_name)) + +# Import from S3 +print('Importing from s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix)) +response = dr.import_model(Name=dr_model_name, ModelArtifactsS3Path='s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix), RoleArn=aws_s3_role, Type='REINFORCEMENT_LEARNING') + +if response['ResponseMetadata']['HTTPStatusCode'] == 200: + print('Model importing as {}'.format(response['ModelArn'])) +else: + sys.exit('Error occcured when uploading') \ No newline at end of file diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index dbd36d72..4b3b4d3a 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -7,6 +7,8 @@ usage(){ echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." echo " -b Uploads best checkpoint. Default is last checkpoint." echo " -p model Uploads model in specified S3 prefix." + echo " -i Import model with the upload name" + echo " -I name Import model with a specific name" exit 1 } @@ -17,7 +19,7 @@ function ctrl_c() { exit 1 } -while getopts ":fwdhbp:c:" opt; do +while getopts ":fwdhbp:c:iI:" opt; do case $opt in b) OPT_CHECKPOINT="Best" ;; @@ -31,6 +33,10 @@ p) OPT_PREFIX="$OPTARG" ;; w) OPT_WIPE="--delete" ;; +i) OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" +;; +I) OPT_IMPORT="$OPTARG" +;; h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 @@ -171,3 +177,9 @@ aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} aws ${DR_UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN} aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} + +# After upload trigger the import +if [[ -n "${OPT_IMPORT}" ]]; +then + $DR_DIR/scripts/upload/import-model.py "${DR_UPLOAD_S3_PROFILE}" "${DR_UPLOAD_S3_ROLE}" "${TARGET_S3_BUCKET}" "${TARGET_S3_PREFIX}" "${OPT_IMPORT}" +fi \ No newline at end of file From 7b9b5b0860ad54d08bbe39058d35584708ea97d4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 4 Mar 2021 19:01:42 +0000 Subject: [PATCH 215/428] Add more documentation --- utils/start-local-browser.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/start-local-browser.sh b/utils/start-local-browser.sh index 7eaf8f33..5a8800d6 100755 --- a/utils/start-local-browser.sh +++ b/utils/start-local-browser.sh @@ -47,7 +47,7 @@ FILE=$DR_DIR/tmp/streams-$DR_RUN_ID.html # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then - echo "This script does not support swarm mode." + echo "This script does not support swarm mode. Use `dr-start-viewer`." exit fi From 646af6e6416ffe3c0eabd711caa9f7164a1426ac Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 8 Mar 2021 10:46:40 +0100 Subject: [PATCH 216/428] Fixing bug #58 --- docs/multi_gpu.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 70f0518b..96bd482c 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -56,6 +56,6 @@ Sagemaker is more critical to place, but also more complicated, as you will have A template is in `utils/Dockerfile.sagemaker-gpu`. Again the number is the applicable CUDA number. -Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu` with x being anything you like. +Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu .` with x being anything you like. -Update `system.env` to use the new image. \ No newline at end of file +Update `system.env` to use the new image. From a787a48a48ae6a54e0da90df781465978e25ed34 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 8 Apr 2021 08:17:53 +0200 Subject: [PATCH 217/428] Script that uses API to submit models and monitor results (#59) * Initial submission script * Additional debugging * Updated and tested * Ensure directory exists * Black formatting * Using the deepracer_client wrapper * Optimizing API calls --- utils/submit-monitor.py | 297 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100755 utils/submit-monitor.py diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py new file mode 100755 index 00000000..633f8ba7 --- /dev/null +++ b/utils/submit-monitor.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +import sys +import getopt +import os +import pickle +import urllib.request + +import boto3 + +try: + import pandas as pd + from deepracer import boto3_enhancer +except ImportError: + print("You need to install pandas and deepracer-utils to use this utility.") + sys.exit(1) + +dr = None + +def main(): + + # Parse Arguments + try: + opts, _ = getopt.getopt( + sys.argv[1:], + "lvsghm:b:", + ["logs", "verbose", "summary", "graphics", "help", "model=", "board="], + ) + except getopt.GetoptError as err: + # print help information and exit: + print(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + + logs_path = "{}/data/logs/leaderboards".format(os.environ.get("DR_DIR", None)) + + download_logs = False + download_videos = False + verbose = False + create_summary = False + model_name = None + leaderboard_guid = None + + for opt, arg in opts: + if opt in ("-l", "--logs"): + download_logs = True + elif opt in ("-g", "--graphics"): + download_videos = True + elif opt in ("-v", "--verbose"): + verbose = True + elif opt in ("-s", "--summary"): + create_summary = True + elif opt in ("-m", "--model"): + model_name = arg.strip() + elif opt in ("-b", "--board"): + leaderboard_guid = arg.strip() + elif opt in ("-h", "--help"): + usage() + sys.exit() + + # Prepare Boto3 + session = boto3.session.Session( + region_name="us-east-1", + profile_name=os.environ.get("DR_UPLOAD_S3_PROFILE", None), + ) + + global dr + dr = boto3_enhancer.deepracer_client(session=session) + + # Find the ARN for my model + my_model = find_model(model_name) + + if my_model is not None: + my_model_arn = my_model["ModelArn"].values[0] + if verbose: + print("Found ModelARN for model {}: {}".format(model_name, my_model_arn)) + else: + print("Did not find model with name {}".format(model_name)) + sys.exit(1) + + + # Find the leaderboard + leaderboard_arn = find_leaderboard(leaderboard_guid) + + if leaderboard_arn is not None: + if verbose: + print("Found Leaderboard with ARN {}".format(leaderboard_arn)) + else: + print("Did not find Leaderboard with ARN {}".format(leaderboard_arn)) + sys.exit(1) + + # Load summary from file if we are interested in it! + if create_summary: + + pkl_f = "{}/{}/summary.pkl".format(logs_path, leaderboard_guid) + if os.path.isfile(pkl_f): + infile = open(pkl_f, "rb") + my_submissions = pickle.load(infile) + infile.close() + else: + my_submissions = {} + my_submissions["LeaderboardSubmissions"] = [] + + dir_path = os.path.dirname(pkl_f) + os.makedirs(dir_path, exist_ok=True) + + # Collect data about latest submission + submission_response = dr.get_latest_user_submission(LeaderboardArn=leaderboard_arn) + latest_submission = submission_response["LeaderboardSubmission"] + if latest_submission: + jobid = latest_submission["ActivityArn"].split("/", 1)[1] + print( + "Job {} has status {}".format( + jobid, latest_submission["LeaderboardSubmissionStatusType"] + ) + ) + + if latest_submission["LeaderboardSubmissionStatusType"] == "SUCCESS": + if download_logs: + download_file( + "{}/{}/robomaker-{}-{}.log".format( + logs_path, + leaderboard_guid, + latest_submission["SubmissionTime"], + jobid, + ), + dr.get_asset_url( + Arn=latest_submission["ActivityArn"], + AssetType="ROBOMAKER_CLOUDWATCH_LOG", + )["Url"], + ) + if download_videos: + download_file( + "{}/{}/video-{}-{}.mp4".format( + logs_path, + leaderboard_guid, + latest_submission["SubmissionTime"], + jobid, + ), + latest_submission["SubmissionVideoS3path"], + ) + + # Submit again + _ = dr.create_leaderboard_submission( + ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn + ) + print("Submitted {} to {}.".format(model_name, leaderboard_arn)) + + elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR": + print("Error in previous submission") + if download_logs: + download_file( + "{}/{}/robomaker-{}.log".format( + logs_path, leaderboard_guid, latest_submission["SubmissionTime"] + ), + dr.get_asset_url( + Arn=latest_submission["ActivityArn"], + AssetType="ROBOMAKER_CLOUDWATCH_LOG", + )["Url"], + ) + + # Submit again + _ = dr.create_leaderboard_submission( + ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn + ) + print("Submitted {} to {}.".format(model_name, leaderboard_arn)) + + # Maintain our summary + if create_summary: + for idx, i in enumerate(my_submissions["LeaderboardSubmissions"]): + if i["SubmissionTime"] == latest_submission["SubmissionTime"]: + del my_submissions["LeaderboardSubmissions"][idx] + my_submissions["LeaderboardSubmissions"].append(latest_submission) + + # Save summary + outfile = open(pkl_f, "wb") + pickle.dump(my_submissions, outfile) + outfile.close() + + # Display summary + if verbose: + display_submissions(my_submissions) + + +def download_file(f_name, url): + + dir_path = os.path.dirname(f_name) + os.makedirs(dir_path, exist_ok=True) + if not os.path.isfile(f_name): + print("Downloading {}".format(os.path.basename(f_name))) + urllib.request.urlretrieve(url, f_name) + +def find_model(model_name): + + m_response = dr.list_models(ModelType="REINFORCEMENT_LEARNING", MaxResults=25) + model_dict = m_response["Models"] + models = pd.DataFrame.from_dict(model_dict) + my_model = models[models["ModelName"] == model_name] + + if my_model.size > 0: + return my_model + + while "NextToken" in m_response: + m_response = dr.list_models( + ModelType="REINFORCEMENT_LEARNING", + MaxResults=50, + NextToken=m_response["NextToken"], + ) + model_dict = m_response["Models"] + + models = pd.DataFrame.from_dict(model_dict) + my_model = models[models["ModelName"] == model_name] + if my_model.size > 0: + return my_model + + return None + +def find_leaderboard(leaderboard_guid): + leaderboard_arn = "arn:aws:deepracer:::leaderboard/{}".format(leaderboard_guid) + + l_response = dr.list_leaderboards(MaxResults=25) + lboards_dict = l_response["Leaderboards"] + leaderboards = pd.DataFrame.from_dict(l_response["Leaderboards"]) + if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0: + return leaderboard_arn + + while "NextToken" in l_response: + l_response = dr.list_leaderboards( + MaxResults=50, NextToken=l_response["NextToken"] + ) + lboards_dict = l_response["Leaderboards"] + + leaderboards = pd.DataFrame.from_dict(lboards_dict) + if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0: + return leaderboard_arn + + return None + +def display_submissions(submissions_dict): + # Display status + my_columns = [ + "SubmissionTime", + "TotalLapTime", + "BestLapTime", + "ResetCount", + "CollisionCount", + "OffTrackCount", + "Model", + "JobId", + "Status", + ] + my_submissions_df = pd.DataFrame.from_dict( + submissions_dict["LeaderboardSubmissions"] + ) + my_submissions_df["SubmissionTime"] = ( + my_submissions_df["SubmissionTime"] + .values.astype(dtype="datetime64[ms]") + .astype(dtype="datetime64[s]") + ) + my_submissions_df["TotalLapTime"] = my_submissions_df["TotalLapTime"].values.astype( + dtype="datetime64[ms]" + ) + my_submissions_df["TotalLapTime"] = ( + my_submissions_df["TotalLapTime"].dt.strftime("%M:%S.%f").str[:-4] + ) + my_submissions_df["BestLapTime"] = my_submissions_df["BestLapTime"].values.astype( + dtype="datetime64[ms]" + ) + my_submissions_df["BestLapTime"] = ( + my_submissions_df["BestLapTime"].dt.strftime("%M:%S.%f").str[:-4] + ) + my_submissions_df["JobId"] = my_submissions_df["ActivityArn"].str.split("/").str[1] + my_submissions_df["Status"] = my_submissions_df["LeaderboardSubmissionStatusType"] + my_submissions_df[[None, None, "Model"]] = my_submissions_df.ModelArn.str.split( + "/", expand=True, + ) + + # Display + print("") + print(my_submissions_df[my_columns]) + + +def usage(): + print( + "Usage: submit-monitor.py [-v] [-s] [-l] [-g] -m -b " + ) + print(" -v Verbose output.") + print(" -s Store a summary of all submissions.") + print(" -l Download robomaker logfiles.") + print(" -g Download video recordings.") + print(" -m Display name of the model to submit.") + print(" -b GUID (not ARN) of the leaderboard to submit to.") + sys.exit(1) + + +if __name__ == "__main__": + main() From 471d6401da45399b85afbf3fa788a2d73ca9ae31 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 25 Apr 2021 09:01:56 +0000 Subject: [PATCH 218/428] Add Eval Reward functiond debug parameter. --- defaults/template-run.env | 1 + docker/docker-compose-eval.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index 98e6ebb8..def7da0b 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -17,6 +17,7 @@ DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer DR_EVAL_OPP_CAR_NAME=FasterCar DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME +DR_EVAL_DEBUG_REWARD=False #DR_EVAL_RTF=1.0 DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 7967ce21..853fcb17 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -17,6 +17,7 @@ services: environment: - DISPLAY_N=:0 - CUDA_VISIBLE_DEVICES + - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD} - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} From bd0662c4ba31b4ec7db03fa6253daa68d2cd8c70 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 25 Apr 2021 09:04:08 +0000 Subject: [PATCH 219/428] Move to version 4.0.2-dev of Robomaker --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index deea5ffe..df13c5b5 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "3.1", "containers": { "rl_coach": "3.1.0", - "robomaker": "3.1.0", + "robomaker": "4.0.2-dev", "sagemaker": "3.1.0" } } From 2f129180db59e0e322444bfa498c0c9af7346b83 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 25 Apr 2021 09:45:29 +0000 Subject: [PATCH 220/428] Error in submit-monitor when logs are unavailable --- utils/submit-monitor.py | 56 ++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py index 633f8ba7..4b7df594 100755 --- a/utils/submit-monitor.py +++ b/utils/submit-monitor.py @@ -7,6 +7,7 @@ import urllib.request import boto3 +from botocore.exceptions import ClientError try: import pandas as pd @@ -17,6 +18,7 @@ dr = None + def main(): # Parse Arguments @@ -78,7 +80,6 @@ def main(): print("Did not find model with name {}".format(model_name)) sys.exit(1) - # Find the leaderboard leaderboard_arn = find_leaderboard(leaderboard_guid) @@ -117,18 +118,23 @@ def main(): if latest_submission["LeaderboardSubmissionStatusType"] == "SUCCESS": if download_logs: - download_file( - "{}/{}/robomaker-{}-{}.log".format( - logs_path, - leaderboard_guid, - latest_submission["SubmissionTime"], - jobid, - ), - dr.get_asset_url( + try: + f_url = dr.get_asset_url( Arn=latest_submission["ActivityArn"], AssetType="ROBOMAKER_CLOUDWATCH_LOG", - )["Url"], - ) + )["Url"] + download_file( + "{}/{}/robomaker-{}-{}.log".format( + logs_path, + leaderboard_guid, + latest_submission["SubmissionTime"], + jobid, + ), + f_url, + ) + except ClientError: + print(("WARNING: Logfile for job {} not available.").format(jobid)) + if download_videos: download_file( "{}/{}/video-{}-{}.mp4".format( @@ -149,15 +155,22 @@ def main(): elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR": print("Error in previous submission") if download_logs: - download_file( - "{}/{}/robomaker-{}.log".format( - logs_path, leaderboard_guid, latest_submission["SubmissionTime"] - ), - dr.get_asset_url( + try: + f_url = dr.get_asset_url( Arn=latest_submission["ActivityArn"], AssetType="ROBOMAKER_CLOUDWATCH_LOG", - )["Url"], - ) + )["Url"] + download_file( + "{}/{}/robomaker-{}-{}.log".format( + logs_path, + leaderboard_guid, + latest_submission["SubmissionTime"], + jobid, + ), + f_url, + ) + except ClientError: + print(("WARNING: Logfile for job {} not available.").format(jobid)) # Submit again _ = dr.create_leaderboard_submission( @@ -190,16 +203,17 @@ def download_file(f_name, url): print("Downloading {}".format(os.path.basename(f_name))) urllib.request.urlretrieve(url, f_name) + def find_model(model_name): m_response = dr.list_models(ModelType="REINFORCEMENT_LEARNING", MaxResults=25) model_dict = m_response["Models"] models = pd.DataFrame.from_dict(model_dict) my_model = models[models["ModelName"] == model_name] - + if my_model.size > 0: return my_model - + while "NextToken" in m_response: m_response = dr.list_models( ModelType="REINFORCEMENT_LEARNING", @@ -215,6 +229,7 @@ def find_model(model_name): return None + def find_leaderboard(leaderboard_guid): leaderboard_arn = "arn:aws:deepracer:::leaderboard/{}".format(leaderboard_guid) @@ -236,6 +251,7 @@ def find_leaderboard(leaderboard_guid): return None + def display_submissions(submissions_dict): # Display status my_columns = [ From dd54d0e2b13cc531eedc6f05660812db79e34ee9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 27 Apr 2021 20:52:10 +0200 Subject: [PATCH 221/428] Improving local virtual X support (#62) * Improving local virtual X support * Updated documentation --- .gitignore | 1 + defaults/template-system.env | 1 + docker/docker-compose-eval.yml | 1 - docker/docker-compose-training.yml | 1 - docs/opengl.md | 9 ++++++--- scripts/evaluation/start.sh | 28 ++++++++++++++++++++++++++-- scripts/training/start.sh | 21 +++++++++++++++++++-- utils/setup-xorg.sh | 4 ++-- utils/start-xorg.sh | 19 ++++++++++++++----- 9 files changed, 69 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 8b122011..41e3337d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ data/ tmp/ autorun.s3url nohup.out +start.sh \ No newline at end of file diff --git a/defaults/template-system.env b/defaults/template-system.env index 85e38f05..6586e4b8 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -18,5 +18,6 @@ DR_CLOUD_WATCH_ENABLE=False DR_DOCKER_STYLE=swarm DR_HOST_X=False DR_WEBVIEWER_PORT=8100 +# DR_DISPLAY=:99 # DR_REMOTE_MINIO_URL=http://mynas:9000 # CUDA_VISIBLE_DEVICES=0 \ No newline at end of file diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 7967ce21..0046efdc 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -15,7 +15,6 @@ services: ports: - "${DR_ROBOMAKER_EVAL_PORT}:8080" environment: - - DISPLAY_N=:0 - CUDA_VISIBLE_DEVICES - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index e6b94a3d..40e8b041 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -28,7 +28,6 @@ services: - "${DR_ROBOMAKER_TRAIN_PORT}:8080" - "${DR_ROBOMAKER_GUI_PORT}:5900" environment: - - DISPLAY_N=:0 - WORLD_NAME=${DR_WORLD_NAME} - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} diff --git a/docs/opengl.md b/docs/opengl.md index c2c79966..765befe4 100644 --- a/docs/opengl.md +++ b/docs/opengl.md @@ -14,6 +14,8 @@ On a Ubuntu desktop running Unity there are hardly any additional steps required Before running `dr-start-training` ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined. +NOTE: Users have experienced issues to start training process from remote (SSH / RDP), as a local X session may not be running / you are not allowed to connect to it. Workaround is to start an additional X server following the steps for Headless Server. + With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU. ## Headless Server @@ -21,13 +23,14 @@ With recent Nvidia drivers you can confirm that the setup is working by running Also a headless server with a GPU, e.g. an EC2 instance, or a local computer with a displayless GPU (e.g. Tesla K40, K80, M40). * Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. -* Setup an X-server on the host. `utils\setup-xorg.sh` is a basic installation script. +* Setup an X-server on the host. `utils/setup-xorg.sh` is a basic installation script. * Configure DRfC using the following settings in `system.env`: * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. + * `DR_DISPLAY`; the X display that the headless X server will start on. (Default is `:99`.) -Before training ensure that the server is running, including VNC if you want to connect. Ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined. +Start up the X server with `utils/start-xorg.sh`. -Basic start-up including creation of variables can be achieved with `source utils\start-xorg.sh`. +If `DR_GUI_ENABLE=True` then a VNC server will be started on port 5900 so that you can connect and interact with the Gazebo UI. With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index fc600a58..6b0f1ff8 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -47,12 +47,36 @@ fi echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" python3 $DR_DIR/scripts/evaluation/prepare-config.py +# Check if we are using Host X -- ensure variables are populated +if [[ "${DR_HOST_X,,}" == "true" ]]; +then + if [[ -n "$DR_DISPLAY" ]]; then + ROBO_DISPLAY=$DR_DISPLAY + else + ROBO_DISPLAY=$DISPLAY + fi + + if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then + echo "No X Server running on display $ROBO_DISPLAY. Exiting" + exit 0 + fi + + if [[ -z "$XAUTHORITY" ]]; then + export XAUTHORITY=~/.Xauthority + if [[ ! -f "$XAUTHORITY" ]]; then + echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping." + exit 0 + fi + fi +fi + + # Check if we will use Docker Swarm or Docker Compose if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; then - docker stack deploy $COMPOSE_FILES $STACK_NAME + DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME else - docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d + DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d fi # Request to be quiet. Quitting here. diff --git a/scripts/training/start.sh b/scripts/training/start.sh index a81b691e..6b8a36eb 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -113,9 +113,25 @@ fi # Check if we are using Host X -- ensure variables are populated if [[ "${DR_HOST_X,,}" == "true" ]]; then + if [[ -n "$DR_DISPLAY" ]]; then + ROBO_DISPLAY=$DR_DISPLAY + else + ROBO_DISPLAY=$DISPLAY + fi + + if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then + echo "No X Server running on display $ROBO_DISPLAY. Exiting" + exit 0 + fi + if [[ -z "$XAUTHORITY" ]]; then export XAUTHORITY=~/.Xauthority + if [[ ! -f "$XAUTHORITY" ]]; then + echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping." + exit 0 + fi fi + fi # Check if we will use Docker Swarm or Docker Compose @@ -137,9 +153,10 @@ then exit 0 fi - docker stack deploy $COMPOSE_FILES $STACK_NAME + DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME + else - docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS + DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS fi # Request to be quiet. Quitting here. diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh index 91c146b0..101226e6 100755 --- a/utils/setup-xorg.sh +++ b/utils/setup-xorg.sh @@ -9,9 +9,9 @@ sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \ # Configure sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1) -sudo nvidia-xconfig --busid=$BUS_ID --enable-all-gpus -o /etc/X11/xorg.conf +sudo nvidia-xconfig --busid=$BUS_ID -o $DR_DIR/tmp/xorg.conf -sudo tee -a /etc/X11/xorg.conf << EOF +sudo tee -a $DR_DIR/tmp/xorg.conf << EOF Section "DRI" Mode 0666 diff --git a/utils/start-xorg.sh b/utils/start-xorg.sh index 50058088..c701d233 100755 --- a/utils/start-xorg.sh +++ b/utils/start-xorg.sh @@ -1,11 +1,20 @@ #!/bin/bash -export DISPLAY=:0 +export DISPLAY=$DR_DISPLAY -nohup xinit /usr/bin/jwm -- $DISPLAY & -sleep 1 -xrandr -s 1400x900 -x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & +nohup sudo xinit /usr/bin/jwm -- /usr/lib/xorg/Xorg $DISPLAY -config $DR_DIR/tmp/xorg.conf > $DR_DIR/tmp/xorg.log 2>&1 & sleep 1 +if [[ "${DR_GUI_ENABLE,,}" == "true" ]]; then + xrandr -s 1400x900 + x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & + sleep 1 +fi + xauth generate $DISPLAY export XAUTHORITY=~/.Xauthority + +if timeout 1s xset q &>/dev/null; then + echo "X Server started on display $DISPLAY" +else + echo "Server failed to start on display $DISPLAY" +fi \ No newline at end of file From 9f32118d524a88e8c6034172a48f6c78c7a5a133 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 2 May 2021 17:53:30 +0200 Subject: [PATCH 222/428] Moving to 4.0 --- defaults/dependencies.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 01222502..d100dd99 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { - "master_version": "3.1", + "master_version": "4.0", "containers": { - "rl_coach": "3.1.1", + "rl_coach": "4.0.0", "robomaker": "4.0.2-dev", - "sagemaker": "3.1.1" + "sagemaker": "4.0.0" } } From b722aa2173e6967ce9cc718a56d8ae9350247790 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 2 May 2021 19:46:15 +0200 Subject: [PATCH 223/428] Moving to Robomaker 4.0.2 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index d100dd99..861fab0d 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.0", - "robomaker": "4.0.2-dev", + "robomaker": "4.0.2", "sagemaker": "4.0.0" } } From 3463ece2d8334fec94efc80af53ff02ae81cc237 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Mon, 3 May 2021 21:30:48 +0200 Subject: [PATCH 224/428] Moving back to 4.0.2-dev --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 861fab0d..d100dd99 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.0", - "robomaker": "4.0.2", + "robomaker": "4.0.2-dev", "sagemaker": "4.0.0" } } From 6c0ef1e785a04593a3830c7069faa142fc541c83 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 4 May 2021 19:30:35 +0200 Subject: [PATCH 225/428] Move to 4.0.3 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 861fab0d..ea424f4b 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.0", - "robomaker": "4.0.2", + "robomaker": "4.0.3", "sagemaker": "4.0.0" } } From 38d7bbe7e33528468f45912499bf1958d3f5f740 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Fri, 14 May 2021 17:28:33 +0200 Subject: [PATCH 226/428] Upgrade to 4.0 (#63) * Add Eval Reward functiond debug parameter. * Move to version 4.0.2-dev of Robomaker * Moving to 4.0 * Moving to Robomaker 4.0.2 * Moving back to 4.0.2-dev * Move to 4.0.3 --- defaults/dependencies.json | 8 ++++---- defaults/template-run.env | 1 + docker/docker-compose-eval.yml | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 6be94b9a..ea424f4b 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { - "master_version": "3.1", + "master_version": "4.0", "containers": { - "rl_coach": "3.1.1", - "robomaker": "3.1.2", - "sagemaker": "3.1.1" + "rl_coach": "4.0.0", + "robomaker": "4.0.3", + "sagemaker": "4.0.0" } } diff --git a/defaults/template-run.env b/defaults/template-run.env index 98e6ebb8..def7da0b 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -17,6 +17,7 @@ DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer DR_EVAL_OPP_CAR_NAME=FasterCar DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME +DR_EVAL_DEBUG_REWARD=False #DR_EVAL_RTF=1.0 DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 0046efdc..585b3949 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -16,6 +16,7 @@ services: - "${DR_ROBOMAKER_EVAL_PORT}:8080" environment: - CUDA_VISIBLE_DEVICES + - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD} - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} From 50adcc7a7e212180573bdfd5d45de8f8f97d808a Mon Sep 17 00:00:00 2001 From: "Matias (Matt) Kreder" Date: Fri, 14 May 2021 12:29:39 -0300 Subject: [PATCH 227/428] Add an arn argument so that we can use submit-monitor for community events (#64) * Add arn so that we can use submit-monitor for community events Co-authored-by: Matias Kreder --- utils/submit-monitor.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py index 4b7df594..59d9181c 100755 --- a/utils/submit-monitor.py +++ b/utils/submit-monitor.py @@ -30,7 +30,7 @@ def main(): ) except getopt.GetoptError as err: # print help information and exit: - print(err) # will print something like "option -a not recognized" + print(err) # will print something like "option -x not recognized" usage() sys.exit(2) @@ -42,6 +42,7 @@ def main(): create_summary = False model_name = None leaderboard_guid = None + leaderboard_arn = None for opt, arg in opts: if opt in ("-l", "--logs"): @@ -80,8 +81,12 @@ def main(): print("Did not find model with name {}".format(model_name)) sys.exit(1) + if leaderboard_guid.startswith('arn'): + leaderboard_arn = leaderboard_guid + # Find the leaderboard - leaderboard_arn = find_leaderboard(leaderboard_guid) + if not leaderboard_arn: + leaderboard_arn = find_leaderboard(leaderboard_guid) if leaderboard_arn is not None: if verbose: @@ -305,7 +310,7 @@ def usage(): print(" -l Download robomaker logfiles.") print(" -g Download video recordings.") print(" -m Display name of the model to submit.") - print(" -b GUID (not ARN) of the leaderboard to submit to.") + print(" -b GUID or ARN of the leaderboard to submit to.") sys.exit(1) From 68a95fdc6e22b19af5e0d1867692dff95501c9e1 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 21 May 2021 20:05:37 +0000 Subject: [PATCH 228/428] Fix resubmission if FAILED --- utils/submit-monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py index 4b7df594..4b838dbc 100755 --- a/utils/submit-monitor.py +++ b/utils/submit-monitor.py @@ -152,7 +152,7 @@ def main(): ) print("Submitted {} to {}.".format(model_name, leaderboard_arn)) - elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR": + elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR" or latest_submission["LeaderboardSubmissionStatusType"] == "FAILED": print("Error in previous submission") if download_logs: try: From 90d6fdeaaa378056c98d5ce874a156812fe3a5a5 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 6 Jun 2021 13:52:20 +0200 Subject: [PATCH 229/428] Upgrading to v4.0.5 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index ea424f4b..f2de3f05 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.0", - "robomaker": "4.0.3", + "robomaker": "4.0.5", "sagemaker": "4.0.0" } } From 876e94445105427d5432950df7d0d5bcbfc9be89 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Mon, 28 Jun 2021 17:50:13 +0000 Subject: [PATCH 230/428] Moving to 4.0.6 images --- defaults/dependencies.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index ea424f4b..48cab2e0 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "4.0", "containers": { - "rl_coach": "4.0.0", - "robomaker": "4.0.3", + "rl_coach": "4.0.6-dev", + "robomaker": "4.0.6-dev", "sagemaker": "4.0.0" } } From 4de11a53c84822433aad899760fc0ff6e69915ce Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 7 Jul 2021 13:11:25 +0200 Subject: [PATCH 231/428] Merging Dev / 4.0.6 (#65) * Add Eval Reward functiond debug parameter. * Moving to 4.0.6 images --- defaults/dependencies.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index f2de3f05..2b02e7ce 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "4.0", "containers": { - "rl_coach": "4.0.0", - "robomaker": "4.0.5", + "rl_coach": "4.0.6", + "robomaker": "4.0.6", "sagemaker": "4.0.0" } } From 2878d9f0691ba51cb75a77520e9165e44828a72d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sat, 10 Jul 2021 11:48:04 +0000 Subject: [PATCH 232/428] Enable configuration of reset distance --- defaults/template-run.env | 1 + scripts/evaluation/prepare-config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index def7da0b..94cfbb4c 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -18,6 +18,7 @@ DR_EVAL_OPP_CAR_NAME=FasterCar DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_DEBUG_REWARD=False +DR_EVAL_RESET_BEHIND_DIST=1.0 #DR_EVAL_RTF=1.0 DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 3d8d6115..6eec9428 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -68,6 +68,7 @@ def str2bool(v): if is_continous: config['NUMBER_OF_RESETS'] = '10000' config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') + config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') From 8e88313fa2194486e672051146c747e335d2667d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 10 Jul 2021 13:56:21 +0200 Subject: [PATCH 233/428] Enable configuration of reset distance (#66) * Enable configuration of reset distance --- defaults/template-run.env | 1 + scripts/evaluation/prepare-config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index def7da0b..94cfbb4c 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -18,6 +18,7 @@ DR_EVAL_OPP_CAR_NAME=FasterCar DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME DR_EVAL_DEBUG_REWARD=False +DR_EVAL_RESET_BEHIND_DIST=1.0 #DR_EVAL_RTF=1.0 DR_TRAIN_CHANGE_START_POSITION=True DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 3d8d6115..6eec9428 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -68,6 +68,7 @@ def str2bool(v): if is_continous: config['NUMBER_OF_RESETS'] = '10000' config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') + config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') From 1f918a88e5b7b207cbd17f929a94f76dd33ebbfe Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 11 Jul 2021 12:13:37 +0200 Subject: [PATCH 234/428] Ensure Kinesis stream name is blank if not used --- defaults/template-system.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index 6586e4b8..fccecf54 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -6,7 +6,7 @@ DR_UPLOAD_S3_ROLE= DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_PROFILE= DR_GUI_ENABLE=False -DR_KINESIS_STREAM_NAME=None +DR_KINESIS_STREAM_NAME= DR_KINESIS_STREAM_ENABLE=False DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= From 288dcaa3ab10eed2808f336df55ed2f747fdfed3 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 11 Jul 2021 12:41:36 +0200 Subject: [PATCH 235/428] Version update --- utils/Dockerfile.sagemaker-gpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index 8a42d33b..c481021b 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:3.1.1-gpu-nv -ENV CUDA_VISIBLE_DEVICES=1 +FROM awsdeepracercommunity/deepracer-sagemaker:4.0.6-gpu +ENV CUDA_VISIBLE_DEVICES=0 \ No newline at end of file From 270541cb1af69a849468dbe155929ff386e9d247 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 26 Aug 2021 14:45:55 +0200 Subject: [PATCH 236/428] Added bot-car penalty variable --- defaults/template-run.env | 1 + scripts/evaluation/prepare-config.py | 1 + scripts/training/prepare-config.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index 94cfbb4c..71d516ef 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -52,3 +52,4 @@ DR_H2B_NUMBER_OF_BOT_CARS=3 DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False DR_H2B_BOT_CAR_SPEED=0.2 +DR_H2B_BOT_CAR_PENALTY=5.0 \ No newline at end of file diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 6eec9428..8dc39d3a 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -98,6 +98,7 @@ def str2bool(v): config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') + config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0') # Head to Model if config['RACE_TYPE'] == 'HEAD_TO_MODEL': diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 554c39b7..087176d2 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -73,6 +73,7 @@ config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') + config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0') s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) s3_region = config['AWS_REGION'] @@ -180,6 +181,7 @@ config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')}) config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')}) config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')}) + config.update({'PENALTY_SECONDS': os.environ.get('DR_H2B_BOT_CAR_PENALTY')}) else: config.pop('IS_LANE_CHANGE', None) config.pop('LOWER_LANE_CHANGE_TIME', None) From 8c410e65bd97816863c87e0577d138825631eac4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 26 Aug 2021 17:36:39 +0200 Subject: [PATCH 237/428] Move robomaker to 4.0.7 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 2b02e7ce..c990f625 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.6", - "robomaker": "4.0.6", + "robomaker": "4.0.7", "sagemaker": "4.0.0" } } From f28016232c11df902ef351d3d87d81bc698ec298 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 5 Sep 2021 14:18:26 +0200 Subject: [PATCH 238/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index c990f625..96a4d014 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.6", - "robomaker": "4.0.7", + "robomaker": "4.0.8", "sagemaker": "4.0.0" } } From 14a401ac1617c5f20cfbd8661b804471159e161e Mon Sep 17 00:00:00 2001 From: "Daryl.Jezierski" Date: Tue, 21 Sep 2021 13:12:44 -0400 Subject: [PATCH 239/428] console port to static and updated root user env --- docker/docker-compose-local.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml index 7254ea35..f2eea31b 100644 --- a/docker/docker-compose-local.yml +++ b/docker/docker-compose-local.yml @@ -1,3 +1,4 @@ + version: '3.7' networks: @@ -10,14 +11,14 @@ services: image: minio/minio ports: - "9000:9000" - command: server /data + - "9001:9001" + command: server /data --console-address ":9001" environment: - - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} - - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} + - MINIO_ROOT_USER=${DR_LOCAL_ACCESS_KEY_ID} + - MINIO_ROOT_PASSWORD=${DR_LOCAL_SECRET_ACCESS_KEY} - MINIO_UID - MINIO_GID - MINIO_USERNAME - MINIO_GROUPNAME volumes: - ${DR_DIR}/data/minio:/data - From de9da8778b87e5dc68e7f7bab5831392b8426446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oberfrank=20Rezs=C5=91?= Date: Sat, 25 Sep 2021 11:11:36 +0200 Subject: [PATCH 240/428] fix gitbub.com typo --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index e7bc5921..68d32ec2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,7 +2,7 @@ Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server. -DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://gitbub.com/aws-deepracer-community/deepracer) repository. +DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository. # Main Features From 5117c77d4fb7fa54180002054c2c2b4968f1e1d1 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Tue, 5 Oct 2021 20:54:16 +0200 Subject: [PATCH 241/428] Enable increment directly from upload script --- scripts/upload/upload-model.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 4b3b4d3a..f0b3ab24 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -9,6 +9,7 @@ usage(){ echo " -p model Uploads model in specified S3 prefix." echo " -i Import model with the upload name" echo " -I name Import model with a specific name" + echo " -1 Increment upload name with 1 (dr-increment-upload-model)" exit 1 } @@ -19,13 +20,13 @@ function ctrl_c() { exit 1 } -while getopts ":fwdhbp:c:iI:" opt; do +while getopts ":fwdhbp:c:1iI:" opt; do case $opt in b) OPT_CHECKPOINT="Best" ;; c) OPT_CHECKPOINT_NUM="$OPTARG" ;; -f) OPT_FORCE="True" +f) OPT_FORCE="-f" ;; d) OPT_DRYRUN="--dryrun" ;; @@ -37,6 +38,8 @@ i) OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" ;; I) OPT_IMPORT="$OPTARG" ;; +1) OPT_INCREMENT="Yes" +;; h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 @@ -65,6 +68,11 @@ then exit 1 fi +if [[ -n "${OPT_INCREMENT}" ]]; +then + $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} +fi + SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} if [[ -n "${OPT_PREFIX}" ]]; then From e3048de56659016075d0efd1f98d275360059382 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Wed, 6 Oct 2021 08:33:47 +0200 Subject: [PATCH 242/428] Bugfix - ensure incremented model is used --- scripts/upload/increment.sh | 2 ++ scripts/upload/upload-model.sh | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/upload/increment.sh b/scripts/upload/increment.sh index e22ac05c..7ec4f75b 100755 --- a/scripts/upload/increment.sh +++ b/scripts/upload/increment.sh @@ -72,6 +72,8 @@ else exit 1 fi +export DR_UPLOAD_S3_PREFIX=$(eval echo "${NEW_UPLOAD_MODEL}") + if [[ -n "${OPT_WIPE}" ]]; then MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} ) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index f0b3ab24..207dd301 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -53,6 +53,11 @@ then echo "*** DRYRUN MODE ***" fi +if [[ -n "${OPT_INCREMENT}" ]]; +then + source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} +fi + export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} export TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} @@ -68,11 +73,6 @@ then exit 1 fi -if [[ -n "${OPT_INCREMENT}" ]]; -then - $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} -fi - SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} if [[ -n "${OPT_PREFIX}" ]]; then From 581c5798e672dab61dc16bc3582d5803ea2d7da1 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Wed, 6 Oct 2021 08:38:04 +0200 Subject: [PATCH 243/428] Fix import bug --- scripts/upload/upload-model.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 207dd301..ef4a688f 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -56,6 +56,7 @@ fi if [[ -n "${OPT_INCREMENT}" ]]; then source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} + OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" fi export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} From 9ec4e93c7538d659fbe5035e9922fc29ef8c1061 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 10 Oct 2021 20:05:48 +0200 Subject: [PATCH 244/428] Allow restart viewer through start training --- bin/scripts_wrapper.sh | 40 ++++++++++++++++++--------------------- scripts/training/start.sh | 12 ++++++++++-- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/bin/scripts_wrapper.sh b/bin/scripts_wrapper.sh index 75629661..ac6ff4cd 100644 --- a/bin/scripts_wrapper.sh +++ b/bin/scripts_wrapper.sh @@ -3,19 +3,19 @@ function dr-upload-custom-files { eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Uploading files to $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DIR/custom_files/ $CUSTOM_TARGET + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET } function dr-upload-model { - dr-update-env && ${DIR}/scripts/upload/upload-model.sh "$@" + dr-update-env && ${DR_DIR}/scripts/upload/upload-model.sh "$@" } function dr-download-model { - dr-update-env && ${DIR}/scripts/upload/download-model.sh "$@" + dr-update-env && ${DR_DIR}/scripts/upload/download-model.sh "$@" } function dr-upload-car-zip { - dr-update-env && ${DIR}/scripts/upload/upload-car.sh "$@" + dr-update-env && ${DR_DIR}/scripts/upload/upload-car.sh "$@" } function dr-list-aws-models { @@ -27,35 +27,35 @@ function dr-set-upload-model { } function dr-increment-upload-model { - dr-update-env && ${DIR}/scripts/upload/increment.sh "$@" && dr-update-env + dr-update-env && ${DR_DIR}/scripts/upload/increment.sh "$@" && dr-update-env } function dr-download-custom-files { eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) echo "Downloading files from $CUSTOM_TARGET" - aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DIR/custom_files/ + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DR_DIR/custom_files/ } function dr-start-training { dr-update-env - $DIR/scripts/training/start.sh "$@" + $DR_DIR/scripts/training/start.sh "$@" } function dr-increment-training { - dr-update-env && ${DIR}/scripts/training/increment.sh "$@" && dr-update-env + dr-update-env && ${DR_DIR}/scripts/training/increment.sh "$@" && dr-update-env } function dr-stop-training { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/training && ./stop.sh" + ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/training && ./stop.sh" } function dr-start-evaluation { dr-update-env - $DIR/scripts/evaluation/start.sh "$@" + $DR_DIR/scripts/evaluation/start.sh "$@" } function dr-stop-evaluation { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/evaluation && ./stop.sh" + ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh" } @@ -65,14 +65,14 @@ function dr-start-tournament { function dr-start-loganalysis { - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./start.sh" + ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh" } function dr-stop-loganalysis { eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') if [ -n "$LOG_ANALYSIS_ID" ]; then - ROBOMAKER_COMMAND="" bash -c "cd $DIR/scripts/log-analysis && ./stop.sh" + ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh" else echo "Log-analysis is not running." fi @@ -292,22 +292,18 @@ function dr-url-loganalysis { } function dr-view-stream { - ${DIR}/utils/start-local-browser.sh "$@" + ${DR_DIR}/utils/start-local-browser.sh "$@" } function dr-start-viewer { - dr-update-env - $DIR/scripts/viewer/start.sh "$@" + $DR_DIR/scripts/viewer/start.sh "$@" } function dr-stop-viewer { - dr-update-env - $DIR/scripts/viewer/stop.sh "$@" + $DR_DIR/scripts/viewer/stop.sh "$@" } function dr-update-viewer { - dr-update-env - $DIR/scripts/viewer/stop.sh "$@" - $DIR/scripts/viewer/start.sh "$@" - + $DR_DIR/scripts/viewer/stop.sh "$@" + $DR_DIR/scripts/viewer/start.sh "$@" } diff --git a/scripts/training/start.sh b/scripts/training/start.sh index 6b8a36eb..9fa05937 100755 --- a/scripts/training/start.sh +++ b/scripts/training/start.sh @@ -3,11 +3,12 @@ source $DR_DIR/bin/scripts_wrapper.sh usage(){ - echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ]" + echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ] [-v]" echo " -w Wipes the target AWS DeepRacer model structure before upload." echo " -q Do not output / follow a log when starting." echo " -a Follow all Sagemaker and Robomaker logs." echo " -s Follow Sagemaker logs (default)." + echo " -v Updates the viewer webpage." echo " -r [n] Follow Robomaker logs for worker n (default worker 0 / replica 1)." exit 1 } @@ -21,7 +22,7 @@ function ctrl_c() { OPT_DISPLAY="SAGEMAKER" -while getopts ":whqsar:" opt; do +while getopts ":whqsavr:" opt; do case $opt in w) OPT_WIPE="WIPE" ;; @@ -40,6 +41,8 @@ r) # Check if value is in numeric format. ((OPTIND--)) fi ;; +v) OPT_VIEWER="VIEWER" +;; h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 @@ -159,6 +162,11 @@ else DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS fi +# Viewer +if [ -n "$OPT_VIEWER" ]; then + (sleep 5; dr-update-viewer) +fi + # Request to be quiet. Quitting here. if [ -n "$OPT_QUIET" ]; then exit 0 From 4095cde865ec2b6f79a6344e4b1af0920fbefb75 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 16 Oct 2021 20:46:05 +0200 Subject: [PATCH 245/428] Move to 4.0.9 --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 96a4d014..497cce04 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.6", - "robomaker": "4.0.8", + "robomaker": "4.0.9", "sagemaker": "4.0.0" } } From 2b4b1efb6f92ddc22bf424e314069d34dff043f5 Mon Sep 17 00:00:00 2001 From: Yuan Yao Date: Wed, 27 Oct 2021 12:55:19 -0700 Subject: [PATCH 246/428] Add quotes to S3 download source URL --- scripts/upload/download-model.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/upload/download-model.sh b/scripts/upload/download-model.sh index 2677ec9d..9f5958fb 100755 --- a/scripts/upload/download-model.sh +++ b/scripts/upload/download-model.sh @@ -45,7 +45,7 @@ then echo "*** DRYRUN MODE ***" fi -SOURCE_S3_URL=${OPT_SOURCE} +SOURCE_S3_URL="${OPT_SOURCE}" if [[ -z "${SOURCE_S3_URL}" ]]; then @@ -69,11 +69,11 @@ WORK_DIR=${DR_DIR}/tmp/download mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}/config ${WORK_DIR}/full # Check if metadata-files are available -REWARD_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_REWARD_FILE_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) -METADATA_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_METADATA_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -HYPERPARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp ${SOURCE_HYPERPARAM_FILE_S3_KEY} ${WORK_DIR}/config/ --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +REWARD_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_REWARD_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) +METADATA_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_METADATA_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) +HYPERPARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_HYPERPARAM_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) -if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; +if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; then echo "All meta-data files found. Source model ${SOURCE_S3_URL} valid." else @@ -94,7 +94,7 @@ then fi cd ${WORK_DIR} -aws ${DR_UPLOAD_PROFILE} s3 sync ${SOURCE_S3_URL} ${WORK_DIR}/full/ ${OPT_DRYRUN} +aws ${DR_UPLOAD_PROFILE} s3 sync "${SOURCE_S3_URL}" ${WORK_DIR}/full/ ${OPT_DRYRUN} aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync ${WORK_DIR}/full/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN} ${OPT_WIPE} if [[ -n "${OPT_CONFIG}" ]]; From d8cec72636cd99d4a836d7a776884a2cb6b52ca2 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Thu, 4 Nov 2021 15:48:01 +0000 Subject: [PATCH 247/428] Fixing a bug --- utils/submit-monitor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py index 77ae0eed..1b02f67d 100755 --- a/utils/submit-monitor.py +++ b/utils/submit-monitor.py @@ -186,7 +186,10 @@ def main(): # Maintain our summary if create_summary: for idx, i in enumerate(my_submissions["LeaderboardSubmissions"]): - if i["SubmissionTime"] == latest_submission["SubmissionTime"]: + if "SubmissionTime" in i: + if i["SubmissionTime"] == latest_submission["SubmissionTime"]: + del my_submissions["LeaderboardSubmissions"][idx] + else: del my_submissions["LeaderboardSubmissions"][idx] my_submissions["LeaderboardSubmissions"].append(latest_submission) From 927c96815574384c58c733d438fb61f829e8d283 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 13 Nov 2021 20:21:06 +0100 Subject: [PATCH 248/428] Version 4.0.10 of Robomaker --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 497cce04..1a4bbad0 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.6", - "robomaker": "4.0.9", + "robomaker": "4.0.10", "sagemaker": "4.0.0" } } From 04cfba679be640674a3671ff036bc7fb0571c1fb Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 13 Nov 2021 20:25:54 +0100 Subject: [PATCH 249/428] Adding some further doc --- docs/reference.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index cbeebe4c..6fc6e12a 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -17,7 +17,7 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_EVAL_IS_CONTINUOUS` | If False, your evaluation trial will end if you car goes off track or is in a collision. If True, your car will take the penalty times as configured in those parameters, but continue evaluating the trial.| | `DR_EVAL_OFF_TRACK_PENALTY` | Number of seconds penalty time added for an off track during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| | `DR_EVAL_COLLISION_PENALTY` | Number of seconds penalty time added for a collision during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| -| `DR_EVAL_SAVE_MP4` | TODO | +| `DR_EVAL_SAVE_MP4` | Set to `True` to save MP4 of an evaluation run. | | `DR_TRAIN_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| | `DR_TRAIN_ALTERNATE_DRIVING_DIRECTION` | `True` or `False`. If `True`, the car will alternate driving between clockwise and counter-clockwise each episode.| | `DR_TRAIN_START_POSITION_OFFSET` | Used to control where to start the training from on first episode.| @@ -39,7 +39,7 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES` | Minimum distance in meters between obstacles.| | `DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS` | If True, obstacle locations will randomly change after each episode.| | `DR_OA_IS_OBSTACLE_BOT_CAR` | If True, obstacles will appear as a stationary car instead of a box.| -| `DR_OA_OBJECT_POSITIONS` | TODO.| +| `DR_OA_OBJECT_POSITIONS` | Positions of boxes on the track. Tuples consisting of progress (fraction [0..1]) and inside or outside lane (-1 or 1). Example: `"0.23,-1;0.46,1"`| | `DR_H2B_IS_LANE_CHANGE` | If True, bot cars will change lanes based on configuration.| | `DR_H2B_LOWER_LANE_CHANGE_TIME` | Minimum time in seconds before car will change lanes.| | `DR_H2B_UPPER_LANE_CHANGE_TIME` | Maximum time in seconds before car will change langes.| @@ -48,7 +48,7 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS` | Minimum distance between bot cars.| | `DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS` | If True, bot car locations will randomly change after each episode.| | `DR_H2B_BOT_CAR_SPEED` | How fast the bot cars go in meters per second.| -| `DR_CLOUD` | Can be `azure`, `aws` or `local`; determines how the storage will be configured.| +| `DR_CLOUD` | Can be `azure`, `aws`, `local` or `remote`; determines how the storage will be configured.| | `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | | `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| | `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| From f92b8950d9a97c45a8cbc2f4a6f368adaee96c84 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 14 Nov 2021 14:44:40 +0100 Subject: [PATCH 250/428] Enhance Evaluation Mode --- defaults/template-run.env | 1 + scripts/evaluation/prepare-config.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index 71d516ef..6bc4976f 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -9,6 +9,7 @@ DR_RACER_NAME=$DR_CAR_NAME DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_EVAL_NUMBER_OF_TRIALS=3 DR_EVAL_IS_CONTINUOUS=True +DR_EVAL_ENABLE_RESETS=True DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 8dc39d3a..d0220880 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -65,10 +65,13 @@ def str2bool(v): config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) +is_reset = str2bool(os.environ.get('DR_EVAL_ENABLE_RESETS', 'False')) if is_continous: - config['NUMBER_OF_RESETS'] = '10000' config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') + +if is_reset: config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') + config['NUMBER_OF_RESETS'] = '10000' config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') From 87e435a73e6a3bff33821aceb57b274003d58789 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 14 Nov 2021 15:43:33 +0100 Subject: [PATCH 251/428] Improved resets - require robomaker 4.0.11 --- defaults/template-run.env | 2 +- scripts/evaluation/prepare-config.py | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/defaults/template-run.env b/defaults/template-run.env index 6bc4976f..a521eb9b 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -9,7 +9,7 @@ DR_RACER_NAME=$DR_CAR_NAME DR_ENABLE_DOMAIN_RANDOMIZATION=False DR_EVAL_NUMBER_OF_TRIALS=3 DR_EVAL_IS_CONTINUOUS=True -DR_EVAL_ENABLE_RESETS=True +DR_EVAL_MAX_RESETS=100 DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index d0220880..57a12de4 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -63,15 +63,10 @@ def str2bool(v): config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') +config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') -is_continous = str2bool(os.environ.get('DR_EVAL_IS_CONTINUOUS', 'False')) -is_reset = str2bool(os.environ.get('DR_EVAL_ENABLE_RESETS', 'False')) -if is_continous: - config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') - -if is_reset: - config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') - config['NUMBER_OF_RESETS'] = '10000' +config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') +config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_MAX_RESETS', '0') config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') From d7ba389efe002657a23de0b378a77c232d13253f Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 14 Nov 2021 15:57:04 +0100 Subject: [PATCH 252/428] Enable choice of checkpoint for eval --- defaults/template-run.env | 1 + scripts/evaluation/prepare-config.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index a521eb9b..90959b5a 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -13,6 +13,7 @@ DR_EVAL_MAX_RESETS=100 DR_EVAL_OFF_TRACK_PENALTY=5.0 DR_EVAL_COLLISION_PENALTY=5.0 DR_EVAL_SAVE_MP4=False +DR_EVAL_CHECKPOINT=last DR_EVAL_OPP_S3_MODEL_PREFIX=rl-deepracer-sagemaker DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer DR_EVAL_OPP_CAR_NAME=FasterCar diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 57a12de4..9bfb33f2 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -51,6 +51,9 @@ def str2bool(v): config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'bucket'),'mp4')) +# Checkpoint +config['EVAL_CHECKPOINT'] = os.environ.get('DR_EVAL_CHECKPOINT', 'last') + # Car and training body_shell_type = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') config['BODY_SHELL_TYPE'].append(body_shell_type) From a59409f8251a651826175a55bdf81154ee3f0f9e Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Sun, 28 Nov 2021 21:04:56 +0000 Subject: [PATCH 253/428] FIx download of logs. --- utils/submit-monitor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/utils/submit-monitor.py b/utils/submit-monitor.py index 1b02f67d..1688e700 100755 --- a/utils/submit-monitor.py +++ b/utils/submit-monitor.py @@ -3,6 +3,7 @@ import sys import getopt import os +import traceback import pickle import urllib.request @@ -126,10 +127,10 @@ def main(): try: f_url = dr.get_asset_url( Arn=latest_submission["ActivityArn"], - AssetType="ROBOMAKER_CLOUDWATCH_LOG", + AssetType="LOGS", )["Url"] download_file( - "{}/{}/robomaker-{}-{}.log".format( + "{}/{}/robomaker-{}-{}.tar.gz".format( logs_path, leaderboard_guid, latest_submission["SubmissionTime"], @@ -139,6 +140,7 @@ def main(): ) except ClientError: print(("WARNING: Logfile for job {} not available.").format(jobid)) + traceback.print_exc() if download_videos: download_file( @@ -163,10 +165,10 @@ def main(): try: f_url = dr.get_asset_url( Arn=latest_submission["ActivityArn"], - AssetType="ROBOMAKER_CLOUDWATCH_LOG", + AssetType="LOGS", )["Url"] download_file( - "{}/{}/robomaker-{}-{}.log".format( + "{}/{}/robomaker-{}-{}.tar.gz".format( logs_path, leaderboard_guid, latest_submission["SubmissionTime"], @@ -176,6 +178,7 @@ def main(): ) except ClientError: print(("WARNING: Logfile for job {} not available.").format(jobid)) + traceback.print_exc() # Submit again _ = dr.create_leaderboard_submission( From 8d41067c04df334ed7e4cf9769fc4e3bbdbdef23 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 11 Dec 2021 12:13:10 +0100 Subject: [PATCH 254/428] Adjusting KINESIS documentation --- docs/reference.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 6fc6e12a..479aeda0 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -54,8 +54,8 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| | `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| | `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | -| `DR_KINESIS_STREAM_NAME` | Kinesis stream name | -| `DR_KINESIS_STREAM_ENABLE` | Enable or disable Kinesis Stream | +| `DR_KINESIS_STREAM_NAME` | Kinesis stream name. Used if you actually publish to the AWS KVS service. Leave blank if you do not want this. | +| `DR_KINESIS_STREAM_ENABLE` | Enable or disable 'Kinesis Stream', True both publishes to a AWS KVS stream (if name not None), and to the topic `/racecar/deepracer/kvs_stream`. Leave True if you want to watch the car racing. | | `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.| | `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.| | `DR_COACH_IMAGE` | Determines which coach image will be used for training.| From fd21df864a5b98533d72c7d6e4da1788a032e879 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 11 Dec 2021 19:48:53 +0100 Subject: [PATCH 255/428] Change default value to enable viewer --- defaults/template-system.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/template-system.env b/defaults/template-system.env index fccecf54..807f35a4 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -7,7 +7,7 @@ DR_LOCAL_S3_BUCKET=bucket DR_LOCAL_S3_PROFILE= DR_GUI_ENABLE=False DR_KINESIS_STREAM_NAME= -DR_KINESIS_STREAM_ENABLE=False +DR_KINESIS_STREAM_ENABLE=True DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= DR_ANALYSIS_IMAGE=cpu From 955848ab8ada940094be76fa15e308c59dffdfbc Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 18 Dec 2021 11:28:38 +0100 Subject: [PATCH 256/428] Update Slack reference --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 68d32ec2..b3cb5add 100644 --- a/docs/index.md +++ b/docs/index.md @@ -43,5 +43,5 @@ DRfC supports a wide set of features to ensure that you can focus on creating th # Support -* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support. +* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support. * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. From 389c114d8726dd5e174652fb59a9916d0eb0ad57 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 21 Dec 2021 21:58:03 +0100 Subject: [PATCH 257/428] Update model_metadata.json --- defaults/model_metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/model_metadata.json b/defaults/model_metadata.json index 023866f9..5f3eba9a 100644 --- a/defaults/model_metadata.json +++ b/defaults/model_metadata.json @@ -25,5 +25,5 @@ "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", "training_algorithm": "clipped_ppo", "action_space_type": "discrete", - "version": "3" + "version": "4" } From 9fb80b0158160fd89db136e45ffb5e80fbec5fc2 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 21 Dec 2021 21:58:13 +0100 Subject: [PATCH 258/428] Update model_metadata_sac.json --- defaults/model_metadata_sac.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/model_metadata_sac.json b/defaults/model_metadata_sac.json index bf99366d..07c8ac04 100644 --- a/defaults/model_metadata_sac.json +++ b/defaults/model_metadata_sac.json @@ -4,5 +4,5 @@ "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", "training_algorithm": "sac", "action_space_type": "continuous", - "version": "3" + "version": "4" } From 0a57dcd8d6cf7edfd6f317924ac55773d9abb3aa Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 8 Jan 2022 17:29:02 +0100 Subject: [PATCH 259/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 1a4bbad0..6aa906fb 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "4.0", "containers": { "rl_coach": "4.0.6", - "robomaker": "4.0.10", + "robomaker": "4.0.11", "sagemaker": "4.0.0" } } From 0d520030b834f573e70b138dc8f81841f3dec020 Mon Sep 17 00:00:00 2001 From: dartjason <51768630+dartjason@users.noreply.github.com> Date: Sat, 22 Jan 2022 06:38:01 -0500 Subject: [PATCH 260/428] Update Windows documentation (#82) * Update windows.md --- docs/windows.md | 58 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/docs/windows.md b/docs/windows.md index bbca8289..f6da6ae1 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -2,35 +2,75 @@ ## Prerequisites -The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html). +The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html). Ensure your windows has an updated [nvidia cuda enabled driver](https://developer.nvidia.com/cuda/wsl/download) that will work with WSL. + +The further instructions assume that you have a basic working WSL using the default Ubuntu distribution. -The further instructions assume that you have a working Nvidia enabled Docker. ## Additional steps -The `bin/prepare.sh` will not work for a Ubuntu WSL installation, hence additional steps will be required. +The typical `bin/prepare.sh` script will not work for a Ubuntu WSL installation, hence alternate steps will be required. ### Adding required packages -Install the additional packages with the following command: +Install additional packages with the following command: ``` sudo apt-get install jq awscli python3-boto3 docker-compose ``` -### Configure Docker - -To ensure we always have a GPU enabled Docker container, run: +### Install and configure docker and nvidia-docker ``` +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" +sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io + +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json sudo usermod -a -G docker $(id -un) ``` + ### Install DRfC -You can now run `bin/init.sh -a gpu -c local` to setup DRfC. +You can now run `bin/init.sh -a gpu -c local` to setup DRfC, and follow the typical DRfC startup instructions ## Known Issues * `init.sh` is not able to detect the GPU given differences in the Nvidia drivers, and the WSL2 Linux Kernel. You need to manually set the GPU image in `system.env`. -* Docker does not start automatically when you launch Ubuntu. Start it with `sudo service docker start`. \ No newline at end of file +* Docker does not start automatically when you launch Ubuntu. Start it manually with `sudo service docker start` + + You can also configure the service to start automatically using the Windows Task Scheduler + + *1)* Create a new file at /etc/init-wsl (sudo vi /etc/init-wsl) with the following contents. + + ``` + #!/bin/sh + service start docker + ``` + + *2)* Make the script executable `sudo chmod +x /etc/init-wsl` + + *3)* Open Task Scheduler in Windows 10 + + - On the left, click **Task Scheduler Library** option, and then on the right, click **Create Task** + + - In **General** Tab, Enter Name **WSL Startup**, and select **Run whether user is logged on or not** and **Run with highest privileges** options. + + - In **Trigger** tab, click New ... > Begin the task: **At startup** > OK + + - In **Actions** tab, click New ... > Action: **Start a program** + + program/script: **wsl** + + add arguments: **-u root /etc/init-wsl** + + - Click OK to exit + + *4)* You can run the task manually to confirm, or after Windows reboot docker should now automatically start. + +* Video streams may not load using the localhost address. To access the html video streams from your windows browser, you may need to use the IP address of the WSL VM. From a WSL terminal, determine your IP address by the command 'ip addr' and look for **eth0** then **inet** (e.g. ip = 172.29.38.21). Then from your windows browser (edge, chrome, etc) navigate to **ip:8080** (e.g. 172.29.38.21:8080) + From 93e68aeedd48cc11958ab7ed5a8ced5f28fda80f Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 25 Jan 2022 19:54:07 +0100 Subject: [PATCH 261/428] Update multi_worker.md --- docs/multi_worker.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/multi_worker.md b/docs/multi_worker.md index 6b285124..23f09094 100644 --- a/docs/multi_worker.md +++ b/docs/multi_worker.md @@ -18,7 +18,7 @@ Note; Sagemaker will stop collecting experiences once you have reached 10.000 st ## Training with different parameters for each worker -It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME). To enable, set DR_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc. (So alongside run.env, you should have woker-2.env, worker-3.env, etc. run.env is still used for worker 1) Modify the worker env files with your desired changes, which can be more than just the world_name. These additional worker env files are only used if you are training with multiple workers. +It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME). To enable, set DR_TRAIN_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc. (So alongside run.env, you should have woker-2.env, worker-3.env, etc. run.env is still used for worker 1) Modify the worker env files with your desired changes, which can be more than just the world_name. These additional worker env files are only used if you are training with multiple workers. ## Watching the streams From 389b799494048703d226e93c57da38d57867cf5d Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 27 Jan 2022 08:35:24 +0100 Subject: [PATCH 262/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 6aa906fb..e1d17f90 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,7 +1,7 @@ { "master_version": "4.0", "containers": { - "rl_coach": "4.0.6", + "rl_coach": "4.0.11", "robomaker": "4.0.11", "sagemaker": "4.0.0" } From 80ca1d54ad428238bf34c5d869e362af4dca5ad3 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 12 Feb 2022 12:09:42 +0100 Subject: [PATCH 263/428] Video documentation --- bin/activate.sh | 4 ++-- docs/index.md | 1 + docs/video.md | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 docs/video.md diff --git a/bin/activate.sh b/bin/activate.sh index e40c5cbf..abc53b68 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -42,8 +42,8 @@ function dr-update-env { export DR_ROBOMAKER_EVAL_PORT=$(expr 8180 + $DR_RUN_ID) export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID) else - export DR_ROBOMAKER_TRAIN_PORT="8080-8100" - export DR_ROBOMAKER_EVAL_PORT="8080-8100" + export DR_ROBOMAKER_TRAIN_PORT="8080-8089" + export DR_ROBOMAKER_EVAL_PORT="8080-8089" export DR_ROBOMAKER_GUI_PORT="5901-5920" fi diff --git a/docs/index.md b/docs/index.md index b3cb5add..a65ef1cb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -40,6 +40,7 @@ DRfC supports a wide set of features to ensure that you can focus on creating th * [Having multiple GPUs in one Computer](multi_gpu.md) * [Installing on Windows](windows.md) * [Run a Head-to-Head Race](head-to-head.md) +* [Watching the car](video.md) # Support diff --git a/docs/video.md b/docs/video.md new file mode 100644 index 00000000..c4a1f882 --- /dev/null +++ b/docs/video.md @@ -0,0 +1,36 @@ +# Watching the car + +There are multiple ways to watch the car during training and evaluation. The ports and 'features' depend on the docker mode (swarm vs. compose) as well as between training and evaluation. + +## Training using Viewer + +DRfC has a built in viewer that supports showing the video stream from up to 6 workers on one webpage. + +The view can be started with `dr-start-viewer` and is available on `http://localhost:8100` or `http://127.0.0.1:8100`. The viewer must be updated if training is restarted using `dr-update-viewer`, as it needs to connect to the new containers. + +It is also possible to automatically start/update the viewer using the `-v` flag to `dr-start-training`. + +## ROS Stream Viewer + +The ROS Stream Viewer is a built in ROS feature that will stream any topic in ROS that publishing ROSImg messages. The viewer starts automatically. + +### Ports + +| Docker Mode | Training | Evaluation | Comment +| -------- | -------- | -------- | -------- | +| swarm | 8080 + `DR_RUN_ID` | 8180 + `DR_RUN_ID` | Default 8080/8180. Multiple workers share one port, press F5 to cycle between them. +| compose | 8080-8089 | 8080-8089 | Each worker gets a unique port. + +### Topics + +| Topic | Description | +| -------- | -------- | +| `/racecar/camera/zed/rgb/image_rect_color` | In-car video stream. This is used for inference. | +| `/racecar/main_camera/zed/rgb/image_rect_color` | Camera following the car. Stream without overlay | +| `/sub_camera/zed/rgb/image_rect_color` | Top-view of the track | +| `/racecar/deepracer/kvs_stream` | Camera following the car. Stream with overlay. Different overlay in Training and Evaluation | +| `/racecar/deepracer/main_camera_stream` | Same as `kvs_stream`, topic used for MP4 production. Only active in Evaluation if `DR_EVAL_SAVE_MP4=True` | + +## Saving Evaluation to File + +During evaluation (`dr-start-evaluation`), if `DR_EVAL_SAVE_MP4=True` then three MP4 files are created in the S3 bucket's MP4 folder. They contain the in-car camera, top-camera and the camera following the car. \ No newline at end of file From 9f5611984a118b9eb581ac462d9f5e4f7a5619ea Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 13 Feb 2022 22:38:02 +0100 Subject: [PATCH 264/428] Moving default to 4.0.12 --- defaults/dependencies.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index e1d17f90..5554b523 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { "master_version": "4.0", "containers": { - "rl_coach": "4.0.11", - "robomaker": "4.0.11", + "rl_coach": "4.0.12", + "robomaker": "4.0.12", "sagemaker": "4.0.0" } } From 9b9e01732a4bd9ee8800d6c6462f1606984db829 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 19 Feb 2022 19:07:33 +0100 Subject: [PATCH 265/428] Make minio folder available --- scripts/log-analysis/start.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/log-analysis/start.sh b/scripts/log-analysis/start.sh index 1ada0093..6c4bb5d0 100755 --- a/scripts/log-analysis/start.sh +++ b/scripts/log-analysis/start.sh @@ -4,6 +4,7 @@ docker run --rm -d -p "8888:8888" \ -v `pwd`/../../data/logs:/workspace/logs \ -v `pwd`/../../docker/volumes/.aws:/root/.aws \ -v `pwd`/../../data/analysis:/workspace/analysis \ +-v `pwd`/../../data/minio:/workspace/minio \ --name loganalysis \ --network sagemaker-local \ awsdeepracercommunity/deepracer-analysis:$DR_ANALYSIS_IMAGE From fdc97dab195df5d7cd25669bef85d338e4c34ac0 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 21 Feb 2022 20:13:09 +0100 Subject: [PATCH 266/428] Enable cloning of model before evaluation --- scripts/evaluation/start.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/evaluation/start.sh b/scripts/evaluation/start.sh index 6b0f1ff8..b5f4a7bd 100755 --- a/scripts/evaluation/start.sh +++ b/scripts/evaluation/start.sh @@ -3,8 +3,9 @@ source $DR_DIR/bin/scripts_wrapper.sh usage(){ - echo "Usage: $0 [-q]" + echo "Usage: $0 [-q] [-c]" echo " -q Quiet - does not start log tracing." + echo " -c Clone - copies model into new prefix before evaluating." exit 1 } @@ -15,10 +16,12 @@ function ctrl_c() { exit 1 } -while getopts ":q" opt; do +while getopts ":qc" opt; do case $opt in q) OPT_QUIET="QUIET" ;; +c) OPT_CLONE="CLONE" +;; h) usage ;; \?) echo "Invalid option -$OPTARG" >&2 @@ -27,6 +30,13 @@ usage esac done +# clone if required +if [ -n "$OPT_CLONE" ]; then + echo "Cloning model into s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E" + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/model s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/model + aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/ip s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/ip + export DR_LOCAL_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}-E +fi # set evaluation specific environment variables S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" From 3ec75b0a98014c9f36b8fd36191ae2ed0e9ee75a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 3 Mar 2022 19:50:31 +0100 Subject: [PATCH 267/428] Update multi_gpu.md --- docs/multi_gpu.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 96bd482c..927ad104 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -54,7 +54,7 @@ The number is the CUDA number of the GPU you want the Robomakers to use. Sagemaker is more critical to place, but also more complicated, as you will have to build a new Docker image for it to work. -A template is in `utils/Dockerfile.sagemaker-gpu`. Again the number is the applicable CUDA number. +A template is in `utils/Dockerfile.sagemaker-gpu`. Open it to alter the source image in `FROM`, and adapt `CUDA_VISIBLE_DEVICES`. Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu .` with x being anything you like. From 75c1ed0b81065f6d08d7c4ceac12176b3c3570f4 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 2 Apr 2022 23:20:47 +0200 Subject: [PATCH 268/428] Version 5 (#86) * Upgrade to Ubuntu 20.04 * Prepare for version 5 * Typo fix * Update dependencies.json * Update init.sh No separate mkl image --- bin/activate.sh | 6 +++--- bin/init.sh | 2 +- bin/prepare.sh | 15 +++++++++------ defaults/dependencies.json | 8 ++++---- docs/installation.md | 6 +++--- utils/Dockerfile.gpu-detect | 2 +- 6 files changed, 21 insertions(+), 18 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index abc53b68..c99c94cf 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -138,16 +138,16 @@ if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then fi ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. -if [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -eq 0 ]; +if [ "${DR_CLOUD,,}" == "aws" ] && [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -gt 0 ]; then + export DR_LOCAL_S3_AUTH_MODE="role" +else export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" export DR_LOCAL_S3_AUTH_MODE="profile" -else - export DR_LOCAL_S3_AUTH_MODE="role" fi export DR_TRAIN_COMPOSE_FILE diff --git a/bin/init.sh b/bin/init.sh index 5de31b6e..e84d3cb2 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -113,7 +113,7 @@ sed -i "s//$AWS_REGION/g" $INSTALL_DIR/system.env if [[ "${OPT_ARCH}" == "gpu" ]]; then SAGEMAKER_TAG="gpu" elif [[ -n "${CPU_INTEL}" ]]; then - SAGEMAKER_TAG="cpu-avx-mkl" + SAGEMAKER_TAG="cpu" else SAGEMAKER_TAG="cpu" fi diff --git a/bin/prepare.sh b/bin/prepare.sh index 9585a33d..d510b45a 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -7,6 +7,8 @@ function ctrl_c() { exit 1 } + + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system @@ -60,14 +62,15 @@ fi ## Adding Nvidia Drivers if [[ "${ARCH}" == "gpu" ]]; then - sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub - sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' - sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' - sudo bash -c 'apt update && apt install -y nvidia-driver-440-server cuda-minimal-build-10-2 --no-install-recommends -o Dpkg::Options::="--force-overwrite"' + distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') + sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/7fa2af80.pub + echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list + echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list + sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite" fi ## Adding AWSCli -sudo apt-get install -y awscli python3-boto3 +sudo apt-get install -y --no-install-recommends awscli python3-boto3 ## Installing Docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - @@ -90,7 +93,7 @@ sudo systemctl restart docker sudo usermod -a -G docker $(id -un) ## Installing Docker Compose -sudo curl -L https://github.com/docker/compose/releases/download/1.25.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose +sudo curl -L https://github.com/docker/compose/releases/download/1.29.2/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose ## Reboot to load driver -- continue install if in cloud-init diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 5554b523..e28bda49 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,8 +1,8 @@ { - "master_version": "4.0", + "master_version": "5.0", "containers": { - "rl_coach": "4.0.12", - "robomaker": "4.0.12", - "sagemaker": "4.0.0" + "rl_coach": "5.0.0", + "robomaker": "5.0.1", + "sagemaker": "5.0.0" } } diff --git a/docs/installation.md b/docs/installation.md index 9f85f6d7..430fc4b4 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -7,7 +7,7 @@ Depending on your needs as well as specific needs of the cloud platform you can **AWS**: * EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training. - * Ubuntu 18.04 + * Ubuntu 20.04 * Minimum 30 GB, preferred 40 GB of OS disk. * Ephemeral Drive connected * Minimum of 8 GB GPU-RAM if running with GPU. @@ -17,7 +17,7 @@ Depending on your needs as well as specific needs of the cloud platform you can **Azure**: * N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard - * Ubuntu 18.04 + * Ubuntu 20.04 * Standard 30 GB OS drive is sufficient to get started. * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. * Minimum 8 GB GPU-RAM @@ -27,7 +27,7 @@ Depending on your needs as well as specific needs of the cloud platform you can **Local**: * A modern, comparatively powerful, Intel based system. - * Ubuntu 18.04 or 20.04, other Linux-dristros likely to work. + * Ubuntu 20.04, other Linux-dristros likely to work. * 4 core-CPU, equivalent to 8 vCPUs; the more the better. * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. * System RAM + GPU RAM should be at least 32 GB. diff --git a/utils/Dockerfile.gpu-detect b/utils/Dockerfile.gpu-detect index 81d78e96..08a069d2 100644 --- a/utils/Dockerfile.gpu-detect +++ b/utils/Dockerfile.gpu-detect @@ -1,4 +1,4 @@ -FROM nvidia/cuda:10.2-base +FROM nvidia/cuda:11.4.2-base-ubuntu18.04 RUN apt-get update && apt-get install -y --no-install-recommends wget python3 RUN wget https://gist.githubusercontent.com/f0k/63a664160d016a491b2cbea15913d549/raw/f25b6b38932cfa489150966ee899e5cc899bf4a6/cuda_check.py CMD ["python3","cuda_check.py"] \ No newline at end of file From 00aa1985976df2786fff646a5b20d565141bfd6c Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 4 Apr 2022 13:17:53 +0200 Subject: [PATCH 269/428] pkg-config needed for clean setup --- utils/setup-xorg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh index 101226e6..35526151 100755 --- a/utils/setup-xorg.sh +++ b/utils/setup-xorg.sh @@ -4,7 +4,7 @@ # Install additional packages sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \ - menu mesa-utils xterm jwm x11vnc -y --no-install-recommends + menu mesa-utils xterm jwm x11vnc pkg-config -y --no-install-recommends # Configure sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config From 433da995b72844f0971b2cfaa60790d42cf6bed2 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 4 Apr 2022 13:20:07 +0200 Subject: [PATCH 270/428] Ensure .Xauthority exists --- utils/setup-xorg.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/setup-xorg.sh b/utils/setup-xorg.sh index 35526151..4df308e8 100755 --- a/utils/setup-xorg.sh +++ b/utils/setup-xorg.sh @@ -11,6 +11,8 @@ sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1) sudo nvidia-xconfig --busid=$BUS_ID -o $DR_DIR/tmp/xorg.conf +touch ~/.Xauthority + sudo tee -a $DR_DIR/tmp/xorg.conf << EOF Section "DRI" From 6daf51f8114b864a806164eef986fb0d66bc58a1 Mon Sep 17 00:00:00 2001 From: Jochem Lugtenburg Date: Thu, 28 Apr 2022 19:11:14 +0200 Subject: [PATCH 271/428] Update cuda repository key (#88) --- bin/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index d510b45a..1c20e15e 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -14,7 +14,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \ - sudo apt-get install --no-install-recommends -y jq + sudo apt-get install --no-install-recommends -y jq source $DIR/detect.sh echo "Detected cloud type ${CLOUD_NAME}" @@ -63,7 +63,7 @@ fi if [[ "${ARCH}" == "gpu" ]]; then distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') - sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/7fa2af80.pub + sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite" From ad8d82c77292576068c0930ef0af904cbb4cb506 Mon Sep 17 00:00:00 2001 From: jochem725 Date: Thu, 28 Apr 2022 19:16:14 +0200 Subject: [PATCH 272/428] Use https to fetch cuda key --- bin/prepare.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 9585a33d..958a245f 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -12,7 +12,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" ## Patch system sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \ - sudo apt-get install --no-install-recommends -y jq + sudo apt-get install --no-install-recommends -y jq source $DIR/detect.sh echo "Detected cloud type ${CLOUD_NAME}" @@ -60,7 +60,7 @@ fi ## Adding Nvidia Drivers if [[ "${ARCH}" == "gpu" ]]; then - sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' sudo bash -c 'echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda_learn.list' sudo bash -c 'apt update && apt install -y nvidia-driver-440-server cuda-minimal-build-10-2 --no-install-recommends -o Dpkg::Options::="--force-overwrite"' From d5f67455498e66b18c1ca95559637377af4238ad Mon Sep 17 00:00:00 2001 From: jochem725 Date: Thu, 28 Apr 2022 19:18:45 +0200 Subject: [PATCH 273/428] Fix distribution --- bin/prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/prepare.sh b/bin/prepare.sh index 98c3674d..b334f87e 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -63,7 +63,7 @@ fi if [[ "${ARCH}" == "gpu" ]]; then distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite" From 430adcb8501f6d7e42a4c45ade6aea6bdb149249 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 1 Jun 2022 21:21:50 +0200 Subject: [PATCH 274/428] Updating Keys (#91) --- bin/prepare.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/prepare.sh b/bin/prepare.sh index b334f87e..02f18a0d 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -64,6 +64,7 @@ if [[ "${ARCH}" == "gpu" ]]; then distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64/7fa2af80.pub echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite" From 3d37b2ea8560f98eab416d8429278458993004d5 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Wed, 1 Jun 2022 21:38:17 +0200 Subject: [PATCH 275/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index e28bda49..077ff45f 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.0", - "robomaker": "5.0.1", + "robomaker": "5.0.3", "sagemaker": "5.0.0" } } From 14827dd20d2958449138bd92538fb823f0332eb2 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 16 Jun 2022 12:39:30 +0200 Subject: [PATCH 276/428] Update Dockerfile.sagemaker-gpu --- utils/Dockerfile.sagemaker-gpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu index c481021b..a73d3979 100644 --- a/utils/Dockerfile.sagemaker-gpu +++ b/utils/Dockerfile.sagemaker-gpu @@ -1,2 +1,2 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:4.0.6-gpu -ENV CUDA_VISIBLE_DEVICES=0 \ No newline at end of file +FROM awsdeepracercommunity/deepracer-sagemaker:5.0.0-gpu +ENV CUDA_VISIBLE_DEVICES=0 From 66cfb61fda154bcee7389cdd2c418fcc57a95b57 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 26 Jun 2022 13:42:54 +0200 Subject: [PATCH 277/428] Extend Troubleshooting --- docs/installation.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/installation.md b/docs/installation.md index 430fc4b4..456942a3 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -166,3 +166,4 @@ Here are some hints for troubleshooting specific issues you may encounter Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors Check docker errors for specific container | Run `docker logs -f ` Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.
If you don't care which one to use, you can get the first one by running ```ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask)```.
Edit `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr `.
Rerun `./bin/init.sh -c local -a cpu` +I don't have any of the `dr-*` commands | Run `source bin/activate.sh`. From 94d056a0717f3e3bed23890208c493aed0188573 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 2 Aug 2022 11:02:56 +0200 Subject: [PATCH 278/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 077ff45f..70824ed2 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.0", - "robomaker": "5.0.3", + "robomaker": "5.0.4", "sagemaker": "5.0.0" } } From 4afada5b0ac9707cb307e516d19544763874ffce Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Tue, 2 Aug 2022 18:51:01 +0200 Subject: [PATCH 279/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 70824ed2..f493f736 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.0", - "robomaker": "5.0.4", + "robomaker": "5.0.5", "sagemaker": "5.0.0" } } From 0c802509d8349538f7cfaf95b02279fcd8697992 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 4 Aug 2022 08:12:28 +0200 Subject: [PATCH 280/428] Create a unique path for Evaluation Simtrace (#95) * Fixing evaluation trace path * Fix parameter sequence --- scripts/evaluation/prepare-config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 9bfb33f2..11277060 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 import boto3 +from datetime import datetime import sys import os import time @@ -11,6 +12,8 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") +eval_time = datetime.now().strftime('%Y%m%d%H%M%S') + config = {} config['CAR_COLOR'] = [] config['BODY_SHELL_TYPE'] = [] @@ -35,15 +38,17 @@ def str2bool(v): config['MODEL_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) -config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) +config['SIMTRACE_S3_PREFIX'].append( + '{}/evaluation-{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'), eval_time) +) # Metrics config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) if metrics_prefix is not None: - config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time())))) + config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, eval_time)) else: - config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time())))) + config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(eval_time)) # MP4 configuration / sav save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) From ca98771c890f43944aca072410b671ce38f2d0a9 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 4 Aug 2022 08:25:07 +0200 Subject: [PATCH 281/428] Changing approach to Multi-GPU (#92) * Changing approach to Multi-GPU * Increase rl_coach and sagemaker version * Adjust dependencies * Add warning --- bin/activate.sh | 7 +++++++ defaults/dependencies.json | 2 +- defaults/template-system.env | 3 ++- docker/docker-compose-eval.yml | 2 +- docker/docker-compose-training.yml | 3 ++- docs/multi_gpu.md | 16 ++++------------ utils/Dockerfile.sagemaker-gpu | 2 -- 7 files changed, 17 insertions(+), 18 deletions(-) delete mode 100644 utils/Dockerfile.sagemaker-gpu diff --git a/bin/activate.sh b/bin/activate.sh index c99c94cf..38c9b8ac 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -86,6 +86,13 @@ else export DR_DOCKER_FILE_SEP="-f" fi +# Check if CUDA_VISIBLE_DEVICES is configured. +if [[ -n "${CUDA_VISIBLE_DEVICES}" ]]; then + echo "WARNING: You have CUDA_VISIBLE_DEVICES defined. The will no longer work as" + echo " expected. To control GPU assignment use DR_ROBOMAKER_CUDA_DEVICES" + echo " and DR_SAGEMAKER_CUDA_DEVICES and rlcoach v5.0.1 or later." +fi + # Prepare the docker compose files depending on parameters if [[ "${DR_CLOUD,,}" == "azure" ]]; then diff --git a/defaults/dependencies.json b/defaults/dependencies.json index f493f736..a5174099 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -1,7 +1,7 @@ { "master_version": "5.0", "containers": { - "rl_coach": "5.0.0", + "rl_coach": "5.0.1", "robomaker": "5.0.5", "sagemaker": "5.0.0" } diff --git a/defaults/template-system.env b/defaults/template-system.env index 807f35a4..d1dd4000 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -20,4 +20,5 @@ DR_HOST_X=False DR_WEBVIEWER_PORT=8100 # DR_DISPLAY=:99 # DR_REMOTE_MINIO_URL=http://mynas:9000 -# CUDA_VISIBLE_DEVICES=0 \ No newline at end of file +# DR_ROBOMAKER_CUDA_DEVICES=0 +# DR_SAGEMAKER_CUDA_DEVICES=0 \ No newline at end of file diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 585b3949..429ea83a 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -15,7 +15,7 @@ services: ports: - "${DR_ROBOMAKER_EVAL_PORT}:8080" environment: - - CUDA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES=${DR_ROBOMAKER_CUDA_DEVICES} - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD} - WORLD_NAME=${DR_WORLD_NAME} - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} diff --git a/docker/docker-compose-training.yml b/docker/docker-compose-training.yml index 40e8b041..a8f150dc 100644 --- a/docker/docker-compose-training.yml +++ b/docker/docker-compose-training.yml @@ -18,6 +18,7 @@ services: - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY} - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY} + - CUDA_VISIBLE_DEVICES=${DR_SAGEMAKER_CUDA_DEVICES} volumes: - "/var/run/docker.sock:/var/run/docker.sock" - "/tmp/sagemaker:/tmp/sagemaker" @@ -36,6 +37,6 @@ services: - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} - ENABLE_GUI=${DR_GUI_ENABLE} - - CUDA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES=${DR_ROBOMAKER_CUDA_DEVICES} - MULTI_CONFIG - RTF_OVERRIDE=${DR_TRAIN_RTF} diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 927ad104..037a4de5 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -42,20 +42,12 @@ In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Te ### Selecting Device -#### Robomaker -To control the Robomaker then add the following to `system.env`: +To control the CUDA assignment for Sagemaker abd Robomaker then the following to variables in `system.env`: ``` -CUDA_VISIBLE_DEVICES=1 +DR_ROBOMAKER_CUDA_DEVICES=0 +DR_SAGEMAKER_CUDA_DEVICES=1 ``` -The number is the CUDA number of the GPU you want the Robomakers to use. -#### Sagemaker +The number is the CUDA number of the GPU you want the containers to use. -Sagemaker is more critical to place, but also more complicated, as you will have to build a new Docker image for it to work. - -A template is in `utils/Dockerfile.sagemaker-gpu`. Open it to alter the source image in `FROM`, and adapt `CUDA_VISIBLE_DEVICES`. - -Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu .` with x being anything you like. - -Update `system.env` to use the new image. diff --git a/utils/Dockerfile.sagemaker-gpu b/utils/Dockerfile.sagemaker-gpu deleted file mode 100644 index a73d3979..00000000 --- a/utils/Dockerfile.sagemaker-gpu +++ /dev/null @@ -1,2 +0,0 @@ -FROM awsdeepracercommunity/deepracer-sagemaker:5.0.0-gpu -ENV CUDA_VISIBLE_DEVICES=0 From 251f43e68063b0529fbf688083e2628f9042d5fd Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 4 Aug 2022 08:29:14 +0200 Subject: [PATCH 282/428] Fix Docker Setup Issue #83 (#93) * Fix #83 * Adding some logging. --- bin/prepare.sh | 9 ++++++++- defaults/docker-daemon.json | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 defaults/docker-daemon.json diff --git a/bin/prepare.sh b/bin/prepare.sh index 02f18a0d..e0713585 100755 --- a/bin/prepare.sh +++ b/bin/prepare.sh @@ -85,7 +85,14 @@ then curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime - cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json + if [ -f "/etc/docker/daemon.json" ]; + then + echo "Altering /etc/docker/daemon.json with default-rutime nvidia." + cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json + else + echo "Creating /etc/docker/daemon.json with default-rutime nvidia." + sudo cp $DIR/../defaults/docker-daemon.json /etc/docker/daemon.json + fi fi sudo systemctl enable docker sudo systemctl restart docker diff --git a/defaults/docker-daemon.json b/defaults/docker-daemon.json new file mode 100644 index 00000000..c0fc2e4b --- /dev/null +++ b/defaults/docker-daemon.json @@ -0,0 +1,9 @@ +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime": "nvidia" +} \ No newline at end of file From 1421f68b485165a9ca157784dfb11afc0927720a Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 4 Aug 2022 20:00:22 +0200 Subject: [PATCH 283/428] Upload 'snapshot' to local S3 --- scripts/upload/upload-model.sh | 63 +++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index ef4a688f..6771b848 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -1,15 +1,16 @@ #!/bin/bash usage(){ - echo "Usage: $0 [-f] [-w] [-d] [-b] [-c ] [-p ]" + echo "Usage: $0 [-f] [-w] [-d] [-b] [-1] [-i] [-I] [-L] [-c ] [-p ]" echo " -f Force upload. No confirmation question." echo " -w Wipes the target AWS DeepRacer model structure before upload." echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." echo " -b Uploads best checkpoint. Default is last checkpoint." - echo " -p model Uploads model in specified S3 prefix." + echo " -p model Uploads model from specified S3 prefix." echo " -i Import model with the upload name" echo " -I name Import model with a specific name" echo " -1 Increment upload name with 1 (dr-increment-upload-model)" + echo " -L Upload model to the local S3 bucket" exit 1 } @@ -20,7 +21,7 @@ function ctrl_c() { exit 1 } -while getopts ":fwdhbp:c:1iI:" opt; do +while getopts ":fwdhbp:c:1iI:L" opt; do case $opt in b) OPT_CHECKPOINT="Best" ;; @@ -38,6 +39,8 @@ i) OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" ;; I) OPT_IMPORT="$OPTARG" ;; +L) OPT_LOCAL="Local" +;; 1) OPT_INCREMENT="Yes" ;; h) usage @@ -59,8 +62,38 @@ then OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" fi -export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} -export TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} +SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} +if [[ -n "${OPT_PREFIX}" ]]; +then + SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX} +else + SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} +fi +SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} +SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} +SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" + +TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} + +if [[ -z "${OPT_LOCAL}" ]]; +then + TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} + UPLOAD_PROFILE=${DR_UPLOAD_PROFILE} +else + if [[ -n "${OPT_IMPORT}" ]]; + then + echo "Combination of -i and -L is not permitted." + exit 1 + fi + if [[ "${DR_UPLOAD_S3_PREFIX}" = "${SOURCE_S3_MODEL_PREFIX}" ]]; + then + echo "Target equals source. Exiting." + exit 1 + fi + + TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET} + UPLOAD_PROFILE=${DR_LOCAL_PROFILE_ENDPOINT_URL} +fi if [[ -z "${DR_UPLOAD_S3_BUCKET}" ]]; then @@ -74,16 +107,6 @@ then exit 1 fi -SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} -if [[ -n "${OPT_PREFIX}" ]]; -then - SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX} -else - SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} -fi -SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} -SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} -SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" export WORK_DIR=${DR_DIR}/tmp/upload/ mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip @@ -181,11 +204,11 @@ fi # echo "" > ${WORK_DIR}model/.ready cd ${WORK_DIR} echo ${CHECKPOINT_JSON} > ${WORK_DIR}model/deepracer_checkpoints.json -aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} -aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${DR_UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN} -aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} +aws ${UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN} +aws ${UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} # After upload trigger the import if [[ -n "${OPT_IMPORT}" ]]; From 9e624715c47c53855219555fb33bb03f2f439728 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Fri, 5 Aug 2022 10:02:48 +0200 Subject: [PATCH 284/428] Fixing error message when using -1 flag with -L --- scripts/upload/upload-model.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/upload/upload-model.sh b/scripts/upload/upload-model.sh index 6771b848..d8365080 100755 --- a/scripts/upload/upload-model.sh +++ b/scripts/upload/upload-model.sh @@ -59,7 +59,10 @@ fi if [[ -n "${OPT_INCREMENT}" ]]; then source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} - OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" + if [[ -n ${OPT_IMPORT} ]]; + then + OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" + fi fi SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} From e970fafe1965e70f937d68ab35b52fe1761b1886 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 22 Sep 2022 19:29:36 +0200 Subject: [PATCH 285/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index f493f736..8200799f 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.0", - "robomaker": "5.0.5", + "robomaker": "5.0.6", "sagemaker": "5.0.0" } } From d77007bf984d18a6f777026889bc4a5d78e07c3d Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 22 Sep 2022 20:13:14 +0200 Subject: [PATCH 286/428] Automatically add minio credentials --- bin/init.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index e84d3cb2..a84842d7 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -103,7 +103,14 @@ else AWS_REGION="us-east-1" sed -i "s//minio/g" $INSTALL_DIR/system.env sed -i "s//not-defined/g" $INSTALL_DIR/system.env - echo "Please run 'aws configure --profile minio' to set the credentials" + + aws configure --profile minio get aws_access_key_id > /dev/null 2> /dev/null + + if [[ "$?" -ne 0 ]]; then + echo "Creating default minio credentials in AWS profile 'minio'" + aws configure --profile minio set aws_access_key_id $(openssl rand -base64 12) + aws configure --profile minio set aws_secret_access_key $(openssl rand -base64 12) + fi fi sed -i "s//to-be-defined/g" $INSTALL_DIR/system.env sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/system.env @@ -154,10 +161,10 @@ docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION # create the network sagemaker-local if it doesn't exit SAGEMAKER_NW='sagemaker-local' -docker swarm init +docker swarm init SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) -docker node update --label-add Sagemaker=true $SWARM_NODE -docker node update --label-add Robomaker=true $SWARM_NODE +docker node update --label-add Sagemaker=true $SWARM_NODE > /dev/null 2> /dev/null +docker node update --label-add Robomaker=true $SWARM_NODE > /dev/null 2> /dev/null docker network ls | grep -q $SAGEMAKER_NW if [ $? -ne 0 ] then From 7c0d9d7bb5ef049e6e4e0be3c089f31938a8c043 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 22 Sep 2022 20:15:14 +0200 Subject: [PATCH 287/428] Adding a config --- bin/init.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/init.sh b/bin/init.sh index a84842d7..416d303c 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -110,6 +110,7 @@ else echo "Creating default minio credentials in AWS profile 'minio'" aws configure --profile minio set aws_access_key_id $(openssl rand -base64 12) aws configure --profile minio set aws_secret_access_key $(openssl rand -base64 12) + aws configure --profile minio set region us-east-1 fi fi sed -i "s//to-be-defined/g" $INSTALL_DIR/system.env From fd23668e804fdee22649e2bb24a0a179b91d315e Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 22 Sep 2022 21:21:53 +0200 Subject: [PATCH 288/428] Tweaking code --- bin/init.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/init.sh b/bin/init.sh index 416d303c..7269ae16 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -101,16 +101,17 @@ elif [[ "${OPT_CLOUD}" == "remote" ]]; then echo "Please define DR_REMOTE_MINIO_URL in system.env to point to remote minio instance." else AWS_REGION="us-east-1" - sed -i "s//minio/g" $INSTALL_DIR/system.env + MINIO_PROFILE="minio" + sed -i "s//$MINIO_PROFILE/g" $INSTALL_DIR/system.env sed -i "s//not-defined/g" $INSTALL_DIR/system.env - aws configure --profile minio get aws_access_key_id > /dev/null 2> /dev/null + aws configure --profile $MINIO_PROFILE get aws_access_key_id > /dev/null 2> /dev/null if [[ "$?" -ne 0 ]]; then - echo "Creating default minio credentials in AWS profile 'minio'" - aws configure --profile minio set aws_access_key_id $(openssl rand -base64 12) - aws configure --profile minio set aws_secret_access_key $(openssl rand -base64 12) - aws configure --profile minio set region us-east-1 + echo "Creating default minio credentials in AWS profile '$MINIO_PROFILE'" + aws configure --profile $MINIO_PROFILE set aws_access_key_id $(openssl rand -base64 12) + aws configure --profile $MINIO_PROFILE set aws_secret_access_key $(openssl rand -base64 12) + aws configure --profile $MINIO_PROFILE set region us-east-1 fi fi sed -i "s//to-be-defined/g" $INSTALL_DIR/system.env From 0c5a2f554ebf7885ba7a021bb32bf657b4be1111 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 24 Sep 2022 18:52:12 +0200 Subject: [PATCH 289/428] Create dr-aws --- bin/activate.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/activate.sh b/bin/activate.sh index 38c9b8ac..4050ffea 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -196,6 +196,9 @@ if ! verlte $DEPENDENCY_VERSION $COACH_VER; then echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER." fi +## Create a dr-aws command +alias dr-aws='aws $DR_LOCAL_PROFILE_ENDPOINT_URL' + source $SCRIPT_DIR/scripts_wrapper.sh function dr-update { From b32a9699f11c8615cf105517ba39ac704285a6d5 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sat, 24 Sep 2022 18:53:57 +0200 Subject: [PATCH 290/428] Rename to dr-local-aws --- bin/activate.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 4050ffea..94f59908 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -196,8 +196,8 @@ if ! verlte $DEPENDENCY_VERSION $COACH_VER; then echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER." fi -## Create a dr-aws command -alias dr-aws='aws $DR_LOCAL_PROFILE_ENDPOINT_URL' +## Create a dr-local-aws command +alias dr-local-aws='aws $DR_LOCAL_PROFILE_ENDPOINT_URL' source $SCRIPT_DIR/scripts_wrapper.sh From 7f92532256ee383f8bc1e9e088c6989a28733a1a Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 1 Oct 2022 14:41:06 +0200 Subject: [PATCH 291/428] Update dependencies.json --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index ce82bf49..4c099ebc 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.6", - "robomaker": "5.0.6", + "robomaker": "5.0.7", "sagemaker": "5.0.0" } } From 06f9ebd019ea97ef3986a498892adffecaa3e637 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 2 Oct 2022 16:10:43 +0200 Subject: [PATCH 292/428] Isolate Robomaker in Evalulation --- docker/docker-compose-eval.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/docker-compose-eval.yml b/docker/docker-compose-eval.yml index 585b3949..671dbc27 100644 --- a/docker/docker-compose-eval.yml +++ b/docker/docker-compose-eval.yml @@ -28,3 +28,4 @@ services: - ENABLE_GUI=${DR_GUI_ENABLE} - ROLLOUT_IDX=0 - RTF_OVERRIDE=${DR_EVAL_RTF} + - ROS_MASTER_URI=http://localhost:11311/ From 680192c08db1bdfa77c4f6f1ce3230559fcd8c06 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Mon, 3 Oct 2022 16:30:16 +0200 Subject: [PATCH 293/428] Option to upload to local S3 --- scripts/upload/upload-car.sh | 56 ++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/scripts/upload/upload-car.sh b/scripts/upload/upload-car.sh index 2e575f73..e40a7a96 100755 --- a/scripts/upload/upload-car.sh +++ b/scripts/upload/upload-car.sh @@ -1,5 +1,34 @@ #!/bin/bash +usage(){ + echo "Usage: $0 [-L] [-f]" + echo " -f Force. Do not ask for confirmation." + echo " -L Upload model to the local S3 bucket." + exit 1 +} + +trap ctrl_c INT + +function ctrl_c() { + echo "Requested to stop." + exit 1 +} + + +while getopts ":Lf" opt; do +case $opt in +L) OPT_LOCAL="Local" +;; +f) OPT_FORCE="force" +;; +h) usage +;; +\?) echo "Invalid option -$OPTARG" >&2 +usage +;; +esac +done + # This script creates the tar.gz file necessary to operate inside a deepracer physical car # The file is created directly from within the sagemaker container, using the most recent checkpoint @@ -10,7 +39,7 @@ then for CONTAINER in $SAGEMAKER_CONTAINERS; do CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') - echo $CONTAINER_NAME + echo "Found Sagemaker container: $CONTAINER_NAME" done fi @@ -25,6 +54,27 @@ cd $DR_DIR/tmp/car_upload/model #create a tar.gz file containing all of these files tar -czvf carfile.tar.gz * -#upload to s3 -aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz +# Upload files +if [[ -z "${OPT_FORCE}" ]]; +then + if [[ -n "${OPT_LOCAL}" ]]; + then + echo "Ready to upload car model to local s3://${DR_LOCAL_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}." + else + echo "Ready to upload car model to remote s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}." + fi + read -r -p "Are you sure? [y/N] " response + if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] + then + echo "Aborting." + exit 1 + fi +fi +#upload to s3 +if [[ -n "${OPT_LOCAL}" ]]; +then + aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp carfile.tar.gz s3://${DR_LOCAL_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz +else + aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz +fi \ No newline at end of file From 44a53e2c3b0e6272dc997c7f62c8c0d6f6dfcecb Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Wed, 5 Oct 2022 21:13:13 +0200 Subject: [PATCH 294/428] Obstacle Type --- defaults/template-run.env | 1 + defaults/template-worker.env | 1 + scripts/evaluation/prepare-config.py | 1 + scripts/training/prepare-config.py | 3 +++ 4 files changed, 6 insertions(+) diff --git a/defaults/template-run.env b/defaults/template-run.env index 90959b5a..9bc1aa7a 100644 --- a/defaults/template-run.env +++ b/defaults/template-run.env @@ -45,6 +45,7 @@ DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_OA_OBSTACLE_TYPE=box_obstacle DR_OA_OBJECT_POSITIONS= DR_H2B_IS_LANE_CHANGE=False DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 diff --git a/defaults/template-worker.env b/defaults/template-worker.env index 11aabd5b..863ae773 100644 --- a/defaults/template-worker.env +++ b/defaults/template-worker.env @@ -10,6 +10,7 @@ DR_OA_NUMBER_OF_OBSTACLES=6 DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False DR_OA_IS_OBSTACLE_BOT_CAR=False +DR_OA_OBSTACLE_TYPE=box_obstacle DR_OA_OBJECT_POSITIONS= DR_H2B_IS_LANE_CHANGE=False DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 diff --git a/scripts/evaluation/prepare-config.py b/scripts/evaluation/prepare-config.py index 11277060..a4215e72 100755 --- a/scripts/evaluation/prepare-config.py +++ b/scripts/evaluation/prepare-config.py @@ -85,6 +85,7 @@ def str2bool(v): config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + config['OBSTACLE_TYPE'] = os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle') object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") if object_position_str != "": diff --git a/scripts/training/prepare-config.py b/scripts/training/prepare-config.py index 087176d2..cc943796 100755 --- a/scripts/training/prepare-config.py +++ b/scripts/training/prepare-config.py @@ -54,6 +54,7 @@ config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') + config['OBSTACLE_TYPE'] = os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle') object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") if object_position_str != "": @@ -155,6 +156,8 @@ config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')}) config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')}) config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')}) + config.update({'OBSTACLE_TYPE': os.environ.get('DR_OA_OBSTACLE_TYPE', 'box_obstacle')}) + object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") if object_position_str != "": object_positions = [] From a2db4df0a624ace87b89afcc7ff27f35fe9751fe Mon Sep 17 00:00:00 2001 From: Mayur Madnani <10260496+mayurmadnani@users.noreply.github.com> Date: Sun, 9 Oct 2022 10:49:54 +0530 Subject: [PATCH 295/428] working tag before single drive xl implementation --- docker/docker-compose-local.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml index f2eea31b..eff18914 100644 --- a/docker/docker-compose-local.yml +++ b/docker/docker-compose-local.yml @@ -8,7 +8,7 @@ networks: services: minio: - image: minio/minio + image: minio/minio:RELEASE.2022-05-26T05-48-41Z ports: - "9000:9000" - "9001:9001" From 9e54afc00547f107274656ca9e8cc1018e0e5454 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Thu, 13 Oct 2022 21:58:39 +0200 Subject: [PATCH 296/428] Update import-model.py Allow profile to be empty (useful when using EC2 instance with role assigned) --- scripts/upload/import-model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/upload/import-model.py b/scripts/upload/import-model.py index 9b3e038a..c61e9654 100755 --- a/scripts/upload/import-model.py +++ b/scripts/upload/import-model.py @@ -27,8 +27,11 @@ print("You must configure an IAM role with access to the S3 bucket in variable DR_UPLOAD_S3_ROLE ") exit(1) -session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile) - +if len(aws_profile) > 1: + session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile) +else: + session = boto3.session.Session(region_name='us-east-1') + try: dr = session.client('deepracer') except UnknownServiceError: @@ -55,4 +58,4 @@ if response['ResponseMetadata']['HTTPStatusCode'] == 200: print('Model importing as {}'.format(response['ModelArn'])) else: - sys.exit('Error occcured when uploading') \ No newline at end of file + sys.exit('Error occcured when uploading') From e984aa9c4288bd8db7626d9ad30b3f80a74869ab Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 23 Oct 2022 15:24:50 +0200 Subject: [PATCH 297/428] Abort if path has spaces --- bin/init.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/init.sh b/bin/init.sh index 7269ae16..562625e5 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -10,6 +10,11 @@ function ctrl_c() { SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" +if [[ "$INSTALL_DIR" == *\ * ]]; then + echo "Deepracer-for-Cloud cannot be installed in path with spaces. Exiting." + exit 1 +fi + OPT_ARCH="gpu" OPT_CLOUD="" From 4ea80b550502858e3b8d99f94fb5bc734d3439ea Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Sun, 23 Oct 2022 22:32:08 +0200 Subject: [PATCH 298/428] Simplify dependencies --- scripts/upload/import-model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/upload/import-model.py b/scripts/upload/import-model.py index c61e9654..e5ff076f 100755 --- a/scripts/upload/import-model.py +++ b/scripts/upload/import-model.py @@ -12,6 +12,7 @@ try: import pandas as pd import deepracer + from deepracer import boto3_enhancer except ImportError: print("You need to install pandas and deepracer-utils to use the import function.") exit(1) @@ -31,9 +32,10 @@ session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile) else: session = boto3.session.Session(region_name='us-east-1') - + +global dr try: - dr = session.client('deepracer') + dr = boto3_enhancer.deepracer_client(session=session) except UnknownServiceError: print ("Boto3 service 'deepracer' is not installed. Cannot import model.") print ("Install with 'pip install deepracer-utils' and 'python -m deepracer install-cli --force'") From 570da13d8f945033c87e54fd3e90fd4b5d969e11 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 3 Nov 2022 22:01:37 +0100 Subject: [PATCH 299/428] Workaround for minio --- bin/activate.sh | 8 ++++++++ defaults/template-system.env | 1 + docker/docker-compose-local.yml | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/bin/activate.sh b/bin/activate.sh index 94f59908..2310a67a 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -93,6 +93,14 @@ if [[ -n "${CUDA_VISIBLE_DEVICES}" ]]; then echo " and DR_SAGEMAKER_CUDA_DEVICES and rlcoach v5.0.1 or later." fi +# Check if CUDA_VISIBLE_DEVICES is configured. +if [[ -z "${DR_MINIO_IMAGE}" ]]; then + echo "WARNING: You have not configured DR_MINIO_IMAGE in system.env." + echo " System will default to tag RELEASE.2022-10-24T18-35-07Z" + export DR_MINIO_IMAGE="RELEASE.2022-10-24T18-35-07Z" +fi + + # Prepare the docker compose files depending on parameters if [[ "${DR_CLOUD,,}" == "azure" ]]; then diff --git a/defaults/template-system.env b/defaults/template-system.env index d1dd4000..37bed17a 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -10,6 +10,7 @@ DR_KINESIS_STREAM_NAME= DR_KINESIS_STREAM_ENABLE=True DR_SAGEMAKER_IMAGE= DR_ROBOMAKER_IMAGE= +DR_MINIO_IMAGE=latest DR_ANALYSIS_IMAGE=cpu DR_COACH_IMAGE= DR_WORKERS=1 diff --git a/docker/docker-compose-local.yml b/docker/docker-compose-local.yml index eff18914..5d29115a 100644 --- a/docker/docker-compose-local.yml +++ b/docker/docker-compose-local.yml @@ -8,7 +8,7 @@ networks: services: minio: - image: minio/minio:RELEASE.2022-05-26T05-48-41Z + image: minio/minio:${DR_MINIO_IMAGE} ports: - "9000:9000" - "9001:9001" From d883a5d4af9bb6d9a54df17c839b147ccaf71e03 Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 3 Nov 2022 22:05:46 +0100 Subject: [PATCH 300/428] Exclude non-local installs --- bin/activate.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/activate.sh b/bin/activate.sh index 2310a67a..5d92565e 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -94,13 +94,12 @@ if [[ -n "${CUDA_VISIBLE_DEVICES}" ]]; then fi # Check if CUDA_VISIBLE_DEVICES is configured. -if [[ -z "${DR_MINIO_IMAGE}" ]]; then +if [ "${DR_CLOUD,,}" == "local" ] && [ -z "${DR_MINIO_IMAGE}" ]; then echo "WARNING: You have not configured DR_MINIO_IMAGE in system.env." echo " System will default to tag RELEASE.2022-10-24T18-35-07Z" export DR_MINIO_IMAGE="RELEASE.2022-10-24T18-35-07Z" fi - # Prepare the docker compose files depending on parameters if [[ "${DR_CLOUD,,}" == "azure" ]]; then From 008d6e89dee1e506eda08f91634507d83d03b1db Mon Sep 17 00:00:00 2001 From: Lars Ludvigsen Date: Thu, 3 Nov 2022 22:09:45 +0100 Subject: [PATCH 301/428] Added doc --- docs/reference.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference.md b/docs/reference.md index 479aeda0..d51d5373 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -58,6 +58,7 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_KINESIS_STREAM_ENABLE` | Enable or disable 'Kinesis Stream', True both publishes to a AWS KVS stream (if name not None), and to the topic `/racecar/deepracer/kvs_stream`. Leave True if you want to watch the car racing. | | `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.| | `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.| +| `DR_MINIO_IMAGE` | Determines which Minio image will be used. | | `DR_COACH_IMAGE` | Determines which coach image will be used for training.| | `DR_WORKERS` | Number of Robomaker workers to be used for training. See additional documentation for more information about this feature.| | `DR_ROBOMAKER_MOUNT_LOGS` | TODO.| From 03ea758bd7ac7710121e3c5014c836c6ab96ecb7 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 6 Nov 2022 20:02:59 +0100 Subject: [PATCH 302/428] Robomaker 5.0.8 as default --- defaults/dependencies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/dependencies.json b/defaults/dependencies.json index 4c099ebc..d9d5a45b 100644 --- a/defaults/dependencies.json +++ b/defaults/dependencies.json @@ -2,7 +2,7 @@ "master_version": "5.0", "containers": { "rl_coach": "5.0.6", - "robomaker": "5.0.7", + "robomaker": "5.0.8", "sagemaker": "5.0.0" } } From a0fc74fa3b5a3725bcd4a95b5aea2732ff8b58f8 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 6 Nov 2022 20:05:03 +0100 Subject: [PATCH 303/428] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ab5e9771..de196d31 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # DeepRacer-For-Cloud -Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). +Provides a quick and easy way to get up and running with a DeepRacer training environment using a cloud virtual machine or a local compter, such [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing) or the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu). + +DRfC runs on Ubuntu 20.04 or 22.04. GPU acceleration requires a NVIDIA GPU, preferrably with more than 8GB of VRAM. ## Introduction From 029aa666399180c812d87276d47945bd09206a5b Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sun, 6 Nov 2022 22:38:48 +0100 Subject: [PATCH 304/428] Update reference --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index de196d31..43143dc4 100644 --- a/README.md +++ b/README.md @@ -38,5 +38,5 @@ Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https: ## Support -* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support. +* For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support. * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. From 9fea338c8ce42d4aeefc155c3c5bf5d247753fb0 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen <59617571+larsll@users.noreply.github.com> Date: Sat, 3 Dec 2022 20:35:41 +0100 Subject: [PATCH 305/428] Enable externally mounted bundle in Robomaker (#105) * Enable externally mounted bundle in Robomaker * Enable external scripts mounted into container --- bin/activate.sh | 6 ++++++ bin/init.sh | 2 +- defaults/template-system.env | 1 + docker/docker-compose-simapp.yml | 9 +++++++++ docs/reference.md | 3 ++- 5 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 docker/docker-compose-simapp.yml diff --git a/bin/activate.sh b/bin/activate.sh index 5d92565e..ff489105 100644 --- a/bin/activate.sh +++ b/bin/activate.sh @@ -151,6 +151,12 @@ if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml" fi +# Enable local simapp mount +if [[ -d "${DR_ROBOMAKER_MOUNT_SIMAPP_DIR,,}" ]]; then + DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-simapp.yml" + DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-simapp.yml" +fi + ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. if [ "${DR_CLOUD,,}" == "aws" ] && [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -gt 0 ]; then diff --git a/bin/init.sh b/bin/init.sh index 562625e5..317113ff 100755 --- a/bin/init.sh +++ b/bin/init.sh @@ -70,7 +70,7 @@ cd $INSTALL_DIR # create directory structure for docker volumes mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket -mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/tmp +mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/data/scripts $INSTALL_DIR/tmp sudo mkdir -p /tmp/sagemaker sudo chmod -R g+w /tmp/sagemaker diff --git a/defaults/template-system.env b/defaults/template-system.env index 37bed17a..5aa3c45e 100644 --- a/defaults/template-system.env +++ b/defaults/template-system.env @@ -15,6 +15,7 @@ DR_ANALYSIS_IMAGE=cpu DR_COACH_IMAGE= DR_WORKERS=1 DR_ROBOMAKER_MOUNT_LOGS=False +# DR_ROBOMAKER_MOUNT_SIMAPP_DIR= DR_CLOUD_WATCH_ENABLE=False DR_DOCKER_STYLE=swarm DR_HOST_X=False diff --git a/docker/docker-compose-simapp.yml b/docker/docker-compose-simapp.yml new file mode 100644 index 00000000..f19f49d1 --- /dev/null +++ b/docker/docker-compose-simapp.yml @@ -0,0 +1,9 @@ +version: '3.7' + +services: + robomaker: + volumes: + - '${DR_ROBOMAKER_MOUNT_SIMAPP_DIR}/sagemaker_rl_agent:/opt/install/sagemaker_rl_agent' + - '${DR_ROBOMAKER_MOUNT_SIMAPP_DIR}/deepracer_simulation_environment:/opt/install/deepracer_simulation_environment' + - '${DR_DIR}/data/logs:/logs' + - '${DR_DIR}/data/scripts:/scripts' \ No newline at end of file diff --git a/docs/reference.md b/docs/reference.md index d51d5373..69c7b89e 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -61,7 +61,8 @@ The scripts assume that two files `system.env` containing constant configuration | `DR_MINIO_IMAGE` | Determines which Minio image will be used. | | `DR_COACH_IMAGE` | Determines which coach image will be used for training.| | `DR_WORKERS` | Number of Robomaker workers to be used for training. See additional documentation for more information about this feature.| -| `DR_ROBOMAKER_MOUNT_LOGS` | TODO.| +| `DR_ROBOMAKER_MOUNT_LOGS` | True to get logs mounted to `$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX`| +| `DR_ROBOMAKER_MOUNT_SIMAPP_DIR` | Path to the altered Robomaker bundle, e.g. `/home/ubuntu/deepracer-simapp/bundle`.| | `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.| | `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`. Use Compose for openGL optimized containers.| | `DR_HOST_X` | Uses the host X-windows server, rather than starting one inside of Robomaker. Required for OpenGL images.| From e77a4a5689e6a568bc81e37d9d43379711a0f30c Mon Sep 17 00:00:00 2001 From: anjrew Date: Fri, 30 Dec 2022 20:01:27 +0100 Subject: [PATCH 306/428] Viewer: Add 'Select' options for car, camera type and quality. (#110) * Viewer: Add 'Select' options for car, camera type and quality. - Add HTML template for easier future development. - Add banner when user tries to view more than 6 workers cars at the same time. (Browser mas max 6) - Synchronize state between url and select options * From PR comments:\n - Add a field to change the width of the picture\n - Reduce the amount of whitespace --- .gitignore | 4 +- scripts/viewer/index.template.html | 402 +++++++++++++++++++++++++++++ scripts/viewer/start.sh | 43 ++- 3 files changed, 439 insertions(+), 10 deletions(-) create mode 100755 scripts/viewer/index.template.html diff --git a/.gitignore b/.gitignore index 41e3337d..661a09d7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,12 @@ recording/ recording /*.env /*.bak +/*.tar /*.json DONE data/ tmp/ autorun.s3url nohup.out -start.sh \ No newline at end of file +start.sh +_ \ No newline at end of file diff --git a/scripts/viewer/index.template.html b/scripts/viewer/index.template.html new file mode 100755 index 00000000..074cab78 --- /dev/null +++ b/scripts/viewer/index.template.html @@ -0,0 +1,402 @@ + + + + + DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC + + + + + +
+ +
+
+ +
+ + + + + \ No newline at end of file diff --git a/scripts/viewer/start.sh b/scripts/viewer/start.sh index 11872988..6abd0e40 100755 --- a/scripts/viewer/start.sh +++ b/scripts/viewer/start.sh @@ -1,12 +1,13 @@ #!/usr/bin/env bash usage(){ - echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]" + echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command] -p [port]" echo " -w Width of individual stream." echo " -h Heigth of individual stream." echo " -q Quality of the stream image." echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" echo " -b Browser command (default: firefox --new-tab)" + echo " -p The port to use " exit 1 } @@ -23,8 +24,9 @@ WIDTH=480 HEIGHT=360 QUALITY=75 BROWSER="firefox --new-tab" +PORT=$DR_WEBVIEWER_PORT -while getopts ":w:h:q:t:b:" opt; do +while getopts ":w:h:q:t:b:p:" opt; do case $opt in w) WIDTH="$OPTARG" ;; @@ -36,12 +38,16 @@ t) TOPIC="$OPTARG" ;; b) BROWSER="$OPTARG" ;; +p) PORT="$OPTARG" +;; \?) echo "Invalid option -$OPTARG" >&2 usage ;; esac done +DR_WEBVIEWER_PORT=$PORT + export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf @@ -53,7 +59,7 @@ server { index index.html index.htm; } EOF -echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
" > $DR_VIEWER_HTML + if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ ) @@ -70,15 +76,26 @@ if [ -z "$ROBOMAKER_CONTAINERS" ]; then exit fi - +# Expose the diamensions to the HTML template +export QUALITY +export WIDTH +export HEIGHT +# Create .js array of robomakers to pass to the HTML template +export ROBOMAKER_CONTAINERS_HTML="" +for c in $ROBOMAKER_CONTAINERS; do + ROBOMAKER_CONTAINERS_HTML+="'$c'," +done +SCRIPT_PATH="${BASH_SOURCE:-$0}" +ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" +ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" +INDEX_HTML_TEMPLATE="${ABS_DIRECTORY}/index.template.html" +# Replace all variables in HTML template and create the viewer html file +envsubst < "${INDEX_HTML_TEMPLATE}" > $DR_VIEWER_HTML + +# Add proxy paths in the NGINX file for c in $ROBOMAKER_CONTAINERS; do - C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" - C_IMG="
" - echo $C_IMG >> $DR_VIEWER_HTML echo " location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >> $DR_NGINX_CONF done - -echo "
" >> $DR_VIEWER_HTML echo "}" >> $DR_NGINX_CONF # Check if we will use Docker Swarm or Docker Compose @@ -103,3 +120,11 @@ if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then $BROWSER "http://127.0.01:8100" & fi +CURRENT_CONTAINER_HASH=$(docker ps | grep dr_viewer | head -c 12) + +IP_ADDRESSES="$( hostname -I)"; +echo "The viewer will avaliable on the following hosts after initialization:" +for ip in $IP_ADDRESSES; +do + echo "http://${ip}:${PORT}" +done \ No newline at end of file From 3a92a6b53d0b3013514f5197b7cfb02d0e3dec04 Mon Sep 17 00:00:00 2001 From: Lars Lorentz Ludvigsen Date: Fri, 30 Dec 2022 19:22:19 +0000 Subject: [PATCH 307/428] Style fixes --- scripts/viewer/index.template.html | 36 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/scripts/viewer/index.template.html b/scripts/viewer/index.template.html index 074cab78..c5478c28 100755 --- a/scripts/viewer/index.template.html +++ b/scripts/viewer/index.template.html @@ -2,7 +2,7 @@ - DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC + DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX