microsoft
diff --git a/‎createstubs.sh‎
Lines changed: 11 additions & 6 deletions b/‎createstubs.sh‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎cyberbattle/_env/cyberbattle_env.py‎
Lines changed: 112 additions & 82 deletions b/‎cyberbattle/_env/cyberbattle_env.py‎
Lines changed: 112 additions & 82 deletions
diff --git a/‎cyberbattle/_env/flatten_wrapper.py‎
Lines changed: 136 additions & 0 deletions b/‎cyberbattle/_env/flatten_wrapper.py‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎cyberbattle/_env/graph_wrapper.py‎
Lines changed: 3 additions & 3 deletions b/‎cyberbattle/_env/graph_wrapper.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cyberbattle/agents/baseline/agent_wrapper.py‎
Lines changed: 33 additions & 22 deletions b/‎cyberbattle/agents/baseline/agent_wrapper.py‎
Lines changed: 33 additions & 22 deletions
diff --git a/‎cyberbattle/agents/baseline/baseline_test.py‎
Lines changed: 22 additions & 0 deletions b/‎cyberbattle/agents/baseline/baseline_test.py‎
Lines changed: 22 additions & 0 deletions
@@ -18,6 +18,13 @@ createstub() {
         echo stub $name already created
     fi
 }
+param=$1
+if [[ $param == "--recreate" ]]; then
+    echo 'Deleting typing directory'
+    rm -Rf typings/
+fi
+
+echo 'Creating stubs'
 
 mkdir -p typings/
 
@@ -30,23 +37,21 @@ createstub ordered_set
 createstub asciichartpy
 createstub networkx
 createstub boolean
+createstub IPython
 
 
 if [ ! -d "typings/gym" ]; then
     pyright --createstub gym
     # Patch gym stubs
     echo '    spaces = ...' >> typings/gym/spaces/dict.pyi
     echo '    nvec = ...' >> typings/gym/spaces/space.pyi
+    echo '    spaces = ...' >> typings/gym/spaces/space.pyi
+    echo '    spaces = ...' >> typings/gym/spaces/tuple.pyi
+    echo '    n = ...' >> typings/gym/spaces/multi_binary.pyi
 else
     echo stub gym already created
 fi
 
-if [ ! -d "typings/IPython" ]; then
-    pyright --createstub IPython.core.display
-else
-    echo stub 'IPython' already created
-fi
-
 
 echo 'Typing stub generation completed'
 
 
@@ -0,0 +1,136 @@
+"""Space flattening wrappers fro the CyberBattleEnv gym environment.
+"""
+from collections import OrderedDict
+from sqlite3 import NotSupportedError
+from gym import spaces
+import numpy as np
+from cyberbattle._env.cyberbattle_env import DummySpace, CyberBattleEnv, Action
+from gym.core import ObservationWrapper, ActionWrapper
+
+
+class FlattenObservationWrapper(ObservationWrapper):
+    """
+    Flatten all nested dictionaries and tuples from the
+     observation space of a CyberBattleSim environment`CyberBattleEnv`.
+     The resulting observation space is a dictionary containing only
+     subspaces of types: `Discrete`, `MultiBinary`, and `MultiDiscrete`.
+    """
+
+    def flatten_multibinary_space(self, space: spaces.Space):
+        if isinstance(space, spaces.MultiBinary):
+            if type(space.n) in [tuple, list, np.ndarray]:
+                flatten_dim = np.multiply.reduce(space.n)
+                print(f'// MultiBinary flattened from {space.n} -> {flatten_dim}')
+                return spaces.MultiBinary(flatten_dim)
+            else:
+                print(f'// MultiBinary already flat: {space.n}')
+                return space
+        else:
+            return space
+
+    def __init__(self, env: CyberBattleEnv, ignore_fields=['action_mask']):
+        ObservationWrapper.__init__(self, env)
+        self.env = env
+        self.ignore_fields = ignore_fields
+
+        space_dict = OrderedDict({})
+        for key, space in env.observation_space.spaces.items():
+            if key in ignore_fields:
+                print('Filtering out field', key)
+            elif isinstance(space, spaces.Dict):
+                for k2, subspace in space.items():
+                    space_dict[f"{key}_{k2}"] = self.flatten_multibinary_space(subspace)
+            elif isinstance(space, spaces.Tuple):
+                for i, subspace in enumerate(space.spaces):
+                    space_dict[f"{key}_{i}"] = self.flatten_multibinary_space(subspace)
+            elif isinstance(space, spaces.MultiBinary):
+                space_dict[key] = self.flatten_multibinary_space(space)
+            elif isinstance(space, spaces.Discrete) or isinstance(space, spaces.MultiDiscrete):
+                space_dict[key] = space
+            elif isinstance(space, DummySpace):
+                print(f'warning: unsupported observation space: {space} : {type(space)}')
+            else:
+                raise NotImplementedError(f"Case not handled: {key} - type {type(space)}")
+
+        self.observation_space = spaces.Dict(space_dict)
+
+    def flatten_multibinary_observation(self, space, o):
+        if isinstance(space, spaces.MultiBinary) and \
+                type(space.n) in [tuple, list, np.ndarray] and \
+                len(space.n) > 1:
+            flatten_dim = np.multiply.reduce(space.n)
+            return tuple(o.reshape(flatten_dim))
+        else:
+            return o
+
+    def observation(self, observation: dict):
+        o = OrderedDict({})
+        for key, space in self.env.observation_space.spaces.items():
+            value = observation[key]
+            if key in self.ignore_fields:
+                continue
+            elif isinstance(space, spaces.Dict):
+                for subkey, subspace in space.items():
+                    o[f"{key}_{subkey}"] = self.flatten_multibinary_observation(subspace, value[subkey])
+            elif isinstance(space, spaces.Tuple):
+                for i, subspace in enumerate(space.spaces):
+                    o[f"{key}_{i}"] = self.flatten_multibinary_observation(subspace, value[i])
+            elif isinstance(space, spaces.MultiBinary):
+                o[key] = self.flatten_multibinary_observation(space, value)
+            elif isinstance(space, spaces.Discrete) or isinstance(space, spaces.MultiDiscrete):
+                o[key] = value
+            elif isinstance(space, DummySpace):
+                continue
+            else:
+                raise NotImplementedError(f"Case not handled: {key} - type {type(space)}")
+
+        return o
+
+
+class FlattenActionWrapper(ActionWrapper):
+    """
+    Flatten all nested dictionaries and tuples from the
+     action space of a CyberBattleSim environment`CyberBattleEnv`.
+     The resulting action space is a dictionary containing only
+     subspaces of types: `Discrete`, `MultiBinary`, and `MultiDiscrete`.
+    """
+
+    def __init__(self, env: CyberBattleEnv):
+        ActionWrapper.__init__(self, env)
+        self.env = env
+
+        self.action_space = spaces.MultiDiscrete([
+            # connect, local vulnerabilities, remote vulnerabilities
+            1 + env.bounds.local_attacks_count + env.bounds.remote_attacks_count,
+
+            # source node
+            env.bounds.maximum_node_count,
+
+            # target  node
+            env.bounds.maximum_node_count,
+
+            # target port (for connect action only)
+            env.bounds.port_count,
+
+            # target port (credentials used, for connect action only)
+            env.bounds.maximum_total_credentials
+        ]
+        )
+
+    def action(self, action: np.ndarray) -> Action:
+        action_type = action[0]
+        if action_type == 0:
+            return {'connect': action[1:5]}
+
+        action_type -= 1
+        if action_type < self.env.bounds.local_attacks_count:
+            return {'local_vulnerability': np.array([action[1], action_type])}
+
+        action_type -= self.env.bounds.local_attacks_count
+        if action_type < self.env.bounds.remote_attacks_count:
+            return {'remote_vulnerability': np.array([action[1], action[2], action_type])}
+
+        raise NotSupportedError(f'Unsupported action: {action}')
+
+    def reverse_action(self, action):
+        raise NotImplementedError
@@ -182,15 +182,15 @@ def __add_node(self, observation):
             creds = onp.full(self._bounds.maximum_total_credentials, -1, dtype=onp.int8)
             self.__graph.add_node(
                 node_index,
-                name=observation['discovered_nodes'][node_index],
+                name=observation['_discovered_nodes'][node_index],
                 privilege_level=None, flags=None,  # these are set by __update_nodes()
                 credentials=creds,
                 has_leaked_creds=False,
             )
 
     def __update_edges(self, observation):
-        g_orig = observation['explored_network']
-        node_ids = {n: i for i, n in enumerate(observation['discovered_nodes'])}
+        g_orig = observation['_explored_network']
+        node_ids = {n: i for i, n in enumerate(observation['_discovered_nodes'])}
         for (from_name, to_name), edge_properties in g_orig.edges.items():
             self.__graph.add_edge(node_ids[from_name], node_ids[to_name], **edge_properties)
 
 
@@ -69,9 +69,11 @@ def get(self, a: StateAugmentation, node) -> ndarray:
         node_prop = a.observation['discovered_nodes_properties']
 
         # list of all properties set/unset on the node
-        # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0)
         assert node < len(node_prop), f'invalid node index {node} (not discovered yet)'
-        remapped = np.array((1 + node_prop[node]) / 2, dtype=np.int_)
+
+        # Remap to get rid of the unknown value (2):
+        #   1->1, 0->0, 2->0
+        remapped = np.array(node_prop[node] % 2, dtype=np.int_)
         return remapped
 
 
@@ -85,7 +87,7 @@ def __init__(self, p: EnvironmentBounds):
     def get(self, a: StateAugmentation, node) -> ndarray:
         assert node is not None, 'feature only valid in the context of a node'
 
-        discovered_node_count = len(a.observation['discovered_nodes_properties'])
+        discovered_node_count = a.observation['discovered_node_count']
 
         assert node < discovered_node_count, f'invalid node index {node} (not discovered yet)'
 
@@ -110,13 +112,14 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [2] * p.property_count)
 
     def get(self, a: StateAugmentation, node) -> ndarray:
-        node_prop = np.array(a.observation['discovered_nodes_properties'])
+        n = a.observation['discovered_node_count']
+        node_prop = np.array(a.observation['discovered_nodes_properties'])[:n]
 
         # keep last window of entries
         node_prop_window = node_prop[-self.window_size:, :]
 
-        # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0)
-        node_prop_window_remapped = np.int32((1 + node_prop_window) / 2)
+        # Remap to get rid of the unknown value (2)
+        node_prop_window_remapped = np.int32(node_prop_window % 2)
 
         countby = np.sum(node_prop_window_remapped, axis=0)
 
@@ -131,9 +134,11 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [2] * p.port_count)
 
     def get(self, a: StateAugmentation, node):
-        ccm = a.observation['credential_cache_matrix']
+        n = a.observation['credential_cache_length']
         known_credports = np.zeros(self.env_properties.port_count, dtype=np.int32)
-        known_credports[np.int32(ccm[:, 1])] = 1
+        if n > 0:
+            ccm = np.array(a.observation['credential_cache_matrix'])[:n]
+            known_credports[np.int32(ccm[:, 1])] = 1
         return known_credports
 
 
@@ -145,9 +150,11 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [2] * p.port_count)
 
     def get(self, a: StateAugmentation, node):
-        ccm = a.observation['credential_cache_matrix']
         known_credports = np.zeros(self.env_properties.port_count, dtype=np.int32)
-        known_credports[np.int32(ccm[-self.window_size:, 1])] = 1
+        n = a.observation['credential_cache_length']
+        if n > 0:
+            ccm = np.array(a.observation['credential_cache_matrix'])[:n]
+            known_credports[np.int32(ccm[-self.window_size:, 1])] = 1
         return known_credports
 
 
@@ -158,8 +165,13 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [p.maximum_total_credentials + 1] * p.port_count)
 
     def get(self, a: StateAugmentation, node):
-        ccm = a.observation['credential_cache_matrix']
-        return np.bincount(np.int32(ccm[:, 1]), minlength=self.env_properties.port_count)
+        n = a.observation['credential_cache_length']
+        if n > 0:
+            ccm = np.array(a.observation['credential_cache_matrix'])[:n]
+            ports = np.int32(ccm[:, 1])
+        else:
+            ports = np.zeros(0)
+        return np.bincount(ports, minlength=self.env_properties.port_count)
 
 
 class Feature_discovered_credential_count(Feature):
@@ -169,7 +181,8 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [p.maximum_total_credentials + 1])
 
     def get(self, a: StateAugmentation, node):
-        return [len(a.observation['credential_cache_matrix'])]
+        n = a.observation['credential_cache_length']
+        return [n]
 
 
 class Feature_discovered_node_count(Feature):
@@ -179,7 +192,7 @@ def __init__(self, p: EnvironmentBounds):
         super().__init__(p, [p.maximum_node_count + 1])
 
     def get(self, a: StateAugmentation, node):
-        return [len(a.observation['discovered_nodes_properties'])]
+        return [a.observation['discovered_node_count']]
 
 
 class Feature_discovered_notowned_node_count(Feature):
@@ -190,10 +203,10 @@ def __init__(self, p: EnvironmentBounds, clip: Optional[int]):
         super().__init__(p, [self.clip + 1])
 
     def get(self, a: StateAugmentation, node):
-        node_props = a.observation['discovered_nodes_properties']
-        discovered = len(node_props)
+        discovered = a.observation['discovered_node_count']
+        node_props = np.array(a.observation['discovered_nodes_properties'][:discovered])
         # here we assume that a node is owned just if all its properties are known
-        owned = np.count_nonzero(np.all(node_props != 0, axis=1))
+        owned = np.count_nonzero(np.all(node_props != 2, axis=1))
         diff = discovered - owned
         return [min(diff, self.clip)]
 
@@ -355,7 +368,7 @@ def specialize_to_gymaction(self, source_node: np.int32, observation, abstract_a
 
         abstract_action_index_int = int(abstract_action_index)
 
-        node_prop = np.array(observation['discovered_nodes_properties'])
+        discovered_nodes_count = observation['discovered_node_count']
 
         if abstract_action_index_int < self.n_local_actions:
             vuln = abstract_action_index_int
@@ -365,8 +378,6 @@ def specialize_to_gymaction(self, source_node: np.int32, observation, abstract_a
         if abstract_action_index_int < self.n_remote_actions:
             vuln = abstract_action_index_int
 
-            discovered_nodes_count = len(node_prop)
-
             if discovered_nodes_count <= 1:
                 return None
 
@@ -382,11 +393,11 @@ def specialize_to_gymaction(self, source_node: np.int32, observation, abstract_a
         abstract_action_index_int -= self.n_remote_actions
         port = np.int32(abstract_action_index_int)
 
-        discovered_credentials = np.array(observation['credential_cache_matrix'])
-        n_discovered_creds = len(discovered_credentials)
+        n_discovered_creds = observation['credential_cache_length']
         if n_discovered_creds <= 0:
             # no credential available in the cache: cannot poduce a valid connect action
             return None
+        discovered_credentials = np.array(observation['credential_cache_matrix'])[:n_discovered_creds]
 
         nodes_not_owned = discovered_nodes_notowned(observation)
 
 
@@ -12,6 +12,7 @@
 import cyberbattle.agents.baseline.agent_dql as dqla
 import cyberbattle.agents.baseline.agent_wrapper as w
 import cyberbattle.agents.baseline.learner as learner
+import cyberbattle.agents.baseline.agent_tabularqlearning as tqa
 
 logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")
 
@@ -69,3 +70,24 @@ def test_agent_training() -> None:
     )
 
     assert random_run
+
+
+def test_tabularq_agent_training() -> None:
+    tabularq_run = learner.epsilon_greedy_search(
+        cyberbattlechain,
+        ep,
+        learner=tqa.QTabularLearner(
+            ep,
+            gamma=0.015, learning_rate=0.01, exploit_percentile=100),
+        episode_count=training_episode_count,
+        iteration_count=iteration_count,
+        epsilon=0.90,
+        epsilon_exponential_decay=5000,
+        epsilon_minimum=0.01,
+        verbosity=Verbosity.Quiet,
+        render=False,
+        plot_episodes_length=False,
+        title="Tabular Q-learning"
+    )
+
+    assert tabularq_run