akelleh · sotomsa · Aug 27, 2018 · Sep 12, 2018 · Sep 12, 2018 · Sep 12, 2018
diff --git a/causality/estimation/adjustments.py b/causality/estimation/adjustments.py
@@ -1,10 +1,14 @@
 from networkx.algorithms import is_directed_acyclic_graph
+import networkx as nx
+#from pgmpy.models import BayesianModel
+from causality.estimation.utils import utils
+from itertools import combinations
 
 class AdjustmentException(Exception):
     pass
 
 class AdjustForDirectCauses(object):
-    def __init__(self): 
+    def __init__(self):
         pass
 
     def find_predecessors(self, g, causes):
@@ -17,11 +21,93 @@ def assumptions_satisfied(self, g, causes, effects, predecessors):
         if not is_directed_acyclic_graph(g):
             return False
         if not len(set(effects).intersection(set(causes).union(predecessors))) == 0:
-            return False 
-        return True 
+            return False
+        return True
 
     def admissable_set(self, g, causes, effects):
         predecessors = self.find_predecessors(g, causes)
         if not self.assumptions_satisfied(g, causes, effects, predecessors):
             raise AdjustmentException("Failed to satisfy adjustment assumptions")
         return predecessors
+
+class backDoorAdjustments(object):
+    def __init__(self,):
+        pass
+
+    def assumptions_satisfied(self, g, causes, effects):
+        if not is_directed_acyclic_graph(g):
+            raise AdjustmentException("Suplied Graph is not Directed and Acyclic")
+        if (len(causes)==0 or len(effects)==0):
+            raise AdjustmentException("Causes/Effects can not be empty")
+
+    def __are_causes_dseparated_from_effects(self, g, s, causes, effects):
+        # Internal function to exit double loop
+        def is_cause_dseparated_from_effects(g, s, cause, effects):
+            for effect in effects:
+                if utils().is_active_trail(g, cause, effect, observed=s):
+                    return(False)
+            return(True)
+
+        causesDSeparatedFromEffectsInGraph = True
+        for cause in causes:
+            if not is_cause_dseparated_from_effects(g,s,cause,effects):
+                causesDSeparatedFromEffectsInGraph = False
+                break
+        return(causesDSeparatedFromEffectsInGraph)
+
+
+    def minimal_backdoor_admissable_sets(self, g, causes, effects):
+
+        def is_superset_of_any_set_of_sets(s, setOfSets):
+            isSubset = False
+            for i in setOfSets:
+                if set(s).issuperset(i):
+                    isSubset = True
+                    break
+            return(isSubset)
+
+        # Check arguments
+        self.assumptions_satisfied(g, causes, effects)
+
+        # Bayesian Network is a DiGraph wrapper from pgmpy
+        # used because of its d-separation function (is_active_trail)
+        #backDoorGraph = BayesianModel(nx.edges(g))
+        backDoorGraph = nx.DiGraph(nx.edges(g))
+        descendantsOfCauses = set()
+
+        # Create back door graph and collect descendants from causes
+        for cause in causes:
+            outEdgesOfCause = backDoorGraph.out_edges(cause)
+            descendantsOfCauses = descendantsOfCauses.union(nx.descendants(backDoorGraph,cause))
+            backDoorGraph.remove_edges_from(outEdgesOfCause)
+
+        # Possible adjustment nodes are those from the original graph that:
+        # i) Are not causes
+        # ii) Are not effects
+        # iii) Are not descendants of the causes
+        possibleAdjustmentNodes = set(backDoorGraph.nodes()).difference(set(causes),
+                                                                        set(effects),
+                                                                        set(descendantsOfCauses))
+        # Keep track of which sets have been added
+        minAdmissablesSets = set()
+
+        # If the empty set d-separates causes and effects in the back door graph
+        # then return the empty set
+        if self.__are_causes_dseparated_from_effects(backDoorGraph, set(), causes, effects):
+            return
+
+        # Check all set partitions of possibleAdjustmentNodes
+        for r in range(len(possibleAdjustmentNodes)):
+            for s in combinations(possibleAdjustmentNodes,r+1):
+                # Check s only if s is not a super set of any set already in minAdmissablesSets
+                if not is_superset_of_any_set_of_sets(s,minAdmissablesSets):
+                    # Only add set to minAdmissablesSets if all causes are d-Separated of causes
+                    if self.__are_causes_dseparated_from_effects(backDoorGraph, s, causes, effects):
+                        minAdmissablesSets.add(frozenset(s))
+                        yield frozenset(s)
+
+        # If after checking all combinations we don't find any admissable set then raise an Exception
+        if len(minAdmissablesSets)==0:
+            raise AdjustmentException("Failed to satisfy adjustment assumptions")
+
+        return
diff --git a/causality/estimation/utils.py b/causality/estimation/utils.py
@@ -0,0 +1,138 @@
+import networkx as nx
+
+# All this file has the functions that are needed to be added to
+# Causality from pgmpy in order to check active trail in the sense
+# of d-connectness.
+# It only has minimum changes in order to make it work as a standalone
+# function.
+# The function making all heavy work is active_trail_nodes
+
+class utils(object):
+    def __init__(self):
+        pass
+
+    def _get_ancestors_of(self, g, obs_nodes_list):
+        """
+        Returns a dictionary of all ancestors of all the observed nodes including the
+        node itself.
+        Parameters
+        ----------
+        obs_nodes_list: string, list-type
+            name of all the observed nodes
+        Examples
+        --------
+        >>> from pgmpy.models import BayesianModel
+        >>> model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'),
+        ...                        ('I', 'L')])
+        >>> model._get_ancestors_of('G')
+        {'D', 'G', 'I'}
+        >>> model._get_ancestors_of(['G', 'I'])
+        {'D', 'G', 'I'}
+        """
+        if not isinstance(obs_nodes_list, (list, tuple)):
+            obs_nodes_list = [obs_nodes_list]
+
+        for node in obs_nodes_list:
+            if node not in g.nodes():
+                raise ValueError('Node {s} not in not in graph'.format(s=node))
+
+        ancestors_list = set()
+        nodes_list = set(obs_nodes_list)
+        while nodes_list:
+            node = nodes_list.pop()
+            if node not in ancestors_list:
+                nodes_list.update(g.predecessors(node))
+            ancestors_list.add(node)
+        return ancestors_list
+
+    def active_trail_nodes(self, g, variables, observed=None):
+        """
+        Returns a dictionary with the given variables as keys and all the nodes reachable
+        from that respective variable as values.
+        Parameters
+        ----------
+        variables: str or array like
+            variables whose active trails are to be found.
+        observed : List of nodes (optional)
+            If given the active trails would be computed assuming these nodes to be observed.
+        Examples
+        --------
+        >>> from pgmpy.models import BayesianModel
+        >>> student = BayesianModel()
+        >>> student.add_nodes_from(['diff', 'intel', 'grades'])
+        >>> student.add_edges_from([('diff', 'grades'), ('intel', 'grades')])
+        >>> student.active_trail_nodes('diff')
+        {'diff': {'diff', 'grades'}}
+        >>> student.active_trail_nodes(['diff', 'intel'], observed='grades')
+        {'diff': {'diff', 'intel'}, 'intel': {'diff', 'intel'}}
+        References
+        ----------
+        Details of the algorithm can be found in 'Probabilistic Graphical Model
+        Principles and Techniques' - Koller and Friedman
+        Page 75 Algorithm 3.1
+        """
+        if observed:
+            observed_list = observed if isinstance(observed, (list, tuple)) else [observed]
+        else:
+            observed_list = []
+        ancestors_list = self._get_ancestors_of(g, observed_list)
+
+        # Direction of flow of information
+        # up ->  from parent to child
+        # down -> from child to parent
+
+        active_trails = {}
+        for start in variables if isinstance(variables, (list, tuple)) else [variables]:
+            visit_list = set()
+            visit_list.add((start, 'up'))
+            traversed_list = set()
+            active_nodes = set()
+            while visit_list:
+                node, direction = visit_list.pop()
+                if (node, direction) not in traversed_list:
+                    if node not in observed_list:
+                        active_nodes.add(node)
+                    traversed_list.add((node, direction))
+                    if direction == 'up' and node not in observed_list:
+                        for parent in g.predecessors(node):
+                            visit_list.add((parent, 'up'))
+                        for child in g.successors(node):
+                            visit_list.add((child, 'down'))
+                    elif direction == 'down':
+                        if node not in observed_list:
+                            for child in g.successors(node):
+                                visit_list.add((child, 'down'))
+                        if node in ancestors_list:
+                            for parent in g.predecessors(node):
+                                visit_list.add((parent, 'up'))
+            active_trails[start] = active_nodes
+        return active_trails
+
+    def is_active_trail(self, g, start, end, observed=None):
+        """
+        Returns True if there is any active trail between start and end node
+        Parameters
+        ----------
+        start : Graph Node
+        end : Graph Node
+        observed : List of nodes (optional)
+            If given the active trail would be computed assuming these nodes to be observed.
+        additional_observed : List of nodes (optional)
+            If given the active trail would be computed assuming these nodes to be observed along with
+            the nodes marked as observed in the model.
+        Examples
+        --------
+        >>> from pgmpy.models import BayesianModel
+        >>> student = BayesianModel()
+        >>> student.add_nodes_from(['diff', 'intel', 'grades', 'letter', 'sat'])
+        >>> student.add_edges_from([('diff', 'grades'), ('intel', 'grades'), ('grades', 'letter'),
+        ...                         ('intel', 'sat')])
+        >>> student.is_active_trail('diff', 'intel')
+        False
+        >>> student.is_active_trail('grades', 'sat')
+        True
+        """
+        if end in self.active_trail_nodes(g, start, observed)[start]:
+            return True
+        else:
+            return False