From 955555da2b1370e2f8fbd8a7f5d55174542e237d Mon Sep 17 00:00:00 2001 From: Sofia Faltenbacher Date: Thu, 16 Jan 2025 11:49:44 +0100 Subject: [PATCH 1/2] refactor(independence_tests): use combinations instead of permutations (less tests) also: first steps for generator tests, some more tests for graph.are_nodes_d_separated --- .../constraint/independence_tests/common.py | 10 +- tests/test_generators.py | 233 +++++++++++++++++- tests/test_path_classification.py | 59 +++++ tests/test_pc_e2e.py | 3 +- 4 files changed, 294 insertions(+), 11 deletions(-) diff --git a/causy/causal_discovery/constraint/independence_tests/common.py b/causy/causal_discovery/constraint/independence_tests/common.py index 42e960b..a2778cb 100644 --- a/causy/causal_discovery/constraint/independence_tests/common.py +++ b/causy/causal_discovery/constraint/independence_tests/common.py @@ -96,12 +96,12 @@ def process( """ results = [] already_deleted_edges = set() - for nodes in itertools.permutations(nodes): - x: NodeInterface = graph.nodes[nodes[0]] - y: NodeInterface = graph.nodes[nodes[1]] - z: NodeInterface = graph.nodes[nodes[2]] + for node in nodes: + remaining_nodes = [n for n in nodes if n != node] + x: NodeInterface = graph.nodes[remaining_nodes[0]] + y: NodeInterface = graph.nodes[remaining_nodes[1]] + z: NodeInterface = graph.nodes[node] - # Avoid division by zero if x is None or y is None or z is None: return diff --git a/tests/test_generators.py b/tests/test_generators.py index 3080e88..9d79f68 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -1,6 +1,6 @@ from causy.causal_discovery.constraint.algorithms.pc import PC_EDGE_TYPES from causy.common_pipeline_steps.calculation import CalculatePearsonCorrelations -from causy.generators import PairsWithNeighboursGenerator +from causy.generators import PairsWithNeighboursGenerator, AllCombinationsGenerator from causy.graph_model import graph_model_factory from causy.causal_discovery.constraint.independence_tests.common import ( CorrelationCoefficientTest, @@ -14,14 +14,125 @@ class GeneratorsTestCase(CausyTestCase): SEED = 1 - def test_pairs_with_neighbours_generator(self): + # TODO, wip + def test_pairs_with_neighbours_generator_two_nodes(self): rdnv = self.seeded_random.normalvariate sample_generator = IIDSampleGenerator( edges=[ SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = PairsWithNeighboursGenerator( + comparison_settings=ComparisonSettings(min=2, max=4) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + pass + + def test_pairs_with_neighbours_generator_three_nodes_one_neighbour(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + SampleEdge(NodeReference("X"), NodeReference("Z"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = PairsWithNeighboursGenerator( + comparison_settings=ComparisonSettings(min=3, max=4) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + pass + + def test_pairs_with_neighbours_generator_three_nodes_two_neighbours(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + SampleEdge(NodeReference("X"), NodeReference("Z"), 1), + SampleEdge(NodeReference("Y"), NodeReference("Z"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = PairsWithNeighboursGenerator( + comparison_settings=ComparisonSettings(min=3, max=4) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + pass + + def test_pairs_with_neighbours_generator_four_nodes_fully_connected(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + SampleEdge(NodeReference("X"), NodeReference("Z"), 1), + SampleEdge(NodeReference("X"), NodeReference("W"), 1), SampleEdge(NodeReference("Y"), NodeReference("Z"), 1), - SampleEdge(NodeReference("Z"), NodeReference("W"), 1), SampleEdge(NodeReference("Y"), NodeReference("W"), 1), + SampleEdge(NodeReference("Z"), NodeReference("W"), 1), ], random=lambda: rdnv(0, 1), ) @@ -49,4 +160,118 @@ def test_pairs_with_neighbours_generator(self): all_results = [] for i in result: - all_results.extend(i) + all_results.append(i) + pass + + def test_all_combinations_generator_two_nodes(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = AllCombinationsGenerator( + comparison_settings=ComparisonSettings(min=2, max=2) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + pass + + def test_all_combinations_generator(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + SampleEdge(NodeReference("X"), NodeReference("Z"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = AllCombinationsGenerator( + comparison_settings=ComparisonSettings(min=3, max=3) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + + pass + + def test_all_combinations_generator_four_nodes_fully_connected(self): + rdnv = self.seeded_random.normalvariate + sample_generator = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 1), + SampleEdge(NodeReference("X"), NodeReference("Z"), 1), + SampleEdge(NodeReference("X"), NodeReference("W"), 1), + SampleEdge(NodeReference("Y"), NodeReference("Z"), 1), + SampleEdge(NodeReference("Y"), NodeReference("W"), 1), + SampleEdge(NodeReference("Z"), NodeReference("W"), 1), + ], + random=lambda: rdnv(0, 1), + ) + + algo = graph_model_factory( + Algorithm( + pipeline_steps=[ + CalculatePearsonCorrelations(), + CorrelationCoefficientTest(threshold=0.005), + PartialCorrelationTest(threshold=0.005), + ], + edge_types=PC_EDGE_TYPES, + extensions=[], + name="PC", + ) + ) + test_data, graph = sample_generator.generate(1000) + tst = algo() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + result = AllCombinationsGenerator( + comparison_settings=ComparisonSettings(min=2, max=4) + ).generate(tst.graph.graph, tst) + all_results = [] + + for i in result: + all_results.append(i) + pass diff --git a/tests/test_path_classification.py b/tests/test_path_classification.py index a71e85d..abe5eee 100644 --- a/tests/test_path_classification.py +++ b/tests/test_path_classification.py @@ -1,5 +1,7 @@ +from causy.causal_discovery.constraint.algorithms import PC from causy.edge_types import DirectedEdge from causy.graph import GraphBaseAccessMixin, GraphManager +from causy.sample_generator import IIDSampleGenerator, SampleEdge, NodeReference from tests.utils import CausyTestCase @@ -175,6 +177,63 @@ def test_are_nodes_d_separated_open_path_mediated(self): graph.add_directed_edge(node2, node3, {"test": "test"}) self.assertFalse(graph.are_nodes_d_separated(node1, node3, [])) + def test_are_nodes_d_separated_blocked_path_mediated(self): + new_graph_manager = GraphManager + new_graph_manager.__bases__ = ( + GraphBaseAccessMixin, + DirectedEdge.GraphAccessMixin, + ) + graph = new_graph_manager() + node1 = graph.add_node("test1", [1, 2, 3]) + node2 = graph.add_node("test2", [1, 2, 3]) + node3 = graph.add_node("test3", [1, 2, 3]) + graph.add_directed_edge(node1, node2, {"test": "test"}) + graph.add_directed_edge(node2, node3, {"test": "test"}) + (self.assertTrue(graph.are_nodes_d_separated(node1, node3, [node2]))) + + def test_are_nodes_d_separated_blocked_path_mediated(self): + rdnv = self.seeded_random.normalvariate + model = IIDSampleGenerator( + edges=[ + SampleEdge(NodeReference("X"), NodeReference("Y"), 5), + SampleEdge(NodeReference("Y"), NodeReference("Z"), 6), + ], + random=lambda: rdnv(0, 1), + ) + sample_size = 1000 + test_data, graph = model.generate(sample_size) + + tst = PC() + tst.create_graph_from_data(test_data) + tst.create_all_possible_edges() + tst.execute_pipeline_steps() + + self.assertGraphStructureIsEqual(tst.graph, graph) + + self.assertEqual( + True, + tst.graph.are_nodes_d_separated( + tst.graph.node_by_id("X"), + tst.graph.node_by_id("Z"), + [tst.graph.node_by_id("Y")], + ), + ) + + def test_are_nodes_d_separated_open_path_cpdag(self): + new_graph_manager = GraphManager + new_graph_manager.__bases__ = ( + GraphBaseAccessMixin, + DirectedEdge.GraphAccessMixin, + ) + graph = new_graph_manager() + node1 = graph.add_node("test1", [1, 2, 3]) + node2 = graph.add_node("test2", [1, 2, 3]) + node3 = graph.add_node("test3", [1, 2, 3]) + # TODO: check add edge again to see if this is the correct way to test cpdags with undirected edges + graph.add_edge(node1, node2, {"test": "test"}) + graph.add_edge(node3, node2, {"test": "test"}) + (self.assertTrue(graph.are_nodes_d_separated(node1, node3, [node2]))) + def test_are_nodes_d_separated_open_path_confounder(self): new_graph_manager = GraphManager new_graph_manager.__bases__ = ( diff --git a/tests/test_pc_e2e.py b/tests/test_pc_e2e.py index 362cd6b..ef325bf 100644 --- a/tests/test_pc_e2e.py +++ b/tests/test_pc_e2e.py @@ -88,8 +88,7 @@ def test_pc_number_of_all_proposed_actions_three_nodes(self): pc_results = tst.execute_pipeline_steps() self.assertEqual(len(pc_results[0].all_proposed_actions), 3) self.assertEqual(len(pc_results[1].all_proposed_actions), 3) - # TODO: think about whether the pairs with neighbours generator returns what we want, but the counting seems correct - self.assertEqual(len(pc_results[2].all_proposed_actions), 4) + self.assertEqual(len(pc_results[2].all_proposed_actions), 3) def test_pc_number_of_actions_three_nodes(self): """ From bc4c55a1a5a5b5fff4a9840558ce9d0dec62a5cf Mon Sep 17 00:00:00 2001 From: Sofia Faltenbacher Date: Thu, 16 Jan 2025 12:05:02 +0100 Subject: [PATCH 2/2] feat(independence_tests): also track dependent triples --- .../constraint/independence_tests/common.py | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/causy/causal_discovery/constraint/independence_tests/common.py b/causy/causal_discovery/constraint/independence_tests/common.py index a2778cb..3712e81 100644 --- a/causy/causal_discovery/constraint/independence_tests/common.py +++ b/causy/causal_discovery/constraint/independence_tests/common.py @@ -144,7 +144,8 @@ def process( u=x, v=y, action=TestResultAction.REMOVE_EDGE_UNDIRECTED, - data={"separatedBy": [z]}, + data={"separatedBy": [z], + "triple": [x, y, [z]]}, ) ) already_deleted_edges.add((x, y)) @@ -155,6 +156,7 @@ def process( u=x, v=y, action=TestResultAction.DO_NOTHING, + data={"triple": [x, y, [z]]}, ) ) @@ -236,11 +238,11 @@ def process( p_value = 2 * (1 - stats.norm.cdf(z_value)) # If the p value is smaller than the threshold, the null hypothesis (conditional independence) is rejected, otherwise we accept it and delete the edge + nodes_set = set([graph.nodes[n] for n in nodes]) if p_value > self.threshold: logger.debug( f"Nodes {graph.nodes[nodes[0]].name} and {graph.nodes[nodes[1]].name} are uncorrelated given nodes {','.join([graph.nodes[on].name for on in other_neighbours])}" ) - nodes_set = set([graph.nodes[n] for n in nodes]) return TestResult( u=graph.nodes[nodes[0]], v=graph.nodes[nodes[1]], @@ -248,14 +250,22 @@ def process( data={ "separatedBy": list( nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]} - ) - }, + ), + "triple": [graph.nodes[nodes[0]], graph.nodes[nodes[1]], nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]}], + } ) else: return TestResult( u=graph.nodes[nodes[0]], v=graph.nodes[nodes[1]], action=TestResultAction.DO_NOTHING, + data={ + "separatedBy": list( + nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]} + ), + "triple": [graph.nodes[nodes[0]], graph.nodes[nodes[1]], + nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]}], + } ) @@ -317,12 +327,11 @@ def process( partial_correlation.item(), self.threshold, ) - + nodes_set = set([graph.nodes[n] for n in nodes]) if abs(t) < critical_t: logger.debug( f"Nodes {graph.nodes[nodes[0]].name} and {graph.nodes[nodes[1]].name} are uncorrelated given nodes {','.join([graph.nodes[on].name for on in other_neighbours])}" ) - nodes_set = set([graph.nodes[n] for n in nodes]) return TestResult( u=graph.nodes[nodes[0]], v=graph.nodes[nodes[1]], @@ -330,7 +339,8 @@ def process( data={ "separatedBy": list( nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]} - ) + ), + "triple": [graph.nodes[nodes[0]], graph.nodes[nodes[1]], nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]}] }, ) else: @@ -338,4 +348,8 @@ def process( u=graph.nodes[nodes[0]], v=graph.nodes[nodes[1]], action=TestResultAction.DO_NOTHING, + data={ + "triple": [graph.nodes[nodes[0]], graph.nodes[nodes[1]], + nodes_set - {graph.nodes[nodes[0]], graph.nodes[nodes[1]]}] + }, )