Skip to content

Commit

Permalink
Add data cleaning scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
xJoskiy committed Jan 20, 2025
1 parent c4b7eb7 commit 224b370
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 1 deletion.
3 changes: 2 additions & 1 deletion examples/datasets/taxes_2.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ NewYork,5000,0.3
Wisconsin,5000,0.15
Wisconsin,6000,0.2
Wisconsin,4000,0.1
Wisconsin,3000,0.3
Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
Texas,5000,0.05
Texas,4000,0.1
87 changes: 87 additions & 0 deletions examples/expert/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import Dict, List, Tuple
from collections import defaultdict
import matplotlib.pyplot as plt
import desbordante as db
import networkx as nx
import time



class DataCleaner:
def __init__(self, violations: List[Tuple[int, int]]) -> None:
self.graph: Dict[int, List[int]] = defaultdict(list)
for v1, v2 in violations:
if v1 != v2:
self.graph[v1].append(v2)
self.graph[v2].append(v1)
else:
self.graph[v1] = [v1]
self.nodes: List[int] = list(self.graph.keys())
self.removed_nodes: List[int] = []

def __remove_highest_degree_node(self) -> None:
max_key = max(self.graph, key=lambda x: len(self.graph[x]))
for neighbor in self.graph[max_key]:
self.graph[neighbor].remove(max_key)

del self.graph[max_key]
self.nodes.remove(max_key)
self.removed_nodes.append(max_key)

# Check if the graph contains any edges
def __has_edges(self) -> bool:
return any(self.graph[node] for node in self.graph)

# Remove highest degree node while graph has edges
def clean(self) -> None:
print("Cleaning algorithm started")
while self.__has_edges():
self.__remove_highest_degree_node()
print("Cleaning algorithm finished")

def draw(self, is_blocked: bool = True) -> None:
plt.figure()
G = nx.Graph()
G.add_nodes_from(self.nodes)
for node, neighbours in self.graph.items():
[G.add_edge(node, neighbour) for neighbour in neighbours]
nx.draw(G, with_labels=True)
plt.show(block=is_blocked)


def main():
TABLE_1 = '/home/joskiy/Projects/Desbordante/examples/datasets/taxes_2.csv'
DC = "!(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate)"
SEPARATOR = ','
HAS_HEADER = True

print("Data loading started")
verificator = db.dc_verification.algorithms.Default()
verificator.load_data(table=(TABLE_1, SEPARATOR, HAS_HEADER))
print("Data loading finished")

DO_COLLECT_VIOLATIONS = True

print("Algo execution started")

verificator.execute(denial_constraint=DC, do_collect_violations=DO_COLLECT_VIOLATIONS)

print("Algo execution finished")

dc_holds = verificator.dc_holds()

print("DC " + DC + " holds: " + str(dc_holds))

violations = verificator.get_violations()
cleaner = DataCleaner(violations)

cleaner.draw(False)
cleaner.clean()
cleaner.draw()

nodes = sorted(cleaner.removed_nodes)
print(f"Records to be removed: {", ".join(map(str, nodes))}")


if __name__ == "__main__":
main()

0 comments on commit 224b370

Please sign in to comment.