Skip to content

Commit 17a9b8e

Browse files
Issue-10: Replace naive implementation with apriori
1 parent 337c178 commit 17a9b8e

File tree

4 files changed

+77
-1
lines changed

4 files changed

+77
-1
lines changed

mbdiff/apriori.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from efficient_apriori import apriori
2+
import numpy as np
3+
4+
def df_to_apriori(df):
5+
"""Convert pandas df to a data structure understood by efficient apriori."""
6+
[tuple(row) for row in df.values.tolist()]
7+
res = []
8+
for _, row in df.iterrows():
9+
transation = []
10+
for col in df.columns:
11+
val = row[col]
12+
if val is np.nan:
13+
continue
14+
attr_record = f"{col}:{val}"
15+
transation.append(attr_record)
16+
res.append(tuple(transation))
17+
return res
18+
19+
20+
def format_res(apriori_results):
21+
"""Convert apriori results to what Mb Diff expects."""
22+
explanations = [rule.lhs for rule in apriori_results]
23+
dict_form_explanations = []
24+
for explanation in explanations:
25+
d = {}
26+
for term in explanation:
27+
col, value = term.split(":")
28+
d[col] = value
29+
dict_form_explanations.append(d)
30+
return dict_form_explanations
31+
32+
33+
def explain(df, min_support, max_order):
34+
data = df_to_apriori(df)
35+
_, rules = apriori(data, min_support, max_length=max_order)
36+
outlier_explanations = [r for r in rules if "outlier:outlier" in r.rhs]
37+
return format_res(outlier_explanations)

mbdiff/diff.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
from typing import List
12
from pandas import DataFrame, read_csv
23
from mbdiff.diff_query import DiffQuery, InvalidQuery
34
from mbdiff.risk_ratio import risk_ratio
45
from mbdiff.attribute_mining import get_combs
56
from numpy import nan
7+
from mbdiff.apriori import explain as aprio_explain
68

79

810
def diff_file(
@@ -26,11 +28,32 @@ def diff(
2628
):
2729
"""Return explanations, invalid and below support criterium attribute combinations."""
2830
query.mark_groups(df)
29-
ignored_cols = ["outlier"]
31+
ignored_cols = []
3032
# ignore all non categorical columns
3133
for i, column in enumerate(df.columns):
3234
if df.dtypes[i] != "object":
3335
ignored_cols.append(column)
36+
sel_cols = [col for col in df.columns if col not in ignored_cols]
37+
df = df[sel_cols]
38+
return _apriori_diff(df, max_order, min_support, min_risk, ignored_cols)
39+
40+
41+
def _apriori_diff(df: DataFrame, max_order: int, min_support: float, min_risk: float, ignored_cols: List):
42+
"""Explanation mining from attributes close to macrobase DIFF, with use of Apriori."""
43+
explanations = aprio_explain(df, min_support, max_order)
44+
results, invalid = [], []
45+
for explanation in explanations:
46+
rr = risk_ratio(explanation, df)
47+
res = (rr, explanation)
48+
if rr is nan or rr <= min_risk:
49+
invalid.append(explanation)
50+
else:
51+
results.append(res)
52+
return results, invalid
53+
54+
55+
def _naive_impl(df: DataFrame, max_order: int, min_support: float, min_risk: float, ignored_cols: List):
56+
"""My own naive implementation of attribute mining, akin to apriori but without minimization."""
3457
combinations = get_combs(df, max_order, min_support, ignored_cols)
3558
results, invalid = [], []
3659
for combination in combinations:

mbdiff/explanation.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Module for abstracing away different implementation of explanation finding."""
2+
from typing import List
3+
4+
5+
class Explanation:
6+
"""Representation of a single explanation."""
7+
8+
def __init__(self, attrs:List[str], support, score) -> None:
9+
self.attrs = attrs
10+
self.support = support
11+
self.score = score
12+
13+
def __str__(self) -> str:
14+
attrs = "\t".join(self.attrs)
15+
return f"{self.score}\t{self.support}\t" + attrs

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ six==1.15.0
1919
toml==0.10.2
2020
typed-ast==1.4.1
2121
typing-extensions==3.7.4.3
22+
efficient-apriori==1.1.1

0 commit comments

Comments
 (0)