Issue-5: Report invalid and below threshold combinations

PiotrZakrzewski · PiotrZakrzewski · commit 66517850dd79 · 2020-11-24T08:21:51.000+01:00
diff --git a/mbdiff/__main__.py b/mbdiff/__main__.py
@@ -1,7 +1,7 @@
 import click
 from mbdiff.diff_query import DiffQuery
 from mbdiff.diff import diff_file
-from mbdiff.pretty_print import present_explanations
+from mbdiff.pretty_print import present_explanations, present_invalid
 
 
 @click.command()
@@ -13,10 +13,19 @@
 def main(data, min_support, min_risk, max_order, query):
     metric, op, value = query.split()
     query = DiffQuery(metric, op, value)
-    explanations = diff_file(data, query, max_order, min_risk, min_support)
-    print("Explanations")
-    explanations = sorted(explanations, key=lambda x: x[0], reverse=True)
-    print(present_explanations(explanations))
+    explanations, invalid = diff_file(data, query, max_order, min_risk, min_support)
+    if explanations:
+        explanations = sorted(explanations, key=lambda x: x[0], reverse=True)
+        print("Explanations")
+        print(present_explanations(explanations))
+    else:
+        print("Could not find any explanations for this input")
+    if invalid:
+        print("Attribute combinations below thresholds")
+        print(present_invalid(invalid))
+    else:
+        print("There were no invalid or below threshold attribute combinations")
+
 
 
 if __name__ == "__main__":
diff --git a/mbdiff/diff.py b/mbdiff/diff.py
@@ -2,6 +2,7 @@
 from mbdiff.diff_query import DiffQuery
 from mbdiff.risk_ratio import risk_ratio
 from mbdiff.attribute_mining import get_combs
+from numpy import nan
 
 
 def diff_file(
@@ -10,7 +11,7 @@ def diff_file(
     max_order: int,
     min_risk: float,
     min_support: float,
-) -> list:
+):
     """Given a tab delimited file and a distinguishing metric return explanations."""
     df = read_csv(path_to_df)
     print("Outliers:")
@@ -20,17 +21,22 @@ def diff_file(
 
 def diff(
     df: DataFrame, query: DiffQuery, max_order: int, min_risk: float, min_support: float
-) -> list:
+):
+    """Return explanations, invalid and below support criterium attribute combinations."""
     query.mark_groups(df)
     ignored_cols = ["outlier"]
     # ignore all non categorical columns
     for i, column in enumerate(df.columns):
         if df.dtypes[i] != "object":
             ignored_cols.append(column)
     combinations = get_combs(df, max_order, min_support, ignored_cols)
-    results = []
+    results, invalid = [], []
     for combination in combinations:
         rr = risk_ratio(combination, df)
-        if rr >= min_risk:
-            results.append((rr, combination))
-    return results
+        res = (rr, combination)
+        if rr is nan or rr <= min_risk:
+            invalid.append(combination)
+        else:
+            results.append(res)
+
+    return results, invalid
diff --git a/mbdiff/pretty_print.py b/mbdiff/pretty_print.py
@@ -1,5 +1,5 @@
 """Present explanations in a more approachable way."""
-from typing import List
+from typing import List, Dict
 from tabulate import tabulate
 from pandas import DataFrame
 
@@ -14,3 +14,8 @@ def present_explanations(explanations: List) -> str:
     # NaN means "any value", represent as "-" just like in the original paper
     pres_df.fillna("-", inplace=True)
     return tabulate(pres_df, headers="keys")
+
+def present_invalid(combinations: List[Dict]) -> str:
+    """Pretty print invalid attr combinations."""
+    pres_df = DataFrame(combinations)
+    return tabulate(pres_df, headers="keys")
diff --git a/mbdiff/risk_ratio.py b/mbdiff/risk_ratio.py
@@ -1,4 +1,5 @@
 from pandas import DataFrame
+from numpy import nan
 
 
 def calc_support(df, column, value) -> float:
@@ -20,11 +21,11 @@ def risk_ratio(attr_combination: dict, df) -> float:
     top_d = a0 + ai
     denom_d = b0 + bi
     if top_d == 0 or denom_d == 0:
-        return 0
+        return nan
     top = a0 / top_d
     denom = b0 / denom_d
     if denom < 0.01:
-        return 0.0
+        return nan
     return top / denom
 
 
diff --git a/mbdiff/tests/test_risk_ratio.py b/mbdiff/tests/test_risk_ratio.py
@@ -1,11 +1,12 @@
 from mbdiff.risk_ratio import calc_support, risk_ratio
 import pytest
+from numpy import nan
 
 
 def test_risk_basic_null_1(df_outliers):
-    """Due to lack of cat2 in the inliers the result will be 0.0."""
+    """Due to lack of cat2 in the inliers the result will be NaN."""
     comb = {"cats": "cat2"}
-    assert risk_ratio(comb, df_outliers) == pytest.approx(0.0)
+    assert risk_ratio(comb, df_outliers) is nan
 
 
 def test_risk_basic_null_2(df_outliers):