Skip to content

Commit 4909830

Browse files
authored
Merge pull request #30 from vecxoz/dev
Fix #29 and maintenance for tests
2 parents cd7f6ef + 25de6f3 commit 4909830

7 files changed

+318
-165
lines changed

tests/test_func_api_classification_binary.py

Lines changed: 61 additions & 41 deletions
Large diffs are not rendered by default.

tests/test_func_api_classification_multiclass.py

Lines changed: 61 additions & 41 deletions
Large diffs are not rendered by default.

tests/test_func_api_regression.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from scipy.sparse import coo_matrix
2525
from sklearn.model_selection import cross_val_predict
2626
from sklearn.model_selection import cross_val_score
27-
from sklearn.model_selection import train_test_split
27+
# from sklearn.model_selection import train_test_split
2828
from sklearn.model_selection import KFold
2929
from sklearn.datasets import load_boston
3030
from sklearn.metrics import mean_absolute_error
@@ -39,7 +39,27 @@
3939

4040
boston = load_boston()
4141
X, y = boston.data, boston.target
42-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
42+
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
43+
44+
45+
# Make train/test split by hand to avoid strange errors probably related to testing suit:
46+
# https://github.com/scikit-learn/scikit-learn/issues/1684
47+
# https://github.com/scikit-learn/scikit-learn/issues/1704
48+
# Note: Python 2.7, 3.4 - OK, but 3.5, 3.6 - error
49+
50+
np.random.seed(0)
51+
ind = np.arange(500)
52+
np.random.shuffle(ind)
53+
54+
ind_train = ind[:400]
55+
ind_test = ind[400:]
56+
57+
X_train = X[ind_train]
58+
X_test = X[ind_test]
59+
60+
y_train = y[ind_train]
61+
y_test = y[ind_test]
62+
4363

4464
#-------------------------------------------------------------------------------
4565
#-------------------------------------------------------------------------------

tests/test_sklearn_api_classification_binary.py

Lines changed: 59 additions & 39 deletions
Large diffs are not rendered by default.

tests/test_sklearn_api_classification_multiclass.py

Lines changed: 59 additions & 39 deletions
Large diffs are not rendered by default.

tests/test_sklearn_api_regression.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from sklearn.base import RegressorMixin
2626
from sklearn.model_selection import cross_val_predict
2727
from sklearn.model_selection import cross_val_score
28-
from sklearn.model_selection import train_test_split
28+
# from sklearn.model_selection import train_test_split
2929
from sklearn.model_selection import KFold
3030
from sklearn.model_selection import GridSearchCV
3131
from sklearn.model_selection import RandomizedSearchCV
@@ -50,7 +50,27 @@
5050

5151
boston = load_boston()
5252
X, y = boston.data, boston.target
53-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
53+
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
54+
55+
56+
# Make train/test split by hand to avoid strange errors probably related to testing suit:
57+
# https://github.com/scikit-learn/scikit-learn/issues/1684
58+
# https://github.com/scikit-learn/scikit-learn/issues/1704
59+
# Note: Python 2.7, 3.4 - OK, but 3.5, 3.6 - error
60+
61+
np.random.seed(0)
62+
ind = np.arange(500)
63+
np.random.shuffle(ind)
64+
65+
ind_train = ind[:400]
66+
ind_test = ind[400:]
67+
68+
X_train = X[ind_train]
69+
X_test = X[ind_test]
70+
71+
y_train = y[ind_train]
72+
y_test = y[ind_test]
73+
5474

5575
# -----------------------------------------------------------------------------
5676
# Scikit-learn INcompatible estimator

vecstack/coresk.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,35 @@ def _estimator_action(self, estimator, X_train, y_train, X_test,
851851
# -------------------------------------------------------------------------
852852
# -------------------------------------------------------------------------
853853

854+
def _random_choice(self, n, size, bound=2**30):
855+
"""
856+
Memory efficient (but slower) version of np.random.choice
857+
858+
Parameters:
859+
===========
860+
n : int
861+
Upper value for range to chose from: [0, n).
862+
This parameter is bounded (see bound).
863+
size: int
864+
Number of values to chose
865+
bound : int
866+
Upper random int for backward compatibility
867+
with some older numpy versions
868+
869+
Returns:
870+
========
871+
ids : 1d numpy array of shape (size, ) dtype=np.int32
872+
"""
873+
ids = []
874+
while len(ids) < size:
875+
rnd = np.random.randint(min(bound, n))
876+
if rnd not in ids:
877+
ids.append(rnd)
878+
return np.array(ids, dtype=np.int32)
879+
880+
# -------------------------------------------------------------------------
881+
# -------------------------------------------------------------------------
882+
854883
def _get_footprint(self, X, n_items=1000):
855884
"""Selects ``n_items`` random elements from 2d numpy array or
856885
sparse matrix (or all elements if their number is less or equal
@@ -861,7 +890,11 @@ def _get_footprint(self, X, n_items=1000):
861890
r, c = X.shape
862891
n = r * c
863892
# np.random.seed(0) # for development
864-
ids = np.random.choice(n, min(n_items, n), replace=False)
893+
894+
# OOM with large arrays (see #29)
895+
# ids = np.random.choice(n, min(n_items, n), replace=False)
896+
897+
ids = self._random_choice(n, min(n_items, n))
865898

866899
for i in ids:
867900
row = i // c

0 commit comments

Comments
 (0)