Skip to content

Commit

Permalink
Merge pull request #30 from flennerhag/dev
Browse files Browse the repository at this point in the history
1.2
  • Loading branch information
flennerhag authored May 18, 2017
2 parents 3720221 + 9273cbd commit 40c6563
Show file tree
Hide file tree
Showing 43 changed files with 3,296 additions and 1,123 deletions.
178 changes: 106 additions & 72 deletions benchmarks/scale_comp.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,76 @@
"""ML-ENSEMBLE
Comparison of multithreading performance as dataset scale increases.
Comparison of multiprocessing performance as data scales.
Example Output
--------------
ML-ENSEMBLE
Threading performance test for data set dimensioned up to (1000000, 10)
Threading performance test for data set dimensioned up to (10000, 50)
Available CPUs: 4
Ensemble architecture
Num layers: 2
Fit per base layer estimator: 4 + 1
layer-1 | Estimators: ['RandomForestRegressor', 'GradientBoostingRegressor', 'ElasticNet', 'KNeighborsRegressor'].
layer-2 | Meta Estimator: Lasso
Fit per base layer estimator: 2 + 1
layer-1 | Estimators: ['svr-1', 'svr-2', 'svr-3', 'svr-4'].
layer-2 | Meta Estimator: svr
FIT TIMES
samples | cores: 1 | cores: 2 | cores: 4 |
samples
1000 SuperLearner (1) : 0.88s | BlendEnsemble (1) : 0.35s |
1000 SuperLearner (4) : 0.71s | BlendEnsemble (4) : 0.41s |
2000 SuperLearner (1) : 2.82s | BlendEnsemble (1) : 0.76s |
2000 SuperLearner (4) : 1.51s | BlendEnsemble (4) : 0.59s |
3000 SuperLearner (1) : 6.04s | BlendEnsemble (1) : 1.56s |
3000 SuperLearner (4) : 2.96s | BlendEnsemble (4) : 0.90s |
4000 SuperLearner (1) : 10.94s | BlendEnsemble (1) : 2.79s |
4000 SuperLearner (4) : 7.92s | BlendEnsemble (4) : 1.53s |
5000 SuperLearner (1) : 18.45s | BlendEnsemble (1) : 4.58s |
5000 SuperLearner (4) : 8.52s | BlendEnsemble (4) : 2.26s |
6000 SuperLearner (1) : 27.48s | BlendEnsemble (1) : 7.24s |
6000 SuperLearner (4) : 15.06s | BlendEnsemble (4) : 3.41s |
7000 SuperLearner (1) : 38.73s | BlendEnsemble (1) : 8.62s |
7000 SuperLearner (4) : 18.21s | BlendEnsemble (4) : 4.41s |
8000 SuperLearner (1) : 52.08s | BlendEnsemble (1) : 12.10s |
8000 SuperLearner (4) : 23.43s | BlendEnsemble (4) : 4.95s |
9000 SuperLearner (1) : 61.70s | BlendEnsemble (1) : 14.58s |
9000 SuperLearner (4) : 28.55s | BlendEnsemble (4) : 8.45s |
10000 SuperLearner (1) : 75.76s | BlendEnsemble (1) : 18.72s |
10000 SuperLearner (4) : 32.71s | BlendEnsemble (4) : 7.52s |
Benchmark done | 00:09:00
Plotting results... done.
Figure written to .../scale_comp_1.png
"""

import os
import numpy as np

import os
from mlens.ensemble import SuperLearner, Subsemble, BlendEnsemble
from mlens.ensemble import SuperLearner, BlendEnsemble
from mlens.utils import print_time

from sklearn.datasets import make_friedman1
from sklearn.svm import SVR
from sklearn.base import clone


from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_friedman1
from time import perf_counter

PLOT = True
if PLOT:
try:
import matplotlib.pyplot as plt
plt.ion()

except ImportError:
print("Could not import matplotlib. Will ignore PLOT flag.")
PLOT = False

MAX = int(50000)
STEP = int(5000)
COLS = 5
ENS = [SuperLearner, BlendEnsemble]
KWG = [{'folds': 2}, {}]
MAX = int(1e4)
STEP = int(1e3)
COLS = 50

SEED = 2017
np.random.seed(SEED)
Expand All @@ -57,56 +80,59 @@ def build_ensemble(kls, **kwargs):
"""Generate ensemble of class kls."""

ens = kls(**kwargs)

ens.add([KNeighborsRegressor(), KNeighborsRegressor(),
KNeighborsRegressor(), KNeighborsRegressor()])

ens.add(KNeighborsRegressor())

ens.add([SVR() for _ in range(4)])
ens.add_meta(SVR())
return ens


if __name__ == '__main__':

print("\nML-ENSEMBLE\n")
print("Threading performance test for data set dimensioned up "
"to (%i, %i)" % (MAX, COLS))

c = os.cpu_count()
print("Available CPUs: %i\n" % c)
cores = [1, c]

cores = [int(np.floor(i)) for i in np.linspace(1, c, 3)]

ens = [[build_ensemble(kls, n_jobs=i, )
for kls in [SuperLearner, Subsemble, BlendEnsemble]]
ens = [[build_ensemble(kls, n_jobs=i, **kwd)
for kls, kwd in zip(ENS, KWG)]
for i in cores]

###########################################################################
# PRINTED MESSAGE
print("\nML-ENSEMBLE\n")
print("Threading performance test for data set "
"dimensioned up to (%i, %i)" % (MAX, COLS))
print("Available CPUs: %i\n" % c)
print('Ensemble architecture')
print("Num layers: %i" % ens[0][0].layers.n_layers)
print("Fit per base layer estimator: %i + 1" % ens[0][0].folds)

for lyr in ens[0][0].layers.layers:
if int(lyr[-1]) == ens[0][0].layers.n_layers:
continue
print('%s | Estimators: %r.' % (lyr, [e for e, _ in
ens[0][0].layers.layers[
lyr].estimators]))
print("%s | Meta Estimator: %s" % ('layer-2', ens[0][0].layers.layers[
'layer-2'].estimators[0][0]))

print('%s | Estimators: %r.' %
(lyr, [e for e, _ in ens[0][0].layers.layers[lyr].estimators]))

print("%s | Meta Estimator: %s" %
('layer-2', ens[0][0].layers.layers['layer-2'].estimators[0][0]))

print('\nFIT TIMES')
print('%7s' % 'samples', flush=True)

ts = perf_counter()
###########################################################################
# ESTIMATION
times = {i: {kls().__class__.__name__: []
for kls in [SuperLearner, Subsemble, BlendEnsemble]}
for kls in [SuperLearner, BlendEnsemble]}
for i in cores}

ts = perf_counter()
for s in range(STEP, MAX + STEP, STEP):
X, y = make_friedman1(n_samples=s, random_state=SEED)

X, y = make_friedman1(n_samples=s, n_features=COLS, random_state=SEED)

# Iterate over number of cores to fit with
for n, etypes in zip(cores, ens):

print('%7i' % s, end=" ", flush=True)

# Iterate over ensembles with given number of cores
for e in etypes:
name = e.__class__.__name__
e = clone(e)
Expand All @@ -117,33 +143,41 @@ def build_ensemble(kls, **kwargs):

times[n][name].append(t1)

print('%s (%i) : %4.2f |' % (name, n, t1), end=" ", flush=True)
print('%s (%i) : %6.2fs |' % (name, n, t1),
end=" ", flush=True)
print()
print()

print_time(ts, "Benchmark done")

if PLOT:
print("Plotting results...", end=" ", flush=True)
x = range(STEP, MAX + STEP, STEP)

cm = [plt.cm.rainbow(i) for i in np.linspace(0, 1.0,
int(3 *len(cores)))
]
plt.figure(figsize=(8, 8))

i = 0
for n in cores:
for s, e in times[n].items():
ax = plt.plot(x, e, color=cm[i], marker='.',
label='%s (%i)' % (s, n))
i += 1

plt.title('Benchmark of time to fit')
plt.xlabel('Sample size')
plt.ylabel('Time to fit (sec)')
plt.legend(frameon=False)

f = os.path.join('scale_comp.png')
plt.savefig(f, bbox_inches='tight', dpi=600)
print("done.\nFigure written to %s" % f)
try:
import matplotlib.pyplot as plt

plt.ion()
print("Plotting results...", end=" ", flush=True)

plt.figure(figsize=(8, 8))

x = range(STEP, MAX + STEP, STEP)
cm = [plt.cm.rainbow(i)
for i in np.linspace(0, 1.0, int(3 * len(cores)))]

i = 0
for n in cores:
for s, e in times[n].items():
ax = plt.plot(x, e, color=cm[i], marker='.',
label='%s (%i)' % (s, n))
i += 1

plt.title('Benchmark of time to fit')
plt.xlabel('Sample size')
plt.ylabel('Time to fit (sec)')
plt.legend(frameon=False)

f = os.path.join(os.getcwd(), 'scale_benchmark.png')
plt.savefig(f, bbox_inches='tight', dpi=600)
print("done.\nFigure written to %s" % f)

except ImportError:
print("Could not import matplotlib. Will ignore PLOT flag.")
2 changes: 1 addition & 1 deletion docs/benchmarks.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.. _benchmarks:

Performance Benchmarks
Performance benchmarks
======================

The Friedman Regression Problem 1
Expand Down
9 changes: 6 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import subprocess
import os
import sys
from mlens import __version__
import subprocess
import sphinx_rtd_theme


# Ensure numpydoc exists
subprocess.run(['pip', 'install', 'numpydoc'])

Expand Down Expand Up @@ -59,6 +61,7 @@

# Intersphinx options
intersphinx_mapping = {'http://scikit-learn.org/stable/': None,
'http://docs.scipy.org/doc/numpy/': None,
'http://matplotlib.org/': None,
'http://pandas.pydata.org/pandas-docs/stable/': None}

Expand All @@ -84,9 +87,9 @@
# built documents.
#
# The short X.Y version.
version = '0.0.1'
version = str(__version__)[:5]
# The full version, including alpha/beta/rc tags.
release = '0.0.1.dev0'
release = str(__version__)

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
Loading

0 comments on commit 40c6563

Please sign in to comment.