From c97487553b019ed5f1d12e26bacfeed9d8864aba Mon Sep 17 00:00:00 2001 From: Igor Babuschkin Date: Sun, 25 Oct 2015 13:30:54 +0000 Subject: [PATCH 1/7] Add normalization code --- mle/model.py | 50 +++++++++++++++++++++++++++++++++++++++++-------- mle/variable.py | 4 ++-- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/mle/model.py b/mle/model.py index f96819b..1e36177 100644 --- a/mle/model.py +++ b/mle/model.py @@ -1,10 +1,12 @@ import logging from time import clock +import math import numpy as np from scipy.optimize import minimize from theano import function, gof, shared, config import theano.tensor as T +from scipy.integrate import nquad from mle.util import memoize @@ -17,7 +19,8 @@ def __init__(self): self.submodels = dict() def fit(self, data, init, method='BFGS', verbose=False): - data_args = [] + bounds = [] + lengths = [] shared_params = [] for var in self.observed: try: @@ -28,8 +31,11 @@ def fit(self, data, init, method='BFGS', verbose=False): except: raise ValueError('The fitted dataset must support string indexing') - data_args.append(np.array(data[var.name])) - shared_params.append((var, shared(data[var.name].astype(config.floatX), borrow=True))) + bounds.append((var._lower, var._upper)) + this_data = np.array(data[var.name]) + this_data = this_data[(this_data > var._lower) & (this_data < var._upper)] + lengths.append(len(this_data)) + shared_params.append((var, shared(this_data.astype(config.floatX), borrow=True))) const = [] x0 = [] @@ -41,13 +47,41 @@ def fit(self, data, init, method='BFGS', verbose=False): else: x0.append(init[par.name]) - logp = function(self.constant + self.floating, -T.sum(self._logp), - givens=shared_params, allow_input_downcast=True) - g_logp = function(self.constant + self.floating, T.grad(-T.sum(self._logp), self.floating), - givens=shared_params, allow_input_downcast=True) + scalars = [T.dscalar(x.name) for x in self.observed] + toscalar = list(zip(self.observed, scalars)) + + pdf = function(scalars + self.constant + self.floating, + T.exp(self._logp), + givens=toscalar, + rebuild_strict=False, + allow_input_downcast=True) + + assert(len(set(lengths)) == 1) + N = lengths[0] + + def normalization(parameters): + ret = nquad(pdf, bounds, args=parameters)[0] + print(bounds) + print(ret) + return ret + + logp = function(self.constant + self.floating, + -T.sum(self._logp), + givens=shared_params, + allow_input_downcast=True) + + g_logp = function(self.constant + self.floating, + T.grad(-T.sum(self._logp), self.floating), + givens=shared_params, + allow_input_downcast=True) def func(pars): - return logp(*(const + list(pars))) + val = logp(*(const + list(pars))) + print(val) + if np.isinf(val): + return 1e6 + else: + return val - N * math.log(normalization(const + list(pars))) def g_func(pars): return np.array(g_logp(*(const + list(pars)))) diff --git a/mle/variable.py b/mle/variable.py index b80cc5c..69111c8 100644 --- a/mle/variable.py +++ b/mle/variable.py @@ -20,7 +20,7 @@ def var(name, label=None, observed=False, const=False, vector=False, lower=None, var._label = label var._observed = observed var._const = observed or const - var._lower = lower or -np.inf - var._upper = upper or np.inf + var._lower = -np.inf if lower is None else lower + var._upper = np.inf if upper is None else upper return var From 14c992b3e5fcdb9f6c5b516b117aac06616d1ffb Mon Sep 17 00:00:00 2001 From: Igor Babuschkin Date: Sun, 25 Oct 2015 16:04:01 +0000 Subject: [PATCH 2/7] Fix sign for normalization --- mle/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mle/model.py b/mle/model.py index 1e36177..2414e31 100644 --- a/mle/model.py +++ b/mle/model.py @@ -81,7 +81,7 @@ def func(pars): if np.isinf(val): return 1e6 else: - return val - N * math.log(normalization(const + list(pars))) + return val + N * math.log(normalization(const + list(pars))) def g_func(pars): return np.array(g_logp(*(const + list(pars)))) From 1c6cb925e689f45a7680000f2e57bc6175384b7c Mon Sep 17 00:00:00 2001 From: Igor Babuschkin Date: Sun, 25 Oct 2015 16:28:26 +0000 Subject: [PATCH 3/7] Add normalization and plotting function to results --- mle/model.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mle/model.py b/mle/model.py index 2414e31..9115e10 100644 --- a/mle/model.py +++ b/mle/model.py @@ -61,8 +61,6 @@ def fit(self, data, init, method='BFGS', verbose=False): def normalization(parameters): ret = nquad(pdf, bounds, args=parameters)[0] - print(bounds) - print(ret) return ret logp = function(self.constant + self.floating, @@ -77,7 +75,6 @@ def normalization(parameters): def func(pars): val = logp(*(const + list(pars))) - print(val) if np.isinf(val): return 1e6 else: @@ -88,16 +85,28 @@ def g_func(pars): logging.info('Minimizing negative log-likelihood of model...') + names = [x.name for x in self.parameters] + start = clock() if method.upper() == 'MINUIT': from .minuit import fmin_minuit results = fmin_minuit(func, x0, map(str, self.floating), verbose=verbose) else: results = minimize(func, method=method, jac=g_func, x0=x0, options={'disp': True}) - names = [x.name for x in self.parameters] results.x = {n: x for n, x in zip(names, results.x)} fit_time = clock() - start + estimated_values = [results.x[name] for name in names] + + norm = normalization(estimated_values) + results['norm'] = norm + + def plot_func(*args): + pdf_v = np.vectorize(pdf, excluded=range(len(args), len(args) + len(estimated_values))) + return pdf_v(*(list(args) + estimated_values)) / norm + + results['func'] = plot_func + # Add constant parameters to results for par in self.parameters: if par._const: From 6d8c570d1c6e0128e4f200f4e419588ff5fe8ca7 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Sun, 25 Oct 2015 17:07:59 +0000 Subject: [PATCH 4/7] Fix Python3 support for using iminuit --- mle/minuit.py | 2 +- mle/model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mle/minuit.py b/mle/minuit.py index 2c29a05..2373892 100644 --- a/mle/minuit.py +++ b/mle/minuit.py @@ -27,7 +27,7 @@ def fmin_minuit(func, x0, names=None, verbose=False): print_level = 0 if names is None: - names = map(lambda x: 'param' + str(x), range(len(x0))) + names = list(map(lambda x: 'param' + str(x), range(len(x0)))) else: assert(len(x0) == len(names)) diff --git a/mle/model.py b/mle/model.py index 9115e10..4b31632 100644 --- a/mle/model.py +++ b/mle/model.py @@ -90,7 +90,7 @@ def g_func(pars): start = clock() if method.upper() == 'MINUIT': from .minuit import fmin_minuit - results = fmin_minuit(func, x0, map(str, self.floating), verbose=verbose) + results = fmin_minuit(func, x0, list(map(str, self.floating)), verbose=verbose) else: results = minimize(func, method=method, jac=g_func, x0=x0, options={'disp': True}) results.x = {n: x for n, x in zip(names, results.x)} From 02c8e9fe8852a1801d9af95182f7899731d22593 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Sun, 25 Oct 2015 17:13:49 +0000 Subject: [PATCH 5/7] Fix flake8 errors --- mle/model.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/mle/model.py b/mle/model.py index 4b31632..ae6a03e 100644 --- a/mle/model.py +++ b/mle/model.py @@ -50,33 +50,32 @@ def fit(self, data, init, method='BFGS', verbose=False): scalars = [T.dscalar(x.name) for x in self.observed] toscalar = list(zip(self.observed, scalars)) - pdf = function(scalars + self.constant + self.floating, - T.exp(self._logp), - givens=toscalar, - rebuild_strict=False, - allow_input_downcast=True) + pdf = function( + scalars + self.constant + self.floating, T.exp(self._logp), + givens=toscalar, rebuild_strict=False, allow_input_downcast=True + ) assert(len(set(lengths)) == 1) N = lengths[0] def normalization(parameters): - ret = nquad(pdf, bounds, args=parameters)[0] + ret = nquad(pdf, bounds, args=parameters)[0] return ret - logp = function(self.constant + self.floating, - -T.sum(self._logp), - givens=shared_params, - allow_input_downcast=True) + logp = function( + self.constant + self.floating, -T.sum(self._logp), + givens=shared_params, allow_input_downcast=True + ) - g_logp = function(self.constant + self.floating, - T.grad(-T.sum(self._logp), self.floating), - givens=shared_params, - allow_input_downcast=True) + g_logp = function( + self.constant + self.floating, T.grad(-T.sum(self._logp), self.floating), + givens=shared_params, allow_input_downcast=True + ) def func(pars): val = logp(*(const + list(pars))) if np.isinf(val): - return 1e6 + return 1e6 else: return val + N * math.log(normalization(const + list(pars))) From 4346eea288f5b2e1a0166685003e2bc705ec8662 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Sun, 25 Oct 2015 20:56:52 +0000 Subject: [PATCH 6/7] Improve logging and reject invalid integrals --- mle/model.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mle/model.py b/mle/model.py index ae6a03e..c89c4d4 100644 --- a/mle/model.py +++ b/mle/model.py @@ -1,12 +1,12 @@ import logging -from time import clock import math +from time import clock import numpy as np +from scipy.integrate import nquad from scipy.optimize import minimize from theano import function, gof, shared, config import theano.tensor as T -from scipy.integrate import nquad from mle.util import memoize @@ -73,11 +73,22 @@ def normalization(parameters): ) def func(pars): - val = logp(*(const + list(pars))) + pars = const + list(pars) + func.count += 1 + logging.debug('Starting iteration {} with parameters {}'.format(func.count, pars)) + + val = logp(*pars) if np.isinf(val): return 1e6 + logging.debug(' > Unnormalised log-likelihood = {}'.format(val)) + + norm = normalization(pars) + logging.debug(' > Normalisation = {}'.format(norm)) + if norm == 0: + return np.inf else: - return val + N * math.log(normalization(const + list(pars))) + return val + N*math.log(norm) + func.count = 0 def g_func(pars): return np.array(g_logp(*(const + list(pars)))) @@ -88,7 +99,7 @@ def g_func(pars): start = clock() if method.upper() == 'MINUIT': - from .minuit import fmin_minuit + from mle.minuit import fmin_minuit results = fmin_minuit(func, x0, list(map(str, self.floating)), verbose=verbose) else: results = minimize(func, method=method, jac=g_func, x0=x0, options={'disp': True}) From bfbf11438363cb3f72822f7ac8f58f3c1e5cb2b9 Mon Sep 17 00:00:00 2001 From: Christopher Burr Date: Thu, 5 Nov 2015 17:46:13 +0000 Subject: [PATCH 7/7] Add plot() method to the fit results object --- mle/model.py | 6 +++-- mle/util.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/mle/model.py b/mle/model.py index c89c4d4..779877c 100644 --- a/mle/model.py +++ b/mle/model.py @@ -8,7 +8,7 @@ from theano import function, gof, shared, config import theano.tensor as T -from mle.util import memoize +from mle import util __all__ = ['Model'] @@ -117,6 +117,8 @@ def plot_func(*args): results['func'] = plot_func + results.plot = util.make_plotter(data, results, self.observed) + # Add constant parameters to results for par in self.parameters: if par._const: @@ -134,7 +136,7 @@ def _add_submodel(self, name, model): def _add_compiled_expr(self, name, expr): - @memoize + @util.memoize def compiler(): logging.info('Compiling {}...'.format(name)) return function(self.observed + self.parameters, expr, allow_input_downcast=True) diff --git a/mle/util.py b/mle/util.py index a51a3a6..2d3fc37 100644 --- a/mle/util.py +++ b/mle/util.py @@ -1,5 +1,9 @@ +import collections import functools +from matplotlib import pylab +import numpy as np +from scipy import integrate from theano import Variable, scan from theano.tensor import arange, grad, stack from theano.gradient import format_as @@ -78,3 +82,65 @@ def hashable(a): return tuple(map(hashable, a)) except: return a + + +def make_plotter(data, results, observables): + """Make a plot function for `results`.""" + assert(len(observables) == 1) + observable = observables[0] + + # TODO Find a nicer solution to having this wrapper + def _pdf(data): + if isinstance(data, collections.Iterable): + return np.array([_pdf(val) for val in data]) + else: + return float(results['func'](data)) + + def _plot(*args, **kwargs): + return plot_fitted_hist(_pdf, data, observable, *args, **kwargs) + + return _plot + + +def plot_fitted_hist(pdf, data, observable, nbins=None, lower=None, upper=None, residuals=True): + data = data[observable.name] + # Configuration + nbins = nbins or 100 + lower = lower or max(observable._lower, min(data)) + upper = upper or min(observable._upper, max(data)) + + figure = pylab.figure(1) + + # Plot the data in a histogram with sqrt(N) error bars + figure.add_axes((.1, .3, .8, .8)) + bin_vals, bin_edges = np.histogram( + data[(lower < data) & (data < upper)], + bins=np.linspace(lower, upper, nbins+1) + ) + bin_centers = bin_edges[:-1] + 0.5*np.diff(bin_edges) + pylab.errorbar( + bin_centers, bin_vals, + xerr=0.5*np.diff(bin_edges), yerr=np.sqrt(bin_vals), fmt='none' + ) + + # Plot fitted pdf + x = np.linspace(lower, upper, nbins*10) + integral = integrate.quad(pdf, lower, upper)[0] + correction = sum(bin_vals*np.diff(bin_edges))/integral + y = pdf(x)*correction + + pylab.plot(x, y, lw=1.5) + pylab.xlim([lower, upper]) + pylab.tick_params(top='off', right='off', left='off') + + if not residuals: + return + + # Make the residual plot + figure.add_axes((.1, .04, .8, .2)) + difference = (pdf(bin_centers)*correction - bin_vals) / np.sqrt(bin_vals) + pylab.bar(bin_edges[:-1], difference, np.diff(bin_edges)) + pylab.xlim([lower, upper]) + pylab.ylim([-4, 4]) + pylab.yticks(np.arange(-4, 5, 1)) + pylab.tick_params(top='off', right='off', left='off')