From 4759d31666ca476a07c09a450e42a847abdc2b3d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 21:12:34 -0800 Subject: [PATCH 01/31] Initial commit of clustered heatmap --- pandas/tools/plotting.py | 279 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index c4255e706b19f..184f0bbf7bfa2 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2488,6 +2488,285 @@ def _maybe_convert_date(x): x = conv_func(x) return x +# helper for cleaning up axes by removing ticks, tick labels, frame, etc. +def _clean_axis(ax): + """Remove ticks, tick labels, and frame from axis""" + ax.get_xaxis().set_ticks([]) + ax.get_yaxis().set_ticks([]) + for sp in ax.spines.values(): + sp.set_visible(False) + + +def _color_list_to_matrix_and_cmap(color_list, ind, row=True): + """ + For 'heatmap()' + This only works for 1-column color lists.. + TODO: Support multiple color labels on an element in the heatmap + """ + import matplotlib as mpl + + colors = set(color_list) + col_to_value = {col: i for i, col in enumerate(colors)} + +# ind = column_dendrogram_distances['leaves'] + matrix = np.array([col_to_value[col] for col in color_list])[ind] + print 'matrix.shape', matrix.shape, + print 'len(color_list)', len(color_list) + # Is this row-side or column side? + if row: + new_shape = (len(color_list), 1) + else: + new_shape = (1, len(color_list)) + matrix = matrix.reshape(new_shape) + + cmap = mpl.colors.ListedColormap(colors) + return matrix, cmap + + + + + +def heatmap(df, title=None, colorbar_label='values', + col_side_colors=None, row_side_colors=None, + color_scale='linear', cmap=None, + row_linkage_method='complete', + col_linkage_method='complete', + figsize=None, + label_rows=True, + label_cols=True, + + #col_labels=None, + #row_labels=None, + + xlabel_fontsize=12, + ylabel_fontsize=10, + cluster_cols=True, + cluster_rows=True, + plot_df=None): + + + """ + + @author Olga Botvinnik olga.botvinnik@gmail.com + + @param df: + @param title: + @param colorbar_label: + @param col_side_colors: + @param row_side_colors: + @param color_scale: + @param cmap: + @param figsize: + @param label_rows: Can be boolean or a list of strings, with exactly the + length of the number of rows in df. + @param label_cols: Can be boolean or a list of strings, with exactly the + length of the number of columns in df. + @param col_labels: + @param row_labels: + @param xlabel_fontsize: + @param ylabel_fontsize: + @param cluster_cols: + @param cluster_rows: + @param plot_df: + @return: @rtype: @raise TypeError: + """ + import matplotlib.pyplot as plt + import matplotlib.gridspec as gridspec + import scipy.spatial.distance as distance + import scipy.cluster.hierarchy as sch + import matplotlib as mpl + from collections import Iterable + + almost_black = '#262626' + sch.set_link_color_palette([almost_black]) + if type(plot_df) is None: + plot_df = df + + if any(plot_df.index != df.index): + raise TypeError('plot_df must have the exact same indices as df') + if any(plot_df.columns != df.columns): + raise TypeError('plot_df must have the exact same columns as df') + # make norm + divergent = df.max().max() > 0 and df.min().min() < 0 + + if color_scale == 'log': + vmin = max(np.floor(df.dropna(how='all').min().dropna().min()), 1e-10) + vmax = np.ceil(df.dropna(how='all').max().dropna().max()) + my_norm = mpl.colors.LogNorm(vmin, vmax) + print 'vmax', vmax + print 'vmin', vmin + elif divergent: + abs_max = abs(df.max().max()) + abs_min = abs(df.min().min()) + vmax = max(abs_max, abs_min) + my_norm = mpl.colors.Normalize(vmin=-vmax, vmax=vmax) + else: + my_norm = None + + if cmap is None: + cmap = mpl.cm.RdBu_r if divergent else mpl.cm.Blues_r + cmap.set_bad('white') + + # calculate pairwise distances for rows + row_pairwise_dists = distance.squareform(distance.pdist(df)) + row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) + + # calculate pairwise distances for columns + col_pairwise_dists = distance.squareform(distance.pdist(df.T)) + # cluster + col_clusters = sch.linkage(col_pairwise_dists, method=col_linkage_method) + + # heatmap with row names + dendrogram_height_fraction = df.shape[0] * 0.25 / df.shape[0] + dendrogram_width_fraction = df.shape[1] * 0.25 / df.shape[1] + width_ratios = [dendrogram_width_fraction, 1] \ + if row_side_colors is None else [dendrogram_width_fraction, 0.05, 1] + height_ratios = [dendrogram_height_fraction, 1] \ + if col_side_colors is None else [dendrogram_height_fraction, 0.05, 1] + nrows = 2 if col_side_colors is None else 3 + ncols = 2 if row_side_colors is None else 3 + + print 'width_ratios', width_ratios + print 'height_ratios', height_ratios + + width = df.shape[1] * 0.25 + height = min(df.shape[0] * .75, 40) + if figsize is None: + figsize = (width, height) + print figsize + + fig = plt.figure(figsize=figsize) + heatmap_gridspec = \ + gridspec.GridSpec(nrows, ncols, wspace=0.0, hspace=0.0, + width_ratios=width_ratios, + height_ratios=height_ratios) + # print heatmap_gridspec + + ### col dendrogram ### + column_dendrogram_ax = fig.add_subplot(heatmap_gridspec[0, ncols - 1]) + if cluster_cols: + column_dendrogram_distances = sch.dendrogram(col_clusters, + color_threshold=np.inf, + color_list=[ + ppl.almost_black]) + else: + column_dendrogram_distances = {'leaves': range(df.shape[1])} + _clean_axis(column_dendrogram_ax) + + ### col colorbar ### + if col_side_colors is not None: + column_colorbar_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) + col_side_matrix, col_cmap = _color_list_to_matrix_and_cmap( + col_side_colors, + ind=column_dendrogram_distances['leaves'], + row=False) + column_colorbar_ax_pcolormesh = column_colorbar_ax.pcolormesh( + col_side_matrix, cmap=col_cmap, + edgecolors='white', linewidth=0.1) + column_colorbar_ax.set_xlim(0, col_side_matrix.shape[1]) + _clean_axis(column_colorbar_ax) + + ### row dendrogram ### + row_dendrogram_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 0]) + if cluster_rows: + row_dendrogram_distances = \ + sch.dendrogram(row_clusters, + color_threshold=np.inf, + orientation='right', + color_list=[ppl.almost_black]) + else: + row_dendrogram_distances = {'leaves': range(df.shape[0])} + _clean_axis(row_dendrogram_ax) + + ### row colorbar ### + if row_side_colors is not None: + row_colorbar_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 1]) + row_side_matrix, row_cmap = _color_list_to_matrix_and_cmap( + row_side_colors, + ind=row_dendrogram_distances['leaves'], + row=True) + row_colorbar_ax_pcolormesh = row_colorbar_ax.pcolormesh(row_side_matrix, + cmap=row_cmap, + edgecolors='white', + linewidth=0.1) + row_colorbar_ax.set_ylim(0, row_side_matrix.shape[0]) + _clean_axis(row_colorbar_ax) + + ### heatmap #### + heatmap_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, ncols - 1]) + heatmap_ax_pcolormesh = \ + heatmap_ax.pcolormesh(plot_df.ix[row_dendrogram_distances[ + 'leaves'], + column_dendrogram_distances[ + 'leaves']].values, + norm=my_norm, cmap=cmap) + heatmap_ax.set_ylim(0, df.shape[0]) + heatmap_ax.set_xlim(0, df.shape[1]) + _clean_axis(heatmap_ax) + + ## row labels ## + if isinstance(label_rows, Iterable): + if len(label_rows) == df.shape[0]: + yticklabels = label_rows + label_rows = True + else: + raise BaseException("Length of 'label_rows' must be the same as " + "df.shape[0]") + elif label_rows: + yticklabels = df.index[row_dendrogram_distances['leaves']] + + if label_rows: + heatmap_ax.set_yticks(np.arange(df.shape[0]) + 0.5) + heatmap_ax.yaxis.set_ticks_position('right') + heatmap_ax.set_yticklabels(yticklabels, fontsize=ylabel_fontsize) + + # Add title if there is one: + if title is not None: + heatmap_ax.set_title(title) + + ## col labels ## + if isinstance(label_cols, Iterable): + if len(label_cols) == df.shape[0]: + xticklabels = label_rows + label_cols = True + else: + raise BaseException("Length of 'label_cols' must be the same as " + "df.shape[1]") + elif label_rows: + xticklabels = df.columns[column_dendrogram_distances['leaves']] + + if label_cols: + heatmap_ax.set_xticks(np.arange(df.shape[1]) + 0.5) + xticklabels = heatmap_ax.set_xticklabels(xticklabels, + fontsize=xlabel_fontsize) + # rotate labels 90 degrees + for label in xticklabels: + label.set_rotation(90) + + # remove the tick lines + for l in heatmap_ax.get_xticklines() + heatmap_ax.get_yticklines(): + l.set_markersize(0) + + ### scale colorbar ### + scale_colorbar_ax = fig.add_subplot( + heatmap_gridspec[0:(nrows - 1), + 0]) # colorbar for scale in upper left corner + cb = fig.colorbar(heatmap_ax_pcolormesh, + cax=scale_colorbar_ax) # note that we could pass the norm explicitly with norm=my_norm + cb.set_label(colorbar_label) + cb.ax.yaxis.set_ticks_position( + 'left') # move ticks to left side of colorbar to avoid problems with tight_layout + cb.ax.yaxis.set_label_position( + 'left') # move label to left side of colorbar to avoid problems with tight_layout + cb.outline.set_linewidth(0) + # make colorbar labels smaller + yticklabels = cb.ax.yaxis.get_ticklabels() + for t in yticklabels: + t.set_fontsize(t.get_fontsize() - 3) + + fig.tight_layout() + return fig, row_dendrogram_distances, column_dendrogram_distances + if __name__ == '__main__': # import pandas.rpy.common as com # sales = com.load_data('sanfrancisco.home.sales', package='nutshell') From d8eb771ca8af7c7f40da74d562a5a75ee9bf0bd3 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 22:25:17 -0800 Subject: [PATCH 02/31] Fixed bugs that arose while making an example. Funny how that happens! --- pandas/tools/plotting.py | 79 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 184f0bbf7bfa2..104e8aed482f8 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2510,8 +2510,6 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): # ind = column_dendrogram_distances['leaves'] matrix = np.array([col_to_value[col] for col in color_list])[ind] - print 'matrix.shape', matrix.shape, - print 'len(color_list)', len(color_list) # Is this row-side or column side? if row: new_shape = (len(color_list), 1) @@ -2526,9 +2524,13 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): -def heatmap(df, title=None, colorbar_label='values', - col_side_colors=None, row_side_colors=None, - color_scale='linear', cmap=None, +def heatmap(df, + title=None, + colorbar_label='values', + col_side_colors=None, + row_side_colors=None, + color_scale='linear', + cmap=None, row_linkage_method='complete', col_linkage_method='complete', figsize=None, @@ -2549,6 +2551,8 @@ def heatmap(df, title=None, colorbar_label='values', @author Olga Botvinnik olga.botvinnik@gmail.com + This is liberally borrowed (with permission) from + @param df: @param title: @param colorbar_label: @@ -2579,7 +2583,7 @@ def heatmap(df, title=None, colorbar_label='values', almost_black = '#262626' sch.set_link_color_palette([almost_black]) - if type(plot_df) is None: + if type(plot_df) is type(None): plot_df = df if any(plot_df.index != df.index): @@ -2626,9 +2630,6 @@ def heatmap(df, title=None, colorbar_label='values', nrows = 2 if col_side_colors is None else 3 ncols = 2 if row_side_colors is None else 3 - print 'width_ratios', width_ratios - print 'height_ratios', height_ratios - width = df.shape[1] * 0.25 height = min(df.shape[0] * .75, 40) if figsize is None: @@ -2643,22 +2644,21 @@ def heatmap(df, title=None, colorbar_label='values', # print heatmap_gridspec ### col dendrogram ### - column_dendrogram_ax = fig.add_subplot(heatmap_gridspec[0, ncols - 1]) + col_dendrogram_ax = fig.add_subplot(heatmap_gridspec[0, ncols - 1]) if cluster_cols: - column_dendrogram_distances = sch.dendrogram(col_clusters, + col_dendrogram = sch.dendrogram(col_clusters, color_threshold=np.inf, - color_list=[ - ppl.almost_black]) + color_list=[almost_black]) else: - column_dendrogram_distances = {'leaves': range(df.shape[1])} - _clean_axis(column_dendrogram_ax) + col_dendrogram = {'leaves': list(range(df.shape[1]))} + _clean_axis(col_dendrogram_ax) ### col colorbar ### if col_side_colors is not None: column_colorbar_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) col_side_matrix, col_cmap = _color_list_to_matrix_and_cmap( col_side_colors, - ind=column_dendrogram_distances['leaves'], + ind=col_dendrogram['leaves'], row=False) column_colorbar_ax_pcolormesh = column_colorbar_ax.pcolormesh( col_side_matrix, cmap=col_cmap, @@ -2669,13 +2669,13 @@ def heatmap(df, title=None, colorbar_label='values', ### row dendrogram ### row_dendrogram_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 0]) if cluster_rows: - row_dendrogram_distances = \ + row_dendrogram = \ sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right', - color_list=[ppl.almost_black]) + color_list=[almost_black]) else: - row_dendrogram_distances = {'leaves': range(df.shape[0])} + row_dendrogram = {'leaves': list(range(df.shape[0]))} _clean_axis(row_dendrogram_ax) ### row colorbar ### @@ -2683,23 +2683,22 @@ def heatmap(df, title=None, colorbar_label='values', row_colorbar_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 1]) row_side_matrix, row_cmap = _color_list_to_matrix_and_cmap( row_side_colors, - ind=row_dendrogram_distances['leaves'], + ind=row_dendrogram['leaves'], row=True) - row_colorbar_ax_pcolormesh = row_colorbar_ax.pcolormesh(row_side_matrix, - cmap=row_cmap, - edgecolors='white', - linewidth=0.1) + row_colorbar_ax.pcolormesh(row_side_matrix, cmap=row_cmap, + edgecolors='white', linewidth=0.01) row_colorbar_ax.set_ylim(0, row_side_matrix.shape[0]) _clean_axis(row_colorbar_ax) ### heatmap #### heatmap_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, ncols - 1]) heatmap_ax_pcolormesh = \ - heatmap_ax.pcolormesh(plot_df.ix[row_dendrogram_distances[ - 'leaves'], - column_dendrogram_distances[ - 'leaves']].values, - norm=my_norm, cmap=cmap) + heatmap_ax.pcolormesh(plot_df.ix[row_dendrogram['leaves'], + col_dendrogram['leaves']].values, + norm=my_norm, cmap=cmap, + edgecolors='white', + lw=0.01) + heatmap_ax.set_ylim(0, df.shape[0]) heatmap_ax.set_xlim(0, df.shape[1]) _clean_axis(heatmap_ax) @@ -2713,7 +2712,7 @@ def heatmap(df, title=None, colorbar_label='values', raise BaseException("Length of 'label_rows' must be the same as " "df.shape[0]") elif label_rows: - yticklabels = df.index[row_dendrogram_distances['leaves']] + yticklabels = df.index[row_dendrogram['leaves']] if label_rows: heatmap_ax.set_yticks(np.arange(df.shape[0]) + 0.5) @@ -2722,18 +2721,18 @@ def heatmap(df, title=None, colorbar_label='values', # Add title if there is one: if title is not None: - heatmap_ax.set_title(title) + col_dendrogram_ax.set_title(title) ## col labels ## if isinstance(label_cols, Iterable): - if len(label_cols) == df.shape[0]: - xticklabels = label_rows + if len(label_cols) == df.shape[1]: + xticklabels = label_cols label_cols = True else: raise BaseException("Length of 'label_cols' must be the same as " "df.shape[1]") - elif label_rows: - xticklabels = df.columns[column_dendrogram_distances['leaves']] + elif label_cols: + xticklabels = df.columns[col_dendrogram['leaves']] if label_cols: heatmap_ax.set_xticks(np.arange(df.shape[1]) + 0.5) @@ -2754,18 +2753,18 @@ def heatmap(df, title=None, colorbar_label='values', cb = fig.colorbar(heatmap_ax_pcolormesh, cax=scale_colorbar_ax) # note that we could pass the norm explicitly with norm=my_norm cb.set_label(colorbar_label) - cb.ax.yaxis.set_ticks_position( - 'left') # move ticks to left side of colorbar to avoid problems with tight_layout - cb.ax.yaxis.set_label_position( - 'left') # move label to left side of colorbar to avoid problems with tight_layout + + # move ticks to left side of colorbar to avoid problems with tight_layout + cb.ax.yaxis.set_ticks_position('left') cb.outline.set_linewidth(0) + # make colorbar labels smaller yticklabels = cb.ax.yaxis.get_ticklabels() for t in yticklabels: t.set_fontsize(t.get_fontsize() - 3) fig.tight_layout() - return fig, row_dendrogram_distances, column_dendrogram_distances + return fig, row_dendrogram, col_dendrogram if __name__ == '__main__': # import pandas.rpy.common as com From 7e2b64d869972aa32fe0c09e120f0a29caa48b58 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 22:29:09 -0800 Subject: [PATCH 03/31] Added citation of Chris DeBoever --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 104e8aed482f8..864a33ee22396 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2551,7 +2551,7 @@ def heatmap(df, @author Olga Botvinnik olga.botvinnik@gmail.com - This is liberally borrowed (with permission) from + This is liberally borrowed (with permission) from http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb @param df: @param title: From 94a92c520ce679aaf4bd54921807ecea7719cb31 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 22:48:46 -0800 Subject: [PATCH 04/31] Added some docstrings --- pandas/tools/plotting.py | 44 ++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 864a33ee22396..b77a61d86810a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2536,6 +2536,8 @@ def heatmap(df, figsize=None, label_rows=True, label_cols=True, + vmin = None, + vmax=None, #col_labels=None, #row_labels=None, @@ -2553,26 +2555,38 @@ def heatmap(df, This is liberally borrowed (with permission) from http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb - @param df: - @param title: - @param colorbar_label: - @param col_side_colors: - @param row_side_colors: - @param color_scale: - @param cmap: - @param figsize: + @param df: The dataframe you want to cluster on + @param title: Title of the figure + @param colorbar_label: What to colorbar (color scale of the heatmap) + @param col_side_colors: Label the columns with a color + @param row_side_colors: Label the rows with a color + @param color_scale: Either 'linear' or 'log' + @param cmap: A matplotlib colormap, default is mpl.cm.Blues_r if data is + sequential, or mpl.cm.RdBu_r if data is divergent (has both positive and + negative numbers) + @param figsize: Size of the figure. The default is a function of the + dataframe size. @param label_rows: Can be boolean or a list of strings, with exactly the length of the number of rows in df. @param label_cols: Can be boolean or a list of strings, with exactly the length of the number of columns in df. - @param col_labels: - @param row_labels: - @param xlabel_fontsize: - @param ylabel_fontsize: - @param cluster_cols: + @param col_labels: If True, label with df.columns. If False, unlabeled. + Else, this can be an iterable to relabel the columns with labels of your own + choosing. This is helpful if you have duplicate column names and pandas + won't let you reindex it. + @param row_labels: If True, label with df.index. If False, unlabeled. + Else, this can be an iterable to relabel the row names with labels of your + own choosing. This is helpful if you have duplicate index names and pandas + won't let you reindex it. + @param xlabel_fontsize: Default 12pt + @param ylabel_fontsize: Default 10pt + @param cluster_cols: Boolean, whether or not to cluster the columns @param cluster_rows: - @param plot_df: - @return: @rtype: @raise TypeError: + @param plot_df: The dataframe you want to plot. This can contain NAs and + other nasty things. + @return: fig, row_dendrogram, col_dendrogram + @rtype: matplotlib.figure.Figure, dict, dict + @raise TypeError: """ import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec From b6ce06c4a3bc0c315e3913a7bef77076104c4026 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 22:50:39 -0800 Subject: [PATCH 05/31] Added some docstrings --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b77a61d86810a..955f312380cfb 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2550,11 +2550,11 @@ def heatmap(df, """ - @author Olga Botvinnik olga.botvinnik@gmail.com This is liberally borrowed (with permission) from http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb + @param df: The dataframe you want to cluster on @param title: Title of the figure @param colorbar_label: What to colorbar (color scale of the heatmap) From 049a36e2ce9db004f713060115264a97a0866ffd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 4 Dec 2013 22:52:10 -0800 Subject: [PATCH 06/31] Changed exceptions to AssertionErrors --- pandas/tools/plotting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 955f312380cfb..b8624e19d15ee 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2601,9 +2601,9 @@ def heatmap(df, plot_df = df if any(plot_df.index != df.index): - raise TypeError('plot_df must have the exact same indices as df') + raise AssertionError('plot_df must have the exact same indices as df') if any(plot_df.columns != df.columns): - raise TypeError('plot_df must have the exact same columns as df') + raise AssertionError('plot_df must have the exact same columns as df') # make norm divergent = df.max().max() > 0 and df.min().min() < 0 @@ -2723,7 +2723,7 @@ def heatmap(df, yticklabels = label_rows label_rows = True else: - raise BaseException("Length of 'label_rows' must be the same as " + raise AssertionError("Length of 'label_rows' must be the same as " "df.shape[0]") elif label_rows: yticklabels = df.index[row_dendrogram['leaves']] @@ -2743,7 +2743,7 @@ def heatmap(df, xticklabels = label_cols label_cols = True else: - raise BaseException("Length of 'label_cols' must be the same as " + raise AssertionError("Length of 'label_cols' must be the same as " "df.shape[1]") elif label_cols: xticklabels = df.columns[col_dendrogram['leaves']] From a1900623dec73e3887be80d96157a3ad7169815b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 00:29:29 -0800 Subject: [PATCH 07/31] Fixed relabeling of columns --- pandas/tools/plotting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b8624e19d15ee..05f997369a3f9 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2726,9 +2726,10 @@ def heatmap(df, raise AssertionError("Length of 'label_rows' must be the same as " "df.shape[0]") elif label_rows: - yticklabels = df.index[row_dendrogram['leaves']] + yticklabels = df.index if label_rows: + yticklabels = yticklabels[row_dendrogram['leaves']] heatmap_ax.set_yticks(np.arange(df.shape[0]) + 0.5) heatmap_ax.yaxis.set_ticks_position('right') heatmap_ax.set_yticklabels(yticklabels, fontsize=ylabel_fontsize) @@ -2746,9 +2747,10 @@ def heatmap(df, raise AssertionError("Length of 'label_cols' must be the same as " "df.shape[1]") elif label_cols: - xticklabels = df.columns[col_dendrogram['leaves']] + xticklabels = df.columns if label_cols: + xticklabels = xticklabels[col_dendrogram['leaves']] heatmap_ax.set_xticks(np.arange(df.shape[1]) + 0.5) xticklabels = heatmap_ax.set_xticklabels(xticklabels, fontsize=xlabel_fontsize) From e955409309b7bc91c9b66755959cf464f6491ce8 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 07:46:27 -0800 Subject: [PATCH 08/31] Fixed some PEP8 and other formatting stuff --- pandas/tools/plotting.py | 244 ++++++++++++++++++++++----------------- 1 file changed, 137 insertions(+), 107 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 05f997369a3f9..50012787c4317 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -21,6 +21,7 @@ try: # mpl optional import pandas.tseries.converter as conv + conv.register() # needs to override so set_xlim works with str/number except ImportError: pass @@ -30,70 +31,72 @@ # to True. mpl_stylesheet = { 'axes.axisbelow': True, - 'axes.color_cycle': ['#348ABD', - '#7A68A6', - '#A60628', - '#467821', - '#CF4457', - '#188487', - '#E24A33'], - 'axes.edgecolor': '#bcbcbc', - 'axes.facecolor': '#eeeeee', - 'axes.grid': True, - 'axes.labelcolor': '#555555', - 'axes.labelsize': 'large', - 'axes.linewidth': 1.0, - 'axes.titlesize': 'x-large', - 'figure.edgecolor': 'white', - 'figure.facecolor': 'white', - 'figure.figsize': (6.0, 4.0), - 'figure.subplot.hspace': 0.5, - 'font.family': 'monospace', - 'font.monospace': ['Andale Mono', - 'Nimbus Mono L', - 'Courier New', - 'Courier', - 'Fixed', - 'Terminal', - 'monospace'], - 'font.size': 10, - 'interactive': True, - 'keymap.all_axes': ['a'], - 'keymap.back': ['left', 'c', 'backspace'], - 'keymap.forward': ['right', 'v'], - 'keymap.fullscreen': ['f'], - 'keymap.grid': ['g'], - 'keymap.home': ['h', 'r', 'home'], - 'keymap.pan': ['p'], - 'keymap.save': ['s'], - 'keymap.xscale': ['L', 'k'], - 'keymap.yscale': ['l'], - 'keymap.zoom': ['o'], - 'legend.fancybox': True, - 'lines.antialiased': True, - 'lines.linewidth': 1.0, - 'patch.antialiased': True, - 'patch.edgecolor': '#EEEEEE', - 'patch.facecolor': '#348ABD', - 'patch.linewidth': 0.5, - 'toolbar': 'toolbar2', - 'xtick.color': '#555555', - 'xtick.direction': 'in', - 'xtick.major.pad': 6.0, - 'xtick.major.size': 0.0, - 'xtick.minor.pad': 6.0, - 'xtick.minor.size': 0.0, - 'ytick.color': '#555555', - 'ytick.direction': 'in', - 'ytick.major.pad': 6.0, - 'ytick.major.size': 0.0, - 'ytick.minor.pad': 6.0, - 'ytick.minor.size': 0.0 + 'axes.color_cycle': ['#348ABD', + '#7A68A6', + '#A60628', + '#467821', + '#CF4457', + '#188487', + '#E24A33'], + 'axes.edgecolor': '#bcbcbc', + 'axes.facecolor': '#eeeeee', + 'axes.grid': True, + 'axes.labelcolor': '#555555', + 'axes.labelsize': 'large', + 'axes.linewidth': 1.0, + 'axes.titlesize': 'x-large', + 'figure.edgecolor': 'white', + 'figure.facecolor': 'white', + 'figure.figsize': (6.0, 4.0), + 'figure.subplot.hspace': 0.5, + 'font.family': 'monospace', + 'font.monospace': ['Andale Mono', + 'Nimbus Mono L', + 'Courier New', + 'Courier', + 'Fixed', + 'Terminal', + 'monospace'], + 'font.size': 10, + 'interactive': True, + 'keymap.all_axes': ['a'], + 'keymap.back': ['left', 'c', 'backspace'], + 'keymap.forward': ['right', 'v'], + 'keymap.fullscreen': ['f'], + 'keymap.grid': ['g'], + 'keymap.home': ['h', 'r', 'home'], + 'keymap.pan': ['p'], + 'keymap.save': ['s'], + 'keymap.xscale': ['L', 'k'], + 'keymap.yscale': ['l'], + 'keymap.zoom': ['o'], + 'legend.fancybox': True, + 'lines.antialiased': True, + 'lines.linewidth': 1.0, + 'patch.antialiased': True, + 'patch.edgecolor': '#EEEEEE', + 'patch.facecolor': '#348ABD', + 'patch.linewidth': 0.5, + 'toolbar': 'toolbar2', + 'xtick.color': '#555555', + 'xtick.direction': 'in', + 'xtick.major.pad': 6.0, + 'xtick.major.size': 0.0, + 'xtick.minor.pad': 6.0, + 'xtick.minor.size': 0.0, + 'ytick.color': '#555555', + 'ytick.direction': 'in', + 'ytick.major.pad': 6.0, + 'ytick.major.size': 0.0, + 'ytick.minor.pad': 6.0, + 'ytick.minor.size': 0.0 } + def _get_standard_kind(kind): return {'density': 'kde'}.get(kind, kind) + def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt @@ -101,6 +104,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', if color is None and colormap is not None: if isinstance(colormap, compat.string_types): import matplotlib.cm as cm + cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: @@ -118,6 +122,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', colors = list(colors) elif color_type == 'random': import random + def random_color(column): random.seed(column) return [random.random() for _ in range(3)] @@ -127,7 +132,7 @@ def random_color(column): raise NotImplementedError if len(colors) != num_colors: - multiple = num_colors//len(colors) - 1 + multiple = num_colors // len(colors) - 1 mod = num_colors % len(colors) colors += multiple * colors @@ -135,6 +140,7 @@ def random_color(column): return colors + class _Options(dict): """ Stores pandas plotting options. @@ -262,6 +268,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde + y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) @@ -279,9 +286,9 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, _label_axis(ax, kind='y', label=a, position='left') - if j!= 0: + if j != 0: ax.yaxis.set_visible(False) - if i != n-1: + if i != n - 1: ax.xaxis.set_visible(False) for ax in axes.flat: @@ -290,10 +297,11 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, return axes -def _label_axis(ax, kind='x', label='', position='top', - ticks=True, rotate=False): +def _label_axis(ax, kind='x', label='', position='top', + ticks=True, rotate=False): from matplotlib.artist import setp + if kind == 'x': ax.set_xlabel(label, visible=True) ax.xaxis.set_visible(True) @@ -310,21 +318,22 @@ def _label_axis(ax, kind='x', label='', position='top', return - - - def _gca(): import matplotlib.pyplot as plt + return plt.gca() def _gcf(): import matplotlib.pyplot as plt + return plt.gcf() + def _get_marker_compat(marker): import matplotlib.lines as mlines import matplotlib as mpl + if mpl.__version__ < '1.1.0' and marker == '.': return 'o' if marker not in mlines.lineMarkers: @@ -450,6 +459,7 @@ def f(x): if len(amplitudes) % 2 != 0: result += amplitudes[-1] * sin(harmonic * x) return result + return f n = len(data) @@ -685,6 +695,7 @@ def autocorrelation_plot(series, ax=None): ax: Matplotlib axis object """ import matplotlib.pyplot as plt + n = len(series) data = np.asarray(series) if ax is None: @@ -694,6 +705,7 @@ def autocorrelation_plot(series, ax=None): def r(h): return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + x = np.arange(n) + 1 y = lmap(r, x) z95 = 1.959963984540054 @@ -735,6 +747,7 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- axes: collection of Matplotlib Axes """ + def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) @@ -816,6 +829,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, def _validate_color_args(self): from pandas import DataFrame + if 'color' not in self.kwds and 'colors' in self.kwds: warnings.warn(("'colors' is being deprecated. Please use 'color'" "instead of 'colors'")) @@ -823,13 +837,14 @@ def _validate_color_args(self): self.kwds['color'] = colors if ('color' in self.kwds and - (isinstance(self.data, Series) or - isinstance(self.data, DataFrame) and len(self.data.columns) == 1)): + (isinstance(self.data, Series) or + isinstance(self.data, DataFrame) and len( + self.data.columns) == 1)): # support series.plot(color='green') self.kwds['color'] = [self.kwds['color']] if ('color' in self.kwds or 'colors' in self.kwds) and \ - self.colormap is not None: + self.colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") @@ -843,6 +858,7 @@ def _validate_color_args(self): def _iter_data(self): from pandas.core.frame import DataFrame + if isinstance(self.data, (Series, np.ndarray)): yield self.label, np.asarray(self.data) elif isinstance(self.data, DataFrame): @@ -1017,6 +1033,7 @@ def legend_title(self): @cache_readonly def plt(self): import matplotlib.pyplot as plt + return plt _need_to_set_index = False @@ -1099,6 +1116,7 @@ def _get_ax(self, i): def on_right(self, i): from pandas.core.frame import DataFrame + if isinstance(self.secondary_y, bool): return self.secondary_y @@ -1126,6 +1144,7 @@ def _get_style(self, i, col_name): def _get_colors(self): from pandas.core.frame import DataFrame + if isinstance(self.data, DataFrame): num_colors = len(self.data.columns) else: @@ -1150,13 +1169,14 @@ def _get_marked_label(self, label, col_num): class KdePlot(MPLPlot): def __init__(self, data, bw_method=None, ind=None, **kwargs): MPLPlot.__init__(self, data, **kwargs) - self.bw_method=bw_method - self.ind=ind + self.bw_method = bw_method + self.ind = ind def _make_plot(self): from scipy.stats import gaussian_kde from scipy import __version__ as spv from distutils.version import LooseVersion + plotf = self._get_plot_function() colors = self._get_colors() for i, (label, y) in enumerate(self._iter_data()): @@ -1201,12 +1221,13 @@ def _post_plot_logic(self): for ax in self.axes: ax.legend(loc='best') + class ScatterPlot(MPLPlot): def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) self.kwds.setdefault('c', self.plt.rcParams['patch.facecolor']) if x is None or y is None: - raise ValueError( 'scatter requires and x and y column') + raise ValueError('scatter requires and x and y column') if com.is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if com.is_integer(y) and not self.data.columns.holds_integer(): @@ -1228,7 +1249,6 @@ def _post_plot_logic(self): class LinePlot(MPLPlot): - def __init__(self, data, **kwargs): self.mark_right = kwargs.pop('mark_right', True) MPLPlot.__init__(self, data, **kwargs) @@ -1238,6 +1258,7 @@ def __init__(self, data, **kwargs): def _index_freq(self): from pandas.core.frame import DataFrame + if isinstance(self.data, (Series, DataFrame)): freq = getattr(self.data.index, 'freq', None) if freq is None: @@ -1259,9 +1280,11 @@ def _is_dynamic_freq(self, freq): def _no_base(self, freq): # hack this for 0.10.1, creating more technical debt...sigh from pandas.core.frame import DataFrame + if (isinstance(self.data, (Series, DataFrame)) and isinstance(self.data.index, DatetimeIndex)): import pandas.tseries.frequencies as freqmod + base = freqmod.get_freq(freq) x = self.data.index if (base <= freqmod.FreqGroup.FR_DAY): @@ -1333,6 +1356,7 @@ def _make_plot(self): def _make_ts_plot(self, data, **kwargs): from pandas.tseries.plotting import tsplot + kwargs = kwargs.copy() colors = self._get_colors() @@ -1342,7 +1366,7 @@ def _make_ts_plot(self, data, **kwargs): def _plot(data, col_num, ax, label, style, **kwds): newlines = tsplot(data, plotf, ax=ax, label=label, - style=style, **kwds) + style=style, **kwds) ax.grid(self.grid) lines.append(newlines[0]) @@ -1402,6 +1426,7 @@ def _maybe_convert_index(self, data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames from pandas.core.frame import DataFrame + if (isinstance(data.index, DatetimeIndex) and isinstance(data, DataFrame)): freq = getattr(data.index, 'freq', None) @@ -1455,7 +1480,6 @@ def _post_plot_logic(self): class BarPlot(MPLPlot): - _default_rot = {'bar': 90, 'barh': 0} def __init__(self, data, **kwargs): @@ -1467,7 +1491,7 @@ def __init__(self, data, **kwargs): else: self.tickoffset = 0.375 self.bar_width = 0.5 - self.log = kwargs.pop('log',False) + self.log = kwargs.pop('log', False) MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): @@ -1478,7 +1502,7 @@ def _args_adjust(self): def bar_f(self): if self.kind == 'bar': def f(ax, x, y, w, start=None, **kwds): - return ax.bar(x, y, w, bottom=start,log=self.log, **kwds) + return ax.bar(x, y, w, bottom=start, log=self.log, **kwds) elif self.kind == 'barh': def f(ax, x, y, w, start=None, log=self.log, **kwds): return ax.barh(x, y, w, left=start, **kwds) @@ -1519,7 +1543,7 @@ def _make_plot(self): start = 0 if mpl_le_1_2_1 else None if self.subplots: - rect = bar_f(ax, self.ax_pos, y, self.bar_width, + rect = bar_f(ax, self.ax_pos, y, self.bar_width, start=start, **kwds) ax.set_title(label) elif self.stacked: @@ -1567,8 +1591,8 @@ def _post_plot_logic(self): if name is not None: ax.set_ylabel(name) - # if self.subplots and self.legend: - # self.axes[0].legend(loc='best') + # if self.subplots and self.legend: + # self.axes[0].legend(loc='best') class BoxPlot(MPLPlot): @@ -1585,7 +1609,6 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, xlim=None, ylim=None, logx=False, logy=False, xticks=None, yticks=None, kind='line', sort_columns=False, fontsize=None, secondary_y=False, **kwds): - """ Make line, bar, or scatter plots of DataFrame series with the index on the x-axis using matplotlib / pylab. @@ -1664,8 +1687,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, raise ValueError('Invalid chart type given %s' % kind) if kind == 'scatter': - plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots, - rot=rot,legend=legend, ax=ax, style=style, + plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots, + rot=rot, legend=legend, ax=ax, style=style, fontsize=fontsize, use_index=use_index, sharex=sharex, sharey=sharey, xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, title=title, grid=grid, @@ -1695,7 +1718,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, else: plot_obj = klass(frame, kind=kind, subplots=subplots, rot=rot, - legend=legend, ax=ax, style=style, fontsize=fontsize, + legend=legend, ax=ax, style=style, + fontsize=fontsize, use_index=use_index, sharex=sharex, sharey=sharey, xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, title=title, grid=grid, figsize=figsize, logx=logx, @@ -1775,6 +1799,7 @@ def plot_series(series, label=None, kind='line', use_index=True, rot=None, be ignored. """ import matplotlib.pyplot as plt + if ax is None and len(plt.get_fignums()) > 0: ax = _gca() if ax.get_yaxis().get_ticks_position().strip().lower() == 'right': @@ -1829,6 +1854,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, ax : matplotlib.axes.AxesSubplot """ from pandas import Series, DataFrame + if isinstance(data, Series): data = DataFrame({'x': data}) column = 'x' @@ -1838,11 +1864,12 @@ def _get_colors(): return _get_standard_colors(color=kwds.get('color'), num_colors=1) def maybe_color_bp(bp): - if 'color' not in kwds : + if 'color' not in kwds: from matplotlib.artist import setp - setp(bp['boxes'],color=colors[0],alpha=1) - setp(bp['whiskers'],color=colors[0],alpha=1) - setp(bp['medians'],color=colors[2],alpha=1) + + setp(bp['boxes'], color=colors[0], alpha=1) + setp(bp['whiskers'], color=colors[0], alpha=1) + setp(bp['medians'], color=colors[2], alpha=1) def plot_group(grouped, ax): keys, values = zip(*grouped) @@ -1916,7 +1943,8 @@ def format_date_labels(ax, rot): pass -def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, **kwargs): +def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, + **kwargs): """ Make a scatter plot from two DataFrame columns @@ -2018,6 +2046,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, return axes import matplotlib.pyplot as plt + n = len(data.columns) if layout is not None: @@ -2026,7 +2055,9 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, rows, cols = layout if rows * cols < n: - raise ValueError('Layout of %sx%s is incompatible with %s columns' % (rows, cols, n)) + raise ValueError( + 'Layout of %sx%s is incompatible with %s columns' % ( + rows, cols, n)) else: rows, cols = 1, 1 while rows * cols < n: @@ -2100,9 +2131,9 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, if kwds.get('layout', None) is not None: raise ValueError("The 'layout' keyword is not supported when " "'by' is None") - # hack until the plotting interface is a bit more unified + # hack until the plotting interface is a bit more unified fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else - plt.figure(figsize=figsize)) + plt.figure(figsize=figsize)) if (figsize is not None and tuple(figsize) != tuple(fig.get_size_inches())): fig.set_size_inches(*figsize, forward=True) @@ -2194,6 +2225,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, ret[key] = d else: from pandas.tools.merge import concat + keys, frames = zip(*grouped) if grouped.axis == 0: df = concat(frames, keys=keys, axis=1) @@ -2508,7 +2540,7 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): colors = set(color_list) col_to_value = {col: i for i, col in enumerate(colors)} -# ind = column_dendrogram_distances['leaves'] + # ind = column_dendrogram_distances['leaves'] matrix = np.array([col_to_value[col] for col in color_list])[ind] # Is this row-side or column side? if row: @@ -2521,9 +2553,6 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): return matrix, cmap - - - def heatmap(df, title=None, colorbar_label='values', @@ -2536,7 +2565,7 @@ def heatmap(df, figsize=None, label_rows=True, label_cols=True, - vmin = None, + vmin=None, vmax=None, #col_labels=None, @@ -2547,13 +2576,10 @@ def heatmap(df, cluster_cols=True, cluster_rows=True, plot_df=None): - - """ @author Olga Botvinnik olga.botvinnik@gmail.com - This is liberally borrowed (with permission) from http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb - + This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc @param df: The dataframe you want to cluster on @param title: Title of the figure @@ -2661,8 +2687,8 @@ def heatmap(df, col_dendrogram_ax = fig.add_subplot(heatmap_gridspec[0, ncols - 1]) if cluster_cols: col_dendrogram = sch.dendrogram(col_clusters, - color_threshold=np.inf, - color_list=[almost_black]) + color_threshold=np.inf, + color_list=[almost_black]) else: col_dendrogram = {'leaves': list(range(df.shape[1]))} _clean_axis(col_dendrogram_ax) @@ -2724,7 +2750,7 @@ def heatmap(df, label_rows = True else: raise AssertionError("Length of 'label_rows' must be the same as " - "df.shape[0]") + "df.shape[0]") elif label_rows: yticklabels = df.index @@ -2745,7 +2771,7 @@ def heatmap(df, label_cols = True else: raise AssertionError("Length of 'label_cols' must be the same as " - "df.shape[1]") + "df.shape[1]") elif label_cols: xticklabels = df.columns @@ -2766,8 +2792,10 @@ def heatmap(df, scale_colorbar_ax = fig.add_subplot( heatmap_gridspec[0:(nrows - 1), 0]) # colorbar for scale in upper left corner + + # note that we could pass the norm explicitly with norm=my_norm cb = fig.colorbar(heatmap_ax_pcolormesh, - cax=scale_colorbar_ax) # note that we could pass the norm explicitly with norm=my_norm + cax=scale_colorbar_ax) cb.set_label(colorbar_label) # move ticks to left side of colorbar to avoid problems with tight_layout @@ -2782,6 +2810,7 @@ def heatmap(df, fig.tight_layout() return fig, row_dendrogram, col_dendrogram + if __name__ == '__main__': # import pandas.rpy.common as com # sales = com.load_data('sanfrancisco.home.sales', package='nutshell') @@ -2795,6 +2824,7 @@ def heatmap(df, import pandas.tools.plotting as plots import pandas.core.frame as fr + reload(plots) reload(fr) from pandas.core.frame import DataFrame From c0f03d8ef2514c9a14d5f743818d1b2416a3a42e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:03:46 -0800 Subject: [PATCH 09/31] Changed default sequential colormap to YlGnBu and explained "divergent" better --- pandas/tools/plotting.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 50012787c4317..c6f3ef1721966 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2631,6 +2631,12 @@ def heatmap(df, if any(plot_df.columns != df.columns): raise AssertionError('plot_df must have the exact same columns as df') # make norm + + # Check if the matrix has values both above and below zero, or only above + # or only below zero. If both above and below, then the data is + # "divergent" and we will use a colormap with 0 centered at white, + # negative values blue, and positive values red. Otherwise, we will use + # the YlGnBu colormap. divergent = df.max().max() > 0 and df.min().min() < 0 if color_scale == 'log': @@ -2648,7 +2654,7 @@ def heatmap(df, my_norm = None if cmap is None: - cmap = mpl.cm.RdBu_r if divergent else mpl.cm.Blues_r + cmap = mpl.cm.RdBu_r if divergent else mpl.cm.YlGnBu cmap.set_bad('white') # calculate pairwise distances for rows From f82ebb4783616a0ec1f439d61278403241346da2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:20:28 -0800 Subject: [PATCH 10/31] Changed dictionary comprehension to explicit dict() call on generated (For Travis-CI) --- pandas/tools/plotting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index c6f3ef1721966..516224f252915 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2538,7 +2538,7 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): import matplotlib as mpl colors = set(color_list) - col_to_value = {col: i for i, col in enumerate(colors)} + col_to_value = dict((col, i) for i, col in enumerate(colors)) # ind = column_dendrogram_distances['leaves'] matrix = np.array([col_to_value[col] for col in color_list])[ind] @@ -2623,7 +2623,7 @@ def heatmap(df, almost_black = '#262626' sch.set_link_color_palette([almost_black]) - if type(plot_df) is type(None): + if isinstance(plot_df) is type(None): plot_df = df if any(plot_df.index != df.index): From 7634d20749a8fb2dacda8e0d04f11a547ce2540a Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:21:09 -0800 Subject: [PATCH 11/31] fixed incomplete typo undoing --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 516224f252915..02efacbbe3a81 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2623,7 +2623,7 @@ def heatmap(df, almost_black = '#262626' sch.set_link_color_palette([almost_black]) - if isinstance(plot_df) is type(None): + if type(plot_df) is type(None): plot_df = df if any(plot_df.index != df.index): From 5ee83c0ab9caddcf58d3855307332031697a0133 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:26:16 -0800 Subject: [PATCH 12/31] Fixed yticklabel indexing so yticklabels could be just a list of strings and not necessarily indexable by a list of indices --- pandas/tools/plotting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 02efacbbe3a81..e789dd2bf8f84 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2623,7 +2623,7 @@ def heatmap(df, almost_black = '#262626' sch.set_link_color_palette([almost_black]) - if type(plot_df) is type(None): + if plot_df is None: plot_df = df if any(plot_df.index != df.index): @@ -2761,7 +2761,7 @@ def heatmap(df, yticklabels = df.index if label_rows: - yticklabels = yticklabels[row_dendrogram['leaves']] + yticklabels = [yticklabels[i] for i in row_dendrogram['leaves']] heatmap_ax.set_yticks(np.arange(df.shape[0]) + 0.5) heatmap_ax.yaxis.set_ticks_position('right') heatmap_ax.set_yticklabels(yticklabels, fontsize=ylabel_fontsize) @@ -2782,7 +2782,7 @@ def heatmap(df, xticklabels = df.columns if label_cols: - xticklabels = xticklabels[col_dendrogram['leaves']] + xticklabels = [xticklabels[i] for i in col_dendrogram['leaves']] heatmap_ax.set_xticks(np.arange(df.shape[1]) + 0.5) xticklabels = heatmap_ax.set_xticklabels(xticklabels, fontsize=xlabel_fontsize) From 542f10d9098f4a53257692510ef960ffb836d437 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:30:46 -0800 Subject: [PATCH 13/31] Made AssertionError of label_cols and label_rows more descriptive --- pandas/tools/plotting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e789dd2bf8f84..ee04754fa2cde 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2756,7 +2756,8 @@ def heatmap(df, label_rows = True else: raise AssertionError("Length of 'label_rows' must be the same as " - "df.shape[0]") + "df.shape[0] (len(label_rows)={}, df.shape[" + "0]={})".format(len(label_rows), df.shape[0])) elif label_rows: yticklabels = df.index @@ -2777,7 +2778,8 @@ def heatmap(df, label_cols = True else: raise AssertionError("Length of 'label_cols' must be the same as " - "df.shape[1]") + "df.shape[1] (len(label_cols)={}, df.shape[" + "1]={})".format(len(label_cols), df.shape[1])) elif label_cols: xticklabels = df.columns From de47e83f05b0fc7cfdbc8e60d72f55c73649dd2f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:31:30 -0800 Subject: [PATCH 14/31] Removed extraneous print statements --- pandas/tools/plotting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index ee04754fa2cde..5918e5759c9b1 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2643,8 +2643,6 @@ def heatmap(df, vmin = max(np.floor(df.dropna(how='all').min().dropna().min()), 1e-10) vmax = np.ceil(df.dropna(how='all').max().dropna().max()) my_norm = mpl.colors.LogNorm(vmin, vmax) - print 'vmax', vmax - print 'vmin', vmin elif divergent: abs_max = abs(df.max().max()) abs_min = abs(df.min().min()) From f4c2f279dcb2ea7f5c911c2b2df31f1d24eb67bb Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 09:33:42 -0800 Subject: [PATCH 15/31] Removed more extraneous print statements --- pandas/tools/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 5918e5759c9b1..4f117e22746bf 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2678,7 +2678,7 @@ def heatmap(df, height = min(df.shape[0] * .75, 40) if figsize is None: figsize = (width, height) - print figsize + #print figsize fig = plt.figure(figsize=figsize) heatmap_gridspec = \ From 8f03b1e6abdd6cb355f1dcb98863c99860acbce5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 10:18:58 -0800 Subject: [PATCH 16/31] Changed docstrings to reStructuredText and added linewidth+edgecolor specifications --- pandas/tools/plotting.py | 61 +++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 4f117e22746bf..75df3637d79f7 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2575,45 +2575,56 @@ def heatmap(df, ylabel_fontsize=10, cluster_cols=True, cluster_rows=True, + linewidth=0.0001, + edgecolor='white', plot_df=None): """ @author Olga Botvinnik olga.botvinnik@gmail.com + This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc - @param df: The dataframe you want to cluster on - @param title: Title of the figure - @param colorbar_label: What to colorbar (color scale of the heatmap) - @param col_side_colors: Label the columns with a color - @param row_side_colors: Label the rows with a color - @param color_scale: Either 'linear' or 'log' - @param cmap: A matplotlib colormap, default is mpl.cm.Blues_r if data is + :param df: The dataframe you want to cluster on + :param title: Title of the figure + :param colorbar_label: What to colorbar (color scale of the heatmap) + :param col_side_colors: Label the columns with a color + :param row_side_colors: Label the rows with a color + :param color_scale: Either 'linear' or 'log' + :param cmap: A matplotlib colormap, default is mpl.cm.Blues_r if data is sequential, or mpl.cm.RdBu_r if data is divergent (has both positive and negative numbers) - @param figsize: Size of the figure. The default is a function of the + :param figsize: Size of the figure. The default is a function of the dataframe size. - @param label_rows: Can be boolean or a list of strings, with exactly the + :param label_rows: Can be boolean or a list of strings, with exactly the length of the number of rows in df. - @param label_cols: Can be boolean or a list of strings, with exactly the + :param label_cols: Can be boolean or a list of strings, with exactly the length of the number of columns in df. - @param col_labels: If True, label with df.columns. If False, unlabeled. + :param col_labels: If True, label with df.columns. If False, unlabeled. Else, this can be an iterable to relabel the columns with labels of your own choosing. This is helpful if you have duplicate column names and pandas won't let you reindex it. - @param row_labels: If True, label with df.index. If False, unlabeled. + :param row_labels: If True, label with df.index. If False, unlabeled. Else, this can be an iterable to relabel the row names with labels of your own choosing. This is helpful if you have duplicate index names and pandas won't let you reindex it. - @param xlabel_fontsize: Default 12pt - @param ylabel_fontsize: Default 10pt - @param cluster_cols: Boolean, whether or not to cluster the columns - @param cluster_rows: - @param plot_df: The dataframe you want to plot. This can contain NAs and + :param xlabel_fontsize: Default 12pt + :param ylabel_fontsize: Default 10pt + :param cluster_cols: Boolean, whether or not to cluster the columns + :param cluster_rows: + :param plot_df: The dataframe you want to plot. This can contain NAs and other nasty things. - @return: fig, row_dendrogram, col_dendrogram - @rtype: matplotlib.figure.Figure, dict, dict - @raise TypeError: + :param row_linkage_method: + :param col_linkage_method: + :param vmin: Minimum value to plot on heatmap + :param vmax: Maximum value to plot on heatmap + :param linewidth: Linewidth of lines around heatmap box elements + (default 0.0001) + :param edgecolor: Color of lines around heatmap box elements (default + white) """ + #@return: fig, row_dendrogram, col_dendrogram + #@rtype: matplotlib.figure.Figure, dict, dict + #@raise TypeError: import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import scipy.spatial.distance as distance @@ -2655,6 +2666,7 @@ def heatmap(df, cmap = mpl.cm.RdBu_r if divergent else mpl.cm.YlGnBu cmap.set_bad('white') + # TODO: Add optimal leaf ordering for clusters # calculate pairwise distances for rows row_pairwise_dists = distance.squareform(distance.pdist(df)) row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) @@ -2680,6 +2692,8 @@ def heatmap(df, figsize = (width, height) #print figsize + + fig = plt.figure(figsize=figsize) heatmap_gridspec = \ gridspec.GridSpec(nrows, ncols, wspace=0.0, hspace=0.0, @@ -2697,6 +2711,7 @@ def heatmap(df, col_dendrogram = {'leaves': list(range(df.shape[1]))} _clean_axis(col_dendrogram_ax) + # TODO: Allow for array of color labels ### col colorbar ### if col_side_colors is not None: column_colorbar_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) @@ -2706,7 +2721,7 @@ def heatmap(df, row=False) column_colorbar_ax_pcolormesh = column_colorbar_ax.pcolormesh( col_side_matrix, cmap=col_cmap, - edgecolors='white', linewidth=0.1) + edgecolors='white', linewidth=linewidth) column_colorbar_ax.set_xlim(0, col_side_matrix.shape[1]) _clean_axis(column_colorbar_ax) @@ -2730,7 +2745,7 @@ def heatmap(df, ind=row_dendrogram['leaves'], row=True) row_colorbar_ax.pcolormesh(row_side_matrix, cmap=row_cmap, - edgecolors='white', linewidth=0.01) + edgecolors='white', linewidth=linewidth) row_colorbar_ax.set_ylim(0, row_side_matrix.shape[0]) _clean_axis(row_colorbar_ax) @@ -2741,7 +2756,7 @@ def heatmap(df, col_dendrogram['leaves']].values, norm=my_norm, cmap=cmap, edgecolors='white', - lw=0.01) + lw=linewidth) heatmap_ax.set_ylim(0, df.shape[0]) heatmap_ax.set_xlim(0, df.shape[1]) From 2517e7166ed6e4fececad2188cde239711d7e144 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 10:34:55 -0800 Subject: [PATCH 17/31] Fixed log scale vmin/vmax stuff and added some more parameters for user fiddling --- pandas/tools/plotting.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 75df3637d79f7..d2b0730b7982f 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2575,9 +2575,10 @@ def heatmap(df, ylabel_fontsize=10, cluster_cols=True, cluster_rows=True, - linewidth=0.0001, + linewidth=0, edgecolor='white', - plot_df=None): + plot_df=None, + colorbar_ticklabels_fontsize=10): """ @author Olga Botvinnik olga.botvinnik@gmail.com @@ -2618,7 +2619,7 @@ def heatmap(df, :param vmin: Minimum value to plot on heatmap :param vmax: Maximum value to plot on heatmap :param linewidth: Linewidth of lines around heatmap box elements - (default 0.0001) + (default 0) :param edgecolor: Color of lines around heatmap box elements (default white) """ @@ -2651,22 +2652,25 @@ def heatmap(df, divergent = df.max().max() > 0 and df.min().min() < 0 if color_scale == 'log': - vmin = max(np.floor(df.dropna(how='all').min().dropna().min()), 1e-10) - vmax = np.ceil(df.dropna(how='all').max().dropna().max()) + if vmin is None: + vmin = max(np.floor(df.dropna(how='all').min().dropna().min()), 1e-10) + if vmax is None: + vmax = np.ceil(df.dropna(how='all').max().dropna().max()) my_norm = mpl.colors.LogNorm(vmin, vmax) elif divergent: abs_max = abs(df.max().max()) abs_min = abs(df.min().min()) - vmax = max(abs_max, abs_min) - my_norm = mpl.colors.Normalize(vmin=-vmax, vmax=vmax) + vmaxx = max(abs_max, abs_min) + my_norm = mpl.colors.Normalize(vmin=-vmaxx, vmax=vmaxx) else: - my_norm = None + my_norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) if cmap is None: cmap = mpl.cm.RdBu_r if divergent else mpl.cm.YlGnBu cmap.set_bad('white') # TODO: Add optimal leaf ordering for clusters + # TODO: if color_scale is 'log', should distance also be on np.log(df) ? # calculate pairwise distances for rows row_pairwise_dists = distance.squareform(distance.pdist(df)) row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) @@ -2721,7 +2725,7 @@ def heatmap(df, row=False) column_colorbar_ax_pcolormesh = column_colorbar_ax.pcolormesh( col_side_matrix, cmap=col_cmap, - edgecolors='white', linewidth=linewidth) + edgecolor=edgecolor, linewidth=linewidth) column_colorbar_ax.set_xlim(0, col_side_matrix.shape[1]) _clean_axis(column_colorbar_ax) @@ -2745,7 +2749,7 @@ def heatmap(df, ind=row_dendrogram['leaves'], row=True) row_colorbar_ax.pcolormesh(row_side_matrix, cmap=row_cmap, - edgecolors='white', linewidth=linewidth) + edgecolors=edgecolor, linewidth=linewidth) row_colorbar_ax.set_ylim(0, row_side_matrix.shape[0]) _clean_axis(row_colorbar_ax) @@ -2755,7 +2759,7 @@ def heatmap(df, heatmap_ax.pcolormesh(plot_df.ix[row_dendrogram['leaves'], col_dendrogram['leaves']].values, norm=my_norm, cmap=cmap, - edgecolors='white', + edgecolor=edgecolor, lw=linewidth) heatmap_ax.set_ylim(0, df.shape[0]) @@ -2826,7 +2830,7 @@ def heatmap(df, # make colorbar labels smaller yticklabels = cb.ax.yaxis.get_ticklabels() for t in yticklabels: - t.set_fontsize(t.get_fontsize() - 3) + t.set_fontsize(colorbar_ticklabels_fontsize) fig.tight_layout() return fig, row_dendrogram, col_dendrogram From 8286c8851dcd91a9c7406763f32050b3f7f02dda Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 15:08:12 -0800 Subject: [PATCH 18/31] Added width ratio finder for various colorbar locations (but otherwise other colorbar locations not implemented) --- pandas/tools/plotting.py | 65 ++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index d2b0730b7982f..188abc5d4a9db 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2555,6 +2555,7 @@ def _color_list_to_matrix_and_cmap(color_list, ind, row=True): def heatmap(df, title=None, + title_fontsize=12, colorbar_label='values', col_side_colors=None, row_side_colors=None, @@ -2578,11 +2579,16 @@ def heatmap(df, linewidth=0, edgecolor='white', plot_df=None, - colorbar_ticklabels_fontsize=10): + colorbar_ticklabels_fontsize=10, + colorbar_loc="upper left"): """ @author Olga Botvinnik olga.botvinnik@gmail.com + :param title_fontsize: + :param colorbar_ticklabels_fontsize: + :param colorbar_loc: Can be 'upper left' (in the corner), 'right', + or 'bottom' This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc :param df: The dataframe you want to cluster on @@ -2670,7 +2676,7 @@ def heatmap(df, cmap.set_bad('white') # TODO: Add optimal leaf ordering for clusters - # TODO: if color_scale is 'log', should distance also be on np.log(df) ? + # TODO: if color_scale is 'log', should distance also be on np.log(df)? # calculate pairwise distances for rows row_pairwise_dists = distance.squareform(distance.pdist(df)) row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) @@ -2681,14 +2687,39 @@ def heatmap(df, col_clusters = sch.linkage(col_pairwise_dists, method=col_linkage_method) # heatmap with row names - dendrogram_height_fraction = df.shape[0] * 0.25 / df.shape[0] - dendrogram_width_fraction = df.shape[1] * 0.25 / df.shape[1] - width_ratios = [dendrogram_width_fraction, 1] \ - if row_side_colors is None else [dendrogram_width_fraction, 0.05, 1] - height_ratios = [dendrogram_height_fraction, 1] \ - if col_side_colors is None else [dendrogram_height_fraction, 0.05, 1] - nrows = 2 if col_side_colors is None else 3 - ncols = 2 if row_side_colors is None else 3 + + def get_width_ratios(half_width, side_colors, + colorbar_loc, dimension, side_colors_ratio=0.05): + if colorbar_loc not in ('upper left', 'right', 'bottom'): + raise AssertionError("{} is not a valid 'colorbar_loc' (valid: " + "'upper left', 'right', 'bottom')".format( + colorbar_loc)) + if dimension not in ('height', 'width'): + raise AssertionError("{} is not a valid 'dimension' (valid: " + "'height', 'width')".format( + dimension)) + + ratios = [half_width, half_width] + if side_colors: + ratios += [side_colors_ratio] + + if (colorbar_loc == 'right' and dimension == 'width') or ( + colorbar_loc == 'bottom' and dimension == 'height'): + return ratios + [1, 0.05] + else: + return ratios + [1] + + + col_dendrogram_half_height = df.shape[0] * 0.1 / df.shape[0] + row_dendrogram_half_width = df.shape[1] * 0.1 / df.shape[1] + width_ratios = get_width_ratios(row_dendrogram_half_width, + row_side_colors, + colorbar_loc, dimension='width') + height_ratios = get_width_ratios(col_dendrogram_half_height, + col_side_colors, + colorbar_loc, dimension='height') + nrows = 3 if col_side_colors is None else 4 + ncols = 3 if row_side_colors is None else 4 width = df.shape[1] * 0.25 height = min(df.shape[0] * .75, 40) @@ -2706,7 +2737,7 @@ def heatmap(df, # print heatmap_gridspec ### col dendrogram ### - col_dendrogram_ax = fig.add_subplot(heatmap_gridspec[0, ncols - 1]) + col_dendrogram_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) if cluster_cols: col_dendrogram = sch.dendrogram(col_clusters, color_threshold=np.inf, @@ -2718,7 +2749,7 @@ def heatmap(df, # TODO: Allow for array of color labels ### col colorbar ### if col_side_colors is not None: - column_colorbar_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) + column_colorbar_ax = fig.add_subplot(heatmap_gridspec[2, ncols - 1]) col_side_matrix, col_cmap = _color_list_to_matrix_and_cmap( col_side_colors, ind=col_dendrogram['leaves'], @@ -2730,7 +2761,7 @@ def heatmap(df, _clean_axis(column_colorbar_ax) ### row dendrogram ### - row_dendrogram_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 0]) + row_dendrogram_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 1]) if cluster_rows: row_dendrogram = \ sch.dendrogram(row_clusters, @@ -2743,7 +2774,7 @@ def heatmap(df, ### row colorbar ### if row_side_colors is not None: - row_colorbar_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 1]) + row_colorbar_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 2]) row_side_matrix, row_cmap = _color_list_to_matrix_and_cmap( row_side_colors, ind=row_dendrogram['leaves'], @@ -2786,7 +2817,7 @@ def heatmap(df, # Add title if there is one: if title is not None: - col_dendrogram_ax.set_title(title) + col_dendrogram_ax.set_title(title, fontsize=title_fontsize) ## col labels ## if isinstance(label_cols, Iterable): @@ -2827,6 +2858,10 @@ def heatmap(df, cb.ax.yaxis.set_ticks_position('left') cb.outline.set_linewidth(0) + ## Make colorbar narrower + #xmin, xmax, ymin, ymax = cb.ax.axis() + #cb.ax.set_xlim(xmin, xmax/0.2) + # make colorbar labels smaller yticklabels = cb.ax.yaxis.get_ticklabels() for t in yticklabels: From fd195b15c882b4520952c6e385466a4dcec3886c Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 15:09:58 -0800 Subject: [PATCH 19/31] Changed clustering to operate on log10 of the data if the color_scale=log --- pandas/tools/plotting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 188abc5d4a9db..e073b7219e05a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2678,11 +2678,11 @@ def heatmap(df, # TODO: Add optimal leaf ordering for clusters # TODO: if color_scale is 'log', should distance also be on np.log(df)? # calculate pairwise distances for rows - row_pairwise_dists = distance.squareform(distance.pdist(df)) + row_pairwise_dists = distance.squareform(distance.pdist(np.log10(df))) row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) # calculate pairwise distances for columns - col_pairwise_dists = distance.squareform(distance.pdist(df.T)) + col_pairwise_dists = distance.squareform(distance.pdist(np.log10(df.T))) # cluster col_clusters = sch.linkage(col_pairwise_dists, method=col_linkage_method) From ae8698257400969ff206a2efa247ca048c74b120 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 22:23:06 -0800 Subject: [PATCH 20/31] Added acknowledgement to Michael Lovci --- pandas/tools/plotting.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e073b7219e05a..ff5b5750306a1 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2585,11 +2585,16 @@ def heatmap(df, @author Olga Botvinnik olga.botvinnik@gmail.com + This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc + Many thanks to Christopher DeBoever and Mike Lovci for providing heatmap + guidance. + + :param title_fontsize: :param colorbar_ticklabels_fontsize: :param colorbar_loc: Can be 'upper left' (in the corner), 'right', or 'bottom' - This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc + :param df: The dataframe you want to cluster on :param title: Title of the figure @@ -2639,6 +2644,8 @@ def heatmap(df, import matplotlib as mpl from collections import Iterable + + almost_black = '#262626' sch.set_link_color_palette([almost_black]) if plot_df is None: @@ -2679,12 +2686,12 @@ def heatmap(df, # TODO: if color_scale is 'log', should distance also be on np.log(df)? # calculate pairwise distances for rows row_pairwise_dists = distance.squareform(distance.pdist(np.log10(df))) - row_clusters = sch.linkage(row_pairwise_dists, method=row_linkage_method) + row_linkage = sch.linkage(row_pairwise_dists, method=row_linkage_method) # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(np.log10(df.T))) # cluster - col_clusters = sch.linkage(col_pairwise_dists, method=col_linkage_method) + col_linkage = sch.linkage(col_pairwise_dists, method=col_linkage_method) # heatmap with row names @@ -2739,7 +2746,7 @@ def get_width_ratios(half_width, side_colors, ### col dendrogram ### col_dendrogram_ax = fig.add_subplot(heatmap_gridspec[1, ncols - 1]) if cluster_cols: - col_dendrogram = sch.dendrogram(col_clusters, + col_dendrogram = sch.dendrogram(col_linkage, color_threshold=np.inf, color_list=[almost_black]) else: @@ -2764,7 +2771,7 @@ def get_width_ratios(half_width, side_colors, row_dendrogram_ax = fig.add_subplot(heatmap_gridspec[nrows - 1, 1]) if cluster_rows: row_dendrogram = \ - sch.dendrogram(row_clusters, + sch.dendrogram(row_linkage, color_threshold=np.inf, orientation='right', color_list=[almost_black]) From ff63dd9498800c393c37bfb334c18334df3874f6 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 22:31:27 -0800 Subject: [PATCH 21/31] Added fastcluster as optional import for really large matrices --- pandas/tools/plotting.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index ff5b5750306a1..f4cb000e4985b 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2645,6 +2645,23 @@ def heatmap(df, from collections import Iterable + if df.shape[0] > 1000 or df.shape[1] > 1000: + try: + import fastcluster + linkage_function = fastcluster.linkage + except ImportError: + raise warnings.warn('Module "fastcluster" not found. The ' + 'dataframe ' + 'provided has ' + 'shape {}, and one ' + 'of the dimensions has greater than 1000 ' + 'variables. Calculating linkage on such a ' + 'matrix will take a long time with vanilla ' + '"scipy.cluster.hierarchy.linkage", and we ' + 'suggest fastcluster for such large datasets'\ + .format(df.shape), RuntimeWarning) + else: + linkage_function = sch.linkage almost_black = '#262626' sch.set_link_color_palette([almost_black]) @@ -2686,12 +2703,12 @@ def heatmap(df, # TODO: if color_scale is 'log', should distance also be on np.log(df)? # calculate pairwise distances for rows row_pairwise_dists = distance.squareform(distance.pdist(np.log10(df))) - row_linkage = sch.linkage(row_pairwise_dists, method=row_linkage_method) + row_linkage = linkage_function(row_pairwise_dists, method=row_linkage_method) # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(np.log10(df.T))) # cluster - col_linkage = sch.linkage(col_pairwise_dists, method=col_linkage_method) + col_linkage = linkage_function(col_pairwise_dists, method=col_linkage_method) # heatmap with row names From c1918126de1da074466d24f8bb9ab06315efd206 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 22:33:18 -0800 Subject: [PATCH 22/31] added fastcluster argument to heatmap function --- pandas/tools/plotting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index f4cb000e4985b..3d54ecf943528 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2580,7 +2580,8 @@ def heatmap(df, edgecolor='white', plot_df=None, colorbar_ticklabels_fontsize=10, - colorbar_loc="upper left"): + colorbar_loc="upper left", + use_fastcluster=False): """ @author Olga Botvinnik olga.botvinnik@gmail.com @@ -2645,7 +2646,7 @@ def heatmap(df, from collections import Iterable - if df.shape[0] > 1000 or df.shape[1] > 1000: + if (df.shape[0] > 1000 or df.shape[1] > 1000) or use_fastcluster: try: import fastcluster linkage_function = fastcluster.linkage From ee8bdec646b86f0efb237813324256a075aebaa0 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 5 Dec 2013 22:38:55 -0800 Subject: [PATCH 23/31] Was clustering on log10(df) all the time?? fixed to only do so when color_scale='log' --- pandas/tools/plotting.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 3d54ecf943528..fee332bee870f 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2703,11 +2703,13 @@ def heatmap(df, # TODO: Add optimal leaf ordering for clusters # TODO: if color_scale is 'log', should distance also be on np.log(df)? # calculate pairwise distances for rows - row_pairwise_dists = distance.squareform(distance.pdist(np.log10(df))) + if color_scale == 'log': + df = np.log10(df) + row_pairwise_dists = distance.squareform(distance.pdist(df)) row_linkage = linkage_function(row_pairwise_dists, method=row_linkage_method) # calculate pairwise distances for columns - col_pairwise_dists = distance.squareform(distance.pdist(np.log10(df.T))) + col_pairwise_dists = distance.squareform(distance.pdist(df.T)) # cluster col_linkage = linkage_function(col_pairwise_dists, method=col_linkage_method) From 7caf10be19ad15a2bea5011361230df75fbc4c85 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 6 Dec 2013 01:40:47 -0800 Subject: [PATCH 24/31] added metric option for calculating distances --- pandas/tools/plotting.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index fee332bee870f..509e8cdad996c 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2581,7 +2581,8 @@ def heatmap(df, plot_df=None, colorbar_ticklabels_fontsize=10, colorbar_loc="upper left", - use_fastcluster=False): + use_fastcluster=False, + metric='euclidean'): """ @author Olga Botvinnik olga.botvinnik@gmail.com @@ -2705,11 +2706,13 @@ def heatmap(df, # calculate pairwise distances for rows if color_scale == 'log': df = np.log10(df) - row_pairwise_dists = distance.squareform(distance.pdist(df)) + row_pairwise_dists = distance.squareform(distance.pdist(df, + metric=metric)) row_linkage = linkage_function(row_pairwise_dists, method=row_linkage_method) # calculate pairwise distances for columns - col_pairwise_dists = distance.squareform(distance.pdist(df.T)) + col_pairwise_dists = distance.squareform(distance.pdist(df.T, + metric=metric)) # cluster col_linkage = linkage_function(col_pairwise_dists, method=col_linkage_method) From 87e989d005eae37370ac72c11b5ed5dbb4c1a4bf Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 6 Dec 2013 01:40:59 -0800 Subject: [PATCH 25/31] added metric option for calculating distances --- pandas/tools/plotting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 509e8cdad996c..b473eed5718f0 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2586,7 +2586,6 @@ def heatmap(df, """ @author Olga Botvinnik olga.botvinnik@gmail.com - This is liberally borrowed (with permission) from http://bit.ly/1eWcYWc Many thanks to Christopher DeBoever and Mike Lovci for providing heatmap guidance. From b7b23409cdcdda549bff91d00673fe825102a0a2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 6 Dec 2013 01:41:20 -0800 Subject: [PATCH 26/31] removed extraneous spaces --- pandas/tools/plotting.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index b473eed5718f0..facda5994014a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2568,10 +2568,6 @@ def heatmap(df, label_cols=True, vmin=None, vmax=None, - - #col_labels=None, - #row_labels=None, - xlabel_fontsize=12, ylabel_fontsize=10, cluster_cols=True, From 4c31c6cc7556eeb4c951dea90a81ca4e0ffc3487 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 6 Dec 2013 13:34:37 -0800 Subject: [PATCH 27/31] set default linkage method to average --- pandas/tools/plotting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index facda5994014a..f59108f25f0e5 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2561,8 +2561,8 @@ def heatmap(df, row_side_colors=None, color_scale='linear', cmap=None, - row_linkage_method='complete', - col_linkage_method='complete', + row_linkage_method='average', + col_linkage_method='average', figsize=None, label_rows=True, label_cols=True, From a2a13132cd701bd57d0a4287a133636ee8591a0d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 7 Dec 2013 16:16:45 -0800 Subject: [PATCH 28/31] changed any(df) to df.any() --- pandas/tools/plotting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index f59108f25f0e5..2db5141805fa4 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2665,9 +2665,9 @@ def heatmap(df, if plot_df is None: plot_df = df - if any(plot_df.index != df.index): + if (plot_df.index != df.index).any(): raise AssertionError('plot_df must have the exact same indices as df') - if any(plot_df.columns != df.columns): + if (plot_df.columns != df.columns).any(): raise AssertionError('plot_df must have the exact same columns as df') # make norm From 0707d0f95bb8c92e05d9137e9ede67785a3f5ffd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 16 Dec 2013 08:53:34 -0800 Subject: [PATCH 29/31] Changed AssertionError's to ValueError's --- pandas/tools/plotting.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 2db5141805fa4..98efe8e0d37a7 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2622,7 +2622,7 @@ def heatmap(df, :param cluster_rows: :param plot_df: The dataframe you want to plot. This can contain NAs and other nasty things. - :param row_linkage_method: + :param row_linkage_method: :param col_linkage_method: :param vmin: Minimum value to plot on heatmap :param vmax: Maximum value to plot on heatmap @@ -2641,6 +2641,7 @@ def heatmap(df, import matplotlib as mpl from collections import Iterable + #if cluster if (df.shape[0] > 1000 or df.shape[1] > 1000) or use_fastcluster: try: @@ -2666,9 +2667,9 @@ def heatmap(df, plot_df = df if (plot_df.index != df.index).any(): - raise AssertionError('plot_df must have the exact same indices as df') + raise ValueError('plot_df must have the exact same indices as df') if (plot_df.columns != df.columns).any(): - raise AssertionError('plot_df must have the exact same columns as df') + raise ValueError('plot_df must have the exact same columns as df') # make norm # Check if the matrix has values both above and below zero, or only above From b2e4552dc4c92c81cceab4b21bbfa0d387a94304 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 16 Dec 2013 09:14:50 -0800 Subject: [PATCH 30/31] Added docstring explaining get_width_ratios --- pandas/tools/plotting.py | 54 +++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 98efe8e0d37a7..cdd01c50da3ad 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2714,8 +2714,52 @@ def heatmap(df, # heatmap with row names - def get_width_ratios(half_width, side_colors, + def get_width_ratios(shape, side_colors, colorbar_loc, dimension, side_colors_ratio=0.05): + """ + Figures out the ratio of each subfigure within the larger figure. + The dendrograms currently are 2*half_dendrogram, which is a proportion of + the dataframe shape. Right now, this only supports the colormap in + the upper left. The full figure map looks like: + + 0.1 0.1 0.05 1.0 + 0.1 cb column + 0.1 dendrogram + 0.05 col colors + | r d r + | o e o + | w n w + | d + 1.0| r c heatmap + | o o + | g l + | r o + | a r + | m s + + The colorbar is half_dendrogram of the whitespace in the corner between + the row and column dendrogram. Otherwise, it's too big and its + corners touch the heatmap, which I didn't like. + + For example, if there are side_colors, need to provide an extra value + in the ratio tuples, with the width side_colors_ratio. But if there + aren't any side colors, then the tuple is of size 3 (half_dendrogram, + half_dendrogram, 1.0), and if there are then the tuple is of size 4 ( + half_dendrogram, half_dendrogram, 0.05, 1.0) + + :param side_colors: + :type side_colors: + :param colorbar_loc: + :type colorbar_loc: + :param dimension: + :type dimension: + :param side_colors_ratio: + :type side_colors_ratio: + :return: + :rtype: + """ + i = 0 if dimension == 'height' else 1 + half_dendrogram = shape[i] * 0.1/shape[i] if colorbar_loc not in ('upper left', 'right', 'bottom'): raise AssertionError("{} is not a valid 'colorbar_loc' (valid: " "'upper left', 'right', 'bottom')".format( @@ -2725,7 +2769,7 @@ def get_width_ratios(half_width, side_colors, "'height', 'width')".format( dimension)) - ratios = [half_width, half_width] + ratios = [half_dendrogram, half_dendrogram] if side_colors: ratios += [side_colors_ratio] @@ -2736,12 +2780,10 @@ def get_width_ratios(half_width, side_colors, return ratios + [1] - col_dendrogram_half_height = df.shape[0] * 0.1 / df.shape[0] - row_dendrogram_half_width = df.shape[1] * 0.1 / df.shape[1] - width_ratios = get_width_ratios(row_dendrogram_half_width, + width_ratios = get_width_ratios(df.shape, row_side_colors, colorbar_loc, dimension='width') - height_ratios = get_width_ratios(col_dendrogram_half_height, + height_ratios = get_width_ratios(df.shape, col_side_colors, colorbar_loc, dimension='height') nrows = 3 if col_side_colors is None else 4 From d6785a2dad67b3c4ae0e8a289ffb8ec6dcc06439 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 16 Dec 2013 14:56:25 -0800 Subject: [PATCH 31/31] Unified row and column linkage methods --- pandas/tools/plotting.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index cdd01c50da3ad..ba5ae3b0cb52c 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -2561,8 +2561,7 @@ def heatmap(df, row_side_colors=None, color_scale='linear', cmap=None, - row_linkage_method='average', - col_linkage_method='average', + linkage_method='average', figsize=None, label_rows=True, label_cols=True, @@ -2704,13 +2703,13 @@ def heatmap(df, df = np.log10(df) row_pairwise_dists = distance.squareform(distance.pdist(df, metric=metric)) - row_linkage = linkage_function(row_pairwise_dists, method=row_linkage_method) + row_linkage = linkage_function(row_pairwise_dists, method=linkage_method) # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(df.T, metric=metric)) # cluster - col_linkage = linkage_function(col_pairwise_dists, method=col_linkage_method) + col_linkage = linkage_function(col_pairwise_dists, method=linkage_method) # heatmap with row names