Heatmap with Dendrogram in Python + matplotlib

In R, you can draw a heatmap with clustered dendrograms for each axis of x and y just by typing heatmap (x), and the gene expression level is familiar to bioinformaticians who mutter every day. It is a function of, but when I tried to do the same thing in the Python + matplotlib environment, there was not much information, so I tried it by trial and error.

I will publish it as a memorandum.

Data preparation

First, as appropriate data, create data that shows the increase or decrease in gene expression level for each cell tissue. The tissue choices and gene choices here are all appropriate. We'll tweak the random-generated data into a pandas.DataFrame object so that it looks like it when clustered later.

`python`


#!/usr/bin/env python3

genes = [
    'HIST1H4H', 'SPRN', 'DNLZ', 'PYCARD', 'PRDM11', 'DKKL1', 'CYBRD1', 'DMAP1',
    'MT1E', 'PDGFRL', 'SERTM1', 'PIFO', 'FAM109A', 'ST5', 'ABCA7', 'FAM160A1',
    'SAMD15', 'NUAK1', 'GLTP', 'HIST3H2A', 'SCN5A', 'PNPLA5', 'SFRP5', 'CCBE1',
    'PTCD1', 'RFTN1', 'SYTL2', 'FAM65B', 'NFKBIZ', 'RHOG', 'KIF3A', 'DYRK1B',
    'NXPH2', 'APLN', 'ZNF526', 'NRIP3', 'KCNMA1', 'MTSS1', 'ZNF566', 'TNC',
    'GPX2', 'AQP3', 'TSACC', 'SNX15', 'TRIM22', 'THAP6', 'GRIP1', 'DLGAP3',
]
tissues = [
    'brain', 'kidney', 'lung', 'heart',
    'liver', 'pancreas', 'testis', 'placenta',
]


def draw_heatmap(a):
    print(a.head())

def _main():
    from random import choice
    import numpy as np
    from pandas import DataFrame
    v = np.random.random([4, len(tissues)]) * 1.2 - 0.6
    w = np.random.random([3, len(genes)]) * 0.8 - 0.4
    v = np.vstack([choice(v) for x in genes])
    w = np.vstack([choice(w) for x in tissues]).T
    a = DataFrame(v + w, index=genes, columns=tissues)
    draw_heatmap(a)

if __name__ == '__main__':
    _main()

`Execution result`


             brain    kidney      lung     heart     liver  pancreas  \
HIST1H4H -0.085630  0.074054  0.058026 -0.142751 -0.767515 -0.348885   
SPRN     -0.424203  0.251821 -0.012052 -0.037645 -0.000477  0.714727   
DNLZ      0.372402  0.532086  0.097971 -0.102806 -0.727570  0.109148   
PYCARD   -0.561378  0.114647  0.706732  0.681139  0.718306  0.577552   
PRDM11   -0.698969 -0.022945  0.240133  0.214540  0.251708  0.439960   

            testis  placenta  
HIST1H4H  0.324709 -0.319531  
SPRN     -0.815679 -0.010529  
DNLZ      0.402956 -0.241284  
PYCARD   -0.728135  0.077015  
PRDM11   -0.773637  0.031513

From now on, we will only play with the draw_heatmap (a) function.

For quick display

If you want to check only the data for the time being instead of creating materials, you can display it by inserting the data into ʻimshow ()ofmatplotlib` as follows.

def draw_heatmap(a):
    from matplotlib import pyplot as plt
    plt.imshow(a, aspect='auto', interpolation='none')
    plt.colorbar()
    plt.xticks(range(a.shape[1]), a.columns)
    plt.yticks(range(a.shape[0]), a.index)
    plt.show()

Make it look good

We will make it look a little better than the above example.

The scale is unnecessary, so turn it off.
The label is hard to see, so adjust the size and position.
It seems that there are relatively many color coding such as red for Down regulation, black for Neutral, green for Up regulation, etc., which is a remnant of the two-color microarray, so I will follow this.
Since the scale is positive and negative and not symmetrical, specify the upper and lower limits.

from matplotlib.colors import LinearSegmentedColormap
microarray_cmap = LinearSegmentedColormap('microarray', {
    'red': [(0.0, 1.0, 1.0), (0.5, 0.2, 0.2), (1.0, 0.0, 0.0)],
    'green': [(0.0, 0.0, 0.0), (0.5, 0.2, 0.2), (1.0, 1.0, 1.0)],
    'blue': [(0.0, 0.0, 0.0), (0.5, 0.2, 0.2), (1.0, 0.0, 0.0)],
})

def draw_heatmap(a, cmap=microarray_cmap):
    from matplotlib import pyplot as plt
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=-1.0, vmax=1.0)
    plt.colorbar()
    plt.xticks(range(a.shape[1]), a.columns, rotation=15)
    plt.yticks(range(a.shape[0]), a.index, size='small')
    plt.gca().get_xaxis().set_ticks_position('none')
    plt.gca().get_yaxis().set_ticks_position('none')
    plt.show()

Colormap that changes from red to black to green is not in the preset of matplotlib, so I define it by myself with LinearSegmentedColormap. If 0 is completely black, the difference near 0 is too low in brightness to be understood, so I dare to make the middle dark gray.

Cluster

It is easier to see by clustering genes with similar expression patterns.

def draw_heatmap(a, cmap=microarray_cmap):
    from matplotlib import pyplot as plt
    from scipy.spatial.distance import pdist
    from scipy.cluster.hierarchy import linkage, dendrogram

    metric = 'euclidean'
    method = 'average'

    plt.subplot(1, 2, 1)
    ylinkage =linkage(pdist(a, metric=metric), method=method, metric=metric)
    ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                         distance_sort='descending')
    a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.subplot(1, 2, 2)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=-1.0, vmax=1.0)
    plt.colorbar()
    plt.xticks(range(a.shape[1]), a.columns, rotation=15)
    plt.yticks(range(a.shape[0]), a.index, size='small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()
    plt.show()

Use the Scipy function to perform hierarchical clustering and draw side by side with the dendrogram.

Various distance definitions can be used in addition to euclidean. See Scipy Documentation for more information.
Various hierarchical clustering methods other than average can be used. See Scipy Documentation for more information.
After drawing the dendrogram, replace the rows of the DataFrame according to the order of the leaves.
If you want clustering but don't need the dendrogram, specify dendrogram (..., no_plot = True).
For the y-axis, ʻimshow ()has the origin at the upper left, whiledendrogram () has the origin at the lower left, so use ʻinvert_yaxis () to swap the directions of the axes.

Make it look good

Makes the above example look good.

Connect the dendrogram and heatmap and move the y-axis label to the right.
I don't need the dendrogram frame, so I'll erase it.
I don't need the color of the dendrogram, so I only use black.
Reduce the width of the dendrogram a little more.

def draw_heatmap(a, cmap=microarray_cmap):
    from matplotlib import pyplot as plt
    from mpl_toolkits.axes_grid1 import make_axes_locatable
    from scipy.spatial.distance import pdist
    from scipy.cluster.hierarchy import linkage, dendrogram

    metric = 'euclidean'
    method = 'average'

    main_axes = plt.gca()
    divider = make_axes_locatable(main_axes)

    plt.sca(divider.append_axes("left", 1.0, pad=0))
    ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric)
    ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                         distance_sort='descending',
                         link_color_func=lambda x: 'black')
    plt.gca().set_axis_off()
    a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.sca(main_axes)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=-1.0, vmax=1.0)
    plt.colorbar(pad=0.15)
    plt.gca().yaxis.tick_right()
    plt.xticks(range(a.shape[1]), a.columns, rotation=15, size='small')
    plt.yticks(range(a.shape[0]), a.index, size='small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()
    plt.show()

I'm using make_axes_locatable () instead of subplot ().
The dendrogram frame is hidden with set_axis_off () and the line is black only with link_color_func.
yaxis.tick_right () brings the y-axis label to the right. Since it overlaps with the color bar as it is, the interval is widened as colorbar (pad = ...).

The x-axis is also clustered.

It is not necessary for time-series data, but it may be easier to understand if the x-axis is also clustered for data for each organization. What you do is basically the same as before.

def draw_heatmap(a, cmap=microarray_cmap):
    from matplotlib import pyplot as plt
    from mpl_toolkits.axes_grid1 import make_axes_locatable
    from scipy.spatial.distance import pdist
    from scipy.cluster.hierarchy import linkage, dendrogram

    metric = 'euclidean'
    method = 'average'

    main_axes = plt.gca()
    divider = make_axes_locatable(main_axes)
    xdendro_axes = divider.append_axes("top", 0.5, pad=0)
    ydendro_axes = divider.append_axes("left", 1.0, pad=0)

    plt.sca(xdendro_axes)
    xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric)
    xdendro = dendrogram(xlinkage, orientation='top', no_labels=True,
                         distance_sort='descending',
                         link_color_func=lambda x: 'black')
    plt.gca().set_axis_off()
    a = a[[a.columns[i] for i in xdendro['leaves']]]

    plt.sca(ydendro_axes)
    ylinkage = linkage(pdist(a, metric=metric), method=method,
                       metric=metric)
    ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                         distance_sort='descending',
                         link_color_func=lambda x: 'black')
    plt.gca().set_axis_off()
    a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.sca(main_axes)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=-1.0, vmax=1.0)
    plt.colorbar(pad=0.15)
    plt.gca().yaxis.tick_right()
    plt.xticks(range(a.shape[1]), a.columns, rotation=15, size='small')
    plt.yticks(range(a.shape[0]), a.index, size='x-small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()
    plt.show()

After that, just paste it in the paper with savefig ('hoge.eps') and so on.

By the way, I feel that the meaning of dendrogram (orientation = ...) is different between landscape and portrait ... Scipy bug?