In R, you can draw a heatmap with clustered dendrograms for each axis of x and y just by typing heatmap (x)
, and the gene expression level is familiar to bioinformaticians who mutter every day. It is a function of, but when I tried to do the same thing in the Python + matplotlib environment, there was not much information, so I tried it by trial and error.
I will publish it as a memorandum.
First, as appropriate data, create data that shows the increase or decrease in gene expression level for each cell tissue.
The tissue choices and gene choices here are all appropriate.
We'll tweak the random-generated data into a pandas.DataFrame
object so that it looks like it when clustered later.
python
#!/usr/bin/env python3
genes = [
'HIST1H4H', 'SPRN', 'DNLZ', 'PYCARD', 'PRDM11', 'DKKL1', 'CYBRD1', 'DMAP1',
'MT1E', 'PDGFRL', 'SERTM1', 'PIFO', 'FAM109A', 'ST5', 'ABCA7', 'FAM160A1',
'SAMD15', 'NUAK1', 'GLTP', 'HIST3H2A', 'SCN5A', 'PNPLA5', 'SFRP5', 'CCBE1',
'PTCD1', 'RFTN1', 'SYTL2', 'FAM65B', 'NFKBIZ', 'RHOG', 'KIF3A', 'DYRK1B',
'NXPH2', 'APLN', 'ZNF526', 'NRIP3', 'KCNMA1', 'MTSS1', 'ZNF566', 'TNC',
'GPX2', 'AQP3', 'TSACC', 'SNX15', 'TRIM22', 'THAP6', 'GRIP1', 'DLGAP3',
]
tissues = [
'brain', 'kidney', 'lung', 'heart',
'liver', 'pancreas', 'testis', 'placenta',
]
def draw_heatmap(a):
print(a.head())
def _main():
from random import choice
import numpy as np
from pandas import DataFrame
v = np.random.random([4, len(tissues)]) * 1.2 - 0.6
w = np.random.random([3, len(genes)]) * 0.8 - 0.4
v = np.vstack([choice(v) for x in genes])
w = np.vstack([choice(w) for x in tissues]).T
a = DataFrame(v + w, index=genes, columns=tissues)
draw_heatmap(a)
if __name__ == '__main__':
_main()
Execution result
brain kidney lung heart liver pancreas \
HIST1H4H -0.085630 0.074054 0.058026 -0.142751 -0.767515 -0.348885
SPRN -0.424203 0.251821 -0.012052 -0.037645 -0.000477 0.714727
DNLZ 0.372402 0.532086 0.097971 -0.102806 -0.727570 0.109148
PYCARD -0.561378 0.114647 0.706732 0.681139 0.718306 0.577552
PRDM11 -0.698969 -0.022945 0.240133 0.214540 0.251708 0.439960
testis placenta
HIST1H4H 0.324709 -0.319531
SPRN -0.815679 -0.010529
DNLZ 0.402956 -0.241284
PYCARD -0.728135 0.077015
PRDM11 -0.773637 0.031513
From now on, we will only play with the draw_heatmap (a)
function.
If you want to check only the data for the time being instead of creating materials, you can display it by inserting the data into ʻimshow ()of
matplotlib` as follows.
def draw_heatmap(a):
from matplotlib import pyplot as plt
plt.imshow(a, aspect='auto', interpolation='none')
plt.colorbar()
plt.xticks(range(a.shape[1]), a.columns)
plt.yticks(range(a.shape[0]), a.index)
plt.show()
We will make it look a little better than the above example.
from matplotlib.colors import LinearSegmentedColormap
microarray_cmap = LinearSegmentedColormap('microarray', {
'red': [(0.0, 1.0, 1.0), (0.5, 0.2, 0.2), (1.0, 0.0, 0.0)],
'green': [(0.0, 0.0, 0.0), (0.5, 0.2, 0.2), (1.0, 1.0, 1.0)],
'blue': [(0.0, 0.0, 0.0), (0.5, 0.2, 0.2), (1.0, 0.0, 0.0)],
})
def draw_heatmap(a, cmap=microarray_cmap):
from matplotlib import pyplot as plt
plt.imshow(a, aspect='auto', interpolation='none',
cmap=cmap, vmin=-1.0, vmax=1.0)
plt.colorbar()
plt.xticks(range(a.shape[1]), a.columns, rotation=15)
plt.yticks(range(a.shape[0]), a.index, size='small')
plt.gca().get_xaxis().set_ticks_position('none')
plt.gca().get_yaxis().set_ticks_position('none')
plt.show()
Colormap that changes from red to black to green is not in the preset of matplotlib, so I define it by myself with LinearSegmentedColormap
.
If 0 is completely black, the difference near 0 is too low in brightness to be understood, so I dare to make the middle dark gray.
It is easier to see by clustering genes with similar expression patterns.
def draw_heatmap(a, cmap=microarray_cmap):
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
metric = 'euclidean'
method = 'average'
plt.subplot(1, 2, 1)
ylinkage =linkage(pdist(a, metric=metric), method=method, metric=metric)
ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
distance_sort='descending')
a = a.ix[[a.index[i] for i in ydendro['leaves']]]
plt.subplot(1, 2, 2)
plt.imshow(a, aspect='auto', interpolation='none',
cmap=cmap, vmin=-1.0, vmax=1.0)
plt.colorbar()
plt.xticks(range(a.shape[1]), a.columns, rotation=15)
plt.yticks(range(a.shape[0]), a.index, size='small')
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.gca().invert_yaxis()
plt.show()
Use the Scipy function to perform hierarchical clustering and draw side by side with the dendrogram.
dendrogram (..., no_plot = True)
.has the origin at the upper left, while
dendrogram () has the origin at the lower left, so use ʻinvert_yaxis ()
to swap the directions of the axes.Makes the above example look good.
def draw_heatmap(a, cmap=microarray_cmap):
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
metric = 'euclidean'
method = 'average'
main_axes = plt.gca()
divider = make_axes_locatable(main_axes)
plt.sca(divider.append_axes("left", 1.0, pad=0))
ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric)
ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
distance_sort='descending',
link_color_func=lambda x: 'black')
plt.gca().set_axis_off()
a = a.ix[[a.index[i] for i in ydendro['leaves']]]
plt.sca(main_axes)
plt.imshow(a, aspect='auto', interpolation='none',
cmap=cmap, vmin=-1.0, vmax=1.0)
plt.colorbar(pad=0.15)
plt.gca().yaxis.tick_right()
plt.xticks(range(a.shape[1]), a.columns, rotation=15, size='small')
plt.yticks(range(a.shape[0]), a.index, size='small')
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.gca().invert_yaxis()
plt.show()
make_axes_locatable ()
instead of subplot ()
.set_axis_off ()
and the line is black only with link_color_func
.yaxis.tick_right ()
brings the y-axis label to the right. Since it overlaps with the color bar as it is, the interval is widened as colorbar (pad = ...)
.It is not necessary for time-series data, but it may be easier to understand if the x-axis is also clustered for data for each organization. What you do is basically the same as before.
def draw_heatmap(a, cmap=microarray_cmap):
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
metric = 'euclidean'
method = 'average'
main_axes = plt.gca()
divider = make_axes_locatable(main_axes)
xdendro_axes = divider.append_axes("top", 0.5, pad=0)
ydendro_axes = divider.append_axes("left", 1.0, pad=0)
plt.sca(xdendro_axes)
xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric)
xdendro = dendrogram(xlinkage, orientation='top', no_labels=True,
distance_sort='descending',
link_color_func=lambda x: 'black')
plt.gca().set_axis_off()
a = a[[a.columns[i] for i in xdendro['leaves']]]
plt.sca(ydendro_axes)
ylinkage = linkage(pdist(a, metric=metric), method=method,
metric=metric)
ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
distance_sort='descending',
link_color_func=lambda x: 'black')
plt.gca().set_axis_off()
a = a.ix[[a.index[i] for i in ydendro['leaves']]]
plt.sca(main_axes)
plt.imshow(a, aspect='auto', interpolation='none',
cmap=cmap, vmin=-1.0, vmax=1.0)
plt.colorbar(pad=0.15)
plt.gca().yaxis.tick_right()
plt.xticks(range(a.shape[1]), a.columns, rotation=15, size='small')
plt.yticks(range(a.shape[0]), a.index, size='x-small')
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.gca().invert_yaxis()
plt.show()
After that, just paste it in the paper with savefig ('hoge.eps')
and so on.
By the way, I feel that the meaning of dendrogram (orientation = ...)
is different between landscape and portrait ... Scipy bug?
Recommended Posts