A well-known index when talking about the similarity of compounds is the "Tanimoto coefficient". However, there are quite a few people who only say "Tanimoto coefficient", and there are quite a few opportunities to agonize over "Isn't that enough to explain?"
The number of compounds is 10.
smiles = [
'C1=CC=C2C3CC(CNC3)CN2C1=O',
'CN1c2c(C(N(C)C1=O)=O)[nH0](CC(CO)O)c[nH0]2',
'CN1C2CC(CC1C1C2O1)OC(C(c1ccccc1)CO)=O',
'CN1C2CC(CC1C1C2O1)OC(C(c1cccnc1)CO)=O', #Similar to the compound above
'CN(C=1C(=O)N(c2ccccc2)N(C1C)C)C',
'CN(C=1C(=O)N(C2CCCCC2)N(C1C)C)C', #Similar to the compound above
'OCC1C(C(C(C(OCC2C(C(C(C(OC(c3ccccc3)C#N)O2)O)O)O)O1)O)O)O',
'OCc1ccccc1OC1C(C(C(C(CO)O1)O)O)O',
'OCc1cc(N)ccc1OC1C(C(C(C(CO)O1)O)O)O', #Similar to the compound above
'[nH0]1c(OC)c2c([nH0]cc[nH0]2)[nH0]c1',
]
from rdkit import Chem
mols = [Chem.MolFromSmiles(smile) for smile in smiles]
from rdkit.Chem import AllChem
fps = [AllChem.GetMorganFingerprint(mol, 3, useFeatures=True) for mol in mols]
from rdkit import DataStructs
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
By doing this, you can see the distribution of the Tanimoto coefficient. It seems that the molecule with Tanimoto coefficient = 1 is the same molecule, but there are other molecular pairs with higher Tanimoto coefficient. Find out (or imagine) what it is
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
The Morgan fingerprint also has various parameters, and changing them also changes the value of the Tanimoto coefficient.
fps = [AllChem.GetMorganFingerprint(mol, 2, useFeatures=True) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
fps = [AllChem.GetMorganFingerprint(mol, 1, useFeatures=True) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 2048) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
There are only two because it is difficult to calculate everything. One of the important things is that in the calculation results below, there are 10 or more cases with a Tanimoto coefficient of 1.0. In other words, be aware that the Tanimoto coefficient can be 1.0 even if they are not the same molecule.
fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
fps = [Chem.RDKFingerprint(mol) for mol in mols]
sim_matrix = [DataStructs.BulkTanimotoSimilarity(fp, fps) for fp in fps]
plt.hist(np.array(sim_matrix).flatten(), bins=20)
plt.grid()
plt.show()
from rdkit.Chem import rdFMCS
matrix = []
for mol1 in mols:
for mol2 in mols:
mcs = rdFMCS.FindMCS([mol1, mol2])
a1 = len(mol1.GetAtoms())
a2 = len(mol2.GetAtoms())
matrix.append(mcs.numAtoms / (a1 + a2 - mcs.numAtoms) )
plt.hist(np.array(matrix).flatten(), bins=20)
plt.grid()
plt.show()
from rdkit.Chem import rdFMCS
matrix = []
for mol1 in mols:
for mol2 in mols:
mcs = rdFMCS.FindMCS([mol1, mol2], atomCompare=rdFMCS.AtomCompare.CompareAny)
a1 = len(mol1.GetAtoms())
a2 = len(mol2.GetAtoms())
matrix.append(mcs.numAtoms / (a1 + a2 - mcs.numAtoms) )
plt.hist(np.array(matrix).flatten(), bins=20)
plt.grid()
plt.show()
from rdkit.Chem import rdFMCS
matrix = []
for mol1 in mols:
for mol2 in mols:
mcs = rdFMCS.FindMCS([mol1, mol2])
a1 = len(mol1.GetBonds())
a2 = len(mol2.GetBonds())
matrix.append(mcs.numBonds / (a1 + a2 - mcs.numBonds) )
plt.hist(np.array(matrix).flatten(), bins=20)
plt.grid()
plt.show()
from rdkit.Chem import rdFMCS
matrix = []
for mol1 in mols:
for mol2 in mols:
mcs = rdFMCS.FindMCS([mol1, mol2], bondCompare=rdFMCS.BondCompare.CompareOrderExact)
a1 = len(mol1.GetBonds())
a2 = len(mol2.GetBonds())
matrix.append(mcs.numBonds / (a1 + a2 - mcs.numBonds) )
plt.hist(np.array(matrix).flatten(), bins=20)
plt.grid()
plt.show()
Recommended Posts