2D Molecular GraphΒΆ
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | #!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Similarity comparison between molecules whose formula are specified using
the OpenSMILES format.
Since a SMILES string is intrinsically a graph representation of a molecule,
it can be easily used by the marginalized graph kernel.
"""
import numpy as np
import pandas as pd
from graphdot import Graph
from graphdot.kernel.marginalized import MarginalizedGraphKernel
from graphdot.microkernel import (
TensorProduct,
SquareExponential,
KroneckerDelta
)
# build sample molecules
smiles_list = [
'CC', # ethane
'CCO', # acetic acid
'CCN', # ethylamine
'C=C', # ethene
'CC=C', # propene
'CC=CC', # 2-n-butene
]
# convert to molecular graphs
# nodes(atoms) has 'aromatic', 'charge', 'element', 'hcount' attributes
# edges(bonds) has the 'order' attribute
graphs = [Graph.from_smiles(smi) for smi in smiles_list]
# define node and edge kernelets
knode = TensorProduct(aromatic=KroneckerDelta(0.8),
charge=SquareExponential(1.0),
element=KroneckerDelta(0.5),
hcount=SquareExponential(1.0))
kedge = TensorProduct(order=KroneckerDelta(0.5))
# compose the marginalized graph kernel and compute pairwise similarity
kernel = MarginalizedGraphKernel(knode, kedge, q=0.05)
R = kernel(graphs)
# normalize the similarity matrix and then print
d = np.diag(R)**-0.5
K = np.diag(d).dot(R).dot(np.diag(d))
print(pd.DataFrame(K, columns=smiles_list, index=smiles_list))
|
Exptected output:
CC CCO CCN C=C CC=C CC=CC
CC 1.000000 0.301240 0.320140 0.081422 0.168009 0.184527
CCO 0.301240 1.000000 0.688769 0.242019 0.533106 0.565582
CCN 0.320140 0.688769 1.000000 0.234430 0.473560 0.484795
C=C 0.081422 0.242019 0.234430 1.000000 0.361879 0.246465
CC=C 0.168009 0.533106 0.473560 0.361879 1.000000 0.827114
CC=CC 0.184527 0.565582 0.484795 0.246465 0.827114 1.000000