2D Molecular Graph¶

#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Similarity comparison between molecules whose formula are specified using
the OpenSMILES format.

Since a SMILES string is intrinsically a graph representation of a molecule,
it can be easily used by the marginalized graph kernel.
"""
import numpy as np
import pandas as pd
from graphdot import Graph
from graphdot.kernel.marginalized import MarginalizedGraphKernel
from graphdot.microkernel import (
    TensorProduct,
    SquareExponential,
    KroneckerDelta
)

# build sample molecules
smiles_list = [
    'CC',  # ethane
    'CCO',  # acetic acid
    'CCN',  # ethylamine
    'C=C',  # ethene
    'CC=C',  # propene
    'CC=CC',  # 2-n-butene
]

# convert to molecular graphs
# nodes(atoms) has 'aromatic', 'charge', 'element', 'hcount' attributes
# edges(bonds) has the 'order' attribute
graphs = [Graph.from_smiles(smi) for smi in smiles_list]

# define node and edge kernelets
knode = TensorProduct(aromatic=KroneckerDelta(0.8),
                      charge=SquareExponential(1.0),
                      element=KroneckerDelta(0.5),
                      hcount=SquareExponential(1.0))

kedge = TensorProduct(order=KroneckerDelta(0.5))

# compose the marginalized graph kernel and compute pairwise similarity
kernel = MarginalizedGraphKernel(knode, kedge, q=0.05)

R = kernel(graphs)

# normalize the similarity matrix and then print
d = np.diag(R)**-0.5
K = np.diag(d).dot(R).dot(np.diag(d))

print(pd.DataFrame(K, columns=smiles_list, index=smiles_list))

Exptected output:

             CC       CCO       CCN       C=C      CC=C     CC=CC
CC     1.000000  0.301240  0.320140  0.081422  0.168009  0.184527
CCO    0.301240  1.000000  0.688769  0.242019  0.533106  0.565582
CCN    0.320140  0.688769  1.000000  0.234430  0.473560  0.484795
C=C    0.081422  0.242019  0.234430  1.000000  0.361879  0.246465
CC=C   0.168009  0.533106  0.473560  0.361879  1.000000  0.827114
CC=CC  0.184527  0.565582  0.484795  0.246465  0.827114  1.000000