"""Tools for generating chemical features."""
from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
from functools import cache, wraps
from carabiner.cast import cast
from carabiner.decorators import return_none_on_error, vectorize
from descriptastorus.descriptors import MakeGenerator
from pandas import DataFrame, Series
import numpy as np
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem import Mol
try:
from rdkit.Chem.AllChem import FingeprintGenerator64 as FingerprintGenerator64, GetMorganGenerator
except ImportError: # typo in some rdkit versions
from rdkit.Chem.rdFingerprintGenerator import FingerprintGenerator64, GetMorganGenerator
from .cleaning import clean_smiles
from .converting import _x2mol, _mol2x, _smiles2mol, _convert_input_to_smiles
def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], Union[DataFrame, Tuple[np.ndarray, np.ndarray]]]:
@wraps(f)
def _f(prefix: Optional[str] = None,
*args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
feature_matrix = f(*args, **kwargs)
if prefix is not None and isinstance(feature_matrix, DataFrame):
new_cols = {col: f"{prefix}_{col}"
for col in feature_matrix.columns
if not col.startswith('_meta')}
feature_matrix = feature_matrix.rename(columns=new_cols)
return feature_matrix
return _f
def _get_descriptastorus_features(
smiles: Iterable[str],
generator: str = "RDKit2DHistogramNormalized"
) -> Union[DataFrame, Tuple[np.ndarray, List[str]]]:
generator = MakeGenerator((generator, ))
smiles = cast(clean_smiles(smiles), to=list)
mols = cast(_smiles2mol(smiles), to=list)
features = generator.processMols(
mols,
smiles,
)
return np.stack(features, axis=0), [col for col, _ in generator.GetColumns()]
[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_2d_features(
strings: Union[Iterable[str], str],
normalized: bool = True,
histogram_normalized: bool = True,
return_dataframe: bool = False,
*args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
"""Calculate 2d features from string representation.
Parameters
----------
strings : str
Input string representation(s).
input_representation : str
Representation type
normalized : bool, optional
Whether to return normalized features. Default: `True`.
histogram_normalized : bool, optional
Whether to return histogram normalized features (faster). Default: `True`.
return_dataframe : bool, optional
Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.
Returns
-------
DataFrame, Tuple of numpy Arrays
If `return_dataframe = True`, a DataFrame with named feature columns, and
the final column called `"meta_feature_valid"` being the validity indicator.
Otherwise returns a tuple of Arrays with the first being the matrix of
features and the second being the vector of validity indicators.
Examples
--------
>>> features, validity = calculate_2d_features(strings='CCC')
>>> features[:,:3]
array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05]])
>>> validity
array([1.])
>>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO'])
>>> features[:,:3]
array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05],
[7.38891722e-01, 6.00042003e-04, 5.00035002e-05]])
>>> validity
array([1., 1.])
>>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
CCC True
CCCO True
Name: meta_feature_valid, dtype: bool
>>> ## Unusal valence
>>> s = "O=S(=O)(OCC1OC(OC2(COS(=O)(=O)O[AlH3](O)O)OC(COS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C2OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C1OS(=O)(=O)O[AlH3](O)O)O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O"
>>> calculate_2d_features(strings=s)[0].shape
(1, 200)
>>> s = 'CCc1c(C(=O)N2CC(c3nnc4c3CCC4)C2)nc(C)c1C(=O)OC'
>>> calculate_2d_features(strings=s)[1]
array([1.])
"""
if normalized:
if histogram_normalized:
generator_name = "RDKit2DHistogramNormalized"
else:
generator_name = "RDKit2DNormalized"
else:
generator_name = "RDKit2D"
strings = cast(strings, to=list)
feature_matrix, columns = _get_descriptastorus_features(
strings,
generator=generator_name,
)
if return_dataframe:
feature_matrix = DataFrame(
feature_matrix,
index=strings,
columns=columns,
)
feature_matrix = (
feature_matrix
.rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
.assign(meta_feature_type=generator_name,
meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
.drop(columns=['meta_feature_valid0'])
)
return feature_matrix
else:
return feature_matrix[:,1:], feature_matrix[:,0]
[docs]
@cache
def smiles_to_3d(
smiles: str,
seed: int = 42
) -> np.ndarray:
from rdkit import Chem
from rdkit.Chem import AllChem, Mol
from rdkit.Chem.rdchem import AtomValenceException
from rdkit.Chem.Descriptors3D import CalcMolDescriptors3D
mol = _smiles2mol(clean_smiles(smiles))
if mol is not None:
params = AllChem.ETKDGv3()
params.randomSeed = seed
Chem.AddHs(mol)
try:
AllChem.EmbedMolecule(mol, params)
desc = CalcMolDescriptors3D(mol)
except (ValueError, AtomValenceException):
desc = {"meta_feature_valid": False}
else:
desc["meta_feature_valid"] = True
return desc
else:
return {"meta_feature_valid": False}
[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_3d_features(
strings: Union[Iterable[str], str],
seed: int = 42,
return_dataframe: bool = False,
*args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
"""Calculate 3d features from string representation.
Parameters
----------
strings : str
Input string representation(s).
input_representation : str
Representation type
seed : int
Seed for reproducible randomness
return_dataframe : bool, optional
Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.
Returns
-------
DataFrame, Tuple of numpy Arrays
If `return_dataframe = True`, a DataFrame with named feature columns, and
the final column called `"meta_feature_valid"` being the validity indicator.
Otherwise returns a tuple of Arrays with the first being the matrix of
features and the second being the vector of validity indicators.
Examples
--------
>>> features, validity = calculate_3d_features(strings='CCC')
>>> features.shape
(1, 11)
>>> sum(validity)
1
>>> features, validity = calculate_3d_features(strings=['CCC', 'CCCO'])
>>> features.shape
(2, 11)
>>> sum(validity)
2
>>> calculate_3d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
CCC True
CCCO True
Name: meta_feature_valid, dtype: bool
>>> ## Unusal valence
>>> s = "O=S(=O)(OCC1OC(OC2(COS(=O)(=O)O[AlH3](O)O)OC(COS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C2OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C1OS(=O)(=O)O[AlH3](O)O)O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O"
>>> calculate_3d_features(strings=s)[0].shape
(1, 0)
>>> s = 'CCc1c(C(=O)N2CC(c3nnc4c3CCC4)C2)nc(C)c1C(=O)OC'
>>> sum(calculate_3d_features(strings=s)[1])
1
"""
import pandas as pd
strings = cast(strings, to=list)
descriptors = [
smiles_to_3d(s) for s in strings
]
feature_matrix = DataFrame(descriptors).fillna(0.)
if return_dataframe:
feature_matrix.index = strings
feature_matrix = (
feature_matrix
.assign(meta_feature_type="RDKit3D")
)
return feature_matrix
else:
feature_matrix = pd.concat([
feature_matrix[["meta_feature_valid"]],
feature_matrix[[col for col in feature_matrix if col != "meta_feature_valid"]],
], axis=1).values
return feature_matrix[:,1:], feature_matrix[:,0]
def _fast_fingerprint(
generator: FingerprintGenerator64,
mol: Mol,
to_np: bool = True
) -> Union[str, np.ndarray]:
try:
fp_string = generator.GetFingerprint(mol).ToBitString()
except:
return None
else:
if to_np:
return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
else:
return fp_string
[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_fingerprints(
strings: Union[Iterable[str], str],
fp_type: str = 'morgan',
radius: int = 2,
chiral: bool = True,
on_bits: bool = True,
return_dataframe: bool = False,
*args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
"""Calculate the binary fingerprint of string representation(s).
Only Morgan fingerprints are allowed.
Parameters
----------
strings : str
Input string representation(s).
input_representation : str
Representation type
fp_type : str, opional
Which fingerprint type to calculate. Default: `'morgan'`.
radius : int, optional
Atom radius for fingerprints. Default: `2`.
chiral : bool, optional
Whether to take chirality into account. Default: `True`.
on_bits : bool, optional
Whether to return the non-zero indices instead of the full binary vector. Default: `True`.
return_dataframe : bool, optional
Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.
Returns
-------
DataFrame, Tuple of numpy Arrays
If `return_dataframe = True`, a DataFrame with named feature columns, and
the final column called `"meta_feature_valid"` being the validity indicator.
Otherwise returns a tuple of Arrays with the first being the matrix of
features and the second being the vector of validity indicators.
Raises
------
NotImplementedError
If `fp_type` is not `'morgan'`.
Examples
--------
>>> bits, validity = calculate_fingerprints(strings='CCC')
>>> bits.tolist()
[['80;294;1057;1344']]
>>> sum(validity) # doctest: +NORMALIZE_WHITESPACE
1
>>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
>>> bits.tolist()
[['80;294;1057;1344'], ['80;222;294;473;794;807;1057;1277']]
>>> sum(validity) # doctest: +NORMALIZE_WHITESPACE
2
>>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)
array([4, 8])
>>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
CCC True
CCCO True
Name: meta_feature_valid, dtype: bool
"""
if fp_type.casefold() == 'morgan':
generator_class = GetMorganGenerator
else:
raise NotImplementedError(f"Fingerprint type {fp_type} not supported!")
fp_generator = generator_class(
radius=radius,
includeChirality=chiral,
)
try:
fp_size = fp_generator.GetOptions().fpSize
except AttributeError: # 'FingerprintGenerator64' object has no attribute 'GetOptions' in older rdkit versions (e.g.2022.9.5)
test_smiles = "CCCC"
fp_size = _fast_fingerprint(
fp_generator,
_smiles2mol(test_smiles),
to_np=True,
).size
strings = cast(strings, to=list)
mols = (_smiles2mol(s) for s in strings)
fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits)
for mol in mols)
if on_bits:
fingerprints = (
map(str, np.flatnonzero(fp_string).tolist())
for fp_string in fp_strings
)
fingerprints = [[';'.join(fp)] for fp in fingerprints]
validity = [len(fp[0]) > 0 for fp in fingerprints]
else:
fingerprints = [
np.array([
int(digit) for digit in fp_string
])
if fp_string is not None
else np.zeros((fp_size, ))
for fp_string in fp_strings
]
validity = [np.all(fp >= 0) for fp in fingerprints]
feature_matrix = np.stack(fingerprints, axis=0)
if return_dataframe:
if feature_matrix.ndim == 1: # on_bits only
feature_matrix = DataFrame(
feature_matrix,
columns=['fp_bits'],
index=strings,
)
else:
feature_matrix = DataFrame(
feature_matrix,
columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)],
index=strings,
)
return feature_matrix.assign(
meta_feature_type=fp_type.casefold(),
meta_feature_valid=validity,
)
else:
return feature_matrix, validity
_FEATURE_CALCULATORS = {
"2d": calculate_2d_features,
"3d": calculate_3d_features,
"fp": calculate_fingerprints,
}
[docs]
def calculate_feature(
feature_type: Union[str, Iterable[str]] = "all",
return_dataframe: bool = False,
*args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
"""Calculate the binary fingerprint or descriptor vector of string representation(s).
Examples
========
>>> calculate_feature("2d", strings=['CCC', 'CCO'])[0].shape
(2, 200)
>>> calculate_feature("3d", strings=['CCC', 'CCO'])[0].shape
(2, 11)
>>> calculate_feature("fp", on_bits=False, strings=['CCC', 'CCO'])[0].shape
(2, 2048)
>>> calculate_feature("all", on_bits=False, strings=['CCC', 'CCO'])[0].shape
(2, 2259)
"""
if feature_type == "all":
feature_type = list(_FEATURE_CALCULATORS)
feature_type = cast(feature_type, to=list)
featurizers = {_type: _FEATURE_CALCULATORS[_type] for _type in feature_type}
dfs = {
_type: f(*args, return_dataframe=return_dataframe, **kwargs)
for _type, f in featurizers.items()
}
if return_dataframe:
import pandas as pd
return pd.concat([
_df.drop(columns=["meta_feature_valid", "meta_feature_type"])
for _df in dfs.values()
] + [
_df[["meta_feature_valid"]].rename(columns={"meta_feature_valid": f"meta_feature_valid_{_type}"})
for _type, _df in dfs.items()
], axis=1)
else:
return np.concatenate([_df[0] for _df in dfs.values()], axis=1), np.stack([_df[1] for _df in dfs.values()], axis=0).astype(int)