Source code for schemist.features

"""Tools for generating chemical features."""

from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
from functools import cache, wraps

from carabiner.cast import cast
from carabiner.decorators import return_none_on_error, vectorize
from descriptastorus.descriptors import MakeGenerator
from pandas import DataFrame, Series
import numpy as np
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem import Mol

try:
    from rdkit.Chem.AllChem import FingeprintGenerator64 as FingerprintGenerator64, GetMorganGenerator
except ImportError: # typo in some rdkit versions
    from rdkit.Chem.rdFingerprintGenerator import FingerprintGenerator64, GetMorganGenerator

from .cleaning import clean_smiles
from .converting import _x2mol, _mol2x, _smiles2mol, _convert_input_to_smiles

def _feature_matrix(f: Callable[[Any], DataFrame]) -> Callable[[Any], Union[DataFrame, Tuple[np.ndarray, np.ndarray]]]:

    @wraps(f)
    def _f(prefix: Optional[str] = None,
           *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:

        feature_matrix = f(*args, **kwargs)

        if prefix is not None and isinstance(feature_matrix, DataFrame):
            new_cols = {col: f"{prefix}_{col}" 
                        for col in feature_matrix.columns 
                        if not col.startswith('_meta')}
            feature_matrix = feature_matrix.rename(columns=new_cols)

        return feature_matrix

    return _f


def _get_descriptastorus_features(
    smiles: Iterable[str], 
    generator: str = "RDKit2DHistogramNormalized"
) -> Union[DataFrame, Tuple[np.ndarray, List[str]]]:

    generator = MakeGenerator((generator, ))
    smiles = cast(clean_smiles(smiles), to=list)
    mols = cast(_smiles2mol(smiles), to=list)
    features = generator.processMols(
        mols, 
        smiles,
    )
    return np.stack(features, axis=0), [col for col, _ in generator.GetColumns()]



[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_2d_features(
    strings: Union[Iterable[str], str], 
    normalized: bool = True, 
    histogram_normalized: bool = True,
    return_dataframe: bool = False,
    *args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:

    """Calculate 2d features from string representation.

    Parameters
    ----------
    strings : str
        Input string representation(s).
    input_representation : str
        Representation type
    normalized : bool, optional
        Whether to return normalized features. Default: `True`.
    histogram_normalized : bool, optional
        Whether to return histogram normalized features (faster). Default: `True`.
    return_dataframe : bool, optional
        Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.

    Returns
    -------
    DataFrame, Tuple of numpy Arrays
        If `return_dataframe = True`, a DataFrame with named feature columns, and 
        the final column called `"meta_feature_valid"` being the validity indicator.
        Otherwise returns a tuple of Arrays with the first being the matrix of 
        features and the second being the vector of validity indicators.

    Examples
    --------
    >>> features, validity = calculate_2d_features(strings='CCC')
    >>> features[:,:3]
    array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05]])
    >>> validity
    array([1.])
    >>> features, validity = calculate_2d_features(strings=['CCC', 'CCCO'])
    >>> features[:,:3]
    array([[4.22879602e-01, 1.30009101e-04, 2.00014001e-05],
           [7.38891722e-01, 6.00042003e-04, 5.00035002e-05]])
    >>> validity
    array([1., 1.])
    >>> calculate_2d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
    CCC     True
    CCCO    True
    Name: meta_feature_valid, dtype: bool
    >>> ## Unusal valence
    >>> s = "O=S(=O)(OCC1OC(OC2(COS(=O)(=O)O[AlH3](O)O)OC(COS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C2OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C1OS(=O)(=O)O[AlH3](O)O)O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O"
    >>> calculate_2d_features(strings=s)[0].shape
    (1, 200)
    >>> s = 'CCc1c(C(=O)N2CC(c3nnc4c3CCC4)C2)nc(C)c1C(=O)OC'
    >>> calculate_2d_features(strings=s)[1]
    array([1.])

    """  

    if normalized:
        if histogram_normalized:
            generator_name = "RDKit2DHistogramNormalized"
        else:
            generator_name = "RDKit2DNormalized"
    else:
        generator_name = "RDKit2D"
    
    strings = cast(strings, to=list)
    feature_matrix, columns = _get_descriptastorus_features(
        strings,
        generator=generator_name,
    )

    if return_dataframe:
        feature_matrix = DataFrame(
            feature_matrix, 
            index=strings,
            columns=columns,
        )

        feature_matrix = (
            feature_matrix
            .rename(columns={f"{generator_name}_calculated": "meta_feature_valid0"})
            .assign(meta_feature_type=generator_name, 
                    meta_feature_valid=lambda x: (x['meta_feature_valid0'] == 1.))
            .drop(columns=['meta_feature_valid0'])
        )
        return feature_matrix
    else:
        return feature_matrix[:,1:], feature_matrix[:,0]




[docs]
@cache
def smiles_to_3d(
    smiles: str,
    seed: int = 42
) -> np.ndarray:
    from rdkit import Chem
    from rdkit.Chem import AllChem, Mol
    from rdkit.Chem.rdchem import AtomValenceException
    from rdkit.Chem.Descriptors3D import CalcMolDescriptors3D
    mol = _smiles2mol(clean_smiles(smiles))
    if mol is not None:
        params = AllChem.ETKDGv3()
        params.randomSeed = seed
        Chem.AddHs(mol)
        try:
            AllChem.EmbedMolecule(mol, params)
            desc = CalcMolDescriptors3D(mol)
        except (ValueError, AtomValenceException):
            desc = {"meta_feature_valid": False}
        else:
            desc["meta_feature_valid"] = True
        return desc
    else:
        return {"meta_feature_valid": False}




[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_3d_features(
    strings: Union[Iterable[str], str],
    seed: int = 42,
    return_dataframe: bool = False,
    *args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:

    """Calculate 3d features from string representation.

    Parameters
    ----------
    strings : str
        Input string representation(s).
    input_representation : str
        Representation type
    seed : int
        Seed for reproducible randomness
    return_dataframe : bool, optional
        Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.

    Returns
    -------
    DataFrame, Tuple of numpy Arrays
        If `return_dataframe = True`, a DataFrame with named feature columns, and 
        the final column called `"meta_feature_valid"` being the validity indicator.
        Otherwise returns a tuple of Arrays with the first being the matrix of 
        features and the second being the vector of validity indicators.

    Examples
    --------
    >>> features, validity = calculate_3d_features(strings='CCC')
    >>> features.shape
    (1, 11)
    >>> sum(validity)
    1
    >>> features, validity = calculate_3d_features(strings=['CCC', 'CCCO'])
    >>> features.shape
    (2, 11)
    >>> sum(validity)
    2
    >>> calculate_3d_features(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
    CCC     True
    CCCO    True
    Name: meta_feature_valid, dtype: bool
    >>> ## Unusal valence
    >>> s = "O=S(=O)(OCC1OC(OC2(COS(=O)(=O)O[AlH3](O)O)OC(COS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C2OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C(OS(=O)(=O)O[AlH3](O)O)C1OS(=O)(=O)O[AlH3](O)O)O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O.O[AlH3](O)O"
    >>> calculate_3d_features(strings=s)[0].shape
    (1, 0)
    >>> s = 'CCc1c(C(=O)N2CC(c3nnc4c3CCC4)C2)nc(C)c1C(=O)OC'
    >>> sum(calculate_3d_features(strings=s)[1])
    1

    """ 
    
    import pandas as pd
    
    strings = cast(strings, to=list)
    descriptors = [
        smiles_to_3d(s) for s in strings
    ]
            
    feature_matrix = DataFrame(descriptors).fillna(0.)
    if return_dataframe:
        feature_matrix.index = strings
        feature_matrix = (
            feature_matrix
            .assign(meta_feature_type="RDKit3D")
        )
        return feature_matrix
    else:
        feature_matrix = pd.concat([
            feature_matrix[["meta_feature_valid"]],
            feature_matrix[[col for col in feature_matrix if col != "meta_feature_valid"]],
        ], axis=1).values
        return feature_matrix[:,1:], feature_matrix[:,0]



def _fast_fingerprint(
    generator: FingerprintGenerator64, 
    mol: Mol,
    to_np: bool = True
) -> Union[str, np.ndarray]:

    try:
        fp_string = generator.GetFingerprint(mol).ToBitString()
    except:
        return None
    else:
        if to_np:
            return np.frombuffer(fp_string.encode(), 'u1') - ord('0')
        else:
            return fp_string
    


[docs]
@_feature_matrix
@_convert_input_to_smiles
def calculate_fingerprints(
    strings: Union[Iterable[str], str], 
    fp_type: str = 'morgan',
    radius: int = 2,
    chiral: bool = True, 
    on_bits: bool = True,
    return_dataframe: bool = False,
    *args, **kwargs
) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
    
    """Calculate the binary fingerprint of string representation(s).

    Only Morgan fingerprints are allowed.

    Parameters
    ----------
    strings : str
        Input string representation(s).
    input_representation : str
        Representation type
    fp_type : str, opional
        Which fingerprint type to calculate. Default: `'morgan'`.
    radius : int, optional
        Atom radius for fingerprints. Default: `2`.
    chiral : bool, optional
        Whether to take chirality into account. Default: `True`.
    on_bits : bool, optional
        Whether to return the non-zero indices instead of the full binary vector. Default: `True`.
    return_dataframe : bool, optional
        Whether to retrun a Pandas DataFrame instead of a numpy Array. Default: `False`.

    Returns
    -------
    DataFrame, Tuple of numpy Arrays
        If `return_dataframe = True`, a DataFrame with named feature columns, and 
        the final column called `"meta_feature_valid"` being the validity indicator.
        Otherwise returns a tuple of Arrays with the first being the matrix of 
        features and the second being the vector of validity indicators.

    Raises
    ------
    NotImplementedError
        If `fp_type` is not `'morgan'`.
    
    Examples
    --------
    >>> bits, validity = calculate_fingerprints(strings='CCC')
    >>> bits.tolist()
    [['80;294;1057;1344']]
    >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
    1 
    >>> bits, validity = calculate_fingerprints(strings=['CCC', 'CCCO'])
    >>> bits.tolist()
    [['80;294;1057;1344'], ['80;222;294;473;794;807;1057;1277']]
    >>> sum(validity)  # doctest: +NORMALIZE_WHITESPACE
    2
    >>> np.sum(calculate_fingerprints(strings=['CCC', 'CCCO'], on_bits=False)[0], axis=-1)
    array([4, 8])
    >>> calculate_fingerprints(strings=['CCC', 'CCCO'], return_dataframe=True).meta_feature_valid
    CCC     True
    CCCO    True
    Name: meta_feature_valid, dtype: bool

    """
    
    if fp_type.casefold() == 'morgan':
        generator_class = GetMorganGenerator
    else:
        raise NotImplementedError(f"Fingerprint type {fp_type} not supported!")
    
    fp_generator = generator_class(
        radius=radius, 
        includeChirality=chiral,
    )
    try:
        fp_size = fp_generator.GetOptions().fpSize
    except AttributeError:  # 'FingerprintGenerator64' object has no attribute 'GetOptions' in older rdkit versions (e.g.2022.9.5)
        test_smiles = "CCCC"
        fp_size = _fast_fingerprint(
            fp_generator, 
            _smiles2mol(test_smiles), 
            to_np=True,
        ).size
    strings = cast(strings, to=list)
    mols = (_smiles2mol(s) for s in strings)
    fp_strings = (_fast_fingerprint(fp_generator, mol, to_np=on_bits) 
                  for mol in mols)

    if on_bits:

        fingerprints = (
            map(str, np.flatnonzero(fp_string).tolist()) 
            for fp_string in fp_strings
        )
        fingerprints = [[';'.join(fp)] for fp in fingerprints]
        validity = [len(fp[0]) > 0 for fp in fingerprints]
    
    else:
        
        fingerprints = [
            np.array([
                int(digit) for digit in fp_string
            ]) 
            if fp_string is not None 
            else np.zeros((fp_size, ))
            for fp_string in fp_strings
        ]
        validity = [np.all(fp >= 0) for fp in fingerprints]
        
    feature_matrix = np.stack(fingerprints, axis=0)

    if return_dataframe:
        if feature_matrix.ndim == 1:  # on_bits only
            feature_matrix = DataFrame(
                feature_matrix, 
                columns=['fp_bits'],
                index=strings,
            )
        else:
            feature_matrix = DataFrame(
                feature_matrix,
                columns=[f"fp_{i}" for i, _ in enumerate(feature_matrix.T)],
                index=strings,
            )
        return feature_matrix.assign(
            meta_feature_type=fp_type.casefold(), 
            meta_feature_valid=validity,
        )
    else:
        return feature_matrix, validity



_FEATURE_CALCULATORS = {
    "2d": calculate_2d_features, 
    "3d": calculate_3d_features, 
    "fp": calculate_fingerprints,
}


[docs]
def calculate_feature(
    feature_type: Union[str, Iterable[str]] = "all",
    return_dataframe: bool = False,
    *args, **kwargs) -> Union[DataFrame, Tuple[np.ndarray, np.ndarray]]:
    
    """Calculate the binary fingerprint or descriptor vector of string representation(s).
    
    Examples
    ========
    >>> calculate_feature("2d", strings=['CCC', 'CCO'])[0].shape
    (2, 200)
    >>> calculate_feature("3d", strings=['CCC', 'CCO'])[0].shape
    (2, 11)
    >>> calculate_feature("fp", on_bits=False, strings=['CCC', 'CCO'])[0].shape
    (2, 2048)
    >>> calculate_feature("all", on_bits=False, strings=['CCC', 'CCO'])[0].shape
    (2, 2259)

    """
    if feature_type == "all":
        feature_type = list(_FEATURE_CALCULATORS)
    feature_type = cast(feature_type, to=list)
    featurizers = {_type: _FEATURE_CALCULATORS[_type] for _type in feature_type}
    dfs = {
        _type: f(*args, return_dataframe=return_dataframe, **kwargs)
        for _type, f in featurizers.items()
    }
    if return_dataframe:
        import pandas as pd
        return pd.concat([
            _df.drop(columns=["meta_feature_valid", "meta_feature_type"])
            for _df in dfs.values()
        ] + [
            _df[["meta_feature_valid"]].rename(columns={"meta_feature_valid": f"meta_feature_valid_{_type}"})
            for _type, _df in dfs.items()
        ], axis=1)
    else:
        return np.concatenate([_df[0] for _df in dfs.values()], axis=1), np.stack([_df[1] for _df in dfs.values()], axis=0).astype(int)