Source code for schemist.generating

"""Tools for enumerating compounds. Currently only works with peptides."""

from typing import Callable, Iterable, Optional, Tuple, Union

from functools import partial
from itertools import chain, islice, product, repeat
from math import ceil, expm1, floor
from random import choice, choices, random, seed

from carabiner import print_err
from carabiner.decorators import vectorize, return_none_on_error
from carabiner.random import sample_iter
from rdkit.Chem import Mol, rdChemReactions
import numpy as np

from .converting import (_x2mol, _mol2x,
                         _convert_input_to_smiles)

AA = tuple('GALVITSMCPFYWHKRDENQ')
dAA = tuple(aa.casefold() for aa in AA)

REACTIONS = {'N_to_C_cyclization': '([N;H1:5][C:1][C:2](=[O:6])[O:3].[N;H2:4][C:7][C:8](=[O:9])[N;H1:10])>>[N;H1:5][C:1][C:2](=[O:6])[N;H1:4][C:7][C:8](=[O:9])[N;H1:10].[O;H2:3]',
             'cysteine_to_chloroacetyl_cyclization': '([N;H1:5][C:2](=[O:6])[C:1][Cl:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8]',
             'cysteine_to_N_cyclization':'([N;H1:5][C:2](=[O:6])[C:1][N;H2:3].[S;H1:4][C;H2:7][C:8])>>[N;H1:5][C:2](=[O:6])[C:1][S:4][C;H2:7][C:8].[N;H3:3]'}

def _get_alphabet(alphabet: Optional[Iterable[str]] = None,
                  d_aa_only: bool = False,
                  include_d_aa: bool = False) -> Tuple[str]:

    alphabet = alphabet or AA
    alphabet_lower = tuple(set(aa.casefold() for aa in AA))

    if d_aa_only:
        alphabet = alphabet_lower
    elif include_d_aa:
        alphabet = tuple(set(chain(alphabet, alphabet_lower)))

    return alphabet

    


[docs]
def all_peptides_of_one_length(length: int,
                               alphabet: Optional[Iterable[str]] = None,
                               d_aa_only: bool = False,
                               include_d_aa: bool = False) -> Iterable[str]:
    
    """
    
    """
    
    alphabet = _get_alphabet(alphabet=alphabet,
                             d_aa_only=d_aa_only,
                             include_d_aa=include_d_aa)

    return (''.join(peptide)
            for peptide in product(alphabet, repeat=length))




[docs]
def all_peptides_in_length_range(max_length: int,
                                 min_length: int = 1,
                                 by: int = 1,
                                 alphabet: Optional[Iterable[str]] = None,
                                 d_aa_only: bool = False,
                                 include_d_aa: bool = False,
                                 *args, **kwargs) -> Iterable[str]:

    """
    
    """
    
    length_range = range(*sorted([min_length, max_length + 1]), by)
    peptide_maker = partial(all_peptides_of_one_length, 
                            alphabet=alphabet,
                            d_aa_only=d_aa_only,
                            include_d_aa=include_d_aa,
                            *args, **kwargs)
    
    return chain.from_iterable(peptide_maker(length=length) 
                               for length in length_range)



def _number_of_peptides(max_length: int,
                        min_length: int = 1,
                        by: int = 1,
                        alphabet: Optional[Iterable[str]] = None,
                        d_aa_only: bool = False,
                        include_d_aa: bool = False):
    
    alphabet = _get_alphabet(alphabet=alphabet,
                             d_aa_only=d_aa_only,
                             include_d_aa=include_d_aa)
    n_peptides = [len(alphabet) ** length 
                  for length in range(*sorted([min_length, max_length + 1]), by)]
    
    return n_peptides


def _naive_sample_peptides_in_length_range(max_length: int,
                                           min_length: int = 1,
                                           by: int = 1,
                                           n: Optional[Union[float, int]] = None,
                                           alphabet: Optional[Iterable[str]] = None,
                                           d_aa_only: bool = False,
                                           include_d_aa: bool = False,
                                           set_seed: Optional[int] = None):
    
    alphabet = _get_alphabet(alphabet=alphabet,
                             d_aa_only=d_aa_only,
                             include_d_aa=include_d_aa)
    n_peptides = _number_of_peptides(max_length=max_length,
                                     min_length=min_length,
                                     by=by,
                                     alphabet=alphabet,
                                     d_aa_only=d_aa_only,
                                     include_d_aa=include_d_aa)
    lengths = list(range(*sorted([min_length, max_length + 1]), by))
    weight_per_length = [n / min(n_peptides) for n in n_peptides]
    weighted_lengths = list(chain.from_iterable(repeat(l, ceil(w)) for l, w in zip(lengths, weight_per_length)))

    lengths_sample = (choice(weighted_lengths) for _ in range(n))
    return (''.join(choices(list(alphabet), k=k)) for k in lengths_sample)



[docs]
def sample_peptides_in_length_range(max_length: int,
                                    min_length: int = 1,
                                    by: int = 1,
                                    n: Optional[Union[float, int]] = None,
                                    alphabet: Optional[Iterable[str]] = None,
                                    d_aa_only: bool = False,
                                    include_d_aa: bool = False,
                                    naive_sampling_cutoff: float = 5e-3,
                                    reservoir_sampling: bool = True,
                                    indexes: Optional[Iterable[int]] = None,
                                    set_seed: Optional[int] = None,
                                    *args, **kwargs) -> Iterable[str]:

    """

    """

    seed(set_seed)

    alphabet = _get_alphabet(alphabet=alphabet,
                             d_aa_only=d_aa_only,
                             include_d_aa=include_d_aa)

    n_peptides = sum(len(alphabet) ** length 
                     for length in range(*sorted([min_length, max_length + 1]), by))
    if n is None:
        n_requested = n_peptides
    elif n >= 1.:
        n_requested = min(floor(n), n_peptides)
    elif n < 1.:
        n_requested = floor(n * n_peptides)

    frac_requested = n_requested / n_peptides
    
    # approximation of birthday problem
    p_any_collision = -expm1(-n_requested * (n_requested - 1.) / (2. * n_peptides))
    n_collisons = n_requested * (1. - ((n_peptides - 1.) / n_peptides) ** (n_requested - 1.))
    frac_collisions = n_collisons / n_requested

    print_err(f"Sampling {n_requested} ({frac_requested * 100.} %) peptides from "
              f"length {min_length} to {max_length} ({n_peptides} combinations). "
              f"Probability of collision if drawing randomly is {p_any_collision}, "
              f"with {n_collisons} ({100. * frac_collisions} %) collisions on average.")

    if frac_collisions < naive_sampling_cutoff and n_peptides > 2e9:

        print_err("> Executing naive sampling. ")

        peptides = _naive_sample_peptides_in_length_range(max_length, min_length, by, 
                                                          n=n_requested,
                                                          alphabet=alphabet,
                                                          d_aa_only=d_aa_only,
                                                          include_d_aa=include_d_aa)

    else:

        print_err("> Executing exhaustive sampling.")

        all_peptides = all_peptides_in_length_range(max_length, min_length, by, 
                                                    alphabet=alphabet,
                                                    d_aa_only=d_aa_only,
                                                    include_d_aa=include_d_aa,
                                                    *args, **kwargs)

        if n is None:

            peptides = all_peptides
            
        elif n >= 1.:
            
            if reservoir_sampling:
                peptides = sample_iter(all_peptides, k=n_requested,
                                       shuffle_output=False)
            else:
                peptides = (pep for pep in all_peptides 
                            if random() <= frac_requested)

        elif n < 1.:

            peptides = (pep for pep in all_peptides 
                        if random() <= n)

    if indexes is not None:

        indexes = (int(ix) if (isinstance(ix, str) and ix.isdigit()) or isinstance(ix, int) or isinstance(ix, float) 
                   else None 
                   for ix in islice(indexes, 3))
        indexes = [ix if (ix is None or ix >= 0) else None 
                   for ix in indexes]
        
        if len(indexes) > 1:
            if n is not None and n >=1. and indexes[0] > n:
                raise ValueError(f"Minimum slice ({indexes[0]}) is higher than number of items ({n}).")

        peptides = islice(peptides, *indexes)

    return peptides



def _reactor(smarts: str) -> Callable[[Mol], Union[Mol, None]]:

    rxn = rdChemReactions.ReactionFromSmarts(smarts)
    reaction_function = rxn.RunReactants

    @vectorize
    @return_none_on_error
    def reactor(s: Mol) -> Mol:

        return reaction_function([s])[0][0]
    
    return reactor



[docs]
@_convert_input_to_smiles
def react(strings: Union[str, Iterable[str]], 
          reaction: str = 'N_to_C_cyclization', 
          output_representation: str = 'smiles',
          **kwargs) -> Union[str, Iterable[str]]:
    
    """
    
    """

    try:
        _this_reaction = REACTIONS[reaction]
    except KeyError:
        raise KeyError(f"Reaction {reaction} is not available. Try: " +
                        ", ".join(list(REACTIONS)))

    # strings = cast(strings, to=list)
    # print_err((strings))

    reactor = _reactor(_this_reaction)
    mols = _x2mol(strings)
    mols = reactor(mols)

    return _mol2x(mols, 
                  output_representation=output_representation,
                  **kwargs)