Source code for src.obj.motif_vector

import numpy as np
import jax.numpy as jnp
import yaml
from ..domains.motif_space import _return_motif_categories
from collections import namedtuple
import nifty8 as ift

from .units import make_unit
from .units import transform_unit_to_dict, transform_dict_to_unit

from ..domains.motif_space import MotifSpace
from ..domains.hamming_space import HammingSpace

from ..utils.save import create_directory_path_if_not_already_existing


[docs]
def MotifVector(motiflength : int,
        alphabet : list,
        unit : str):
    motif_categories = _return_motif_categories(motiflength=motiflength)
    motif_vector_properties = {'motiflength' : motiflength,
        'alphabet' : alphabet,
        'number_of_letters' : len(alphabet),
        'unit' : make_unit(unit)}
    def makeMotifVector(motif_vector_dct : dict):
        motif_vector = namedtuple('MotifVector',
                ('motifs',) + tuple(motif_vector_properties.keys()))
        motifs = ift.MultiField.from_dict(
            _motif_vector_dct_with_fields(motif_vector_dct, alphabet),
            MotifSpace.make(alphabet, motiflength, units=make_unit('bits')),
        )
        return motif_vector(**{**{'motifs': motifs}, **motif_vector_properties})
    return makeMotifVector



[docs]
def add_zebra_fluctuation_to_motif_vector(motif_vector : MotifVector,
                                          zebra_fluctuation : float = 0.,
                                          braze_fluctuation : float = 0.,
                                          aa_fluctuation : float = 0.,
                                          bb_fluctuation : float = 0.,
                                          fourmer_fluctuation : float = 0.
    ) -> MotifVector:
    motif_array = _motif_vector_as_array(motif_vector)
    ic = motif_array[0,0,2,0]
    motif_array[0,0,2,0] += zebra_fluctuation*motif_array[0,0,2,0]
    motif_array[0,1,1,0] += braze_fluctuation*motif_array[0,1,1,0]

    motif_array[0,0,1,0] += aa_fluctuation*motif_array[0,0,1,0]
    motif_array[0,1,2,0] += bb_fluctuation*motif_array[0,1,2,0]

    motif_array[0,0,2,1] += fourmer_fluctuation*ic
    motif_array[1,1,1,2] += fourmer_fluctuation*ic
    motif_array[2,0,2,0] += fourmer_fluctuation*ic
    return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))



[docs]
def convert_homogeneous_dimers_to_zebra_dimers(
        motif_vector : MotifVector,
        zebra_dimer_concentration : float
    ) -> MotifVector:
    """
    Note: if zebra_dimer_concentration is negative, homogeneous dimers are added and zebra_dimers are reduced.
    """
    motif_array = _motif_vector_as_array(motif_vector)
    motif_array[0,0,2,0] += zebra_dimer_concentration
    motif_array[0,1,1,0] += zebra_dimer_concentration
    motif_array[0,1,2,0] -= zebra_dimer_concentration
    motif_array[0,0,1,0] -= zebra_dimer_concentration
    return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))



[docs]
def convert_dimers_to_tetramers(
        motif_vector : MotifVector,
        zebra_tetramer_concentration : float
    ) -> MotifVector:
    """
    Note: if zebra_tetramer_concentration is negative, homogeneous dimers are converted to tetramers instead.
    """
    motif_array = _motif_vector_as_array(motif_vector)

    if zebra_tetramer_concentration >= 0.:
        motif_array[0,0,2,1] += .5*zebra_tetramer_concentration
        motif_array[1,1,1,2] += .5*zebra_tetramer_concentration
        motif_array[2,0,2,0] += .5*zebra_tetramer_concentration

        motif_array[0,1,1,2] += .5*zebra_tetramer_concentration
        motif_array[2,0,2,1] += .5*zebra_tetramer_concentration
        motif_array[1,1,1,0] += .5*zebra_tetramer_concentration

        motif_array[0,0,2,0] -= zebra_tetramer_concentration
        motif_array[0,1,1,0] -= zebra_tetramer_concentration

    else:
        motif_array[0,0,1,1] += .5*zebra_tetramer_concentration
        motif_array[1,0,1,1] += .5*zebra_tetramer_concentration
        motif_array[1,0,1,0] += .5*zebra_tetramer_concentration

        motif_array[0,1,2,2] += .5*zebra_tetramer_concentration
        motif_array[2,1,2,2] += .5*zebra_tetramer_concentration
        motif_array[2,1,2,0] += .5*zebra_tetramer_concentration

        motif_array[0,1,2,0] -= zebra_tetramer_concentration
        motif_array[0,0,1,0] -= zebra_tetramer_concentration

    return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))


def _motif_vector_dct_with_fields(motif_vector_dct,
                                  alphabet : list) -> dict:
    for key in motif_vector_dct.keys():
        if not isinstance(motif_vector_dct[key],ift.Field):
            motif_vector_dct[key] = ift.Field(
                ift.DomainTuple.make(HammingSpace(alphabet,len(motif_vector_dct[key].shape))),
                np.asarray(motif_vector_dct[key]))
    return motif_vector_dct

def _create_empty_motif_vector_dct(motiflength : int,
        alphabet : list = ['a','b']) -> dict:
    number_of_letters = len(alphabet)

    empty_motif_vector = {}
    for strandlength in range(1,motiflength-1):
        category = _return_motif_categories()[0].format(strandlength)
        shape = (number_of_letters,)*strandlength
        empty_motif_vector[category] = np.zeros(shape)
    motif_categories = _return_motif_categories(motiflength=motiflength)
    for category in motif_categories[-3:]:
        category_length = motiflength-1 + (category==motif_categories[-2])
        shape = (number_of_letters,)*category_length
        empty_motif_vector[category] = np.zeros(shape)
    return empty_motif_vector


[docs]
def isinstance_motifvector(obj : object,
                           print_statements : bool = True
                           ) -> bool:
    is_motif_vector = True
    keys = ['motifs','motiflength', 'alphabet', 'number_of_letters', 'unit']
    for key in obj._asdict().keys():
        if key not in keys:
            if print_statements:
                print('Not a MotifVector, missing key: {}.'.format(key))
            return False
    keys = list(_return_motif_categories(motiflength=obj.motiflength))
    for key in obj.motifs.keys():
        if key not in keys:
            if print_statements:
                print('Not a MotifVector, missing key in motifs field: {}.'.format(key))
            return False
    is_motif_vector *= isinstance(obj, tuple)
    is_motif_vector *= hasattr(obj, '_asdict')
    is_motif_vector *= hasattr(obj, '_fields')
    return bool(is_motif_vector)



[docs]
def are_compatible_motif_vectors(mv1 : MotifVector, mv2 : MotifVector) -> bool:
    if not isinstance_motifvector(mv1):
        print('Object is not a MotifVector')
        return False
    if not isinstance_motifvector(mv2):
        print('Object is not a MotifVector')
        return False
    keys = ['motiflength', 'alphabet', 'unit']
    for key in keys:
        if not np.prod(mv1._asdict()[key]==mv2._asdict()[key]):
            print('MotifVectors not compatible: {} mismatch.'.format(key))
            return False
    return True


def _motif_indices_in_motifs_array(motif_vector_array : np.ndarray,
        motiflength : int,
        is_beginning : bool
        ) -> tuple:
    """
    returns an index-tuple for the array in which the motif_sequence_array fits.

    Parameters:
    -----------
    motif_vector_array : np.ndarray,
    motiflength : int,
    is_beginning : bool

    Returns:
    --------
    motif_indices : tuple
    """
    if len(motif_vector_array.shape)==motiflength:
        is_beginning = False
    strandlength = len(motif_vector_array.shape)
    number_of_letters = motif_vector_array.shape[0]
    current_indices = (0,)*int(is_beginning)
    current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning)
    current_indices += (slice(0,number_of_letters),)
    current_indices += (slice(1,number_of_letters+1),)*int(
            strandlength
            -1
            -int(not is_beginning)
            )
    current_indices += (0,)*(motiflength-len(current_indices))
    return current_indices


[docs]
def categories_indices(motiflength : int,
                       alphabet : list
                       ) -> dict:
    """
    returns a dictionary with motif categories as keys and index-tuples as their values.

    Parameters:
    -----------
    motiflenth : int
    alphabet : list

    Returns:
    --------
    categories_indices : dict
    """
    keys = _return_motif_categories(motiflength)

    number_of_letters = len(alphabet)
    categories_indices = {}
    for strandlength in range(1,motiflength-1):
        is_beginning = True
        key = keys[strandlength-1]

        current_indices = (0,)*int(is_beginning)
        current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning)
        current_indices += (slice(0,number_of_letters),)
        current_indices += (slice(1,number_of_letters+1),)*int(
                strandlength
                -1
                -int(not is_beginning)
                )
        current_indices += (0,)*(motiflength-len(current_indices))

        categories_indices[key] = current_indices

    for key in keys[-3:]:
        is_beginning = (key==keys[-3])
        strandlength = motiflength-int(key!=keys[-2])

        current_indices = (0,)*int(is_beginning)
        current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning)
        current_indices += (slice(0,number_of_letters),)*(motiflength>2)
        current_indices += (slice(1,number_of_letters+1),)*int(
                strandlength
                -int(motiflength>2)
                -int(not is_beginning)
                )
        current_indices += (0,)*(motiflength-len(current_indices))

        categories_indices[key] = current_indices
    return categories_indices



[docs]
def category_indices(motif_category : str,
                     motiflength : int,
                     alphabet : list
                     ) -> tuple:
    """
    returns the array-indix-tuple for a certain motif_category.
    """
    return categories_indices(motiflength, alphabet)[motif_category]


def _transform_sequence_array_to_motif_array(
        sequence_array,
        motiflength : int,
        sequence_is_beginning : bool = True
        ) -> np.ndarray:
    """
    transforms an array that only tracks letters (occupied nucleotides)
    into an array that explicitely tracks empty spots with zeros.
    Note that the second spot is always a letter, though.

    Parameters:
    -----------
    sequence_array : nd-array
    motiflength : int
    sequence_is_beginning : boolean (optional)
        whether the sequence is beginning, i.e. the first spot is zero,
        else it will be considered as end or continuation if all spots are
        letters
        default : True

    Returns:
    --------
    motif_array : nd-array
    """
    if len(sequence_array.shape)==motiflength:
        sequence_is_beginning = False
    number_of_letters = sequence_array.shape[0]
    strandlength = len(sequence_array.shape)

    motif_array_shape = (number_of_letters+1,)
    motif_array_shape += (number_of_letters,)*int(motiflength>2)
    motif_array_shape += (number_of_letters+1,)*(motiflength-1-int(motiflength>2))
    motif_array = np.zeros(motif_array_shape)

    indices_sequence_arrays = (0,)*int(sequence_is_beginning)
    indices_sequence_arrays += (slice(1,None),)*(1-int(sequence_is_beginning))
    indices_sequence_arrays += (slice(None),)*int(motiflength>2)
    indices_sequence_arrays += (slice(1,None),)*(strandlength-1-int(motiflength>2)+int(sequence_is_beginning))
    indices_sequence_arrays += (0,)*(motiflength-len(indices_sequence_arrays))

    motif_array[indices_sequence_arrays] = sequence_array
    return motif_array

def _motif_vector_as_array(motif_vector : MotifVector
        ) -> np.ndarray:
    """
    transforms a motif vector into an numpy-array

    Parameters:
    -----------
    motif_vector : MotifVector

    Returns:
    --------
    motif_array : np.ndarray
    """
    motiflength = motif_vector.motiflength
    number_of_letters = motif_vector.number_of_letters

    motif_categories = _return_motif_categories(motiflength)

    motif_array_shape = (number_of_letters+1,)
    motif_array_shape += (number_of_letters,)*int(motiflength>2)
    motif_array_shape += (number_of_letters+1,)*(motiflength-1-int(motiflength>2))
    motif_array = np.zeros(motif_array_shape)
    for motif_category in motif_categories[:-2]:
        motif_array += _transform_sequence_array_to_motif_array(
                motif_vector.motifs[motif_category].val,
                motiflength,
                sequence_is_beginning = True
                )
    for motif_category in motif_categories[-2:]:
        motif_array += _transform_sequence_array_to_motif_array(
                motif_vector.motifs[motif_category].val,
                motiflength,
                sequence_is_beginning = False
                )
    return motif_array

def _array_to_motif_vector_dct(motif_vector_array : np.ndarray,
        motiflength : int,
        alphabet : list,
        ) -> dict:
    if motiflength != len(motif_vector_array.shape):
        raise ValueError("motiflength inconsistent with array shape")
    number_of_letters = len(alphabet)
    if number_of_letters != motif_vector_array.shape[0]-1:
        raise ValueError("alphabet shape ({alphabet_shape}) inconsistent with motif_vector_array shape ({motif_vector_array_shape})".format(
            alphabet_shape = len(alphabet),
            motif_vector_array_shape = motif_vector_array.shape
                    ))

    motif_vector_dct = {}
    for strandlength in range(1,motiflength):
        if strandlength == (motiflength-1):
            category = _return_motif_categories()[-3]
        else:
            category = _return_motif_categories()[0].format(strandlength)
        current_indices = (0,)
        current_indices += (slice(None),)*int(motiflength>2)
        current_indices += (slice(1,None),)*(strandlength-int(motiflength>2))
        current_indices += (0,)*(motiflength-len(current_indices))
        motif_vector_dct[category] = motif_vector_array[current_indices]
    motif_categories = _return_motif_categories(motiflength=motiflength)
    for category in motif_categories[-2:]:
        strandlength = motiflength-1 + (category==motif_categories[-2])
        current_indices = (slice(1,None),)
        current_indices += (slice(None),)*int(motiflength>2)
        current_indices += (slice(1,None),)*(motiflength-2-int(motiflength>2)+int(category==motif_categories[-2]))
        current_indices += (0,)*(motiflength-len(current_indices))
        motif_vector_dct[category] = motif_vector_array[current_indices]
    return motif_vector_dct


[docs]
def save_motif_vector(archive_path : str,
        motif_vector : MotifVector
        ) -> None:
    create_directory_path_if_not_already_existing(archive_path)
    jnp.save(archive_path+'motifs',
            _motif_vector_as_array(motif_vector)
            )

    with open(archive_path+'properties.yaml','w') as yaml_file:
        yaml.dump({'motiflength':motif_vector.motiflength,
            'alphabet':motif_vector.alphabet,
            'unit':transform_unit_to_dict(motif_vector.unit)},
            yaml_file,
            indent=4)



[docs]
def load_motif_vector(archive_path : str
        ) -> MotifVector:
    dct_filename =  archive_path+'properties'+'.yaml'
    array_filename = archive_path+'motifs'+'.npy'
    with open(dct_filename, 'r') as yaml_file:
        motif_vector_properties = yaml.safe_load(yaml_file)
    motif_vector_properties['unit'] = transform_dict_to_unit(motif_vector_properties['unit'])
    makeMotifVector = MotifVector(**motif_vector_properties)
    return makeMotifVector(
            _array_to_motif_vector_dct(jnp.load(array_filename),motif_vector_properties['motiflength'], motif_vector_properties['alphabet'])
            )