Source code for src.obj.motif_breakage_vector

import numpy as np
import nifty8 as ift
from collections import namedtuple
from ..obj.units import make_unit, Unit

from ..domains.motif_space import _return_motif_categories
from ..domains.motif_breakage_space import MotifBreakageSpace


[docs]
def MotifBreakageVector(motiflength : int,
                        alphabet : list,
                        unit : Unit):
    motif_categories = _return_motif_categories(motiflength=motiflength)[1:]
    motif_vector_properties = {'motiflength' : motiflength,
        'alphabet' : alphabet,
        'number_of_letters' : len(alphabet),
        'unit' : unit}
    def makeMotifBreakageVector(motif_breakage_vector_dct : dict):
        breakages = ift.MultiField.from_raw(MotifBreakageSpace.make(alphabet, motiflength),
                                            motif_breakage_vector_dct)
        motif_vector = namedtuple('MotifBreakageVector',
                ('breakages',) + tuple(motif_vector_properties.keys()))
        return motif_vector(**{**{'breakages': breakages}, **motif_vector_properties})
    return makeMotifBreakageVector


def _create_empty_motif_breakage_dct(motiflength : int,
        alphabet : list = ['a','b']) -> dict:
    number_of_letters = len(alphabet)

    empty_motif_vector = {}
    for strandlength in range(2,motiflength-1):
        category = _return_motif_categories()[0].format(strandlength)
        shape = (number_of_letters,)*strandlength
        for hyphen_index in range(1,strandlength):
            empty_motif_vector[category+'_{}'.format(hyphen_index)] = np.zeros(shape)
    motif_categories = _return_motif_categories(motiflength=motiflength)
    for category in motif_categories[-3:]:
        category_length = motiflength-1 + (category==motif_categories[-2])
        shape = (number_of_letters,)*category_length
        for hyphen_index in range(category_length):
            empty_motif_vector[category + '_{}'.format(hyphen_index)] = np.zeros(shape)
    return empty_motif_vector

def _breakage_array_indices(
        motiflength : int,
        strandlength : int,
        breakage_spot : int,
        sequence_is_beginning : bool
    ) -> tuple:
    """
    sets up the indices of the breakage array given the parameters.
    """
    end_overlap = max(0,strandlength-breakage_spot-motiflength//2)
    beginning_overlap = max(0, breakage_spot-motiflength+motiflength//2)
    end_length_without_overlap = strandlength-breakage_spot-end_overlap
    beginning_length_without_overlap = min(breakage_spot, motiflength-motiflength//2)
    zeros_between_end_overlap_and_beginning = max(int(sequence_is_beginning),motiflength-motiflength//2-end_overlap-beginning_length_without_overlap)
    zeros_between_end_and_beginning_overlap = max(0,motiflength//2-end_length_without_overlap-beginning_overlap)

    indices_sequence_arrays =  (slice(1,None),)*end_overlap
    indices_sequence_arrays +=  (0,)*zeros_between_end_overlap_and_beginning
    indices_sequence_arrays += (slice(1,None),)*(beginning_length_without_overlap-1)
    indices_sequence_arrays += (slice(None),)*2
    indices_sequence_arrays += (slice(1,None),)*(end_length_without_overlap-1)
    indices_sequence_arrays += (0,)*zeros_between_end_and_beginning_overlap
    indices_sequence_arrays += (slice(1,None),)*beginning_overlap
    return indices_sequence_arrays

def _transform_sequence_array_to_motif_breakage_array(
        sequence_array,
        motiflength : int,
        breakage_spot : int,
        sequence_is_beginning : bool = True
        ) -> np.ndarray:
    """
    transforms an array that only tracks letters (occupied nucleotides)
    into an array that explicitely tracks empty spots with zeros.
    Note that the second spot is always a letter, though.

    Parameters:
    -----------
    sequence_array : nd-array
    motiflength : int
    sequence_is_beginning : boolean (optional)
        whether the sequence is beginning, i.e. the first spot is zero,
        else it will be considered as end or continuation if all spots are
        letters
        default : True

    Returns:
    --------
    motif_array : nd-array
    """
    if len(sequence_array.shape)==motiflength:
        sequence_is_beginning = False
    number_of_letters = sequence_array.shape[0]
    strandlength = len(sequence_array.shape)

    indices_sequence_arrays = _breakage_array_indices(motiflength, strandlength, breakage_spot, sequence_is_beginning)

    motif_array_shape = (number_of_letters+1,)*int(motiflength>2)
    motif_array_shape += (number_of_letters,)*2
    motif_array_shape += (number_of_letters+1,)*(motiflength-2-int(motiflength>2))
    motif_array = np.zeros(motif_array_shape)

    motif_array[indices_sequence_arrays] = sequence_array
    return motif_array

def _motif_breakage_vector_as_array(
        motif_breakage_vector : MotifBreakageVector
        ) -> np.ndarray:
    """
    transforms a motif vector into a numpy-array

    Parameters:
    -----------
    motif_breakage_vector : MotifBreakageVector

    Returns:
    --------
    motif_breakage_array : np.ndarray
    """
    motiflength = motif_breakage_vector.motiflength
    number_of_letters = motif_breakage_vector.number_of_letters

    beginning_breakage_spots = (1, motiflength-motiflength//2)
    continuation_breakage_spots = (motiflength-motiflength//2,motiflength-motiflength//2+1)
    end_breakage_spots = (motiflength-motiflength//2, motiflength-1)
    bs = [beginning_breakage_spots, continuation_breakage_spots, end_breakage_spots]
    motif_categories = [(_return_motif_categories()[ii], breakage_spot) for ii in range(1,4) for breakage_spot in range(*bs[ii-1])]

    strand_categories = [(_return_motif_categories()[0].format(strandlength), breakage_spot) for strandlength in range(1,motiflength-1) for breakage_spot in range(1,strandlength)]

    motif_array_shape = (number_of_letters+1,)*int(motiflength>2)
    motif_array_shape += (number_of_letters,)*2
    motif_array_shape += (number_of_letters+1,)*(motiflength-2-int(motiflength>2))
    motif_array = np.zeros(motif_array_shape)

    for motif_category in strand_categories:
        breakage_spot = motif_category[1]
        motif_category = motif_category[0]+'_{}'.format(breakage_spot)
        motif_array += _transform_sequence_array_to_motif_breakage_array(
            motif_breakage_vector.breakages[motif_category].val,
            motiflength,
            breakage_spot,
            sequence_is_beginning = True
        )
    for motif_category in motif_categories:
        breakage_spot = motif_category[1]
        sequence_is_beginning = motif_category[0]=='beginning'
        motif_category = motif_category[0]+'_{}'.format(breakage_spot)
        motif_array += _transform_sequence_array_to_motif_breakage_array(
            motif_breakage_vector.breakages[motif_category].val,
            motiflength,
            breakage_spot,
            sequence_is_beginning = sequence_is_beginning
        )
    return motif_array

def _array_to_motif_breakage_vector(
        motif_breakage_vector_array : np.ndarray,
        motiflength : int,
        alphabet : list,
        unit : Unit
        ) -> MotifBreakageVector:
    if motiflength != len(motif_breakage_vector_array.shape):
        raise ValueError("motiflength inconsistent with array shape")
    number_of_letters = len(alphabet)
    if number_of_letters != motif_breakage_vector_array.shape[0]-1:
        raise ValueError("alphabet shape ({alphabet_shape}) inconsistent with motif_breakage_vector_array shape ({motif_vector_array_shape})".format(
            alphabet_shape = len(alphabet),
            motif_vector_array_shape = motif_breakage_vector_array.shape
                    ))

    beginning_breakage_spots = (1, motiflength-motiflength//2)
    continuation_breakage_spots = (motiflength-motiflength//2,motiflength-motiflength//2+1)
    end_breakage_spots = (motiflength-motiflength//2, motiflength-1)
    bs = [beginning_breakage_spots, continuation_breakage_spots, end_breakage_spots]

    motif_categories = [(_return_motif_categories()[ii], breakage_spot) for ii in range(1,4) for breakage_spot in range(*bs[ii-1])]
    strand_categories = [(_return_motif_categories()[0].format(strandlength), strandlength, breakage_spot) for strandlength in range(1,motiflength-1) for breakage_spot in range(1,strandlength)]

    motif_vector_dct = {}
    for motif_category in strand_categories:
        breakage_spot = motif_category[2]
        strandlength = motif_category[1]
        motif_category = motif_category[0]+'_{}'.format(breakage_spot)
        current_indices = _breakage_array_indices(motiflength, strandlength, breakage_spot, sequence_is_beginning=True)
        motif_vector_dct[motif_category] = motif_breakage_vector_array[current_indices]
    for motif_category in motif_categories:
        breakage_spot = motif_category[1]
        sequence_is_beginning = motif_category[0]=='beginning'
        strandlength = motiflength-int(motif_category[0]!='continuation')
        motif_category = motif_category[0]+'_{}'.format(breakage_spot)
        current_indices = _breakage_array_indices(motiflength, strandlength, breakage_spot, sequence_is_beginning)
        motif_vector_dct[motif_category] = motif_breakage_vector_array[current_indices]
    return motif_vector_dct


[docs]
def isinstance_motifbreakagevector(obj) -> bool:
    is_motif_breakage_vector = True
    keys = ['motiflength','alphabet','number_of_letters','unit','breakages']
    for key in obj._asdict().keys():
        if key not in keys:
            print('Not a MotifBreakageVector, missing key: {}.'.format(key))
            return False
    keys = MotifBreakageSpace.make(obj.alphabet, obj.motiflength).keys()
    for key in obj.breakages.keys():
        if key not in keys:
            print('Not a MotifBreakageVector, missing key in breakages field: {}.'.format(key))
            return False
    is_motif_breakage_vector *= isinstance(obj, tuple)
    return bool(is_motif_breakage_vector)