Source code for src.obj.motif_vector

import numpy as np
import jax.numpy as jnp
import yaml
from ..domains.motif_space import _return_motif_categories
from collections import namedtuple
import nifty8 as ift

from .units import make_unit
from .units import transform_unit_to_dict, transform_dict_to_unit

from ..domains.motif_space import MotifSpace
from ..domains.hamming_space import HammingSpace

from ..utils.save import create_directory_path_if_not_already_existing

[docs] def MotifVector(motiflength : int, alphabet : list, unit : str): motif_categories = _return_motif_categories(motiflength=motiflength) motif_vector_properties = {'motiflength' : motiflength, 'alphabet' : alphabet, 'number_of_letters' : len(alphabet), 'unit' : make_unit(unit)} def makeMotifVector(motif_vector_dct : dict): motif_vector = namedtuple('MotifVector', ('motifs',) + tuple(motif_vector_properties.keys())) motifs = ift.MultiField.from_dict( _motif_vector_dct_with_fields(motif_vector_dct, alphabet), MotifSpace.make(alphabet, motiflength, units=make_unit('bits')), ) return motif_vector(**{**{'motifs': motifs}, **motif_vector_properties}) return makeMotifVector
[docs] def add_zebra_fluctuation_to_motif_vector(motif_vector : MotifVector, zebra_fluctuation : float = 0., braze_fluctuation : float = 0., aa_fluctuation : float = 0., bb_fluctuation : float = 0., fourmer_fluctuation : float = 0. ) -> MotifVector: motif_array = _motif_vector_as_array(motif_vector) ic = motif_array[0,0,2,0] motif_array[0,0,2,0] += zebra_fluctuation*motif_array[0,0,2,0] motif_array[0,1,1,0] += braze_fluctuation*motif_array[0,1,1,0] motif_array[0,0,1,0] += aa_fluctuation*motif_array[0,0,1,0] motif_array[0,1,2,0] += bb_fluctuation*motif_array[0,1,2,0] motif_array[0,0,2,1] += fourmer_fluctuation*ic motif_array[1,1,1,2] += fourmer_fluctuation*ic motif_array[2,0,2,0] += fourmer_fluctuation*ic return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))
[docs] def convert_homogeneous_dimers_to_zebra_dimers( motif_vector : MotifVector, zebra_dimer_concentration : float ) -> MotifVector: """ Note: if zebra_dimer_concentration is negative, homogeneous dimers are added and zebra_dimers are reduced. """ motif_array = _motif_vector_as_array(motif_vector) motif_array[0,0,2,0] += zebra_dimer_concentration motif_array[0,1,1,0] += zebra_dimer_concentration motif_array[0,1,2,0] -= zebra_dimer_concentration motif_array[0,0,1,0] -= zebra_dimer_concentration return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))
[docs] def convert_dimers_to_tetramers( motif_vector : MotifVector, zebra_tetramer_concentration : float ) -> MotifVector: """ Note: if zebra_tetramer_concentration is negative, homogeneous dimers are converted to tetramers instead. """ motif_array = _motif_vector_as_array(motif_vector) if zebra_tetramer_concentration >= 0.: motif_array[0,0,2,1] += .5*zebra_tetramer_concentration motif_array[1,1,1,2] += .5*zebra_tetramer_concentration motif_array[2,0,2,0] += .5*zebra_tetramer_concentration motif_array[0,1,1,2] += .5*zebra_tetramer_concentration motif_array[2,0,2,1] += .5*zebra_tetramer_concentration motif_array[1,1,1,0] += .5*zebra_tetramer_concentration motif_array[0,0,2,0] -= zebra_tetramer_concentration motif_array[0,1,1,0] -= zebra_tetramer_concentration else: motif_array[0,0,1,1] += .5*zebra_tetramer_concentration motif_array[1,0,1,1] += .5*zebra_tetramer_concentration motif_array[1,0,1,0] += .5*zebra_tetramer_concentration motif_array[0,1,2,2] += .5*zebra_tetramer_concentration motif_array[2,1,2,2] += .5*zebra_tetramer_concentration motif_array[2,1,2,0] += .5*zebra_tetramer_concentration motif_array[0,1,2,0] -= zebra_tetramer_concentration motif_array[0,0,1,0] -= zebra_tetramer_concentration return MotifVector(motif_vector.motiflength, motif_vector.alphabet, motif_vector.unit)(_array_to_motif_vector_dct(motif_array, motif_vector.motiflength, motif_vector.alphabet))
def _motif_vector_dct_with_fields(motif_vector_dct, alphabet : list) -> dict: for key in motif_vector_dct.keys(): if not isinstance(motif_vector_dct[key],ift.Field): motif_vector_dct[key] = ift.Field( ift.DomainTuple.make(HammingSpace(alphabet,len(motif_vector_dct[key].shape))), np.asarray(motif_vector_dct[key])) return motif_vector_dct def _create_empty_motif_vector_dct(motiflength : int, alphabet : list = ['a','b']) -> dict: number_of_letters = len(alphabet) empty_motif_vector = {} for strandlength in range(1,motiflength-1): category = _return_motif_categories()[0].format(strandlength) shape = (number_of_letters,)*strandlength empty_motif_vector[category] = np.zeros(shape) motif_categories = _return_motif_categories(motiflength=motiflength) for category in motif_categories[-3:]: category_length = motiflength-1 + (category==motif_categories[-2]) shape = (number_of_letters,)*category_length empty_motif_vector[category] = np.zeros(shape) return empty_motif_vector
[docs] def isinstance_motifvector(obj : object, print_statements : bool = True ) -> bool: is_motif_vector = True keys = ['motifs','motiflength', 'alphabet', 'number_of_letters', 'unit'] for key in obj._asdict().keys(): if key not in keys: if print_statements: print('Not a MotifVector, missing key: {}.'.format(key)) return False keys = list(_return_motif_categories(motiflength=obj.motiflength)) for key in obj.motifs.keys(): if key not in keys: if print_statements: print('Not a MotifVector, missing key in motifs field: {}.'.format(key)) return False is_motif_vector *= isinstance(obj, tuple) is_motif_vector *= hasattr(obj, '_asdict') is_motif_vector *= hasattr(obj, '_fields') return bool(is_motif_vector)
[docs] def are_compatible_motif_vectors(mv1 : MotifVector, mv2 : MotifVector) -> bool: if not isinstance_motifvector(mv1): print('Object is not a MotifVector') return False if not isinstance_motifvector(mv2): print('Object is not a MotifVector') return False keys = ['motiflength', 'alphabet', 'unit'] for key in keys: if not np.prod(mv1._asdict()[key]==mv2._asdict()[key]): print('MotifVectors not compatible: {} mismatch.'.format(key)) return False return True
def _motif_indices_in_motifs_array(motif_vector_array : np.ndarray, motiflength : int, is_beginning : bool ) -> tuple: """ returns an index-tuple for the array in which the motif_sequence_array fits. Parameters: ----------- motif_vector_array : np.ndarray, motiflength : int, is_beginning : bool Returns: -------- motif_indices : tuple """ if len(motif_vector_array.shape)==motiflength: is_beginning = False strandlength = len(motif_vector_array.shape) number_of_letters = motif_vector_array.shape[0] current_indices = (0,)*int(is_beginning) current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning) current_indices += (slice(0,number_of_letters),) current_indices += (slice(1,number_of_letters+1),)*int( strandlength -1 -int(not is_beginning) ) current_indices += (0,)*(motiflength-len(current_indices)) return current_indices
[docs] def categories_indices(motiflength : int, alphabet : list ) -> dict: """ returns a dictionary with motif categories as keys and index-tuples as their values. Parameters: ----------- motiflenth : int alphabet : list Returns: -------- categories_indices : dict """ keys = _return_motif_categories(motiflength) number_of_letters = len(alphabet) categories_indices = {} for strandlength in range(1,motiflength-1): is_beginning = True key = keys[strandlength-1] current_indices = (0,)*int(is_beginning) current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning) current_indices += (slice(0,number_of_letters),) current_indices += (slice(1,number_of_letters+1),)*int( strandlength -1 -int(not is_beginning) ) current_indices += (0,)*(motiflength-len(current_indices)) categories_indices[key] = current_indices for key in keys[-3:]: is_beginning = (key==keys[-3]) strandlength = motiflength-int(key!=keys[-2]) current_indices = (0,)*int(is_beginning) current_indices += (slice(1,number_of_letters+1),)*int(not is_beginning) current_indices += (slice(0,number_of_letters),)*(motiflength>2) current_indices += (slice(1,number_of_letters+1),)*int( strandlength -int(motiflength>2) -int(not is_beginning) ) current_indices += (0,)*(motiflength-len(current_indices)) categories_indices[key] = current_indices return categories_indices
[docs] def category_indices(motif_category : str, motiflength : int, alphabet : list ) -> tuple: """ returns the array-indix-tuple for a certain motif_category. """ return categories_indices(motiflength, alphabet)[motif_category]
def _transform_sequence_array_to_motif_array( sequence_array, motiflength : int, sequence_is_beginning : bool = True ) -> np.ndarray: """ transforms an array that only tracks letters (occupied nucleotides) into an array that explicitely tracks empty spots with zeros. Note that the second spot is always a letter, though. Parameters: ----------- sequence_array : nd-array motiflength : int sequence_is_beginning : boolean (optional) whether the sequence is beginning, i.e. the first spot is zero, else it will be considered as end or continuation if all spots are letters default : True Returns: -------- motif_array : nd-array """ if len(sequence_array.shape)==motiflength: sequence_is_beginning = False number_of_letters = sequence_array.shape[0] strandlength = len(sequence_array.shape) motif_array_shape = (number_of_letters+1,) motif_array_shape += (number_of_letters,)*int(motiflength>2) motif_array_shape += (number_of_letters+1,)*(motiflength-1-int(motiflength>2)) motif_array = np.zeros(motif_array_shape) indices_sequence_arrays = (0,)*int(sequence_is_beginning) indices_sequence_arrays += (slice(1,None),)*(1-int(sequence_is_beginning)) indices_sequence_arrays += (slice(None),)*int(motiflength>2) indices_sequence_arrays += (slice(1,None),)*(strandlength-1-int(motiflength>2)+int(sequence_is_beginning)) indices_sequence_arrays += (0,)*(motiflength-len(indices_sequence_arrays)) motif_array[indices_sequence_arrays] = sequence_array return motif_array def _motif_vector_as_array(motif_vector : MotifVector ) -> np.ndarray: """ transforms a motif vector into an numpy-array Parameters: ----------- motif_vector : MotifVector Returns: -------- motif_array : np.ndarray """ motiflength = motif_vector.motiflength number_of_letters = motif_vector.number_of_letters motif_categories = _return_motif_categories(motiflength) motif_array_shape = (number_of_letters+1,) motif_array_shape += (number_of_letters,)*int(motiflength>2) motif_array_shape += (number_of_letters+1,)*(motiflength-1-int(motiflength>2)) motif_array = np.zeros(motif_array_shape) for motif_category in motif_categories[:-2]: motif_array += _transform_sequence_array_to_motif_array( motif_vector.motifs[motif_category].val, motiflength, sequence_is_beginning = True ) for motif_category in motif_categories[-2:]: motif_array += _transform_sequence_array_to_motif_array( motif_vector.motifs[motif_category].val, motiflength, sequence_is_beginning = False ) return motif_array def _array_to_motif_vector_dct(motif_vector_array : np.ndarray, motiflength : int, alphabet : list, ) -> dict: if motiflength != len(motif_vector_array.shape): raise ValueError("motiflength inconsistent with array shape") number_of_letters = len(alphabet) if number_of_letters != motif_vector_array.shape[0]-1: raise ValueError("alphabet shape ({alphabet_shape}) inconsistent with motif_vector_array shape ({motif_vector_array_shape})".format( alphabet_shape = len(alphabet), motif_vector_array_shape = motif_vector_array.shape )) motif_vector_dct = {} for strandlength in range(1,motiflength): if strandlength == (motiflength-1): category = _return_motif_categories()[-3] else: category = _return_motif_categories()[0].format(strandlength) current_indices = (0,) current_indices += (slice(None),)*int(motiflength>2) current_indices += (slice(1,None),)*(strandlength-int(motiflength>2)) current_indices += (0,)*(motiflength-len(current_indices)) motif_vector_dct[category] = motif_vector_array[current_indices] motif_categories = _return_motif_categories(motiflength=motiflength) for category in motif_categories[-2:]: strandlength = motiflength-1 + (category==motif_categories[-2]) current_indices = (slice(1,None),) current_indices += (slice(None),)*int(motiflength>2) current_indices += (slice(1,None),)*(motiflength-2-int(motiflength>2)+int(category==motif_categories[-2])) current_indices += (0,)*(motiflength-len(current_indices)) motif_vector_dct[category] = motif_vector_array[current_indices] return motif_vector_dct
[docs] def save_motif_vector(archive_path : str, motif_vector : MotifVector ) -> None: create_directory_path_if_not_already_existing(archive_path) jnp.save(archive_path+'motifs', _motif_vector_as_array(motif_vector) ) with open(archive_path+'properties.yaml','w') as yaml_file: yaml.dump({'motiflength':motif_vector.motiflength, 'alphabet':motif_vector.alphabet, 'unit':transform_unit_to_dict(motif_vector.unit)}, yaml_file, indent=4)
[docs] def load_motif_vector(archive_path : str ) -> MotifVector: dct_filename = archive_path+'properties'+'.yaml' array_filename = archive_path+'motifs'+'.npy' with open(dct_filename, 'r') as yaml_file: motif_vector_properties = yaml.safe_load(yaml_file) motif_vector_properties['unit'] = transform_dict_to_unit(motif_vector_properties['unit']) makeMotifVector = MotifVector(**motif_vector_properties) return makeMotifVector( _array_to_motif_vector_dct(jnp.load(array_filename),motif_vector_properties['motiflength'], motif_vector_properties['alphabet']) )