Source code for src.read.strand_motifs_trajectory

import numpy as np
from typing import Union
from .config import symbol_config as read_symbol_config
from ..obj.motif_trajectory import MotifTrajectory
from ..obj.motif_vector import MotifVector, are_compatible_motif_vectors
from ..obj.times_vector import TimesVector
from ..obj.units import Unit
from warnings import warn


[docs]
def strand_motifs_trajectory(filepaths : list,
                             alphabet : list,
                             motiflength : int = 4,
                             times_unit : Unit = None,
                             skiprows : int =2
                             ) -> MotifTrajectory:
    """
    reads from the complexes.txt of the RNAReactor simulation output and returns corresponding
    concentration vectors in motif space.

    PARAMETERS:
    -----------
    filepaths : string or list of strings
        Output file of the RNAReactor simulation
    skiprow : int, optional
        Skip the first `skiprow` lines, including comments
        when reading the file;
        default : 2

    RETURN:
    -------
    strand_motifs_trajectory : MotifTrajectory
    """
    if filepaths is str:
        filepaths = [filepaths,]
    if not isinstance(filepaths, list):
        raise ValueError("filepaths needs to be list.")
    motif_vectors = []
    times = []
    if times_unit is None:
        times_unit = read_symbol_config('time', unitformat=True)

    for filepath in filepaths:
        _, current_times, sequence_trajectory = steps_and_times_and_sequence_trajectory_from_complexes_txt(
                filepath, skiprows=skiprows)
        if len(current_times) == 0:
            continue
        motif_vectors += _transform_sequence_trajectory_into_motif_vector_list(sequence_trajectory, alphabet, motiflength)
        if (len(times)>1):
            if not are_compatible_motif_vectors(motif_vectors[0],motif_vectors[-1]):
                raise ValueError("Non compatible motif vectors.")
            if (times[-1]>current_times[0]):
                warn("Times of initialized motif trajectory will not be chronologically.")
        times += list(current_times)
    times = TimesVector(times,times_unit)
    return MotifTrajectory(motif_vectors,times)



[docs]
def steps_and_times_and_complexes_from_complexes_txt(filepath : str,
        skiprows : int = 2,
        ) -> Union[np.array, np.array, list]:
    """
    Parameters:
    -----------
    filepath : str,
    skiprows : int = 2,
        skip the first <skiprows> lines of the complex.txt file

    Returns:
    --------
    steps : nd-array with dtype int
    total_physical_time : nd-array with dtype np.float64
    complexes : list
        with every element of list is the list of complexes at that time
        the list of complexes at given time is again a list of the format
        complexes[time_index][complex_index]=[number_of_complex : int, structure_of_complex : str]
    """
    if len(list(open(filepath))) == 0:
        warn("complexes.txt file is empty")
        nl = np.empty((0,3), dtype=str)
    else:
        nl = np.loadtxt(filepath, skiprows=skiprows, dtype=str, ndmin = 2)
    steps = np.array(nl[:,0], dtype = int)
    total_physical_time = np.array(nl[:,1], dtype = np.float64)
    number_and_structure_of_complexes = nl[:,2]
    from json import loads
    number_and_structure_of_complexes = [
            loads(number_and_structure_of_complexes[ii].replace('None','"none"'))
            for ii in range(len(number_and_structure_of_complexes))
            ]
    return steps, total_physical_time, number_and_structure_of_complexes



[docs]
def steps_and_times_and_sequence_trajectory_from_complexes_txt(filepath : str,
        skiprows : int = 2,
        ) -> Union[np.array, np.array, list]:
    """
    Parameters:
    -----------
    filepath : str,
    skiprows : int = 2,
        skip the first <skiprows> lines of the complex.txt file

    Returns:
    --------
    steps : nd-array with dtype int
    total_physical_time : nd-array with dtype np.float64
    complexes : list
        with every element of list is the list of complexes at that time
        the list of complexes at given time is again a list of the format
        complexes[time_index][complex_index]=[number_of_complex : int, structure_of_complex : str]
    """
    steps, total_physical_time, number_and_structure_of_complexes = steps_and_times_and_complexes_from_complexes_txt(filepath,skiprows=skiprows)
    sequence_number_trajectory = [{} for ii in range(len(total_physical_time))]
    for time_idx in range(len(total_physical_time)):
        current_number_and_structure_of_complexes = number_and_structure_of_complexes[time_idx]
        for complex_index in range(len(current_number_and_structure_of_complexes)):
            number_of_complex = current_number_and_structure_of_complexes[complex_index][0]
            structure_of_complex = current_number_and_structure_of_complexes[complex_index][1]
            upper_strands_as_continuous_string, lower_strands_as_continuous_string  = _extract_upper_and_lower_strands_as_continuous_strings_from_structure_of_complex(structure_of_complex)
            upper_separate_strand_sequences = _cut_strands_as_continuous_string_into_separate_strand_sequences(upper_strands_as_continuous_string)
            lower_separate_strand_sequences = _cut_strands_as_continuous_string_into_separate_strand_sequences(lower_strands_as_continuous_string)
            for upper_separate_strand_sequence in list(upper_separate_strand_sequences):
                if upper_separate_strand_sequence in sequence_number_trajectory[time_idx].keys():
                    sequence_number_trajectory[time_idx][upper_separate_strand_sequence] += number_of_complex
                else:
                    sequence_number_trajectory[time_idx][upper_separate_strand_sequence] = number_of_complex
            for lower_separate_strand_sequence in lower_separate_strand_sequences:
                if lower_separate_strand_sequence in sequence_number_trajectory[time_idx].keys():
                    sequence_number_trajectory[time_idx][lower_separate_strand_sequence] += number_of_complex
                else:
                    sequence_number_trajectory[time_idx][lower_separate_strand_sequence] = number_of_complex
    return steps, total_physical_time, sequence_number_trajectory


def _reverse_segments(list_of_strand_segments):
    """
    reverses a strand
    """
    return [letter[::-1] for letter in list_of_strand_segments[::-1]]

def _extract_upper_and_lower_segments(structure_of_complex : list) -> Union[np.array,np.array]:
    list_of_upper_segments = np.array(structure_of_complex)[:,0]
    list_of_lower_segments =  np.array(structure_of_complex)[:,1]
    return list_of_upper_segments, list_of_lower_segments 

def _transform_list_of_segments_to_string(list_of_segments : list,
        list_of_strings_that_are_replaced : list = [('|',0),('-',''),('X',0),('x',0),('none',''),('enon',''),('00','0')]
        ) -> str:
    """
    transforms a list_of_segments into strands as continuous string
    """
    for ii in range(len(list_of_segments)):
        list_of_segments[ii] = list_of_segments[ii].replace(' ','').lstrip('5').rstrip('3')
    strands_as_continuous_string = ''.join(list_of_segments)
    for string_replace_tuple in list_of_strings_that_are_replaced:
        strands_as_continuous_string = strands_as_continuous_string.replace(str(string_replace_tuple[0]),str(string_replace_tuple[1]))
    if len(strands_as_continuous_string)>0:
        if strands_as_continuous_string[0]!='0':
            strands_as_continuous_string = '0'+strands_as_continuous_string
        if strands_as_continuous_string[-1]!='0':
            strands_as_continuous_string = strands_as_continuous_string +'0'
    return strands_as_continuous_string

def _extract_upper_and_lower_strands_as_continuous_strings_from_structure_of_complex(structure_of_complex : list) -> Union[str,str]:
    """
    takes structure_of_complex from the RNA Strand Reactor output
    and returns the upper strands sequences and the lower strands sequences.

    Parameters:
    ---------- 
    structure_of_complex : list or numpy.array

    Returns:
    --------
    upper_strands_sequences : list of str
    lower_strands_sequences : list of str
    """
    # segments are continuous parts of a complex without the end or beginning of a strand
    list_of_upper_segments, list_of_lower_segments = _extract_upper_and_lower_segments(structure_of_complex)
    list_of_reversed_lower_segments = _reverse_segments(list_of_lower_segments)
    upper_strands_as_continuous_string = _transform_list_of_segments_to_string(list_of_upper_segments)
    lower_strands_as_continuous_string = _transform_list_of_segments_to_string(list_of_reversed_lower_segments)
    return upper_strands_as_continuous_string, lower_strands_as_continuous_string

def _cut_strands_as_continuous_string_into_separate_strand_sequences(strands_as_continuous_string : str) -> list:
    """
    cut the upper strand of a complex into its single continuous strands
    """
    strands_as_continuous_string_array = np.array(list(strands_as_continuous_string))
    indices_of_empty_spots = np.where(strands_as_continuous_string_array=='0')[0]
    separate_strand_sequences = [[]]*(indices_of_empty_spots.size-1)
    for ii in range(indices_of_empty_spots.size-1):
        separate_strand_sequences[ii] = strands_as_continuous_string[indices_of_empty_spots[ii]+1:(indices_of_empty_spots[ii+1])]
    return separate_strand_sequences

def _transform_sequence_trajectory_into_motif_vector_list(
        sequence_trajectory : list,
        alphabet : list,
        motiflength : int) -> list:
    """
    Returns:
    --------
    motif_vectors : list
        list of MotifVectors
    """
    motif_vectors = [[]]*len(sequence_trajectory)
    for time_index in range(len(sequence_trajectory)):
        sequence_vector = sequence_trajectory[time_index]
        motif_vectors[time_index] = _transform_sequence_vector_into_motif_vector(sequence_vector,
                alphabet,
                motiflength)
    return motif_vectors

def _translate_letters_to_numbers(alphabet:list,
        zero_is_a_letter : bool = False):
    """
    dictionary to translate letters into numbers
    """
    if zero_is_a_letter:
        dct = { '0' : 0 }
    else:
        dct = {}
    for ii in range(len(alphabet)):
        dct[alphabet[ii]] = ii+zero_is_a_letter
    return dct

def _transform_motif_string_to_index_tuple(motif_string : str,
        alphabet : list) -> tuple:
    motif_as_tuple_of_letters = tuple(motif_string)
    """
    transforms a motif e.g. ('0','A','T','C') to corresponding indices, e.g. (0,1,3,2)

    PARAMETERS:
    -----------
    motif : array of letters

    RETURNS:
    --------
    indices
    """
    transdict = _translate_letters_to_numbers(alphabet)
    return tuple(transdict[letter] for letter in motif_as_tuple_of_letters)

def _transform_sequence_vector_into_motif_vector(sequence_vector : dict,
        alphabet : list,
        motiflength : int) -> MotifVector:
    from ..obj.motif_vector import _create_empty_motif_vector_dct
    motif_vector_dct = _create_empty_motif_vector_dct(motiflength,alphabet=alphabet)
    from ..domains.motif_space import _motif_categories
    motif_categories = _motif_categories()
    for sequence in sequence_vector.keys():
        strandlength = len(sequence)
        if (strandlength < (motiflength-1)):
            motif_category = motif_categories[0].format(strandlength)
            motif_index_tuple = _transform_motif_string_to_index_tuple(sequence,alphabet)
            motif_vector_dct[motif_category][motif_index_tuple] += sequence_vector[sequence]
        else:
            occupation_number = sequence_vector[sequence]
            # add beginning
            motif_category = motif_categories[-3]
            motif_string = sequence[:(motiflength-1)]
            motif_index_tuple = _transform_motif_string_to_index_tuple(motif_string, alphabet)
            motif_vector_dct[motif_category][motif_index_tuple] += occupation_number
            # add ending
            motif_category = motif_categories[-1]
            motif_string = sequence[-(motiflength-1):]
            motif_index_tuple = _transform_motif_string_to_index_tuple(motif_string, alphabet)
            motif_vector_dct[motif_category][motif_index_tuple] += occupation_number
            # add continuations
            motif_category = motif_categories[-2]
            for motif_index in range(len(sequence)-motiflength+1):
                motif_string = sequence[motif_index:motif_index+motiflength]
                motif_index_tuple = _transform_motif_string_to_index_tuple(motif_string, alphabet)
                motif_vector_dct[motif_category][motif_index_tuple] += occupation_number
    motif_vector = MotifVector(motiflength,alphabet,'1')
    return motif_vector(motif_vector_dct)