Version 4.1.5
Main Page | Class Hierarchy | Class List | File List | Class Members | Related Pages

PhasedMarkov Class Reference

Phased Markov modelling, estimation and simulation. More...

#include <seqpp/PhasedMarkov.h>

Inheritance diagram for PhasedMarkov:

Markov PhasedMTDMarkov PhasedPMarkov PhasedVLMarkov MTDMarkov PMarkov VLMarkov List of all members.

Public Member Functions

 PhasedMarkov (const string &markov_file, bool calc_rank=false)
 Constructor 1 : read a configuration file.
 PhasedMarkov (const SequenceSet &seqset, short phase, short initial_phase=0, bool calc_rank=false, const string &prior_alpha_file=string())
 Constructor 2 : Estimate the transition matrices on the sequences of seqset.
 PhasedMarkov (const Sequence &seq, short phase, short initial_phase=0, bool calc_rank=false, const string &prior_alpha_file=string())
 Constructor 3 : Estimate the transition matrices on the sequence seq.
 PhasedMarkov (const PhasedMarkov &phm)
 Constructor 4 : Copy constructor.
 PhasedMarkov ()
 Constructor 5 : Default constructor.
 PhasedMarkov (short size, short order, short phase, bool alloc=true, const string &prior_alpha_file=string())
 Constructor 6 : Minimal Constructor.
 PhasedMarkov (const PhasedMarkov &M1, const PhasedMarkov &M2, const float p)
 Constructor 7 : Creation of a "mixed" Markov chain M = p*M1 + (1-p)*M2 */.
 PhasedMarkov (const SequenceSet &seqset, const vector< int > &Indseq, short phase, short initial_phase=0, bool calc_rank=false, const string &prior_alpha_file=string())
 Constructor 8 : Estimation of the transition matrix based on the sequences of seqset given in Indseq.
 PhasedMarkov (const gsl_rng *r, short size, short order, short phase, bool calc_rank=false)
 Constructor 9 : random markov matrices.
 PhasedMarkov (unsigned long **count, short size, short order, short phase, short initial_phase=0, bool calc_rank=false, const string &prior_alpha_file=string())
 Constructor 10 Estimate the transition matrices on a word-count.
virtual ~PhasedMarkov ()
 Destructor.
template<class TSeq>
void estimate (const TSeq &tseq, short phase, short initial_phase, unsigned long beg, unsigned long end, bool calc_rank=false, bool count_again=true)
 Estimate the transition matrices on the sequence/sequenceset tseq.
void estimate (const string &count_file, bool calc_rank=false)
 Estimate the transition matrices from a file containing the count.
void estimate (unsigned long **count, bool decal_required, bool calc_rank=false)
 Estimate the transition matrices from a word count.
const double ** markov_matrices () const
 access to the markov matrix(ces)
const double * markov_matrix (short numphase) const
 access to the numphase-th markov matrix
void draw_markov_matrices (const gsl_rng *r)
 draw at random the markov matrices
virtual void new_markov_matrices ()
 allocate the memory for markov matrices
virtual void free_markov_matrices ()
 free the memory allocated for markov matrices
double total_variation (const PhasedMarkov &M)
 Total variation distance between *this and M.
void compute_stat_laws (bool force=false)
 Compute the stationnary laws.
const double * stat_law (short numphase=0) const
 access to the stationnary distrib in phase numphase
void free_stat_laws ()
 free the memory allocated for stationnary laws
void compute_init_law (double *MuInit, const SequenceSet &seqset) const
 Get the empirical relative frequency of the first order+1 letters on the set of sequences "seqset".
virtual int compute_rank ()
 Computes the rank of convergence of the Markov Chain.
virtual long nb_parameters () const
 return the number of effective parameters
void link_to_translator (const Translator &trans)
 link to a Translator object to use proba methods with strings
double proba_c (const string &word, Coder &coder, short numphase=0) const
 Stationnary proba of a word(size greater than _order) conditionnaly of its first letters (!use link_to_translator before!).
double proba (const string &word, Coder &coder, short numphase=0) const
 Stationnary proba of a word.
double proba_c (const vector< short > &word, Coder &coder, short numphase=0) const
 Stationnary proba of a word(size greater than _order) conditionnaly of its first letters.
double proba (const vector< short > &word, Coder &coder, short numphase=0) const
 Stationnary proba of a word.
double proba_c (long word, int lw=-1, long jump=-1, short numphase=0) const
 Stationnary proba of a word(size greater than _order) conditionnaly of its first letters.
double proba (long word, int lw=-1, long jump=-1, short numphase=0) const
 Stationnary proba of a word.
double proba_c (const long *seq, long tbeg, long tend, short numphase=0) const
 Stationnary proba of the word seq[tbeg...tend](size greater than _order) conditionnaly of its first letters with seq[tend] in phase numphase.
double proba (const long *seq, long tbeg, long tend, short numphase=0) const
 Stationnary proba of the word seq[tbeg...tend] with seq[tend] in phase numphase.
double log_likelihood (const SequenceSet &seqset, short initial_phase=0, short numphase=-1) const
 loglikelihood of a set of sequence
double log_ratio_likelihood (const SequenceSet &seqset, const PhasedMarkov &M, short initial_phase1=0, short initial_phase2=0) const
 Calculation of the logarithm of the ratio of the probability of observing "seq" under "this" distribution and "M".
double log_likelihood (const Sequence &seq, short initial_phase=0, short numphase=-1) const
 loglikelihood of a sequence
double log_ratio_likelihood (const Sequence &seq, const PhasedMarkov &M, short initial_phase1=0, short initial_phase2=0) const
 Calculation of the logarithm of the ratio of the probability of observing "seq" under "this" distribution and "M".
template<class TSeq>
double BIC (const TSeq &tseq, short initial_phase=0) const
 BIC of sequences (BIC = -2*loglikelihood + nbparam*log(length)).
template<class TSeq>
double AIC (const TSeq &tseq, short initial_phase=0) const
 AIC of a set of sequences (AIC = -2*loglikelihood + 2*nbparam).
template<class TSeq1, class TSeq2>
double post_log_likelihood (const TSeq1 &tseq_train, const TSeq2 &tseq_eval, bool force=false, short initial_phase_train=0, short initial_phase_eval=0)
 compute the mean posterior likelihood over the parameters
void print (const string &FileOut)
 Print a summary of the object.
void print (ofstream &Out) const
 Print a summary of the object.
int tell_size () const
 Returns the alphabet size.
int tell_rank () const
 Returns the convergence rank.
int tell_order () const
 Returns the order.
int tell_phase () const
 Returns the phase.
int nMu () const
 size of the stat law vector
int nPi () const
 size of the matrix
double Pi (int index, int p=0) const
 Access to Markov matrix Pi.
double & operator() (int index, int p=0)
 () operator for Markov matrix Pi elements
double Mu (int index, int p=0) const
 Access to stationnary vector Mu elements.
bool isPis () const
 _Pis != NULL ?
bool isMus () const
 _Mus != NULL ?
short nextPhase (short p) const
 Give the phase following p.
short prevPhase (short p) const
 Give the phase preceding p.
bool Stochasticity ()
 Verify stochasticity of the _Pis[] and eventually rescale it.
void file_to_count (const string &src_file, unsigned long **dest_count)
 fill a count from a file

Protected Member Functions

bool isNextPhase () const
 _nextPhase != NULL ?
bool isPrevPhase () const
 _prevPhase != NULL ?

Protected Attributes

short _phase
 Phase of the model.
double ** _Pis
 Pointer to "Matrices" (in vector format) of transition probabilities for each phase.
double ** _containers
 Container of "Matrices" (in vector format) of transition probabilities for each phase.
double ** _Mus
 Vector of stationnary probabilities for each phase.
short _size
 Size of the alphabet.
short _order
 Order of the model (the same at each phase).
long _nPi
 Dim of Pi :_size^(_order+1).
long _nMu
 Dim of Mu :_size^_order.
long _nb_param
 number of effective parameters
int _rank
 How many steps to converge to Mu ?
long _jump
 jump to the codes of _order+1 letters when Sequence-like code
short * _nextPhase
 (Optimization) For each phase, give the next phase
short * _prevPhase
 (Optimization) For each phase, give the previous phase
const Translator_trans
 link to a translator object for the use of proba methods
double _postloglike
 current posterior likelihood from a training [set of] sequence
vector< vector< double > > _prior_alpha
 prior on the counts, one value per alphabet element, and for each phase

Detailed Description

Phased Markov modelling, estimation and simulation.

This is generalization of a Markov chain, using different matrices in function of the considered position in the sequence. The phase is variable.
For example, if we consider 3 phases, and we note respectively Pi1, Pi2 and Pi3, the three transition matrices, the Markov sequences will be generated by the indices 123123123123... In a DNA modelisation (genomic field), this is useful to take into account the fact that a coding region is read by 3 bases-blocks. The order of the Markov Model, i.e. the number of previous states necessary to determine the distribution of the current state, is variable. It is assume here that this order is the same in all the phases.

Methods are implemented for Markovian transition matrix estimation, stationary distribution calculus, word probabilities, total variation distance between two Markovian matrices, and further. The efficiency of eigenproblems computation is ensured by the use of the implicitly restarted Arnoldi algorithm.
Simulations are also possible.


Constructor & Destructor Documentation

PhasedMarkov::PhasedMarkov const SequenceSet seqset,
short  phase,
short  initial_phase = 0,
bool  calc_rank = false,
const string &  prior_alpha_file = string()
 

Constructor 2 : Estimate the transition matrices on the sequences of seqset.

Parameters:
seqset set of sequences for estimation
phase selected phase
initial_phase phase of the first element of each sequence
calc_rank calculus of the convergence rank if true
prior_alpha_file file containing the alpha for the a priori law, one value per alphabet element, and for each phase (separated by a "#Phase i")

PhasedMarkov::PhasedMarkov const Sequence seq,
short  phase,
short  initial_phase = 0,
bool  calc_rank = false,
const string &  prior_alpha_file = string()
 

Constructor 3 : Estimate the transition matrices on the sequence seq.

Parameters:
seq sequence for estimation
phase selected phase
initial_phase phase of the first element of each sequence
calc_rank calculus of the convergence rank if true
prior_alpha_file file containing the alpha for the a priori law, one value per alphabet element, and for each phase (separated by a "#Phase i")

PhasedMarkov::PhasedMarkov short  size,
short  order,
short  phase,
bool  alloc = true,
const string &  prior_alpha_file = string()
 

Constructor 6 : Minimal Constructor.

Initialises the constants of the model but not the matrices nor the stat laws

Parameters:
size alphabet size
order markovian order
phase selected phase
alloc true for matrices memory allocation
prior_alpha_file file containing the alpha for the a priori law, one value per alphabet element, and for each phase (separated by a "#Phase i")

PhasedMarkov::PhasedMarkov const PhasedMarkov M1,
const PhasedMarkov M2,
const float  p
 

Constructor 7 : Creation of a "mixed" Markov chain M = p*M1 + (1-p)*M2 */.

Parameters:
M1 first Markov chain object
M2 second Markov chain object
p weight of M1 in the resulting M(with 0<=p<=1)

PhasedMarkov::PhasedMarkov const SequenceSet seqset,
const vector< int > &  Indseq,
short  phase,
short  initial_phase = 0,
bool  calc_rank = false,
const string &  prior_alpha_file = string()
 

Constructor 8 : Estimation of the transition matrix based on the sequences of seqset given in Indseq.

Parameters:
seqset set of sequences for estimation
Indseq index of selected sequences
phase selected phase
initial_phase phase of the first element of each sequence
calc_rank calculus of the convergence rank if true
prior_alpha_file file containing the alpha for the a priori law, one value per alphabet element, and for each phase (separated by a "#Phase i")

PhasedMarkov::PhasedMarkov const gsl_rng *  r,
short  size,
short  order,
short  phase,
bool  calc_rank = false
 

Constructor 9 : random markov matrices.

Parameters:
r gsl random generator
size alphabet size
order markovian order
phase selected phase
calc_rank calculus of the convergence rank if true
GSL use exple:
    const gsl_rng_type * T;
    // Choice a default generator and seed
    // from environment variables
    gsl_rng_env_setup();
    // New created instance of the generator
    T = gsl_rng_default;
    gsl_rng * r = gsl_rng_alloc (T);
    // Initialize/Seeds the random number generator
    gsl_rng_set( r, (long)getpid() );
    ...
    ...
    gsl_rng_free( r );

PhasedMarkov::PhasedMarkov unsigned long **  count,
short  size,
short  order,
short  phase,
short  initial_phase = 0,
bool  calc_rank = false,
const string &  prior_alpha_file = string()
 

Constructor 10 Estimate the transition matrices on a word-count.

Parameters:
count count of all the coded word(base size) of size order+1 for each phase,for estimation
size alphabet size
order markovian order
phase selected phase
initial_phase phase of the first element of each sequence
calc_rank calculus of the convergence rank if true
prior_alpha_file file containing the alpha for the a priori law, one value per alphabet element, and for each phase (separated by a "#Phase i")


Member Function Documentation

template<class TSeq>
double PhasedMarkov::AIC const TSeq &  tseq,
short  initial_phase = 0
const [inline]
 

AIC of a set of sequences (AIC = -2*loglikelihood + 2*nbparam).

Parameters:
tseq a sequence or a set of sequences
initial_phase phase of the first element of each sequence

template<class TSeq>
double PhasedMarkov::BIC const TSeq &  tseq,
short  initial_phase = 0
const [inline]
 

BIC of sequences (BIC = -2*loglikelihood + nbparam*log(length)).

Parameters:
tseq a sequence or a set of sequences
initial_phase phase of the first element of each sequence

void PhasedMarkov::draw_markov_matrices const gsl_rng *  r  ) 
 

draw at random the markov matrices

Parameters:
r gsl random generator
GSL use exple:
    const gsl_rng_type * T;
    // Choice a default generator and seed
    // from environment variables
    gsl_rng_env_setup();
    // New created instance of the generator
    T = gsl_rng_default;
    gsl_rng * r = gsl_rng_alloc (T);
    // Initialize/Seeds the random number generator
    gsl_rng_set( r, (long)getpid() );
    ...
    ...
    gsl_rng_free( r );

void PhasedMarkov::estimate unsigned long **  count,
bool  decal_required,
bool  calc_rank = false
 

Estimate the transition matrices from a word count.

Parameters:
count count of all the coded word(base size) of size order+1 for each phase, for estimation
decal_required necessary when using a count of word from 1-word to (_order+1)-word
calc_rank calculus of the convergence rank if true

void PhasedMarkov::estimate const string &  count_file,
bool  calc_rank = false
[inline]
 

Estimate the transition matrices from a file containing the count.

Parameters:
count_file file containing the count in the adapted format
calc_rank calculus of the convergence rank if true

Reimplemented in Markov.

template<class TSeq>
void PhasedMarkov::estimate const TSeq &  tseq,
short  phase,
short  initial_phase,
unsigned long  beg,
unsigned long  end,
bool  calc_rank = false,
bool  count_again = true
[inline]
 

Estimate the transition matrices on the sequence/sequenceset tseq.

Parameters:
tseq sequence/sequenceset for estimation
phase selected phase
initial_phase phase of the first element of each sequence
beg begin position in sequence(s) if subsequences
end end position in sequence(s) if subsequences
calc_rank calculus of the convergence rank if true
count_again false if the word-count already performed before the use of this method

void PhasedMarkov::file_to_count const string &  src_file,
unsigned long **  dest_count
 

fill a count from a file

Parameters:
src_file file containing the count, phase per phase, in alphabetical order
dest_count destination count (must be _phase X _nPi)

double PhasedMarkov::log_likelihood const Sequence seq,
short  initial_phase = 0,
short  numphase = -1
const
 

loglikelihood of a sequence

Parameters:
seq sequence
initial_phase phase of the first element of each sequence
numphase likelihood in only the numphase phase, -1 if sum of the phases

double PhasedMarkov::log_likelihood const SequenceSet seqset,
short  initial_phase = 0,
short  numphase = -1
const
 

loglikelihood of a set of sequence

Parameters:
seqset set of sequence
initial_phase phase of the first element of each sequence
numphase likelihood in only the numphase phase, -1 if sum of the phases

double PhasedMarkov::log_ratio_likelihood const Sequence seq,
const PhasedMarkov M,
short  initial_phase1 = 0,
short  initial_phase2 = 0
const
 

Calculation of the logarithm of the ratio of the probability of observing "seq" under "this" distribution and "M".

Parameters:
seq sequence
M alternative Markov chain
initial_phase1 phase of the first element of each sequence considered in model *this
initial_phase2 phase of the first element of each sequence considered in model M
REMARKS : NO verification is done on the compatibility of the Markov chains
CAREFUL: this calculus is performed by scanning the sequence to avoid pbm of too low likelihood

double PhasedMarkov::log_ratio_likelihood const SequenceSet seqset,
const PhasedMarkov M,
short  initial_phase1 = 0,
short  initial_phase2 = 0
const
 

Calculation of the logarithm of the ratio of the probability of observing "seq" under "this" distribution and "M".

Parameters:
seqset set of sequences
M alternative Markov chain
initial_phase1 phase of the first element of each sequence considered in model *this
initial_phase2 phase of the first element of each sequence considered in model M
REMARKS : NO verification is done on the compatibility of the Markov chains
CAREFUL: this calculus is performed by scanning the sequence to avoid pbm of too low likelihood

double PhasedMarkov::Mu int  index,
int  p = 0
const [inline]
 

Access to stationnary vector Mu elements.

Parameters:
index index of the word
p selected phase

double& PhasedMarkov::operator() int  index,
int  p = 0
[inline]
 

() operator for Markov matrix Pi elements

Parameters:
index index of the word
p selected phase

double PhasedMarkov::Pi int  index,
int  p = 0
const [inline]
 

Access to Markov matrix Pi.

Parameters:
index index of the word
p selected phase

template<class TSeq1, class TSeq2>
double PhasedMarkov::post_log_likelihood const TSeq1 &  tseq_train,
const TSeq2 &  tseq_eval,
bool  force = false,
short  initial_phase_train = 0,
short  initial_phase_eval = 0
[inline]
 

compute the mean posterior likelihood over the parameters

Parameters:
tseq_train [set of] sequence for the training step
tseq_eval [set of] sequence for the evaluation step
force "true" to force the re-calculation on tseq_train. Default => "false"
initial_phase_train phase of the first element of each sequence for the training step
initial_phase_eval phase of the first element of each sequence for the evaluation step

void PhasedMarkov::print const string &  FileOut  )  [inline]
 

Print a summary of the object.

The estimation results can be saved in such a representation:

    # 1 <- Order of the phased Markov chain
    # 2 <- Phase
    # 4 <- Alphabet size
    # 19 steps <- Convergence to the stationnary distribution
    # Phase n°0
    # Transition matrix:
    0.3945322543    0.1652811616    0.1535033485    0.2866832356
    etc...........
    # Stationnary Probability:
    0.3127105148    0.2114684268    0.1783495332    0.2974715251
    # Phase n°1
    # Transition matrix:
    0.3923961961    0.163516403     0.1521005152    0.2919868858
    etc................
    # Stationnary Probability:
    0.3135417652    0.2089660861    0.1771006767    0.300391472

double PhasedMarkov::proba const long *  seq,
long  tbeg,
long  tend,
short  numphase = 0
const
 

Stationnary proba of the word seq[tbeg...tend] with seq[tend] in phase numphase.

Parameters:
seq Sequence-like coded sequence(see Sequence). The Markov-order for the code must be the same than _order
tbeg begin position of the word
tend end position of the word
numphase phase of the last letter of word

double PhasedMarkov::proba long  word,
int  lw = -1,
long  jump = -1,
short  numphase = 0
const
 

Stationnary proba of a word.

Parameters:
word word as a Sequence-coded-like integer (see Sequence)
lw length of the word. Default => order+1
jump Sequence-coded-like jump (see Sequence). Default => jump[order]
numphase phase of the last letter of word

double PhasedMarkov::proba const vector< short > &  word,
Coder &  coder,
short  numphase = 0
const
 

Stationnary proba of a word.

Parameters:
word word as a vector of short
coder required Coder object, from a PrimaryCount object for exple
numphase phase of the last letter of word

double PhasedMarkov::proba const string &  word,
Coder &  coder,
short  numphase = 0
const
 

Stationnary proba of a word.

Parameters:
word word as a string -a Translator object is required to process string to int-
coder required Coder object, from a PrimaryCount object for exple
numphase phase of the last letter of word

double PhasedMarkov::proba_c const long *  seq,
long  tbeg,
long  tend,
short  numphase = 0
const
 

Stationnary proba of the word seq[tbeg...tend](size greater than _order) conditionnaly of its first letters with seq[tend] in phase numphase.

Parameters:
seq Sequence-like coded sequence(see Sequence). The Markov-order for the code must be the same than _order
tbeg begin position of the word
tend end position of the word
numphase phase of the last letter of word

double PhasedMarkov::proba_c long  word,
int  lw = -1,
long  jump = -1,
short  numphase = 0
const
 

Stationnary proba of a word(size greater than _order) conditionnaly of its first letters.

Parameters:
word word as a Sequence-coded-like integer (see Sequence)
lw length of the word. Default => order+1
jump Sequence-coded-like jump (see Sequence). Default => jump[order]
numphase phase of the last letter of word

double PhasedMarkov::proba_c const vector< short > &  word,
Coder &  coder,
short  numphase = 0
const
 

Stationnary proba of a word(size greater than _order) conditionnaly of its first letters.

Parameters:
word word as a vector of short
coder required Coder object, from a PrimaryCount object for exple
numphase phase of the last letter of word

double PhasedMarkov::proba_c const string &  word,
Coder &  coder,
short  numphase = 0
const
 

Stationnary proba of a word(size greater than _order) conditionnaly of its first letters (!use link_to_translator before!).

Parameters:
word word as a string -a Translator object is required to process string to int-
coder required Coder object, from a PrimaryCount object for exple
numphase phase of the last letter of word


The documentation for this class was generated from the following files:



Download seq++ 4.1.5
Download previous versions
Statistique & Genome Home


Generated on Thu Aug 4 18:34:05 2005 for seqpp by doxygen 1.3.9.1