IsoSpec
Loading...
Searching...
No Matches
marginalTrek++.cpp
1/*
2 * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3 *
4 * This file is part of IsoSpec.
5 *
6 * IsoSpec is free software: you can redistribute it and/or modify
7 * it under the terms of the Simplified ("2-clause") BSD licence.
8 *
9 * IsoSpec is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 * You should have received a copy of the Simplified BSD Licence
14 * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15 */
16
17
18#include <cmath>
19#include <algorithm>
20#include <vector>
21#include <cstdlib>
22#include <queue>
23#include <utility>
24#include <cstring>
25#include <string>
26#include <limits>
27#include <memory>
28#include "platform.h"
29#include "marginalTrek++.h"
30#include "conf.h"
31#include "allocator.h"
32#include "operators.h"
33#include "summator.h"
34#include "element_tables.h"
35#include "misc.h"
36
37
38namespace IsoSpec
39{
40
42
50void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double* lprobs, int* res)
51{
56
57 // This approximates the mode (heuristics: the mean is close to the mode).
58 for(int i = 0; i < isotopeNo; ++i)
59 res[i] = static_cast<int>( atomCnt * exp(lprobs[i]) ) + 1;
60
61 // The number of assigned atoms above.
62 int s = 0;
63
64 for(int i = 0; i < isotopeNo; ++i) s += res[i];
65
66 int diff = atomCnt - s;
67
68 // Too little: enlarging fist index.
69 if( diff > 0 ){
70 res[0] += diff;
71 }
72 // Too much: trying to redistribute the difference: hopefully the first element is the largest.
73 if( diff < 0 ){
74 diff = abs(diff);
75 int i = 0;
76
77 while( diff > 0){
78 int coordDiff = res[i] - diff;
79
80 if( coordDiff >= 0 ){
81 res[i] -= diff;
82 diff = 0;
83 } else {
84 res[i] = 0;
85 i++;
86 diff = abs(coordDiff);
87 }
88 }
89 }
90
91 // What we computed so far will be very close to the mode: hillclimb the rest of the way
92
93 bool modified = true;
94 double LP = unnormalized_logProb(res, lprobs, isotopeNo);
95 double NLP;
96
97 while(modified)
98 {
99 modified = false;
100 for(int ii = 0; ii < isotopeNo; ii++)
101 for(int jj = 0; jj < isotopeNo; jj++)
102 if(ii != jj && res[ii] > 0)
103 {
104 res[ii]--;
105 res[jj]++;
106 NLP = unnormalized_logProb(res, lprobs, isotopeNo);
107 if(NLP > LP || (NLP == LP && ii > jj))
108 {
109 modified = true;
110 LP = NLP;
111 }
112 else
113 {
114 res[ii]++;
115 res[jj]--;
116 }
117 }
118 }
119}
120
121
122double* getMLogProbs(const double* probs, int isoNo)
123{
129 for(int ii = 0; ii < isoNo; ii++)
130 if(probs[ii] <= 0.0 || probs[ii] > 1.0)
131 throw std::invalid_argument("All isotope probabilities p must fulfill: 0.0 < p <= 1.0");
132
133 double* ret = new double[isoNo];
134
135 // here we change the table of probabilities and log it.
136 for(int i = 0; i < isoNo; i++)
137 {
138 ret[i] = log(probs[i]);
139 for(int j = 0; j < ISOSPEC_NUMBER_OF_ISOTOPIC_ENTRIES; j++)
140 if(elem_table_probability[j] == probs[i])
141 {
142 ret[i] = elem_table_log_probability[j];
143 break;
144 }
145 }
146 return ret;
147}
148
149double get_loggamma_nominator(int x)
150{
151 // calculate log gamma of the nominator calculated in the binomial exression.
152 double ret = lgamma(x+1);
153 return ret;
154}
155
156int verify_atom_cnt(int atomCnt)
157{
158 #if !ISOSPEC_BUILDING_OPENMS
159 if(ISOSPEC_G_FACT_TABLE_SIZE-1 <= atomCnt)
160 throw std::length_error("Subisotopologue too large, size limit (that is, the maximum number of atoms of a single element in a molecule) is: " + std::to_string(ISOSPEC_G_FACT_TABLE_SIZE-1));
161 #endif
162 return atomCnt;
163}
164
166 const double* _masses,
167 const double* _probs,
168 int _isotopeNo,
169 int _atomCnt
170) :
171disowned(false),
172isotopeNo(_isotopeNo),
173atomCnt(verify_atom_cnt(_atomCnt)),
175atom_masses(array_copy<double>(_masses, _isotopeNo)),
176loggamma_nominator(get_loggamma_nominator(_atomCnt)),
177mode_conf(nullptr)
178// Deliberately not initializing mode_lprob
179{}
180
182disowned(false),
183isotopeNo(other.isotopeNo),
184atomCnt(other.atomCnt),
185atom_lProbs(array_copy<double>(other.atom_lProbs, isotopeNo)),
186atom_masses(array_copy<double>(other.atom_masses, isotopeNo)),
188{
189 if(other.mode_conf == nullptr)
190 {
191 mode_conf = nullptr;
192 // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
193 }
194 else
195 {
196 mode_conf = array_copy<int>(other.mode_conf, isotopeNo);
197 mode_lprob = other.mode_lprob;
198 }
199}
200
201
202// the move-constructor: used in the specialization of the marginal.
204disowned(other.disowned),
205isotopeNo(other.isotopeNo),
206atomCnt(other.atomCnt),
210{
211 other.disowned = true;
212 if(other.mode_conf == nullptr)
213 {
214 mode_conf = nullptr;
215 // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
216 }
217 else
218 {
219 mode_conf = other.mode_conf;
220 mode_lprob = other.mode_lprob;
221 }
222}
223
225{
226 if(!disowned)
227 {
228 delete[] atom_masses;
229 delete[] atom_lProbs;
230 delete[] mode_conf;
231 }
232}
233
234
236{
237 Conf res = new int[isotopeNo];
239 return res;
240}
241
242void Marginal::setupMode()
243{
244 ISOSPEC_IMPOSSIBLE(mode_conf != nullptr);
246 mode_lprob = logProb(mode_conf);
247}
248
249
251{
252 double ret_mass = std::numeric_limits<double>::infinity();
253 for(unsigned int ii = 0; ii < isotopeNo; ii++)
254 if( ret_mass > atom_masses[ii] )
255 ret_mass = atom_masses[ii];
256 return ret_mass*atomCnt;
257}
258
260{
261 double ret_mass = 0.0;
262 for(unsigned int ii = 0; ii < isotopeNo; ii++)
263 if( ret_mass < atom_masses[ii] )
264 ret_mass = atom_masses[ii];
265 return ret_mass*atomCnt;
266}
267
269{
270 double found_prob = -std::numeric_limits<double>::infinity();
271 size_t found_idx = 0;
272 for(unsigned int ii = 0; ii < isotopeNo; ii++)
273 if( found_prob < atom_lProbs[ii] )
274 {
275 found_prob = atom_lProbs[ii];
276 found_idx = ii;
277 }
278 return found_idx;
279}
280
282{
283 size_t idx = getMonoisotopicAtomIndex();
284 return atom_masses[idx]*atomCnt;
285}
286
288{
289 double ret = 0.0;
290 for(unsigned int ii = 0; ii < isotopeNo; ii++)
291 ret += exp(atom_lProbs[ii]) * atom_masses[ii];
292 return ret;
293}
294
295double Marginal::variance() const
296{
297 double ret = 0.0;
298 double mean = getAtomAverageMass();
299 for(size_t ii = 0; ii < isotopeNo; ii++)
300 {
301 double msq = atom_masses[ii] - mean;
302 ret += exp(atom_lProbs[ii]) * msq * msq;
303 }
304 return ret * atomCnt;
305}
306
307double Marginal::getLogSizeEstimate(double logEllipsoidRadius) const
308{
309 if(isotopeNo <= 1)
310 return -std::numeric_limits<double>::infinity();
311
312 const double i = static_cast<double>(isotopeNo);
313 const double k = i - 1.0;
314 const double n = static_cast<double>(atomCnt);
315
316 double sum_lprobs = 0.0;
317 for(int jj = 0; jj < i; jj++)
318 sum_lprobs += atom_lProbs[jj];
319
320 double log_V_simplex = k * log(n) - lgamma(i);
321 double log_N_simplex = lgamma(n+i) - lgamma(n+1.0) - lgamma(i);
322 double log_V_ellipsoid = (k * (log(n) + logpi + logEllipsoidRadius) + sum_lprobs) * 0.5 - lgamma((i+1)*0.5);
323
324 return log_N_simplex + log_V_ellipsoid - log_V_simplex;
325}
326
327
328// this is roughly an equivalent of IsoSpec-Threshold-Generator
330 Marginal&& m,
331 int tabSize,
332 int
333) :
334Marginal(std::move(m)),
335current_count(0),
336orderMarginal(atom_lProbs, isotopeNo),
337pq(),
338allocator(isotopeNo, tabSize),
339min_lprob(*std::min_element(atom_lProbs, atom_lProbs+isotopeNo))
340{
341 int* initialConf = allocator.makeCopy(mode_conf);
342
343 pq.push({mode_lprob, initialConf});
344
345 current_count = 0;
346
347 fringe.resize_and_wipe(1);
348
349 current_bucket = 0;
350 initialized_until = 1;
351
352 add_next_conf();
353}
354
355
356bool MarginalTrek::add_next_conf()
357{
362 if(pq.empty())
363 {
364 current_bucket++;
365 while(current_bucket < initialized_until && fringe[current_bucket].empty())
366 {
367// std::cout << "EMPTY bucket, id: " << current_bucket << std::endl;
368 current_bucket++;
369 }
370
371// std::cout << "Entering bucket, size: " << fringe[current_bucket].size() << std::endl;
372
373 if(current_bucket >= initialized_until)
374 return false;
375
376 // std::cout << "Fringe size at pop: " << fringe[current_bucket].size() << std::endl;
377 pq = std::priority_queue<ProbAndConfPtr, pod_vector<ProbAndConfPtr> >(std::less<ProbAndConfPtr>(), pod_vector<ProbAndConfPtr>(std::move(fringe[current_bucket])));
378 };
379
380 double logprob = pq.top().first;
381 Conf topConf = pq.top().second;
382
383 pq.pop();
384 ++current_count;
385
386 _confs.push_back(topConf);
387
388 _conf_masses.push_back(calc_mass(topConf, atom_masses, isotopeNo));
389 _conf_lprobs.push_back(logprob);
390
391 for( unsigned int j = 0; j < isotopeNo; ++j )
392 {
393 if( topConf[j] > mode_conf[j])
394 continue;
395
396 if( topConf[j] > 0 )
397 {
398 for( unsigned int i = 0; i < isotopeNo; ++i )
399 {
400 if( topConf[i] < mode_conf[i] )
401 continue;
402 // Growing index different than decreasing one AND
403 // Remain on simplex condition.
404 if( i != j ){
405 Conf acceptedCandidate = allocator.makeCopy(topConf);
406
407 ++acceptedCandidate[i];
408 --acceptedCandidate[j];
409
410 double new_prob = logProb(acceptedCandidate);
411 size_t bucket_nr = bucket_no(new_prob);
412
413 if(bucket_nr >= initialized_until)
414 {
415 // std::cout << "Extending to: " << bucket_nr << std::endl;
416 initialized_until = bucket_nr+1;
417 fringe.resize_and_wipe(initialized_until);
418 }
419
420 ISOSPEC_IMPOSSIBLE(bucket_nr < current_bucket);
421 if(bucket_nr == current_bucket)
422 pq.push({new_prob, acceptedCandidate});
423 else
424 fringe[bucket_nr].push_back({new_prob, acceptedCandidate});
425
426 }
427
428 if( topConf[i] > mode_conf[i] )
429 break;
430 }
431 }
432 if( topConf[j] < mode_conf[j] )
433 break;
434 }
435
436 return true;
437}
438
439
440MarginalTrek::~MarginalTrek()
441{
442 const size_t fringe_size = fringe.size();
443 for(size_t ii = 0; ii < fringe_size; ii++)
444 fringe[ii].clear();
445}
446
447
448
450 double lCutOff,
451 bool sort,
452 int tabSize,
453 int
454) : Marginal(std::move(m)),
455allocator(isotopeNo, tabSize)
456{
457 Conf currentConf = allocator.makeCopy(mode_conf);
458 if(logProb(currentConf) >= lCutOff)
459 {
460 configurations.push_back(currentConf);
461 lProbs.push_back(mode_lprob);
462 }
463
464 unsigned int idx = 0;
465
466 std::unique_ptr<double[]> prob_partials(new double[isotopeNo]);
467 std::unique_ptr<double[]> prob_part_acc(new double[isotopeNo+1]);
468 prob_part_acc[0] = loggamma_nominator;
469
470 while(idx < configurations.size())
471 {
472 currentConf = configurations[idx];
473 idx++;
474
475 for(size_t ii = 0; ii < isotopeNo; ii++)
476 prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
477
478 for(unsigned int ii = 0; ii < isotopeNo; ii++ )
479 {
480 if(currentConf[ii] > mode_conf[ii])
481 continue;
482
483 if(currentConf[ii] != 0)
484 {
485 double prev_partial_ii = prob_partials[ii];
486 currentConf[ii]--;
487 prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
488
489 for(unsigned int jj = 0; jj < isotopeNo; jj++ )
490 {
491 prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
492
493 if(currentConf[jj] < mode_conf[jj])
494 continue;
495
496 if( ii != jj )
497 {
498 double logp = prob_part_acc[jj] + minuslogFactorial(1+currentConf[jj]) + (1+currentConf[jj]) * atom_lProbs[jj];
499 for(size_t kk = jj+1; kk < isotopeNo; kk++)
500 logp += prob_partials[kk];
501
502 if (logp >= lCutOff)
503 {
504 auto tmp = allocator.makeCopy(currentConf);
505 tmp[jj]++;
506 configurations.push_back(tmp);
507 lProbs.push_back(logp);
508 }
509 }
510 else
511 prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
512
513 if (currentConf[jj] > mode_conf[jj])
514 break;
515 }
516 currentConf[ii]++;
517 prob_partials[ii] = prev_partial_ii;
518 }
519
520 if(currentConf[ii] < mode_conf[ii])
521 break;
522 }
523 }
524
525 no_confs = configurations.size();
526 confs = configurations.data();
527
528 if(sort && no_confs > 0)
529 {
530 std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data(), no_confs));
531 impose_order(order_arr.get(), no_confs, lProbs.data(), confs);
532 }
533
534 probs = new double[no_confs];
535 masses = new double[no_confs];
536
537
538 for(unsigned int ii = 0; ii < no_confs; ii++)
539 {
540 probs[ii] = exp(lProbs[ii]);
541 masses[ii] = calc_mass(confs[ii], atom_masses, isotopeNo);
542 }
543
544 lProbs.push_back(-std::numeric_limits<double>::infinity());
545}
546
547
549{
550 if(masses != nullptr)
551 delete[] masses;
552 if(probs != nullptr)
553 delete[] probs;
554}
555
556
557
558
559
560
561
563: Marginal(std::move(m)), current_threshold(1.0), allocator(isotopeNo, tabSize),
564equalizer(isotopeNo), keyHasher(isotopeNo)
565{
566 fringe.push_back(mode_conf);
567 lProbs.push_back(std::numeric_limits<double>::infinity());
568 fringe_unn_lprobs.push_back(unnormalized_logProb(mode_conf));
569 lProbs.push_back(-std::numeric_limits<double>::infinity());
570 guarded_lProbs = lProbs.data()+1;
571}
572
573bool LayeredMarginal::extend(double new_threshold, bool do_sort)
574{
575 new_threshold -= loggamma_nominator;
576 if(fringe.empty())
577 return false;
578
579 lProbs.pop_back(); // Remove the +inf guardian
580
581 pod_vector<Conf> new_fringe;
582 pod_vector<double> new_fringe_unn_lprobs;
583
584 while(!fringe.empty())
585 {
586 Conf currentConf = fringe.back();
587 fringe.pop_back();
588
589 double opc = fringe_unn_lprobs.back();
590
591 fringe_unn_lprobs.pop_back();
592 if(opc < new_threshold)
593 {
594 new_fringe.push_back(currentConf);
595 new_fringe_unn_lprobs.push_back(opc);
596 }
597
598 else
599 {
600 configurations.push_back(currentConf);
601 lProbs.push_back(opc+loggamma_nominator);
602 for(unsigned int ii = 0; ii < isotopeNo; ii++ )
603 {
604 if(currentConf[ii] > mode_conf[ii])
605 continue;
606
607 if(currentConf[ii] > 0)
608 {
609 currentConf[ii]--;
610 for(unsigned int jj = 0; jj < isotopeNo; jj++ )
611 {
612 if(currentConf[jj] < mode_conf[jj])
613 continue;
614
615 if( ii != jj )
616 {
617 Conf nc = allocator.makeCopy(currentConf);
618 nc[jj]++;
619
620 double lpc = unnormalized_logProb(nc);
621 if(lpc >= new_threshold)
622 {
623 fringe.push_back(nc);
624 fringe_unn_lprobs.push_back(lpc);
625 }
626 else
627 {
628 new_fringe.push_back(nc);
629 new_fringe_unn_lprobs.push_back(lpc);
630 }
631 }
632
633 if(currentConf[jj] > mode_conf[jj])
634 break;
635 }
636 currentConf[ii]++;
637 }
638
639 if(currentConf[ii] < mode_conf[ii])
640 break;
641 }
642 }
643 }
644
645 current_threshold = new_threshold;
646 fringe.swap(new_fringe);
647 fringe_unn_lprobs.swap(new_fringe_unn_lprobs);
648
649 if(do_sort)
650 {
651 size_t to_sort_size = configurations.size() - probs.size();
652 if(to_sort_size > 0)
653 {
654 std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data()+1+probs.size(), to_sort_size));
655 double* P = lProbs.data()+1+probs.size();
656 Conf* C = configurations.data()+probs.size();
657 size_t* O = order_arr.get();
658 impose_order(O, to_sort_size, P, C);
659 }
660 }
661
662 if(probs.capacity() * 2 < configurations.size() + 2)
663 {
664 // Reserve space for new values
665 probs.reserve(configurations.size());
666 masses.reserve(configurations.size());
667 } // Otherwise we're growing slowly enough that standard reallocations on push_back work better - we waste some extra memory
668 // but don't reallocate on every call
669
670// printVector(lProbs);
671 for(unsigned int ii = probs.size(); ii < configurations.size(); ii++)
672 {
673 probs.push_back(exp(lProbs[ii+1]));
674 masses.push_back(calc_mass(configurations[ii], atom_masses, isotopeNo));
675 }
676
677 lProbs.push_back(-std::numeric_limits<double>::infinity()); // Restore guardian
678
679 guarded_lProbs = lProbs.data()+1; // Vector might have reallocated its backing storage
680
681 return true;
682}
683
684
686{
687 double ret = std::numeric_limits<double>::infinity();
688 for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
689 if(*it < ret)
690 ret = *it;
691 return ret;
692}
693
694
696{
697 double ret = -std::numeric_limits<double>::infinity();
698 for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
699 if(*it > ret)
700 ret = *it;
701 return ret;
702}
703
704/* =============================================================== */
705
706template<bool add_guards>
708: Marginal(std::move(m)), current_threshold(1.0), extended_to_idx(0)
709{
710 original_indexes.resize(isotopeNo);
711 for(size_t ii = 0; ii < isotopeNo; ++ii)
712 original_indexes[ii] = ii;
713
714 std::sort(original_indexes.begin(), original_indexes.end(), [&](int a, int b) {
715 return atom_lProbs[a] > atom_lProbs[b];
716 });
717
718
719 masses.reserve(isotopeNo);
720 probs.reserve(isotopeNo);
721
722 if constexpr (add_guards)
723 {
724 lProbs.reserve(isotopeNo+2);
725 lProbs.push_back(std::numeric_limits<double>::infinity());
726 }
727 else
728 lProbs.reserve(isotopeNo);
729
730 for(size_t idx : original_indexes)
731 {
732 lProbs.push_back(atom_lProbs[idx]);
733 probs.push_back(exp(lProbs.back()));
734 masses.push_back(atom_masses[idx]);
735 }
736
737 if constexpr (add_guards)
738 {
739 lProbs.push_back(-std::numeric_limits<double>::infinity());
740 guarded_lProbs = lProbs.data()+1;
741 }
742 else
743 guarded_lProbs = lProbs.data();
744}
745
746template class SingleAtomMarginal<true>;
748
749} // namespace IsoSpec
bool extend(double new_threshold, bool do_sort=true)
Extend the set of computed subisotopologues to those above the new threshold.
double get_max_mass() const
Get the maximal mass in current layer.
double get_min_mass() const
Get the minimal mass in current layer.
LayeredMarginal(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
The marginal distribution class (a subisotopologue).
Conf computeModeConf() const
The the probability of the mode subisotopologue.
Marginal(const double *_masses, const double *_probs, int _isotopeNo, int _atomCnt)
Class constructor.
const unsigned int atomCnt
double getLightestConfMass() const
Get the mass of the lightest subisotopologue.
const unsigned int isotopeNo
const double *const atom_masses
double variance() const
Calculate the variance of the theoretical distribution describing the subisotopologue.
ISOSPEC_FORCE_INLINE double unnormalized_logProb(Conf conf) const
Calculate the log-probability of a given subisotopologue.
const double loggamma_nominator
double getHeaviestConfMass() const
Get the mass of the heaviest subisotopologue.
double getAtomAverageMass() const
The average mass of a single atom.
double getMonoisotopicConfMass() const
Get the mass of the monoisotopic subisotopologue.
virtual ~Marginal()
Destructor.
size_t getMonoisotopicAtomIndex() const
Get the index of the monoisotopic (most probable) atom.
double getLogSizeEstimate(double logEllipsoidRadius) const
Return estimated logarithm of size of the marginal at a given ellipsoid radius.
const double *const atom_lProbs
MarginalTrek(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
virtual ~PrecalculatedMarginal()
Destructor.
PrecalculatedMarginal(Marginal &&m, double lCutOff, bool sort=true, int tabSize=1000, int hashSize=1000)
The move constructor (disowns the Marginal).
SingleAtomMarginal(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
double * getMLogProbs(const double *probs, int isoNo)
void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double *lprobs, int *res)
Find one of the most probable subisotopologues.