IsoSpec
Loading...
Searching...
No Matches
fixedEnvelopes.cpp
1/*
2 * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3 *
4 * This file is part of IsoSpec.
5 *
6 * IsoSpec is free software: you can redistribute it and/or modify
7 * it under the terms of the Simplified ("2-clause") BSD licence.
8 *
9 * IsoSpec is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 * You should have received a copy of the Simplified BSD Licence
14 * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15 */
16
17#include "fixedEnvelopes.h"
18#include <limits>
19#include <cassert>
20#include "isoMath.h"
21
22namespace IsoSpec
23{
24
25FixedEnvelope::FixedEnvelope(const FixedEnvelope& other) :
26_masses(array_copy<double>(other._masses, other._confs_no)),
27_probs(array_copy<double>(other._probs, other._confs_no)),
28_confs(array_copy_nptr<int>(other._confs, other._confs_no*other.allDim)),
29_confs_no(other._confs_no),
30allDim(other.allDim),
31sorted_by_mass(other.sorted_by_mass),
32sorted_by_prob(other.sorted_by_prob),
33total_prob(other.total_prob)
34{}
35
36FixedEnvelope::FixedEnvelope(FixedEnvelope&& other) :
37_masses(other._masses),
38_probs(other._probs),
39_confs(other._confs),
40_confs_no(other._confs_no),
41allDim(other.allDim),
42sorted_by_mass(other.sorted_by_mass),
43sorted_by_prob(other.sorted_by_prob),
44total_prob(other.total_prob)
45{
46other._masses = nullptr;
47other._probs = nullptr;
48other._confs = nullptr;
49other._confs_no = 0;
50other.total_prob = 0.0;
51}
52
53FixedEnvelope::FixedEnvelope(double* in_masses, double* in_probs, size_t in_confs_no, bool masses_sorted, bool probs_sorted, double _total_prob) :
54_masses(in_masses),
55_probs(in_probs),
56_confs(nullptr),
57_confs_no(in_confs_no),
58allDim(0),
59sorted_by_mass(masses_sorted),
60sorted_by_prob(probs_sorted),
61total_prob(_total_prob)
62{}
63
64FixedEnvelope FixedEnvelope::operator+(const FixedEnvelope& other) const
65{
66 double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
67 if(nprobs == nullptr)
68 throw std::bad_alloc();
69 double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
70 if(nmasses == nullptr)
71 {
72 free(nprobs);
73 throw std::bad_alloc();
74 }
75
76 memcpy(nprobs, _probs, sizeof(double) * _confs_no);
77 memcpy(nmasses, _masses, sizeof(double) * _confs_no);
78
79 memcpy(nprobs+_confs_no, other._probs, sizeof(double) * other._confs_no);
80 memcpy(nmasses+_confs_no, other._masses, sizeof(double) * other._confs_no);
81
82 return FixedEnvelope(nmasses, nprobs, _confs_no + other._confs_no);
83}
84
85FixedEnvelope FixedEnvelope::operator*(const FixedEnvelope& other) const
86{
87 double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
88 if(nprobs == nullptr)
89 throw std::bad_alloc();
90 // deepcode ignore CMemoryLeak: It's not a memleak: the memory is passed to FixedEnvelope which
91 // deepcode ignore CMemoryLeak: takes ownership of it, and will properly free() it in destructor.
92 double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
93 if(nmasses == nullptr)
94 {
95 free(nprobs);
96 throw std::bad_alloc();
97 }
98
99 size_t tgt_idx = 0;
100
101 for(size_t ii = 0; ii < _confs_no; ii++)
102 for(size_t jj = 0; jj < other._confs_no; jj++)
103 {
104 nprobs[tgt_idx] = _probs[ii] * other._probs[jj];
105 nmasses[tgt_idx] = _masses[ii] + other._masses[jj];
106 tgt_idx++;
107 }
108
109 return FixedEnvelope(nmasses, nprobs, tgt_idx);
110}
111
112void FixedEnvelope::sort_by_mass()
113{
114 if(sorted_by_mass)
115 return;
116
117 sort_by(_masses);
118
119 sorted_by_mass = true;
120 sorted_by_prob = false;
121}
122
123
124void FixedEnvelope::sort_by_prob()
125{
126 if(sorted_by_prob)
127 return;
128
129 sort_by(_probs);
130
131 sorted_by_prob = true;
132 sorted_by_mass = false;
133}
134
135template<typename T> void reorder_array(T* arr, size_t* order, size_t size, bool can_destroy = false)
136{
137 if(!can_destroy)
138 {
139 size_t* order_c = new size_t[size];
140 memcpy(order_c, order, sizeof(size_t)*size);
141 order = order_c;
142 }
143
144 for(size_t ii = 0; ii < size; ii++)
145 while(order[ii] != ii)
146 {
147 std::swap(arr[ii], arr[order[ii]]);
148 std::swap(order[order[ii]], order[ii]);
149 }
150
151 if(!can_destroy)
152 delete[] order;
153}
154
155void FixedEnvelope::sort_by(double* order)
156{
157 if(_confs_no <= 1)
158 return;
159
160 size_t* indices = new size_t[_confs_no];
161
162 for(size_t ii = 0; ii < _confs_no; ii++)
163 indices[ii] = ii;
164
165 std::sort<size_t*>(indices, indices + _confs_no, TableOrder<double>(order));
166
167 size_t* inverse = new size_t[_confs_no];
168
169 for(size_t ii = 0; ii < _confs_no; ii++)
170 inverse[indices[ii]] = ii;
171
172 delete[] indices;
173
174 reorder_array(_masses, inverse, _confs_no);
175 reorder_array(_probs, inverse, _confs_no, _confs == nullptr);
176 if(_confs != nullptr)
177 {
178 int* swapspace = new int[allDim];
179 for(size_t ii = 0; ii < _confs_no; ii++)
180 while(inverse[ii] != ii)
181 {
182 memcpy(swapspace, &_confs[ii*allDim], allDimSizeofInt);
183 memcpy(&_confs[ii*allDim], &_confs[inverse[ii]*allDim], allDimSizeofInt);
184 memcpy(&_confs[inverse[ii]*allDim], swapspace, allDimSizeofInt);
185 std::swap(inverse[inverse[ii]], inverse[ii]);
186 }
187 delete[] swapspace;
188 }
189 delete[] inverse;
190}
191
192
193double FixedEnvelope::get_total_prob()
194{
195 if(std::isnan(total_prob))
196 {
197 total_prob = 0.0;
198 for(size_t ii = 0; ii < _confs_no; ii++)
199 total_prob += _probs[ii];
200 }
201 return total_prob;
202}
203
204void FixedEnvelope::scale(double factor)
205{
206 for(size_t ii = 0; ii < _confs_no; ii++)
207 _probs[ii] *= factor;
208 total_prob *= factor;
209}
210
211void FixedEnvelope::normalize()
212{
213 double tp = get_total_prob();
214 if(tp != 1.0)
215 {
216 scale(1.0/tp);
217 total_prob = 1.0;
218 }
219}
220
221void FixedEnvelope::shift_mass(double value)
222{
223 for(size_t ii = 0; ii < _confs_no; ii++)
224 _masses[ii] += value;
225}
226
227void FixedEnvelope::resample(size_t samples, double beta_bias)
228{
229 if(_confs_no == 0)
230 throw std::logic_error("Resample called on an empty spectrum");
231
232 double pprob = 0.0;
233 double cprob = 0.0;
234 size_t pidx = -1; // Overflows - but it doesn't matter.
235
236 _probs[_confs_no-1] = (std::numeric_limits<double>::max)();
237
238 while(samples > 0)
239 {
240 pprob += _probs[++pidx];
241 _probs[pidx] = 0.0;
242 double covered_part = (pprob - cprob) / (1.0 - cprob);
243 while(samples * covered_part < beta_bias && samples > 0)
244 {
245 cprob += rdvariate_beta_1_b(samples) * (1.0 - cprob);
246 while(pprob < cprob)
247 {
248 pprob += _probs[++pidx];
249 _probs[pidx] = 0.0;
250 }
251 _probs[pidx] += 1.0;
252 samples--;
253 covered_part = (pprob - cprob) / (1.0 - cprob);
254 }
255 if(samples <= 0)
256 break;
257 size_t nrtaken = rdvariate_binom(samples, covered_part);
258 _probs[pidx] += static_cast<double>(nrtaken);
259 samples -= nrtaken;
260 cprob = pprob;
261 }
262
263 pidx++;
264 memset(_probs + pidx, 0, sizeof(double)*(_confs_no - pidx));
265}
266
267FixedEnvelope FixedEnvelope::LinearCombination(const std::vector<const FixedEnvelope*>& spectra, const std::vector<double>& intensities)
268{
269 return LinearCombination(spectra.data(), intensities.data(), spectra.size());
270}
271
272FixedEnvelope FixedEnvelope::LinearCombination(const FixedEnvelope* const * spectra, const double* intensities, size_t size)
273{
274 size_t ret_size = 0;
275 for(size_t ii = 0; ii < size; ii++)
276 ret_size += spectra[ii]->_confs_no;
277
278 double* newprobs = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
279 if(newprobs == nullptr)
280 throw std::bad_alloc();
281 double* newmasses = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
282 if(newmasses == nullptr)
283 {
284 free(newprobs);
285 throw std::bad_alloc();
286 }
287
288 size_t cntr = 0;
289 for(size_t ii = 0; ii < size; ii++)
290 {
291 double mul = intensities[ii];
292 for(size_t jj = 0; jj < spectra[ii]->_confs_no; jj++)
293 newprobs[jj+cntr] = spectra[ii]->_probs[jj] * mul;
294 memcpy(newmasses + cntr, spectra[ii]->_masses, sizeof(double) * spectra[ii]->_confs_no);
295 cntr += spectra[ii]->_confs_no;
296 }
297 return FixedEnvelope(newmasses, newprobs, cntr);
298}
299
300double FixedEnvelope::WassersteinDistance(FixedEnvelope& other)
301{
302 double ret = 0.0;
303 if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
304 throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
305
306 if(_confs_no == 0 || other._confs_no == 0)
307 return 0.0;
308
309 sort_by_mass();
310 other.sort_by_mass();
311
312 size_t idx_this = 0;
313 size_t idx_other = 0;
314
315 double acc_prob = 0.0;
316 double last_point = 0.0;
317
318
319 while(idx_this < _confs_no && idx_other < other._confs_no)
320 {
321 if(_masses[idx_this] < other._masses[idx_other])
322 {
323 ret += (_masses[idx_this] - last_point) * std::abs(acc_prob);
324 acc_prob += _probs[idx_this];
325 last_point = _masses[idx_this];
326 idx_this++;
327 }
328 else
329 {
330 ret += (other._masses[idx_other] - last_point) * std::abs(acc_prob);
331 acc_prob -= other._probs[idx_other];
332 last_point = other._masses[idx_other];
333 idx_other++;
334 }
335 }
336
337 acc_prob = std::abs(acc_prob);
338
339 while(idx_this < _confs_no)
340 {
341 ret += (_masses[idx_this] - last_point) * acc_prob;
342 acc_prob -= _probs[idx_this];
343 last_point = _masses[idx_this];
344 idx_this++;
345 }
346
347 while(idx_other < other._confs_no)
348 {
349 ret += (other._masses[idx_other] - last_point) * acc_prob;
350 acc_prob -= other._probs[idx_other];
351 last_point = other._masses[idx_other];
352 idx_other++;
353 }
354
355 return ret;
356}
357
358
359double FixedEnvelope::OrientedWassersteinDistance(FixedEnvelope& other)
360{
361 double ret = 0.0;
362 if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
363 throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
364
365 if(_confs_no == 0 || other._confs_no == 0)
366 return 0.0;
367
368 sort_by_mass();
369 other.sort_by_mass();
370
371 size_t idx_this = 0;
372 size_t idx_other = 0;
373
374 double acc_prob = 0.0;
375 double last_point = 0.0;
376
377
378 while(idx_this < _confs_no && idx_other < other._confs_no)
379 {
380 if(_masses[idx_this] < other._masses[idx_other])
381 {
382 ret += (_masses[idx_this] - last_point) * acc_prob;
383 acc_prob += _probs[idx_this];
384 last_point = _masses[idx_this];
385 idx_this++;
386 }
387 else
388 {
389 ret += (other._masses[idx_other] - last_point) * acc_prob;
390 acc_prob -= other._probs[idx_other];
391 last_point = other._masses[idx_other];
392 idx_other++;
393 }
394 }
395
396 while(idx_this < _confs_no)
397 {
398 ret += (_masses[idx_this] - last_point) * acc_prob;
399 acc_prob -= _probs[idx_this];
400 last_point = _masses[idx_this];
401 idx_this++;
402 }
403
404 while(idx_other < other._confs_no)
405 {
406 ret += (other._masses[idx_other] - last_point) * acc_prob;
407 acc_prob -= other._probs[idx_other];
408 last_point = other._masses[idx_other];
409 idx_other++;
410 }
411
412 return ret;
413}
414
415double FixedEnvelope::AbyssalWassersteinDistance(FixedEnvelope& other, double abyss_depth, double other_scale)
416{
417 sort_by_mass();
418 other.sort_by_mass();
419
420 std::vector<std::pair<double, double>> carried;
421
422 size_t idx_this = 0;
423 size_t idx_other = 0;
424
425 //std::cout << "AAA" << std::endl;
426
427 auto finished = [&]() -> bool { return idx_this >= _confs_no && idx_other >= other._confs_no; };
428 auto next = [&]() -> std::pair<double, double> {
429 if(idx_this >= _confs_no || (idx_other < other._confs_no && _masses[idx_this] > other._masses[idx_other]))
430 {
431 std::pair<double, double> res = std::pair<double, double>(other._masses[idx_other], other._probs[idx_other]*other_scale);
432 idx_other++;
433 return res;
434 }
435 else
436 {
437 std::pair<double, double> res = std::pair<double, double>(_masses[idx_this], -_probs[idx_this]);
438 idx_this++;
439 return res;
440 }
441 };
442 double accd = 0.0;
443 double condemned = 0.0;
444
445 while(!finished())
446 {
447 auto pair = next();
448 double m = pair.first;
449 double p = pair.second;
450 if(!carried.empty() && carried[0].second * p > 0.0)
451 {
452 carried.emplace_back(m, p);
453 continue;
454 }
455
456 while(!carried.empty())
457 {
458 double cm = carried.back().first;
459 double cp = carried.back().second;
460 if(m - cm >= abyss_depth)
461 {
462 for(auto it = carried.cbegin(); it != carried.cend(); it++)
463 condemned += fabs(it->second);
464 carried.clear();
465 break;
466 }
467 if((cp+p)*p > 0.0)
468 {
469 accd += fabs((m-cm)*cp);
470 p += cp;
471 carried.pop_back();
472 }
473 else
474 {
475 accd += fabs((m-cm)*p);
476 cp += p;
477 p = 0.0;
478 break;
479 }
480 }
481 if(p != 0.0)
482 carried.emplace_back(m, p);
483 //std::cout << m << " " << p << std::endl;
484 }
485
486 for(auto it = carried.cbegin(); it != carried.cend(); it++)
487 condemned += fabs(it->second);
488
489 return accd + condemned * abyss_depth * 0.5;
490}
491
492#if 0
493double FixedEnvelope::ScaledAbyssalWassersteinDistance(FixedEnvelope * const * others, double abyss_depth, const double* other_scales, const size_t N)
494{
495 sort_by_mass();
496
497 std::priority_queue<std::pair<double, size_t>> PQ;
498 std::unique_ptr<size_t[]> indices = std::make_unique<size_t[]>(N);
499 memset(indices.get(), 0, sizeof(size_t)*N);
500
501 for(size_t ii=0; ii<N; ii++)
502 {
503 others[ii]->sort_by_mass();
504 if(others[ii]->_confs_no > 0)
505 PQ.emplace_back({others._probs[0], ii});
506 }
507
508
509
510
511 std::vector<std::pair<double, double>> carried;
512
513 size_t idx_this = 0;
514 size_t idx_other = 0;
515
516 //std::cout << "AAA" << std::endl;
517
518 auto finished = [&]() -> bool { return idx_this >= _confs_no && PQ.empty(); };
519 auto next = [&]() -> std::tuple<double, double, size_t> {
520 if(idx_this >= _confs_no || (idx_other < other._confs_no && _masses[idx_this] > other._masses[idx_other]))
521 {
522 std::pair<double, double> res = std::pair<double, double>(other._masses[idx_other], other._probs[idx_other]*other_scale);
523 idx_other++;
524 return res;
525 }
526 else
527 {
528 std::pair<double, double> res = std::pair<double, double>(_masses[idx_this], -_probs[idx_this]);
529 idx_this++;
530 return res;
531 }
532 };
533 double accd = 0.0;
534 double condemned = 0.0;
535
536 while(!finished())
537 {
538 auto [m, p] = next();
539 if(!carried.empty() && carried[0].second * p > 0.0)
540 {
541 carried.emplace_back(m, p);
542 continue;
543 }
544
545 while(!carried.empty())
546 {
547 auto& [cm, cp] = carried.back();
548 if(m - cm >= abyss_depth)
549 {
550 for(auto it = carried.cbegin(); it != carried.cend(); it++)
551 condemned += fabs(it->second);
552 carried.clear();
553 break;
554 }
555 if((cp+p)*p > 0.0)
556 {
557 accd += fabs((m-cm)*cp);
558 p += cp;
559 carried.pop_back();
560 }
561 else
562 {
563 accd += fabs((m-cm)*p);
564 cp += p;
565 p = 0.0;
566 break;
567 }
568 }
569 if(p != 0.0)
570 carried.emplace_back(m, p);
571 //std::cout << m << " " << p << std::endl;
572 }
573
574 for(auto it = carried.cbegin(); it != carried.cend(); it++)
575 condemned += fabs(it->second);
576
577 return accd + condemned * abyss_depth * 0.5;
578}
579
580#endif
581
582#if 0
583double AbyssalWassersteinDistanceGrad(FixedEnvelope* const* envelopes, const double* scales, double* ret_gradient, size_t N, double abyss_depth_exp, double abyss_depth_the)
584{
585return 0.0;
586 std::unique_ptr<size_t[]> env_idx = std::make_unique<size_t[]>(N+1);
587 memset(env_idx.get(), 0, (N+1)*sizeof(size_t));
588 memset(ret_gradient, 0, (N+1)*sizeof(double));
589
590 auto pq_cmp = [](std::pair<double, size_t>& p1, std::pair<double, size_t>& p2) { return p1.first > p2.first; };
591 std::priority_queue<std::pair<double, size_t>, std::vector<std::pair<double, size_t>>, decltype(pq_cmp)> PQ(pq_cmp);
592
593 for(size_t ii=0; ii<=N; ii++)
594 {
595 envelopes[ii]->sort_by_mass();
596 if(envelopes[ii]->_confs_no > 0)
597 {
598 std::cout << "Initial push: " << envelopes[ii]->_masses[0] << " " << ii << '\n';
599 PQ.push({envelopes[ii]->_masses[0], ii});
600 }
601 }
602
603 std::vector<std::tuple<double, double, size_t>> carried;
604
605 auto next = [&]() -> std::tuple<double, double, size_t> {
606 auto [mass, eidx] = PQ.top();
607 double prob = envelopes[eidx]->_probs[env_idx[eidx]];
608 PQ.pop();
609 if(eidx == 0)
610 prob = -prob;
611 else
612 prob = prob * scales[eidx];
613 std::cout << "Next: " << mass << " " << prob << " " << eidx << '\n';
614 env_idx[eidx]++;
615 if(env_idx[eidx] < envelopes[eidx]->_confs_no)
616 PQ.push({envelopes[eidx]->_masses[env_idx[eidx]], eidx});
617
618 return {mass, prob, eidx};
619 };
620 double accd = 0.0;
621 double condemned = 0.0;
622 //double flow;
623 const double max_flow_dist = abyss_depth_exp + abyss_depth_the;
624 max_flow_dist *= 2.0;
625
626 while(!PQ.empty())
627 {
628 auto [m, p, eidx] = next();
629 if(!carried.empty() && std::get<1>(carried[0]) * p > 0.0)
630 {
631 carried.emplace_back(m, p, eidx);
632 continue;
633 }
634
635 while(!carried.empty())
636 {
637 auto& [cm, cp, ceidx] = carried.back();
638 if(m - cm >= max_flow_dist)
639 {
640 for(auto it = carried.cbegin(); it != carried.cend(); it++)
641 condemned += fabs(std::get<1>(*it));
642 carried.clear();
643 break;
644 }
645 std::cout << "accing\n";
646 if((cp+p)*p > 0.0)
647 {
648 accd += fabs((m-cm)*cp);
649 p += cp;
650 carried.pop_back();
651 std::cout << "cprob:" << cp << '\n';
652 }
653 else
654 {
655 accd += fabs((m-cm)*p);
656 cp += p;
657 std::cout << "prob:" << p << '\n';
658 p = 0.0;
659 break;
660 }
661 }
662 if(p != 0.0)
663 carried.emplace_back(m, p, eidx);
664 //std::cout << m << " " << p << std::endl;
665 }
666
667 for(auto it = carried.cbegin(); it != carried.cend(); it++)
668 condemned += fabs(std::get<1>(*it));
669
670 std::cout << "ISO:" << accd << " " << condemned << '\n';
671 return accd + condemned * max_flow_dist * 0.5;
672 while(!PQ.empty())
673 {
674 auto [m, p, eidx] = next();
675 if(!carried.empty() && (std::get<1>(carried[0]) * p > 0.0))
676 {
677 carried.emplace_back(m, p, eidx);
678 continue;
679 }
680
681 while(!carried.empty())
682 {
683 auto& [cm, cp, ceidx] = carried.back();
684 if(m - cm >= max_flow_dist)
685 {
686 for(auto it = carried.cbegin(); it != carried.cend(); it++)
687 {
688 flow = fabs(std::get<1>(*it));
689 const size_t target_idx = std::get<2>(*it);
690 flow *= target_idx == 0 ? abyss_depth_exp : abyss_depth_the;
691 ret_gradient[target_idx] += flow;
692 condemned += flow;
693 std::cout << "condemnin': " << m << " " << p << " " << eidx << " | " << cm << " " << cp << " " << ceidx << '\n';
694 }
695 carried.clear();
696 break;
697 }
698 if((cp+p)*p > 0.0)
699 {
700 flow = fabs((m-cm)*cp);
701 accd += flow;
702 p += cp;
703 ret_gradient[ceidx] += flow;
704 carried.pop_back();
705 }
706 else
707 {
708 flow = fabs((m-cm)*p);
709 accd += flow;
710 cp += p;
711 ret_gradient[eidx] += flow;
712 p = 0.0;
713 break;
714 }
715 }
716 if(p != 0.0)
717 carried.emplace_back(m, p, eidx);
718 //std::cout << m << " " << p << std::endl;
719 }
720
721 for(auto it = carried.cbegin(); it != carried.cend(); it++)
722 condemned += fabs(std::get<1>(*it));
723
724
725 return accd + condemned * (abyss_depth_exp + abyss_depth_the) * 0.5;
726}
727#endif
728
729
730std::tuple<double, double, double> FixedEnvelope::WassersteinMatch(FixedEnvelope& other, double flow_distance, double other_scale)
731{
732 if(_confs_no == 0)
733 return {0.0, other.get_total_prob() * other_scale, 0.0};
734
735 double unmatched1 = 0.0;
736 double unmatched2 = 0.0;
737 double massflow = 0.0;
738
739 sort_by_mass();
740 other.sort_by_mass();
741
742 size_t idx_this = 0;
743 size_t idx_other = 0;
744 double used_prob_this = 0.0;
745 double used_prob_other = 0.0;
746
747 while(idx_this < _confs_no && idx_other < other._confs_no)
748 {
749 bool moved = true;
750 while(moved && idx_this < _confs_no && idx_other < other._confs_no)
751 {
752 moved = false;
753 if(_masses[idx_this] < other._masses[idx_other] - flow_distance)
754 {
755 unmatched1 += _probs[idx_this] - used_prob_this;
756 used_prob_this = 0.0;
757 idx_this++;
758 moved = true;
759 }
760 if(other._masses[idx_other] < _masses[idx_this] - flow_distance)
761 {
762 unmatched2 += other._probs[idx_other]*other_scale - used_prob_other;
763 used_prob_other = 0.0;
764 idx_other++;
765 moved = true;
766 }
767 }
768 if(idx_this < _confs_no && idx_other < other._confs_no)
769 {
770 assert(_probs[idx_this] - used_prob_this >= 0.0);
771 assert(other._probs[idx_other]*other_scale - used_prob_other >= 0.0);
772
773 if(_probs[idx_this] - used_prob_this < other._probs[idx_other]*other_scale - used_prob_other)
774 {
775 massflow += _probs[idx_this] - used_prob_this;
776 used_prob_other += _probs[idx_this] - used_prob_this;
777 assert(used_prob_other >= 0.0);
778 used_prob_this = 0.0;
779 idx_this++;
780 }
781 else
782 {
783 massflow += other._probs[idx_other]*other_scale - used_prob_other;
784 used_prob_this += other._probs[idx_other]*other_scale - used_prob_other;
785 assert(used_prob_this >= 0.0);
786 used_prob_other = 0.0;
787 idx_other++;
788 }
789 }
790 }
791
792 unmatched1 -= used_prob_this;
793 unmatched2 -= used_prob_other;
794
795 for(; idx_this < _confs_no; idx_this++)
796 unmatched1 += _probs[idx_this];
797 for(; idx_other < other._confs_no; idx_other++)
798 unmatched2 += other._probs[idx_other]*other_scale;
799
800 return {unmatched1, unmatched2, massflow};
801}
802
803FixedEnvelope FixedEnvelope::bin(double bin_width, double middle)
804{
805 sort_by_mass();
806
807 FixedEnvelope ret;
808
809 if(_confs_no == 0)
810 return ret;
811
812 ret.reallocate_memory<false>(ISOSPEC_INIT_TABLE_SIZE);
813
814 if(bin_width == 0)
815 {
816 double curr_mass = _masses[0];
817 double accd_prob = _probs[0];
818 for(size_t ii = 1; ii<_confs_no; ii++)
819 {
820 if(curr_mass != _masses[ii])
821 {
822 ret.store_conf(curr_mass, accd_prob);
823 curr_mass = _masses[ii];
824 accd_prob = _probs[ii];
825 }
826 else
827 accd_prob += _probs[ii];
828 }
829 ret.store_conf(curr_mass, accd_prob);
830 return ret;
831 }
832
833 size_t ii = 0;
834
835 double half_width = 0.5*bin_width;
836 double hwmm = half_width-middle;
837
838 while(ii < _confs_no)
839 {
840 double current_bin_middle = floor((_masses[ii]+hwmm)/bin_width)*bin_width + middle;
841 double current_bin_end = current_bin_middle + half_width;
842 double bin_prob = 0.0;
843
844 while(ii < _confs_no && _masses[ii] <= current_bin_end)
845 {
846 bin_prob += _probs[ii];
847 ii++;
848 }
849 ret.store_conf(current_bin_middle, bin_prob);
850 }
851
852 return ret;
853}
854
855template<bool tgetConfs> void FixedEnvelope::reallocate_memory(size_t new_size)
856{
857 current_size = new_size;
858 // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
859 _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
860 if(_masses == nullptr)
861 throw std::bad_alloc();
862 tmasses = _masses + _confs_no;
863
864 _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
865 if(_probs == nullptr)
866 throw std::bad_alloc();
867 tprobs = _probs + _confs_no;
868
869 constexpr_if(tgetConfs)
870 {
871 _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
872 if(_confs == nullptr)
873 throw std::bad_alloc();
874 tconfs = _confs + (allDim * _confs_no);
875 }
876}
877
878void FixedEnvelope::slow_reallocate_memory(size_t new_size)
879{
880 current_size = new_size;
881 // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
882 _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
883 if(_masses == nullptr)
884 throw std::bad_alloc();
885 tmasses = _masses + _confs_no;
886
887 _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
888 if(_probs == nullptr)
889 throw std::bad_alloc();
890 tprobs = _probs + _confs_no;
891
892 if(_confs != nullptr)
893 {
894 _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
895 if(_confs == nullptr)
896 throw std::bad_alloc();
897 tconfs = _confs + (allDim * _confs_no);
898 }
899}
900
901template<bool tgetConfs> void FixedEnvelope::threshold_init(Iso&& iso, double threshold, bool absolute)
902{
903 IsoThresholdGenerator generator(std::move(iso), threshold, absolute);
904
905 size_t tab_size = generator.count_confs();
906 this->allDim = generator.getAllDim();
907 this->allDimSizeofInt = this->allDim * sizeof(int);
908
909 this->reallocate_memory<tgetConfs>(tab_size);
910
911 double* ttmasses = this->_masses;
912 double* ttprobs = this->_probs;
913 ISOSPEC_MAYBE_UNUSED int* ttconfs;
914 constexpr_if(tgetConfs)
915 ttconfs = _confs;
916
917 while(generator.advanceToNextConfiguration())
918 {
919 *ttmasses = generator.mass(); ttmasses++;
920 *ttprobs = generator.prob(); ttprobs++;
921 constexpr_if(tgetConfs) { generator.get_conf_signature(ttconfs); ttconfs += allDim; }
922 }
923
924 this->_confs_no = tab_size;
925}
926
927template void FixedEnvelope::threshold_init<true>(Iso&& iso, double threshold, bool absolute);
928template void FixedEnvelope::threshold_init<false>(Iso&& iso, double threshold, bool absolute);
929
930
931template<bool tgetConfs> void FixedEnvelope::total_prob_init(Iso&& iso, double target_total_prob, bool optimize)
932{
933 if(target_total_prob <= 0.0)
934 return;
935
936 if(target_total_prob >= 1.0)
937 {
938 threshold_init<tgetConfs>(std::move(iso), 0.0, true);
939 return;
940 }
941
942 IsoLayeredGenerator generator(std::move(iso), 1000, 1000, true, std::min<double>(target_total_prob, 0.9999));
943
944 this->allDim = generator.getAllDim();
945 this->allDimSizeofInt = this->allDim*sizeof(int);
946
947
948 this->reallocate_memory<tgetConfs>(ISOSPEC_INIT_TABLE_SIZE);
949
950 size_t last_switch = 0;
951 double prob_at_last_switch = 0.0;
952 double prob_so_far = 0.0;
953 double layer_delta;
954
955 const double sum_above = log1p(-target_total_prob) - 2.3025850929940455; // log(0.1);
956
957 do
958 { // Store confs until we accumulate more prob than needed - and, if optimizing,
959 // store also the rest of the last layer
960 while(generator.advanceToNextConfigurationWithinLayer())
961 {
962 this->template addConfILG<tgetConfs>(generator);
963 prob_so_far += *(tprobs-1); // The just-stored probability
964 if(prob_so_far >= target_total_prob)
965 {
966 if (optimize)
967 {
968 while(generator.advanceToNextConfigurationWithinLayer())
969 this->template addConfILG<tgetConfs>(generator);
970 break;
971 }
972 else
973 return;
974 }
975 }
976 if(prob_so_far >= target_total_prob)
977 break;
978
979 last_switch = this->_confs_no;
980 prob_at_last_switch = prob_so_far;
981
982 layer_delta = sum_above - log1p(-prob_so_far);
983 layer_delta = (std::max)((std::min)(layer_delta, -0.1), -5.0);
984 } while(generator.nextLayer(layer_delta));
985
986 if(!optimize || prob_so_far <= target_total_prob)
987 return;
988
989 // Right. We have extra configurations and we have been asked to produce an optimal p-set, so
990 // now we shall trim unneeded configurations, using an algorithm dubbed "quicktrim"
991 // - similar to the quickselect algorithm, except that we use the cumulative sum of elements
992 // left of pivot to decide whether to go left or right, instead of the positional index.
993 // We'll be sorting by the prob array, permuting the other ones in parallel.
994
995 int* conf_swapspace = nullptr;
996 constexpr_if(tgetConfs)
997 conf_swapspace = reinterpret_cast<int*>(malloc(this->allDimSizeofInt));
998
999 size_t start = last_switch;
1000 size_t end = this->_confs_no;
1001 double sum_to_start = prob_at_last_switch;
1002
1003 while(start < end)
1004 {
1005 // Partition part
1006 size_t len = end - start;
1007#if ISOSPEC_BUILDING_R
1008 size_t pivot = len/2 + start;
1009#else
1010 size_t pivot = random_gen() % len + start; // Using Mersenne twister directly - we don't
1011 // need a very uniform distribution just for pivot
1012 // selection
1013#endif
1014 double pprob = this->_probs[pivot];
1015 swap<tgetConfs>(pivot, end-1, conf_swapspace);
1016
1017 double new_csum = sum_to_start;
1018
1019 size_t loweridx = start;
1020 for(size_t ii = start; ii < end-1; ii++)
1021 if(this->_probs[ii] > pprob)
1022 {
1023 swap<tgetConfs>(ii, loweridx, conf_swapspace);
1024 new_csum += this->_probs[loweridx];
1025 loweridx++;
1026 }
1027
1028 swap<tgetConfs>(end-1, loweridx, conf_swapspace);
1029
1030 // Selection part
1031 if(new_csum < target_total_prob)
1032 {
1033 start = loweridx + 1;
1034 sum_to_start = new_csum + this->_probs[loweridx];
1035 }
1036 else
1037 end = loweridx;
1038 }
1039
1040 constexpr_if(tgetConfs)
1041 free(conf_swapspace);
1042
1043 if(end <= current_size/2)
1044 // Overhead in memory of 2x or more, shrink to fit
1045 this->template reallocate_memory<tgetConfs>(end);
1046
1047 this->_confs_no = end;
1048}
1049
1050template void FixedEnvelope::total_prob_init<true>(Iso&& iso, double target_total_prob, bool optimize);
1051template void FixedEnvelope::total_prob_init<false>(Iso&& iso, double target_total_prob, bool optimize);
1052
1053template<bool tgetConfs> void FixedEnvelope::stochastic_init(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias)
1054{
1055 IsoStochasticGenerator generator(std::move(iso), _no_molecules, _precision, _beta_bias);
1056
1057 this->allDim = generator.getAllDim();
1058 this->allDimSizeofInt = this->allDim * sizeof(int);
1059
1060 this->reallocate_memory<tgetConfs>(ISOSPEC_INIT_TABLE_SIZE);
1061
1062 while(generator.advanceToNextConfiguration())
1063 addConfILG<tgetConfs, IsoStochasticGenerator>(generator);
1064}
1065
1066template void FixedEnvelope::stochastic_init<true>(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias);
1067template void FixedEnvelope::stochastic_init<false>(Iso&& iso, size_t _no_molecules, double _precision, double _beta_bias);
1068
1069double FixedEnvelope::empiric_average_mass()
1070{
1071 double ret = 0.0;
1072 for(size_t ii = 0; ii < _confs_no; ii++)
1073 {
1074 ret += _masses[ii] * _probs[ii];
1075 }
1076 return ret / get_total_prob();
1077}
1078
1079double FixedEnvelope::empiric_variance()
1080{
1081 double ret = 0.0;
1082 double avg = empiric_average_mass();
1083 for(size_t ii = 0; ii < _confs_no; ii++)
1084 {
1085 double msq = _masses[ii] - avg;
1086 ret += msq * msq * _probs[ii];
1087 }
1088
1089 return ret / get_total_prob();
1090}
1091
1092FixedEnvelope FixedEnvelope::Binned(Iso&& iso, double target_total_prob, double bin_width, double bin_middle)
1093{
1094 FixedEnvelope ret;
1095
1096 double min_mass = iso.getLightestPeakMass();
1097 double range_len = iso.getHeaviestPeakMass() - min_mass;
1098 size_t no_bins = static_cast<size_t>(range_len / bin_width) + 2;
1099 double half_width = 0.5*bin_width;
1100 double hwmm = half_width-bin_middle;
1101 size_t idx_min = floor((min_mass + hwmm) / bin_width);
1102 size_t idx_max = idx_min + no_bins;
1103
1104 double* acc;
1105# if ISOSPEC_GOT_MMAN
1106 acc = reinterpret_cast<double*>(mmap(nullptr, sizeof(double)*no_bins, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
1107#else
1108 // This will probably crash for large molecules and high resolutions...
1109 acc = reinterpret_cast<double*>(calloc(no_bins, sizeof(double)));
1110#endif
1111 if(acc == NULL)
1112 throw std::bad_alloc();
1113
1114 acc -= idx_min;
1115
1116 IsoLayeredGenerator ITG(std::move(iso));
1117
1118
1119 bool non_empty;
1120 while((non_empty = ITG.advanceToNextConfiguration()) && ITG.prob() == 0.0)
1121 {}
1122
1123 if(non_empty)
1124 {
1125 double accum_prob = ITG.prob();
1126 size_t nonzero_idx = floor((ITG.mass() + hwmm)/bin_width);
1127 acc[nonzero_idx] = accum_prob;
1128
1129 while(ITG.advanceToNextConfiguration() && accum_prob < target_total_prob)
1130 {
1131 double prob = ITG.prob();
1132 accum_prob += prob;
1133 size_t bin_idx = floor((ITG.mass() + hwmm)/bin_width);
1134 acc[bin_idx] += prob;
1135 }
1136
1137 // Making the assumption that there won't be gaps of more than 10 Da in the spectrum. This is true for all
1138 // molecules made of natural elements.
1139 size_t distance_10da = static_cast<size_t>(10.0/bin_width) + 1;
1140
1141 size_t empty_steps = 0;
1142
1143 ret.reallocate_memory<false>(ISOSPEC_INIT_TABLE_SIZE);
1144
1145 for(size_t ii = nonzero_idx; ii >= idx_min && empty_steps < distance_10da; ii--)
1146 {
1147 if(acc[ii] > 0.0)
1148 {
1149 empty_steps = 0;
1150 ret.store_conf(static_cast<double>(ii)*bin_width + bin_middle, acc[ii]);
1151 }
1152 else
1153 empty_steps++;
1154 }
1155
1156 empty_steps = 0;
1157 for(size_t ii = nonzero_idx+1; ii < idx_max && empty_steps < distance_10da; ii++)
1158 {
1159 if(acc[ii] > 0.0)
1160 {
1161 empty_steps = 0;
1162 ret.store_conf(static_cast<double>(ii)*bin_width + bin_middle, acc[ii]);
1163 }
1164 else
1165 empty_steps++;
1166 }
1167 }
1168
1169 acc += idx_min;
1170
1171# if ISOSPEC_GOT_MMAN
1172 munmap(acc, sizeof(double)*no_bins);
1173#else
1174 free(acc);
1175#endif
1176
1177 return ret;
1178}
1179
1180} // namespace IsoSpec