-
Notifications
You must be signed in to change notification settings - Fork 77
/
ebwt_search.cpp
3463 lines (3325 loc) · 114 KB
/
ebwt_search.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <getopt.h>
#include <vector>
#include <time.h>
#ifndef _WIN32
#include <dirent.h>
#include <signal.h>
#endif
#include "aligner.h"
#include "aligner_0mm.h"
#include "aligner_1mm.h"
#include "aligner_23mm.h"
#include "aligner_metrics.h"
#include "aligner_seed_mm.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "bitset.h"
#include "ds.h"
#include "ebwt.h"
#include "ebwt_search.h"
#include "endian_swap.h"
#include "formats.h"
#include "hit.h"
#include "pat.h"
#include "range_cache.h"
#include "sam.h"
#include "sequence_io.h"
#include "threading.h"
#include "tokenize.h"
#ifdef CHUD_PROFILING
#include <CHUD/CHUD.h>
#endif
static int FNAME_SIZE;
#if (__cplusplus >= 201103L)
#include <thread>
static std::atomic<int> thread_counter;
#else
static int thread_counter;
static MUTEX_T thread_counter_mutex;
#endif
using namespace std;
static EList<string> mates1; // mated reads (first mate)
static EList<string> mates2; // mated reads (second mate)
static EList<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
static string adjustedEbwtFileBase;
static bool verbose; // be talkative
static bool startVerbose; // be talkative at startup
bool quiet; // print nothing but the alignments
static int sanityCheck; // enable expensive sanity checks
static int format; // default read format is FASTQ
static string origString; // reference text, or filename(s)
static int seed; // srandom() seed
static int timing; // whether to report basic timing data
static bool allHits; // for multihits, report just one
static bool rangeMode; // report BWT ranges instead of ref locs
static int showVersion; // just print version and quit?
static int ipause; // pause before maching?
static uint32_t qUpto; // max # of queries to read
static int trim5; // amount to trim from 5' end
static int trim3; // amount to trim from 3' end
static int reportOpps; // whether to report # of other mappings
static int offRate; // keep default offRate
static int isaRate; // keep default isaRate
static int mismatches; // allow 0 mismatches by default
static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
static int maqLike; // do maq-like searching
static int seedLen; // seed length (changed in Maq 0.6.4 from 24)
static int seedMms; // # mismatches allowed in seed (maq's -n)
static int qualThresh; // max qual-weighted hamming dist (maq's -e)
static int maxBtsBetter; // max # backtracks allowed in half-and-half mode
static int maxBts; // max # backtracks allowed in half-and-half mode
static int nthreads; // number of pthreads operating concurrently
static bool reorder; // reorder SAM output when running multi-threaded
static int thread_ceiling; // maximum number of threads user wants bowtie to use
static string thread_stealing_dir; // keep track of pids in this directory
static bool thread_stealing; // true iff thread stealing is in use
static output_types outType; // style of output
static bool noRefNames; // true -> print reference indexes; not names
static string dumpAlBase; // basename of same-format files to dump aligned reads to
static string dumpUnalBase; // basename of same-format files to dump unaligned reads to
static string dumpMaxBase; // basename of same-format files to dump reads with more than -m valid alignments to
static uint32_t khits; // number of hits per read; >1 is much slower
static uint32_t mhits; // don't report any hits if there are > mhits
static bool better; // true -> guarantee alignments from best possible stratum
static bool strata; // true -> don't stop at stratum boundaries
static int partitionSz; // output a partitioning key in first field
static int readsPerBatch; // # reads to read from input file at once
static size_t outBatchSz; // # alignments to write to output file at once
static bool noMaqRound; // true -> don't round quals to nearest 10 like maq
static bool fileParallel; // separate threads read separate input files in parallel
static bool useShmem; // use shared memory to hold the index
static bool useMm; // use memory-mapped files to hold the index
static bool mmSweep; // sweep through memory-mapped files immediately after mapping
static bool stateful; // use stateful aligners
static uint32_t prefetchWidth; // number of reads to process in parallel w/ --stateful
static uint32_t minInsert; // minimum insert size (Maq = 0, SOAP = 400)
static uint32_t maxInsert; // maximum insert size (Maq = 250, SOAP = 600)
static bool mate1fw; // -1 mate aligns in fw orientation on fw strand
static bool mate2fw; // -2 mate aligns in rc orientation on fw strand
static bool mateFwSet; // true -> user set --ff/--fr/--rf
static uint32_t mixedThresh; // threshold for when to switch to paired-end mixed mode (see aligner.h)
static uint32_t mixedAttemptLim; // number of attempts to make in "mixed mode" before giving up on orientation
static bool dontReconcileMates; // suppress pairwise all-versus-all way of resolving mates
static uint32_t cacheLimit; // ranges w/ size > limit will be cached
static uint32_t cacheSize; // # words per range cache
static int offBase; // offsets are 0-based by default, but configurable
static bool tryHard; // set very high maxBts, mixedAttemptLim
static uint32_t skipReads; // # reads/read pairs to skip
static bool nofw; // don't align fw orientation of read
static bool norc; // don't align rc orientation of read
static bool strandFix; // attempt to fix strand bias
static bool stats; // print performance stats
static int chunkPoolMegabytes; // max MB to dedicate to best-first search frames per thread
static int chunkSz; // size of single chunk disbursed by ChunkPool
static bool chunkVerbose; // have chunk allocator output status messages?
static bool useV1;
static bool reportSe;
static size_t fastaContLen;
static size_t fastaContFreq;
static bool hadoopOut; // print Hadoop status and summary messages
static bool fullRef;
static bool samNoQnameTrunc; // don't truncate QNAME field at first whitespace
static bool samNoHead; // don't print any header lines in SAM output
static bool samNoSQ; // don't print @SQ header lines
static string rgs; // SAM outputs for @RG header line
static Bitset suppressOuts(64); // output fields to suppress
static bool sampleMax; // whether to report a random alignment when maxed-out via -m/-M
static int defaultMapq; // default mapping quality to print in SAM mode
static bool printCost; // true -> print stratum and cost
bool showSeed;
static EList<string> qualities;
static EList<string> qualities1;
static EList<string> qualities2;
static string wrapper; // Type of wrapper script
bool gAllowMateContainment;
bool noUnal; // don't print unaligned reads
string ebwtFile; // read serialized Ebwt from this file
MUTEX_T gLock;
static void resetOptions() {
mates1.clear();
mates2.clear();
mates12.clear();
ebwtFile = "";
adjustedEbwtFileBase = "";
verbose = 0;
startVerbose = 0;
quiet = false;
sanityCheck = 0; // enable expensive sanity checks
format = FASTQ; // default read format is FASTQ
origString = ""; // reference text, or filename(s)
seed = 0; // srandom() seed
timing = 0; // whether to report basic timing data
allHits = false; // for multihits, report just one
rangeMode = false; // report BWT ranges instead of ref locs
showVersion = 0; // just print version and quit?
ipause = 0; // pause before maching?
qUpto = 0xffffffff; // max # of queries to read
trim5 = 0; // amount to trim from 5' end
trim3 = 0; // amount to trim from 3' end
reportOpps = 0; // whether to report # of other mappings
offRate = -1; // keep default offRate
isaRate = -1; // keep default isaRate
mismatches = 0; // allow 0 mismatches by default
solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33)
integerQuals = false; // quality strings are space-separated strings of integers, not ASCII
maqLike = 1; // do maq-like searching
seedLen = 28; // seed length (changed in Maq 0.6.4 from 24)
seedMms = 2; // # mismatches allowed in seed (maq's -n)
qualThresh = 70; // max qual-weighted hamming dist (maq's -e)
maxBtsBetter = 125; // max # backtracks allowed in half-and-half mode
maxBts = 800; // max # backtracks allowed in half-and-half mode
nthreads = 1; // number of pthreads operating concurrently
reorder = false; // reorder SAM output
thread_ceiling = 0; // max # threads user asked for
thread_stealing_dir = ""; // keep track of pids in this directory
thread_stealing = false; // true iff thread stealing is in use
FNAME_SIZE = 200;
outType = OUTPUT_FULL; // style of output
noRefNames = false; // true -> print reference indexes; not names
dumpAlBase = ""; // basename of same-format files to dump aligned reads to
dumpUnalBase = ""; // basename of same-format files to dump unaligned reads to
dumpMaxBase = ""; // basename of same-format files to dump reads with more than -m valid alignments to
khits = 1; // number of hits per read; >1 is much slower
mhits = 0xffffffff; // don't report any hits if there are > mhits
better = false; // true -> guarantee alignments from best possible stratum
strata = false; // true -> don't stop at stratum boundaries
partitionSz = 0; // output a partitioning key in first field
readsPerBatch = 16; // # reads to read from input file at once
outBatchSz = 16; // # alignments to wrote to output file at once
noMaqRound = false; // true -> don't round quals to nearest 10 like maq
fileParallel = false; // separate threads read separate input files in parallel
useShmem = false; // use shared memory to hold the index
useMm = false; // use memory-mapped files to hold the index
mmSweep = false; // sweep through memory-mapped files immediately after mapping
stateful = false; // use stateful aligners
prefetchWidth = 1; // number of reads to process in parallel w/ --stateful
minInsert = 0; // minimum insert size (Maq = 0, SOAP = 400)
maxInsert = 250; // maximum insert size (Maq = 250, SOAP = 600)
mate1fw = true; // -1 mate aligns in fw orientation on fw strand
mate2fw = false; // -2 mate aligns in rc orientation on fw strand
mateFwSet = false; // true -> user set mate1fw/mate2fw with --ff/--fr/--rf
mixedThresh = 4; // threshold for when to switch to paired-end mixed mode (see aligner.h)
mixedAttemptLim = 100; // number of attempts to make in "mixed mode" before giving up on orientation
dontReconcileMates = true; // suppress pairwise all-versus-all way of resolving mates
cacheLimit = 5; // ranges w/ size > limit will be cached
cacheSize = 0; // # words per range cache
offBase = 0; // offsets are 0-based by default, but configurable
tryHard = false; // set very high maxBts, mixedAttemptLim
skipReads = 0; // # reads/read pairs to skip
nofw = false; // don't align fw orientation of read
norc = false; // don't align rc orientation of read
strandFix = true; // attempt to fix strand bias
stats = false; // print performance stats
chunkPoolMegabytes = 64; // max MB to dedicate to best-first search frames per thread
chunkSz = 256; // size of single chunk disbursed by ChunkPool (in KB)
chunkVerbose = false; // have chunk allocator output status messages?
useV1 = true;
reportSe = false;
fastaContLen = 0;
fastaContFreq = 0;
hadoopOut = false; // print Hadoop status and summary messages
fullRef = false; // print entire reference name instead of just up to 1st space
samNoQnameTrunc = false; // don't truncate at first whitespace?
samNoHead = false; // don't print any header lines in SAM output
samNoSQ = false; // don't print @SQ header lines
rgs = ""; // SAM outputs for @RG header line
suppressOuts.clear(); // output fields to suppress
sampleMax = false;
defaultMapq = 255;
printCost = false; // true -> print cost and stratum
showSeed = false; // true -> print per-read pseudo-random seed
qualities.clear();
qualities1.clear();
qualities2.clear();
wrapper.clear();
gAllowMateContainment = false; // true -> alignments where one mate lies inside the other are valid
noUnal = false; // true -> do not report unaligned reads
}
// mating constraints
static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:M:1:2:I:X:x:z:B:ySCQ:";
enum {
ARG_ORIG = 256,
ARG_SEED,
ARG_RANGE,
ARG_SOLEXA_QUALS,
ARG_MAXBTS,
ARG_VERBOSE,
ARG_STARTVERBOSE,
ARG_QUIET,
ARG_FAST,
ARG_AL,
ARG_UN,
ARG_MAXDUMP,
ARG_REFIDX,
ARG_SANITY,
ARG_OLDBEST,
ARG_BETTER,
ARG_BEST,
ARG_ISARATE,
ARG_PARTITION,
ARG_READS_PER_BATCH,
ARG_integerQuals,
ARG_NOMAQROUND,
ARG_FILEPAR,
ARG_SHMEM,
ARG_MM,
ARG_MMSWEEP,
ARG_STATEFUL,
ARG_PREFETCH_WIDTH,
ARG_FF,
ARG_FR,
ARG_RF,
ARG_MIXED_ATTEMPTS,
ARG_NO_RECONCILE,
ARG_CACHE_LIM,
ARG_CACHE_SZ,
ARG_NO_FW,
ARG_NO_RC,
ARG_SKIP,
ARG_STRAND_FIX,
ARG_STATS,
ARG_ONETWO,
ARG_PHRED64,
ARG_PHRED33,
ARG_CHUNKMBS,
ARG_CHUNKSZ,
ARG_CHUNKVERBOSE,
ARG_STRATA,
ARG_PEV2,
ARG_REPORTSE,
ARG_HADOOPOUT,
ARG_FUZZY,
ARG_FULLREF,
ARG_USAGE,
ARG_SAM_NO_QNAME_TRUNC,
ARG_SAM_NOHEAD,
ARG_SAM_NOSQ,
ARG_SAM_RG,
ARG_SUPPRESS_FIELDS,
ARG_DEFAULT_MAPQ,
ARG_COST,
ARG_SHOWSEED,
ARG_QUALS1,
ARG_QUALS2,
ARG_ALLOW_CONTAIN,
ARG_WRAPPER,
ARG_INTERLEAVED_FASTQ,
ARG_SAM_NO_UNAL,
ARG_THREAD_CEILING,
ARG_THREAD_PIDDIR,
ARG_REORDER_SAM,
};
static struct option long_options[] = {
{(char*)"verbose", no_argument, 0, ARG_VERBOSE},
{(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE},
{(char*)"quiet", no_argument, 0, ARG_QUIET},
{(char*)"sanity", no_argument, 0, ARG_SANITY},
{(char*)"pause", no_argument, &ipause, 1},
{(char*)"orig", required_argument, 0, ARG_ORIG},
{(char*)"all", no_argument, 0, 'a'},
{(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS},
{(char*)"integer-quals", no_argument, 0, ARG_integerQuals},
{(char*)"time", no_argument, 0, 't'},
{(char*)"trim3", required_argument, 0, '3'},
{(char*)"trim5", required_argument, 0, '5'},
{(char*)"seed", required_argument, 0, ARG_SEED},
{(char*)"qupto", required_argument, 0, 'u'},
{(char*)"al", required_argument, 0, ARG_AL},
{(char*)"un", required_argument, 0, ARG_UN},
{(char*)"max", required_argument, 0, ARG_MAXDUMP},
{(char*)"offrate", required_argument, 0, 'o'},
{(char*)"isarate", required_argument, 0, ARG_ISARATE},
{(char*)"reportopps", no_argument, &reportOpps, 1},
{(char*)"version", no_argument, &showVersion, 1},
{(char*)"reads-per-batch", required_argument, 0, ARG_READS_PER_BATCH},
{(char*)"maqerr", required_argument, 0, 'e'},
{(char*)"seedlen", required_argument, 0, 'l'},
{(char*)"seedmms", required_argument, 0, 'n'},
{(char*)"filepar", no_argument, 0, ARG_FILEPAR},
{(char*)"help", no_argument, 0, 'h'},
{(char*)"threads", required_argument, 0, 'p'},
{(char*)"khits", required_argument, 0, 'k'},
{(char*)"mhits", required_argument, 0, 'm'},
{(char*)"minins", required_argument, 0, 'I'},
{(char*)"maxins", required_argument, 0, 'X'},
{(char*)"quals", required_argument, 0, 'Q'},
{(char*)"Q1", required_argument, 0, ARG_QUALS1},
{(char*)"Q2", required_argument, 0, ARG_QUALS2},
{(char*)"best", no_argument, 0, ARG_BEST},
{(char*)"better", no_argument, 0, ARG_BETTER},
{(char*)"oldbest", no_argument, 0, ARG_OLDBEST},
{(char*)"strata", no_argument, 0, ARG_STRATA},
{(char*)"nomaqround", no_argument, 0, ARG_NOMAQROUND},
{(char*)"refidx", no_argument, 0, ARG_REFIDX},
{(char*)"range", no_argument, 0, ARG_RANGE},
{(char*)"maxbts", required_argument, 0, ARG_MAXBTS},
{(char*)"phased", no_argument, 0, 'z'},
{(char*)"partition", required_argument, 0, ARG_PARTITION},
{(char*)"stateful", no_argument, 0, ARG_STATEFUL},
{(char*)"prewidth", required_argument, 0, ARG_PREFETCH_WIDTH},
{(char*)"ff", no_argument, 0, ARG_FF},
{(char*)"fr", no_argument, 0, ARG_FR},
{(char*)"rf", no_argument, 0, ARG_RF},
{(char*)"mixthresh", required_argument, 0, 'x'},
{(char*)"pairtries", required_argument, 0, ARG_MIXED_ATTEMPTS},
{(char*)"noreconcile", no_argument, 0, ARG_NO_RECONCILE},
{(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM},
{(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ},
{(char*)"nofw", no_argument, 0, ARG_NO_FW},
{(char*)"norc", no_argument, 0, ARG_NO_RC},
{(char*)"offbase", required_argument, 0, 'B'},
{(char*)"tryhard", no_argument, 0, 'y'},
{(char*)"skip", required_argument, 0, 's'},
{(char*)"strandfix", no_argument, 0, ARG_STRAND_FIX},
{(char*)"stats", no_argument, 0, ARG_STATS},
{(char*)"12", required_argument, 0, ARG_ONETWO},
{(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
{(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
{(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64},
{(char*)"chunkmbs", required_argument, 0, ARG_CHUNKMBS},
{(char*)"chunksz", required_argument, 0, ARG_CHUNKSZ},
{(char*)"chunkverbose", no_argument, 0, ARG_CHUNKVERBOSE},
{(char*)"mm", no_argument, 0, ARG_MM},
{(char*)"shmem", no_argument, 0, ARG_SHMEM},
{(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP},
{(char*)"pev2", no_argument, 0, ARG_PEV2},
{(char*)"reportse", no_argument, 0, ARG_REPORTSE},
{(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT},
{(char*)"fullref", no_argument, 0, ARG_FULLREF},
{(char*)"usage", no_argument, 0, ARG_USAGE},
{(char*)"sam", no_argument, 0, 'S'},
{(char*)"sam-no-qname-trunc", no_argument, 0, ARG_SAM_NO_QNAME_TRUNC},
{(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"sam-RG", required_argument, 0, ARG_SAM_RG},
{(char*)"suppress", required_argument, 0, ARG_SUPPRESS_FIELDS},
{(char*)"mapq", required_argument, 0, ARG_DEFAULT_MAPQ},
{(char*)"cost", no_argument, 0, ARG_COST},
{(char*)"showseed", no_argument, 0, ARG_SHOWSEED},
{(char*)"allow-contain",no_argument, 0, ARG_ALLOW_CONTAIN},
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
{(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ},
{(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
{(char*)"thread-ceiling",required_argument, 0, ARG_THREAD_CEILING},
{(char*)"thread-piddir", required_argument, 0, ARG_THREAD_PIDDIR},
{(char*)"reorder", no_argument, 0, ARG_REORDER_SAM},
{(char*)0, 0, 0, 0} // terminator
};
/**
* Print a summary usage message to the provided output stream.
*/
static void printUsage(ostream& out) {
#ifdef BOWTIE_64BIT_INDEX
string tool_name = "bowtie-align-l";
#else
string tool_name = "bowtie-align-s";
#endif
if(wrapper == "basic-0") {
tool_name = "bowtie";
}
out << "Usage: " << endl
<< tool_name << " [options]* -x <ebwt> {-1 <m1> -2 <m2> | --12 <r> | --interleaved <i> | <s>} [<hit>]" << endl
<< endl
<< " <ebwt> Index filename prefix (minus trailing .X." + gEbwt_ext + ")." << endl
<< " <m1> Comma-separated list of files containing upstream mates (or the" << endl
<< " sequences themselves, if -c is set) paired with mates in <m2>" << endl
<< " <m2> Comma-separated list of files containing downstream mates (or the" << endl
<< " sequences themselves if -c is set) paired with mates in <m1>" << endl
<< " <r> Comma-separated list of files containing Crossbow-style reads. Can be" << endl
<< " a mixture of paired and unpaired. Specify \"-\" for stdin." << endl
<< " <i> Files with interleaved paired-end FASTQ reads." << endl
<< " <s> Comma-separated list of files containing unpaired reads, or the" << endl
<< " sequences themselves, if -c is set. Specify \"-\" for stdin." << endl
<< " <hit> File to write hits to (default: stdout)" << endl
<< "Input:" << endl
<< " -q query input files are FASTQ .fq/.fastq (default)" << endl
<< " -f query input files are (multi-)FASTA .fa/.mfa" << endl
<< " -F k:<int>,i:<int> query input files are continuous FASTA where reads" << endl
<< " are substrings (k-mers) extracted from a FASTA file <s>" << endl
<< " and aligned at offsets 1, 1+i, 1+2i ... end of reference" << endl
<< " -r query input files are raw one-sequence-per-line" << endl
<< " -c query sequences given on cmd line (as <mates>, <singles>)" << endl
<< " -Q/--quals <file> QV file(s) corresponding to CSFASTA inputs; use with -f -C" << endl
<< " --Q1/--Q2 <file> same as -Q, but for mate files 1 and 2 respectively" << endl
<< " -s/--skip <int> skip the first <int> reads/pairs in the input" << endl
<< " -u/--qupto <int> stop after first <int> reads/pairs (excl. skipped reads)" << endl
<< " -5/--trim5 <int> trim <int> bases from 5' (left) end of reads" << endl
<< " -3/--trim3 <int> trim <int> bases from 3' (right) end of reads" << endl
<< " --phred33-quals input quals are Phred+33 (default)" << endl
<< " --phred64-quals input quals are Phred+64 (same as --solexa1.3-quals)" << endl
<< " --solexa-quals input quals are from GA Pipeline ver. < 1.3" << endl
<< " --solexa1.3-quals input quals are from GA Pipeline ver. >= 1.3" << endl
<< " --integer-quals qualities are given as space-separated integers (not ASCII)" << endl;
if(wrapper == "basic-0") {
out << " --large-index force usage of a 'large' index, even if a small one is present" << endl;
}
out << "Alignment:" << endl
<< " -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities" << endl
<< " or" << endl
<< " -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2)" << endl
<< " -e/--maqerr <int> max sum of mismatch quals across alignment for -n (def: 70)" << endl
<< " -l/--seedlen <int> seed length for -n (default: 28)" << endl
<< " --nomaqround disable Maq-like quality rounding for -n (nearest 10 <= 30)" << endl
<< " -I/--minins <int> minimum insert size for paired-end alignment (default: 0)" << endl
<< " -X/--maxins <int> maximum insert size for paired-end alignment (default: 250)" << endl
<< " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr)" << endl
<< " --nofw/--norc do not align to forward/reverse-complement reference strand" << endl
<< " --maxbts <int> max # backtracks for -n 2/3 (default: 125, 800 for --best)" << endl
<< " --pairtries <int> max # attempts to find mate for anchor hit (default: 100)" << endl
<< " -y/--tryhard try hard to find valid alignments, at the expense of speed" << endl
<< " --chunkmbs <int> max megabytes of RAM for best-first search frames (def: 64)" << endl
<< " --reads-per-batch # of reads to read from input file at once (default: 16)" << endl
<< "Reporting:" << endl
<< " -k <int> report up to <int> good alignments per read (default: 1)" << endl
<< " -a/--all report all alignments per read (much slower than low -k)" << endl
<< " -m <int> suppress all alignments if > <int> exist (def: no limit)" << endl
<< " -M <int> like -m, but reports 1 random hit (MAPQ=0); requires --best" << endl
<< " --best hits guaranteed best stratum; ties broken by quality" << endl
<< " --strata hits in sub-optimal strata aren't reported (requires --best)" << endl
<< "Output:" << endl
<< " -t/--time print wall-clock time taken by search phases" << endl
<< " -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0)" << endl
<< " --quiet print nothing but the alignments" << endl
<< " --refidx refer to ref. seqs by 0-based index rather than name" << endl
<< " --al <fname> write aligned reads/pairs to file(s) <fname>" << endl
<< " --un <fname> write unaligned reads/pairs to file(s) <fname>" << endl
<< " --no-unal suppress SAM records for unaligned reads" << endl
<< " --max <fname> write reads/pairs over -m limit to file(s) <fname>" << endl
<< " --suppress <cols> suppresses given columns (comma-delim'ed) in default output" << endl
<< " --fullref write entire ref name (default: only up to 1st space)" << endl
<< "SAM:" << endl
<< " -S/--sam write hits in SAM format" << endl
<< " --mapq <int> default mapping quality (MAPQ) to print for SAM alignments" << endl
<< " --sam-nohead supppress header lines (starting with @) for SAM output" << endl
<< " --sam-nosq supppress @SQ header lines for SAM output" << endl
<< " --sam-RG <text> add <text> (usually \"lab=value\") to @RG line of SAM header" << endl
<< "Performance:" << endl
<< " -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
<< " -p/--threads <int> number of alignment threads to launch (default: 1)" << endl
#ifdef BOWTIE_MM
<< " --mm use memory-mapped I/O for index; many 'bowtie's can share" << endl
#endif
#ifdef BOWTIE_SHARED_MEM
<< " --shmem use shared mem for index; many 'bowtie's can share" << endl
#endif
<< "Other:" << endl
<< " --seed <int> seed for random number generator" << endl
<< " --verbose verbose output (for debugging)" << endl
<< " --version print version information and quit" << endl
<< " -h/--help print this usage message" << endl
;
if(wrapper.empty()) {
cerr << endl
<< "*** Warning ***" << endl
<< tool_name << " was run directly. It is recommended that you run the wrapper script 'bowtie' instead." << endl
<< endl;
}
}
/**
* Parse an int out of optarg and enforce that it be at least 'lower';
* if it is less than 'lower', than output the given error message and
* exit with an error and a usage message.
*/
static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
long l;
char *endPtr= NULL;
l = strtol(arg, &endPtr, 10);
if (endPtr != NULL) {
if (l < lower || l > upper) {
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
}
return (int32_t)l;
}
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
return -1;
}
/**
* Parse from optarg by default.
*/
static int parseInt(int lower, const char *errmsg) {
return parseInt(lower, INT_MAX, errmsg, optarg);
}
/**
* Upper is INT_MAX by default.
*/
static int parseInt(int lower, const char *errmsg, const char *arg) {
return parseInt(lower, INT_MAX, errmsg, arg);
}
/**
* Upper is INT_MAX, parse from optarg by default.
*/
static int parseInt(int lower, int upper, const char *errmsg) {
return parseInt(lower, upper, errmsg, optarg);
}
/**
* Parse a T string 'str'.
*/
template<typename T>
T parse(const char *s) {
T tmp;
stringstream ss(s);
ss >> tmp;
return tmp;
}
/**
* Parse a pair of Ts from a string, 'str', delimited with 'delim'.
*/
template<typename T>
pair<T, T> parsePair(const char *str, char delim) {
string s(str);
EList<string> ss;
tokenize(s, delim, ss);
pair<T, T> ret;
ret.first = parse<T>(ss[0].c_str());
ret.second = parse<T>(ss[1].c_str());
return ret;
}
/**
* Read command-line arguments
*/
static void parseOptions(int argc, const char **argv) {
int option_index = 0;
int next_option;
if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
do {
next_option = getopt_long(
argc, const_cast<char**>(argv),
short_options, long_options, &option_index);
switch (next_option) {
case ARG_WRAPPER: wrapper = optarg; break;
case '1': tokenize(optarg, ",", mates1); break;
case '2': tokenize(optarg, ",", mates2); break;
case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
case ARG_INTERLEAVED_FASTQ: tokenize(optarg, ",", mates12); format = INTERLEAVED; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
pair<size_t, size_t> p = parsePair<size_t>(optarg, ',');
fastaContLen = p.first;
fastaContFreq = p.second;
break;
}
case 'q': format = FASTQ; break;
case 'r': format = RAW; break;
case 'c': format = CMDLINE; break;
case 'I':
minInsert = (uint32_t)parseInt(0, "-I arg must be positive");
break;
case 'X':
maxInsert = (uint32_t)parseInt(1, "-X arg must be at least 1");
break;
case 'x':
ebwtFile = optarg;
break;
case 's':
skipReads = (uint32_t)parseInt(0, "-s arg must be positive");
break;
case ARG_FF: mate1fw = true; mate2fw = true; mateFwSet = true; break;
case ARG_RF: mate1fw = false; mate2fw = true; mateFwSet = true; break;
case ARG_FR: mate1fw = true; mate2fw = false; mateFwSet = true; break;
case ARG_RANGE: rangeMode = true; break;
case 'S': outType = OUTPUT_SAM; break;
case ARG_SHMEM: useShmem = true; break;
case ARG_SHOWSEED: showSeed = true; break;
case ARG_ALLOW_CONTAIN: gAllowMateContainment = true; break;
case ARG_SUPPRESS_FIELDS: {
EList<string> supp;
tokenize(optarg, ",", supp);
for(size_t i = 0; i < supp.size(); i++) {
int ii = parseInt(1, "--suppress arg must be at least 1", supp[i].c_str());
suppressOuts.set(ii-1);
}
break;
}
case ARG_MM: {
#ifdef BOWTIE_MM
useMm = true;
break;
#else
cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
<< "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl
<< "would like to use memory-mapped I/O on a platform that supports it, please" << endl
<< "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
throw 1;
#endif
}
case ARG_MMSWEEP: mmSweep = true; break;
case ARG_HADOOPOUT: hadoopOut = true; break;
case ARG_AL: dumpAlBase = optarg; break;
case ARG_UN: dumpUnalBase = optarg; break;
case ARG_MAXDUMP: dumpMaxBase = optarg; break;
case ARG_SOLEXA_QUALS: solexaQuals = true; break;
case ARG_integerQuals: integerQuals = true; break;
case ARG_PHRED64: phred64Quals = true; break;
case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
case ARG_NOMAQROUND: noMaqRound = true; break;
case ARG_REFIDX: noRefNames = true; break;
case ARG_STATEFUL: stateful = true; break;
case ARG_REPORTSE: reportSe = true; break;
case ARG_FULLREF: fullRef = true; break;
case ARG_PREFETCH_WIDTH:
prefetchWidth = parseInt(1, "--prewidth must be at least 1");
break;
case 'B':
offBase = parseInt(-999999, "-B/--offbase cannot be a large negative number");
break;
case ARG_SEED:
seed = parseInt(0, "--seed arg must be at least 0");
break;
case 'u':
qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1");
break;
case 'k':
khits = (uint32_t)parseInt(1, "-k arg must be at least 1");
break;
case 'Q':
tokenize(optarg, ",", qualities);
integerQuals = true;
break;
case ARG_QUALS1:
tokenize(optarg, ",", qualities1);
integerQuals = true;
break;
case ARG_QUALS2:
tokenize(optarg, ",", qualities2);
integerQuals = true;
break;
case 'M':
sampleMax = true;
case 'm':
mhits = (uint32_t)parseInt(1, "-m arg must be at least 1");
break;
case 'z':
mixedThresh = (uint32_t)parseInt(0, "-x arg must be at least 0");
break;
case ARG_MIXED_ATTEMPTS:
mixedAttemptLim = (uint32_t)parseInt(1, "--pairtries arg must be at least 1");
break;
case ARG_CACHE_LIM:
cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1");
break;
case ARG_CACHE_SZ:
cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1");
cacheSize *= (1024 * 1024); // convert from MB to B
break;
case ARG_NO_RECONCILE:
dontReconcileMates = true;
break;
case 'p':
nthreads = parseInt(1, "-p/--threads arg must be at least 1");
break;
case ARG_THREAD_CEILING:
thread_ceiling = parseInt(0, "--thread-ceiling must be at least 0");
break;
case ARG_THREAD_PIDDIR:
thread_stealing_dir = optarg;
break;
case ARG_REORDER_SAM:
reorder = true;
break;
case ARG_FILEPAR:
fileParallel = true;
break;
case 'v':
maqLike = 0;
mismatches = parseInt(0, 3, "-v arg must be at least 0 and at most 3");
break;
case '3': trim3 = parseInt(0, "-3/--trim3 arg must be at least 0"); break;
case '5': trim5 = parseInt(0, "-5/--trim5 arg must be at least 0"); break;
case 'o': offRate = parseInt(1, "-o/--offrate arg must be at least 1"); break;
case ARG_ISARATE: isaRate = parseInt(0, "--isarate arg must be at least 0"); break;
case 'e': qualThresh = parseInt(1, "-e/--err arg must be at least 1"); break;
case 'n': seedMms = parseInt(0, 3, "-n/--seedmms arg must be at least 0 and at most 3"); maqLike = 1; break;
case 'l': seedLen = parseInt(5, "-l/--seedlen arg must be at least 5"); break;
case 'h': printUsage(cout); throw 0; break;
case ARG_USAGE: printUsage(cout); throw 0; break;
case 'a': allHits = true; break;
case 'y': tryHard = true; break;
case ARG_CHUNKMBS: chunkPoolMegabytes = parseInt(1, "--chunkmbs arg must be at least 1"); break;
case ARG_CHUNKSZ: chunkSz = parseInt(1, "--chunksz arg must be at least 1"); break;
case ARG_CHUNKVERBOSE: chunkVerbose = true; break;
case ARG_BETTER: stateful = true; better = true; break;
case ARG_BEST: stateful = true; useV1 = false; break;
case ARG_STRATA: strata = true; break;
case ARG_VERBOSE: verbose = true; break;
case ARG_STARTVERBOSE: startVerbose = true; break;
case ARG_QUIET: quiet = true; break;
case ARG_SANITY: sanityCheck = true; break;
case 't': timing = true; break;
case ARG_NO_FW: nofw = true; break;
case ARG_NO_RC: norc = true; break;
case ARG_STATS: stats = true; break;
case ARG_PEV2: useV1 = false; break;
case ARG_SAM_NO_QNAME_TRUNC: samNoQnameTrunc = true; break;
case ARG_SAM_NOHEAD: samNoHead = true; break;
case ARG_SAM_NOSQ: samNoSQ = true; break;
case ARG_SAM_NO_UNAL: noUnal = true; break;
case ARG_SAM_RG: {
if(!rgs.empty()) rgs += '\t';
rgs += optarg;
break;
}
case ARG_COST: printCost = true; break;
case ARG_DEFAULT_MAPQ:
defaultMapq = parseInt(0, "--mapq must be positive");
break;
case ARG_MAXBTS: {
maxBts = parseInt(0, "--maxbts must be positive");
maxBtsBetter = maxBts;
break;
}
case ARG_STRAND_FIX: strandFix = true; break;
case ARG_PARTITION: partitionSz = parse<int>(optarg); break;
case ARG_READS_PER_BATCH: {
if(optarg == NULL || parse<int>(optarg) < 1) {
cerr << "--reads-per-batch arg must be at least 1" << endl;
printUsage(cerr);
throw 1;
}
// TODO: should output batch size be controlled separately?
readsPerBatch = outBatchSz = parse<int>(optarg);
break;
}
case ARG_ORIG:
if(optarg == NULL || strlen(optarg) == 0) {
cerr << "--orig arg must be followed by a string" << endl;
printUsage(cerr);
throw 1;
}
origString = optarg;
break;
case -1: break; /* Done with options. */
case 0:
if (long_options[option_index].flag != 0)
break;
default:
printUsage(cerr);
throw 1;
}
} while(next_option != -1);
if (nthreads == 1 && !thread_stealing) {
reorder = false;
}
if (reorder == true && outType != OUTPUT_SAM) {
cerr << "Bowtie will reorder its output only when outputting SAM." << endl
<< "Please specify the `-S` parameter if you intend on using this option." << endl;
throw 1;
}
//bool paired = mates1.size() > 0 || mates2.size() > 0 || mates12.size() > 0;
if(rangeMode) {
// Tell the Ebwt loader to ignore the suffix-array portion of
// the index. We don't need it because the user isn't asking
// for bowtie to report reference positions (just matrix
// ranges).
offRate = 32;
}
if(!maqLike && mismatches == 3) {
// Much faster than normal 3-mismatch mode
stateful = true;
}
if(mates1.size() != mates2.size()) {
cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
<< "mate files/sequences were specified with -2. The same number of mate files/" << endl
<< "sequences must be specified with -1 and -2." << endl;
throw 1;
}
// Check for duplicate mate input files
if(format != CMDLINE) {
for(size_t i = 0; i < mates1.size(); i++) {
for(size_t j = 0; j < mates2.size(); j++) {
if(mates1[i] == mates2[j] && !quiet) {
cerr << "Warning: Same mate file \"" << mates1[i] << "\" appears as argument to both -1 and -2" << endl;
}
}
}
}
if(tryHard) {
// Increase backtracking limit to huge number
maxBts = maxBtsBetter = INT_MAX;
// Increase number of paired-end scan attempts to huge number
mixedAttemptLim = UINT_MAX;
}
if(!stateful && sampleMax) {
if(!quiet) {
cerr << "Warning: -M was specified w/o --best; automatically enabling --best" << endl;
}
stateful = true;
}
if(strata && !stateful) {
cerr << "--strata must be combined with --best" << endl;
throw 1;
}
if(strata && !allHits && khits == 1 && mhits == 0xffffffff) {
cerr << "--strata has no effect unless combined with -m, -a, or -k N where N > 1" << endl;
throw 1;
}
// If both -s and -u are used, we need to adjust qUpto accordingly
// since it uses patid to know if we've reached the -u limit (and
// patids are all shifted up by skipReads characters)
if(qUpto + skipReads > qUpto) {
qUpto += skipReads;
}
if(useShmem && useMm && !quiet) {
cerr << "Warning: --shmem overrides --mm..." << endl;
useMm = false;
}
if(!mateFwSet) {
// Set nucleotide space default (--fr)
mate1fw = true;
mate2fw = false;
}
if(outType != OUTPUT_FULL && suppressOuts.count() > 0 && !quiet) {
cerr << "Warning: Ignoring --suppress because output type is not default." << endl;
cerr << " --suppress is only available for the default output type." << endl;
suppressOuts.clear();
}
thread_stealing = thread_ceiling > nthreads;
#ifdef _WIN32
thread_stealing = false;
#endif
if(thread_stealing && thread_stealing_dir.empty()) {
cerr << "When --thread-ceiling is specified, must also specify --thread-piddir" << endl;
throw 1;
}
}
static const char *argv0 = NULL;
#define FINISH_READ(p) \
/* Don't do finishRead if the read isn't legit or if the read was skipped by the doneMask */ \
if(get_read_ret.first) { \
sink->finishRead(*p, true, !skipped); \
get_read_ret.first = false; \
} \
skipped = false;
/// Macro for getting the next read, possibly aborting depending on
/// whether the result is empty or the patid exceeds the limit, and
/// marshaling the read into convenient variables.
#define GET_READ(p) \
if(get_read_ret.second) break; \
get_read_ret = p->nextReadPair(); \
if(p->rdid() >= qUpto) { \
get_read_ret = make_pair(false, true); \
} \
if(!get_read_ret.first) { \
if(get_read_ret.second) { \
break; \
} \
continue; \
} \
BTDnaString& patFw = p->bufa().patFw; \
patFw.length(); \
BTDnaString& patRc = p->bufa().patRc; \
patRc.length(); \
BTString& qual = p->bufa().qual; \
qual.length(); \
BTString& qualRev = p->bufa().qualRev; \
qualRev.length(); \
BTDnaString& patFwRev = p->bufa().patFwRev; \
patFwRev.length(); \
BTDnaString& patRcRev = p->bufa().patRcRev; \
patRcRev.length(); \
BTString& name = p->bufa().name; \
name.length(); \
uint32_t patid = (uint32_t)p->rdid(); \
params.setPatId(patid);
#define WORKER_EXIT() \
patsrcFact->destroy(patsrc); \
delete patsrcFact; \
sinkFact->destroy(sink); \
delete sinkFact; \
return;
#ifdef CHUD_PROFILING
#define CHUD_START() chudStartRemotePerfMonitor("Bowtie");
#define CHUD_STOP() chudStopRemotePerfMonitor();
#else
#define CHUD_START()
#define CHUD_STOP()
#endif
/// Create a PatternSourcePerThread for the current thread according
/// to the global params and return a pointer to it
static PatternSourcePerThreadFactory*
createPatsrcFactory(PatternComposer& _patsrc, int tid, uint32_t max_buf) {
PatternSourcePerThreadFactory *patsrcFact;
patsrcFact = new PatternSourcePerThreadFactory(_patsrc, max_buf, skipReads, seed);
assert(patsrcFact != NULL);
return patsrcFact;
}
/**
* Allocate a HitSinkPerThreadFactory on the heap according to the
* global params and return a pointer to it.
*/
static HitSinkPerThreadFactory*
createSinkFactory(HitSink& _sink, size_t threadId) {
HitSinkPerThreadFactory *sink = NULL;
if(!strata) {
// Unstratified
if(!allHits) {
// First N good; "good" inherently ignores strata
sink = new NGoodHitSinkPerThreadFactory(_sink, khits, mhits, defaultMapq, threadId);
} else {