ref_read.cpp

#include "alphabet.h"
#include "ref_read.h"

/**
 * Reads past the next ambiguous or unambiguous stretch of sequence
 * from the given FASTA file and returns its length.  Does not do
 * anything with the sequence characters themselves; this is purely for
 * measuring lengths.
 */
RefRecord fastaRefReadSize(FileBuf& in,
                           const RefReadInParams& rparms,
                           bool first,
                           BitpairOutFileBuf* bpout)
{
	int c;
	static int lastc = '>'; // last character seen

	// RefRecord params
	TIndexOffU len = 0; // 'len' counts toward total length
	// 'off' counts number of ambiguous characters before first
	// unambiguous character
	size_t off = 0;

	// Pick off the first carat and any preceding whitespace
	if(first) {
		assert(!in.eof());
		lastc = '>';
		c = in.getPastWhitespace();
		if(in.eof()) {
			// Got eof right away; emit warning
			cerr << "Warning: Empty input file" << endl;
			lastc = -1;
			return RefRecord(0, 0, true);
		}
		assert(c == '>');
	}

	first = true;
	// Skip to the end of the id line; if the next line is either
	// another id line or a comment line, keep skipping
	if(lastc == '>') {
		// Skip to the end of the name line
		do {
			if((c = in.getPastNewline()) == -1) {
				// No more input
				cerr << "Warning: Encountered empty reference sequence" << endl;
				lastc = -1;
				return RefRecord(0, 0, true);
			}
			if(c == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
			}
			// continue until a non-name, non-comment line
		} while (c == '>');
	} else {
		first = false; // not the first in a sequence
		off = 1; // The gap has already been consumed, so count it
		if((c = in.get()) == -1) {
			// Don't emit a warning, since this might legitimately be
			// a gap on the end of the final sequence in the file
			lastc = -1;
			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
		}
	}

	// Now skip to the first DNA character, counting gap characters
	// as we go
	while(true) {
		int cat = dna4Cat[c];
		if(rparms.nsToAs && cat == 2) c = 'A';
		if(cat == 1) {
			// This is a DNA character
			break; // to read-in loop
                } else if (cat == 2) {
			off++; // skip over gap character and increment
                } else if (c == '>') {
                  if (off > 0 && lastc == '>') {
                    cerr << "Warning: Encountered reference sequence with only "
                            "gaps"
                         << endl;
                  } else if (lastc == '>') {
                    cerr << "Warning: Encountered empty reference sequence"
                         << endl;
                  }
                  lastc = '>';
                  return RefRecord((TIndexOffU)off, 0, first);
                }
                c = in.get();
		if(c == -1) {
			// End-of-file
			if(off > 0 && lastc == '>') {
				cerr << "Warning: Encountered reference sequence with only gaps" << endl;
			} else if(lastc == '>') {
				cerr << "Warning: Encountered empty reference sequence" << endl;
			}
			lastc = -1;
			return RefRecord((TIndexOffU)off, 0, first);
		}
	}
	assert_eq(1, dna4Cat[c]); // C must be unambiguous base

	// in now points just past the first character of a sequence
	// line, and c holds the first character
	while(c != -1 && c != '>') {
		if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
		uint8_t cat = dna4Cat[c];
		int cc = toupper(c);
		if(rparms.bisulfite && cc == 'C') c = cc = 'T';
		if(cat == 1) {
			// It's a DNA character
			assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
			// Consume it
			len++;
			// Output it
			if(bpout != NULL) {
				// output nucleotide
				bpout->write(asc2dna[c]);
			}
		} else if(cat == 2) {
			// It's an N or a gap
			lastc = c;
			assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
		} else {
			// Not DNA and not a gap, ignore it
#ifndef NDEBUG
			if(!isspace(c)) {
				cerr << "Unexpected character in sequence: ";
				if(isprint(c)) {
					cerr << ((char)c) << endl;
				} else {
					cerr << "(" << c << ")" << endl;
				}
			}
#endif
		}
		c = in.get();
	}
	lastc = c;
	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}

static void
printRecords(ostream& os, const EList<RefRecord>& l) {
	for(size_t i = 0; i < l.size(); i++) {
		os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
	}
}

/**
 * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
 * modify 'src'.
 */
void reverseRefRecords(const EList<RefRecord>& src,
					   EList<RefRecord>& dst,
					   bool recursive,
					   bool verbose)
{
	dst.clear();
	{
		EList<RefRecord> cur;
		for(int64_t i = (int64_t)src.size()-1; i >= 0; i--) {
			bool first = (i == (int)src.size()-1 || src[i+1].first);
			if(src[i].len) {
				cur.push_back(RefRecord(0, src[i].len, first));
				first = false;
			}
			if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
		}
		for(int64_t i = 0; i < (int64_t)cur.size(); i++) {
			assert(cur[i].off == 0 || cur[i].len == 0);
			if(i < (int64_t)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
				dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
				i++;
			} else {
				dst.push_back(cur[i]);
			}
		}
	}
	if(verbose) {
		cout << "Source: " << endl;
		printRecords(cout, src);
		cout << "Dest: " << endl;
		printRecords(cout, dst);
	}
#ifndef NDEBUG
	if(!recursive) {
		EList<RefRecord> tmp;
		reverseRefRecords(dst, tmp, true);
		assert_eq(tmp.size(), src.size());
		for(size_t i = 0; i < src.size(); i++) {
			assert_eq(src[i].len, tmp[i].len);
			assert_eq(src[i].off, tmp[i].off);
			assert_eq(src[i].first, tmp[i].first);
		}
	}
#endif
}

/**
 * Calculate a vector containing the sizes of all of the patterns in
 * all of the given input files, in order.  Returns the total size of
 * all references combined.  Rewinds each istream before returning.
 */
std::pair<size_t, size_t>
fastaRefReadSizes(EList<FileBuf*>& in,
                  EList<RefRecord>& recs,
                  EList<uint32_t>& plens,
                  const RefReadInParams& rparms,
                  BitpairOutFileBuf* bpout,
                  TIndexOff& numSeqs)
{
	TIndexOffU unambigTot = 0;
	size_t bothTot = 0;
	assert_gt(in.size(), 0);
	uint32_t both = 0, unambig = 0;
	// For each input istream
	for(size_t i = 0; i < in.size(); i++) {
		bool first = true;
		assert(!in[i]->eof());
		// For each pattern in this istream
		while(!in[i]->eof()) {
			RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
			// Update plens
			if(rec.first) {
				if(unambig > 0) {
					plens.push_back(both);
				}
				both = 0;
				unambig = 0;
			}
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
			if(rec.len == 0) rec.first = false;
#endif
			if((unambigTot + rec.len) < unambigTot) {
#ifdef BOWTIE_64BIT_INDEX
				cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
				     << "reference into smaller chunks and index each independently." << endl;
#else
				cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please try to" << endl
				     << "build a large index instead using the appropiate options." << endl;
#endif
				throw 1;
			}
			// Add the length of this record.
			if(rec.first) numSeqs++;
			unambigTot += rec.len; unambig += rec.len;
			bothTot += rec.len;    both += rec.len;
			bothTot += rec.off;    both += rec.off;
			first = false;
			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
			recs.push_back(rec);
		}
		// Reset the input stream
		in[i]->reset();
		assert(!in[i]->eof());
#ifndef NDEBUG
		// Check that it's really reset
		int c = in[i]->get();
		assert_eq('>', c);
		in[i]->reset();
		assert(!in[i]->eof());
#endif
	}
	assert_geq(bothTot, 0);
	assert_geq(unambigTot, 0);
	if(unambig > 0) {
		plens.push_back(both);
	}
	return make_pair(
		unambigTot, // total number of unambiguous DNA characters read
		bothTot); // total number of DNA characters read, incl. ambiguous ones
}