Skip to content

Commit

Permalink
Fix CRAMReferenceRegion updating. (#1708)
Browse files Browse the repository at this point in the history
* Fix CRAM reference region transitions.

* Add roundtrip tests that fail without this change.

* Add roundtrip tests with 2 and 3 containers aligned to position 1.

* Add more roundtrip tests that validate bases.

* Add a large CRAM roundtrip test and samtools rountdtip test.
  • Loading branch information
cmnbroad authored Jun 4, 2024
1 parent 222d10b commit 127f3de
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
* {@link #fetchReferenceBases(int)} or {@link #fetchReferenceBasesByRegion(int, int, int)}. It caches the bases
* from the previous request, along with metadata about the (0-based) start offset, and length of the
* cached bases.
*
* NOTE: this class is not thread-safe/safe for concurrent access across threads.
*/
public class CRAMReferenceRegion {
private static final Log log = Log.getInstance(CRAMReferenceRegion.class);
Expand Down Expand Up @@ -113,17 +115,18 @@ public void fetchReferenceBases(final int referenceIndex) {

// Re-resolve the reference bases if we don't have a current region or if the region we have
// doesn't span the *entire* contig requested.
final SAMSequenceRecord newSequenceRecord = getSAMSequenceRecord(referenceIndex);
if ((referenceIndex != this.referenceIndex) ||
regionStart != 0 ||
(regionLength < referenceBases.length)) {
(regionLength != newSequenceRecord.getSequenceLength())) {
setCurrentSequence(referenceIndex);
referenceBases = referenceSource.getReferenceBases(sequenceRecord, true);
referenceBases = referenceSource.getReferenceBases(newSequenceRecord, true);
if (referenceBases == null) {
throw new IllegalArgumentException(
String.format("A reference must be supplied (reference sequence %s not found).", sequenceRecord));
String.format("A reference must be supplied (reference sequence %s not found).", newSequenceRecord));
}
regionStart = 0;
regionLength = sequenceRecord.getSequenceLength();
regionLength = newSequenceRecord.getSequenceLength();
}
}

Expand Down
104 changes: 90 additions & 14 deletions src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,104 @@
import htsjdk.samtools.util.Tuple;
import htsjdk.utils.SamtoolsTestUtils;
import org.testng.Assert;
import org.testng.SkipException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.*;
import java.util.*;

/**
* Test roundtripping files through the GATK writer using both the default HTSJDK encoding strategy, plus a variety
* of alternative encoding strategies, in order to stress test the writer implementation. Compares the results with
* the original file, and then roundtrips the newly written CRAM through the samtools writer, validating that samtools
* can consume the HTSJDK-written files with the expected level of roundtrip fidelity (CRAMs don't always roundtrip
* with complete bit-level fidelity, i.e, samtools will resurrect NM/MD tags whether they were present in the original
* file or not unless they are specifically excluded, etc.). So in some case, you can't use full SAMRecord comparisons,
* in which case we fall back to lenient equality and restrict the comparison to read names, bases, alignment start/stop,
* and quality scores.
*/
public class CRAMAllEncodingStrategiesTest extends HtsjdkTest {

private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/cram");
private final CompressorCache compressorCache = new CompressorCache();

@DataProvider(name="roundTripTestFiles")
public Object[][] roundTripTestFiles() {
@DataProvider(name="defaultStrategyRoundTripTestFiles")
public Object[][] defaultStrategyRoundTripTestFiles() {
return new Object[][] {
// a test file with artificially small slices and containers to force multiple slices and containers
{ new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.500-unMapped.cram"),
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta") },
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"),
false, false },
// the same file without the artificially small container constraints
{ new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram"),
new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"),
false, false },
// a test file with only unmapped reads
{ new File(TEST_DATA_DIR, "NA12878.unmapped.cram"),
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"),
false, false },
// generated with samtools 1.19 from the gatk bam file CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam
{ new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram"),
new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"),
true, false },

// these tests use lenient equality to only validate read names, bases, alignment start/stop, and qual scores

// a user-contributed file with reads aligned only to the mito contig that has been rewritten (long ago) with GATK
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTestGATKGen.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false },
// the original user-contributed file with reads aligned only to the mito contig
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTest.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false },
// files created by rewriting the htsjdk test file src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest.cram
// using code that replicates the first read (which is aligned to position 1 of the mito contig) either
// 10,000 or 20,000 times, to create a file with 2 or 3 containers, respectively, that have reads aligned to
// position 1 of the contig
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTest_2_containers_aligned_to_pos_1.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false },
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTest_3_containers_aligned_to_pos_1.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false }
};
}

@Test(dataProvider = "roundTripTestFiles")
public final void testRoundTripDefaultEncodingStrategy(final File sourceFile, final File referenceFile) throws IOException {
@Test(dataProvider = "defaultStrategyRoundTripTestFiles")
public final void testRoundTripDefaultEncodingStrategy(
final File sourceFile,
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
// test the default encoding strategy
final CRAMEncodingStrategy testStrategy = new CRAMEncodingStrategy();
final File tempOutCRAM = File.createTempFile("testRoundTrip", ".cram");
tempOutCRAM.deleteOnExit();
CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy, sourceFile, tempOutCRAM, referenceFile);
assertRoundTripFidelity(sourceFile, tempOutCRAM, referenceFile, false);
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile);
assertRoundTripFidelity(sourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail);
// test interop with samtools using this encoding
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail);
}

@DataProvider(name="encodingStrategiesTestFiles")
public Object[][] encodingStrategiesTestFiles() {
return new Object[][] {
{ new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.500-unMapped.cram"),
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"), false, false },
};
}

@Test(dataProvider = "roundTripTestFiles")
public final void testAllEncodingStrategyCombinations(final File cramSourceFile, final File referenceFile) throws IOException {
@Test(dataProvider = "encodingStrategiesTestFiles")
public final void testAllEncodingStrategyCombinations(
final File cramSourceFile,
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
for (final Tuple<String, CRAMEncodingStrategy> testStrategy : getAllEncodingStrategies()) {
final File tempOutCRAM = File.createTempFile("allEncodingStrategyCombinations", ".cram");
tempOutCRAM.deleteOnExit();
CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy.b, cramSourceFile, tempOutCRAM, referenceFile);
assertRoundTripFidelity(cramSourceFile, tempOutCRAM, referenceFile, false);
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile);
assertRoundTripFidelity(cramSourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail);
// test interop with samtools using this encoding
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail);
}
}

Expand Down Expand Up @@ -82,6 +143,7 @@ public void assertRoundTripFidelity(
final File sourceFile,
final File targetCRAMFile,
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
try (final SamReader sourceReader = SamReaderFactory.makeDefault()
.referenceSequence(referenceFile)
Expand All @@ -91,7 +153,15 @@ public void assertRoundTripFidelity(
final SAMRecordIterator sourceIterator = sourceReader.iterator();
final SAMRecordIterator targetIterator = copyReader.getIterator();
while (sourceIterator.hasNext() && targetIterator.hasNext()) {
if (emitDetail) {
if (lenientEquality) {
final SAMRecord sourceRec = sourceIterator.next();
final SAMRecord targetRec = targetIterator.next();
Assert.assertEquals(targetRec.getReadName(), sourceRec.getReadName());
Assert.assertEquals(targetRec.getAlignmentStart(), sourceRec.getAlignmentStart());
Assert.assertEquals(targetRec.getAlignmentEnd(), sourceRec.getAlignmentEnd());
Assert.assertEquals(targetRec.getReadBases(), sourceRec.getReadBases());
Assert.assertEquals(targetRec.getBaseQualities(), sourceRec.getBaseQualities());
} else if (emitDetail) {
final SAMRecord sourceRec = sourceIterator.next();
final SAMRecord targetRec = targetIterator.next();
if (!sourceRec.equals(targetRec)) {
Expand All @@ -108,13 +178,19 @@ public void assertRoundTripFidelity(
}
}

private void assertRoundtripFidelityWithSamtools(final File sourceCRAM, final File referenceFile) throws IOException {
private void assertRoundtripFidelityWithSamtools(
final File sourceCRAM,
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
if (SamtoolsTestUtils.isSamtoolsAvailable()) {
final File samtoolsOutFile = SamtoolsTestUtils.convertToCRAM(
sourceCRAM,
referenceFile,
"--input-fmt-option decode_md=0 --output-fmt-option store_md=0 --output-fmt-option store_nm=0");
assertRoundTripFidelity(sourceCRAM, samtoolsOutFile, referenceFile, false);
assertRoundTripFidelity(sourceCRAM, samtoolsOutFile, referenceFile, lenientEquality, emitDetail);
} else {
throw new SkipException("samtools is not installed, skipping test");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.HtsjdkTest;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.structure.AlignmentContext;
import htsjdk.samtools.cram.structure.CRAMStructureTestHelper;
import htsjdk.samtools.reference.InMemoryReferenceSequenceFile;
import org.testng.Assert;
Expand Down Expand Up @@ -167,6 +168,44 @@ public void testGetReferenceBasesByRegionPositive(
Assert.assertEquals(bases, Arrays.copyOfRange(fullContigBases, requestedOffset, requestedOffset + requestedLength));
}

// Simulate the state transitions that occur when writing a CRAM file; specifically, use transitions that mirror
// the ones that occur when writing a CRAM with the conditions that affect (and that fail without the fix to)
// https://github.com/broadinstitute/gatk/issues/8768, i.e., a container with one or more containers with reads
// aligned to position 1 of a given contig, followed by one or more containers with reads aligned past position 1
// of the same contig.
@Test
public void testSerialStateTransitions() {
// start with an entire reference sequence
final CRAMReferenceRegion cramReferenceRegion = getAlternatingReferenceRegion();
cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO);
final long fullRegionFragmentLength = cramReferenceRegion.getRegionLength();
Assert.assertEquals(fullRegionFragmentLength, CRAMStructureTestHelper.REFERENCE_CONTIG_LENGTH);

// transition to a shorter reference fragment using fetchReferenceBasesByRegion, then back to the full region
final int SHORT_FRAGMENT_LENGTH = 5;
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO, 0, SHORT_FRAGMENT_LENGTH);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence; this is where the bug previously would have occurred
cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO);
// this assert would fail without the fix
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);

// transition to a shorter region fragment length using fetchReferenceBasesByRegion(AlignmentContext), then back to the full region
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(
new AlignmentContext(
new ReferenceContext(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO),
1,
SHORT_FRAGMENT_LENGTH));
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence
cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);
}

@Test
public void testGetReferenceBasesByRegionExceedsContigLength() {
int regionStart = CRAMStructureTestHelper.REFERENCE_CONTIG_LENGTH / 2;
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 127f3de

Please sign in to comment.