diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContextComparator.java b/src/main/java/htsjdk/variant/variantcontext/VariantContextComparator.java index bcccbeab4e..4cf8b07691 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContextComparator.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContextComparator.java @@ -11,19 +11,22 @@ import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; /** * A Comparator that orders VariantContexts by the ordering of the contigs/chromosomes in the List - * provided at construction time, then by start position with each contig/chromosome. + * provided at construction time, then by start position with each contig/chromosome, then by Ref + * and finally by order agnostic Alt alleles */ public class VariantContextComparator implements Comparator, Serializable { private static final long serialVersionUID = 1L; public static List getSequenceNameList(final SAMSequenceDictionary dictionary) { - final List list = new ArrayList(); + final List list = new ArrayList<>(); for (final SAMSequenceRecord record : dictionary.getSequences()) { list.add(record.getSequenceName()); } @@ -32,11 +35,20 @@ public static List getSequenceNameList(final SAMSequenceDictionary dicti // For fast lookup of the contig's index in the contig list private final Map contigIndexLookup; + private final Comparator internalComparator; + + private static Comparator makeComparator(final Map contigIndexLookup) { + return Comparator.comparingInt((VariantContext vc) -> contigIndexLookup.get(vc.getContig())) + .thenComparingInt(VariantContext::getStart) + .thenComparingInt(VariantContext::getEnd) + .thenComparing(VariantContext::getReference) + .thenComparing(VariantContext::getAlternateAlleles, new SortedUniquedElementsComparator<>()); + } public VariantContextComparator(final List contigs) { if (contigs.isEmpty()) throw new IllegalArgumentException("One or more contigs must be in the contig list."); - final Map protoContigIndexLookup = new HashMap(); + final Map protoContigIndexLookup = new HashMap<>(); int index = 0; for (final String contig : contigs) { protoContigIndexLookup.put(contig, index++); @@ -47,6 +59,7 @@ public VariantContextComparator(final List contigs) { } this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup); + this.internalComparator = makeComparator(this.contigIndexLookup); } /** @@ -57,7 +70,7 @@ public VariantContextComparator(final List contigs) { public VariantContextComparator(final Collection headerLines) { if (headerLines.isEmpty()) throw new IllegalArgumentException("One or more header lines must be in the header line collection."); - final Map protoContigIndexLookup = new HashMap(); + final Map protoContigIndexLookup = new HashMap<>(); for (final VCFContigHeaderLine headerLine : headerLines) { protoContigIndexLookup.put(headerLine.getID(), headerLine.getContigIndex()); } @@ -66,12 +79,13 @@ public VariantContextComparator(final Collection headerLine throw new IllegalArgumentException("There are duplicate contigs/chromosomes in the input header line collection."); } - final Set protoIndexValues = new HashSet(protoContigIndexLookup.values()); + final Set protoIndexValues = new HashSet<>(protoContigIndexLookup.values()); if (protoIndexValues.size() != headerLines.size()) { throw new IllegalArgumentException("One or more contigs share the same index number."); } this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup); + this.internalComparator = makeComparator(this.contigIndexLookup); } public VariantContextComparator(final SAMSequenceDictionary dictionary) { @@ -83,20 +97,24 @@ public int compare(final VariantContext firstVariantContext, final VariantContex // Will throw NullPointerException -- happily -- if either of the chromosomes/contigs aren't // present. This error checking should already have been done in the constructor but it's left // in as defence anyway. - int contigCompare = this.contigIndexLookup.get(firstVariantContext.getContig()).compareTo(this.contigIndexLookup.get(secondVariantContext.getContig())); - contigCompare = contigCompare == 0 ? firstVariantContext.getStart() - secondVariantContext.getStart() : contigCompare; - if (contigCompare == 0) { - // Compare variants that have the same genomic span (chr:start-end) lexicographically by all alleles (ref and alts). - for (int i = 0; i < firstVariantContext.getAlleles().size(); i++) { - // If all previous alleles are identical and the first variant has additional alleles, make the first variant greater. - if (i >= secondVariantContext.getAlleles().size()) { return 1; } - contigCompare = firstVariantContext.getAlleles().get(i).compareTo(secondVariantContext.getAlleles().get(i)); - if (contigCompare != 0) return contigCompare; + return internalComparator.compare(firstVariantContext, secondVariantContext); + } + + //Sorts and uniques the inputs and then compares the values pairwise in their natural order + //until there is a mismatch or 1 of the inputs runs out. + private static class SortedUniquedElementsComparator> implements Comparator>{ + @Override + public int compare(final Collection o1, final Collection o2) { + final Iterator left = new TreeSet<>(o1).iterator(); + final Iterator right = new TreeSet<>(o2).iterator(); + while(left.hasNext() && right.hasNext()){ + final int result = left.next().compareTo(right.next()); + if( result != 0 ){ + return result; + } } + return Boolean.compare(left.hasNext(), right.hasNext()); } - // If all previous alleles are identical and the second variant has additional alleles, make the second variant greater. - if (firstVariantContext.getAlleles().size() < secondVariantContext.getAlleles().size()) { return -1; } - return contigCompare; } /** diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextComparatorUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextComparatorUnitTest.java index 578f673ae4..52da902d9c 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextComparatorUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextComparatorUnitTest.java @@ -1,75 +1,65 @@ package htsjdk.variant.variantcontext; import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.Locatable; +import htsjdk.tribble.SimpleFeature; import org.testng.Assert; import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.List; /** * Unit tests for VariantContextComparator. */ -public class VariantContextComparatorUnitTest extends HtsjdkTest { - private Allele refA, altG, altT; - - @BeforeSuite - public void before() { - refA = Allele.create("A", true); - altG = Allele.create("G", false); - altT = Allele.create("T", false); +public class VariantContextComparatorUnitTest extends HtsjdkTest{ + + private static final String CHR1 = "chr1" , CHR2 = "chr2"; + + @DataProvider + public Object[][] getVCs(){ + final Locatable chr1_1_1 = new SimpleFeature(CHR1, 1, 1); + final Locatable chr1_11_11 = new SimpleFeature(CHR1, 11, 11); + final Locatable chr2_1_1 = new SimpleFeature(CHR2, 1, 1); + final Locatable chr2_10_10 = new SimpleFeature(CHR2, 10,10); + final VariantContext chr1_1_1_A_G = make(chr1_1_1, Allele.REF_A, Allele.ALT_G); + final VariantContext chr2_1_1_A_G = make(chr2_1_1, Allele.REF_A, Allele.ALT_G); + + final VariantContext chr2_10_10_A_G = make(chr2_10_10, Allele.REF_A, Allele.ALT_G); + final VariantContext chr1_11_11_A_G = make(chr1_11_11, Allele.REF_A, Allele.ALT_G); + final VariantContext chr1_1_A = make(chr1_1_1, Allele.REF_A); + return new Object[][] { + {chr1_1_1_A_G, chr1_1_1_A_G, 0}, //identical should match + {chr1_1_1_A_G, chr2_1_1_A_G, -1}, //chr2 comes after chr1 + {chr1_1_1_A_G, chr2_10_10_A_G, -1}, + {chr1_11_11_A_G, chr2_10_10_A_G, -1}, //chr before pos + {chr1_11_11_A_G, chr1_1_1_A_G, 1}, + {chr1_1_1_A_G, make(chr1_1_1, Allele.REF_A, Allele.ALT_T), -1}, //alleles sort lexicographichally + {make(chr1_1_1, Allele.REF_A, Allele.ALT_G, Allele.ALT_T), chr1_1_1_A_G, 1},//extra alleles sort after fewer + {chr1_1_1_A_G, make(chr1_1_1, Allele.REF_G, Allele.ALT_A), -1}, //mismatch ref takes precendence + {make(chr1_1_1, Allele.REF_T, Allele.ALT_C, Allele.create("AAA"), Allele.ALT_G), // allele order doesn't matter + make(chr1_1_1, Allele.REF_T, Allele.ALT_G, Allele.create("AAA"), Allele.ALT_C), 0}, + {chr1_1_1_A_G, chr1_1_A, 1}, // no alt allele + {chr1_1_A, chr1_1_A, 0} // no alt alleles at all + + }; } - @Test - public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleIdentical() { - final String contig = "chr1"; - final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); - final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); - - final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); - final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make(); - - final int compare = comparator.compare(variant1, variant2); - Assert.assertEquals(compare, 0); // TODO: What other criteria might we sort by to break this tie? - } - - @Test - public void testVariantContextsWithSameSiteSortLexicographicallyByAllele() { - final String contig = "chr1"; - final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); - final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); - - final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); - final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altT)).make(); - - final int compare = comparator.compare(variant1, variant2); - Assert.assertEquals(compare, -1); + private static VariantContext make(Locatable pos, Allele ref, Allele ... alt){ + final List alleles = new ArrayList<>(); + alleles.add(ref); + alleles.addAll(Arrays.asList(alt)); + return new VariantContextBuilder("test", pos.getContig(), pos.getStart(), pos.getEnd(), alleles).make(); } - @Test - public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForFirstVariant() { - final String contig = "chr1"; - final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); - final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); - - final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG, altT)).make(); - final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make(); - - final int compare = comparator.compare(variant1, variant2); - Assert.assertEquals(compare, 1); - } - - @Test - public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForSecondVariant() { - final String contig = "chr1"; - final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); - final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); - - final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); - final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG, altT)).make(); - - final int compare = comparator.compare(variant1, variant2); - Assert.assertEquals(compare, -1); + @Test(dataProvider = "getVCs") + public void testVariantContextComparator(VariantContext left, VariantContext right, int expectedResult){ + final VariantContextComparator comparator = new VariantContextComparator(Arrays.asList(CHR1, CHR2)); + Assert.assertEquals(comparator.compare(left, right), expectedResult); + Assert.assertEquals(comparator.compare(right, left), -expectedResult); // make sure it's symmetrical } }