Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring VariantContextComparator #1629

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

/**
* A Comparator that orders VariantContexts by the ordering of the contigs/chromosomes in the List
* provided at construction time, then by start position with each contig/chromosome.
* provided at construction time, then by start position with each contig/chromosome, then by Ref
* and finally by order agnostic Alt alleles
*/
public class VariantContextComparator implements Comparator<VariantContext>, Serializable {
private static final long serialVersionUID = 1L;

public static List<String> getSequenceNameList(final SAMSequenceDictionary dictionary) {
final List<String> list = new ArrayList<String>();
final List<String> list = new ArrayList<>();
for (final SAMSequenceRecord record : dictionary.getSequences()) {
list.add(record.getSequenceName());
}
Expand All @@ -32,11 +35,20 @@ public static List<String> getSequenceNameList(final SAMSequenceDictionary dicti

// For fast lookup of the contig's index in the contig list
private final Map<String, Integer> contigIndexLookup;
private final Comparator<VariantContext> internalComparator;

private static Comparator<VariantContext> makeComparator(final Map<String, Integer> contigIndexLookup) {
return Comparator.comparingInt((VariantContext vc) -> contigIndexLookup.get(vc.getContig()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you are using a lambda to determine the index given the contig names perhaps that is what you should compose in the constructors:

If the constructor param is a SAMSeqDict then (x) -> dict.getRecord(x).getIndex().
If the constructor param is a List then you create that Map.

Also you can add here the NPE capture into a IAE (or whatever) for missing contigs.

.thenComparingInt(VariantContext::getStart)
.thenComparingInt(VariantContext::getEnd)
.thenComparing(VariantContext::getReference)
.thenComparing(VariantContext::getAlternateAlleles, new SortedUniquedElementsComparator<>());
}

public VariantContextComparator(final List<String> contigs) {
if (contigs.isEmpty()) throw new IllegalArgumentException("One or more contigs must be in the contig list.");

final Map<String, Integer> protoContigIndexLookup = new HashMap<String, Integer>();
final Map<String, Integer> protoContigIndexLookup = new HashMap<>();
int index = 0;
for (final String contig : contigs) {
protoContigIndexLookup.put(contig, index++);
Expand All @@ -47,6 +59,7 @@ public VariantContextComparator(final List<String> contigs) {
}

this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup);
this.internalComparator = makeComparator(this.contigIndexLookup);
}

/**
Expand All @@ -57,7 +70,7 @@ public VariantContextComparator(final List<String> contigs) {
public VariantContextComparator(final Collection<VCFContigHeaderLine> headerLines) {
if (headerLines.isEmpty()) throw new IllegalArgumentException("One or more header lines must be in the header line collection.");

final Map<String, Integer> protoContigIndexLookup = new HashMap<String, Integer>();
final Map<String, Integer> protoContigIndexLookup = new HashMap<>();
for (final VCFContigHeaderLine headerLine : headerLines) {
protoContigIndexLookup.put(headerLine.getID(), headerLine.getContigIndex());
}
Expand All @@ -66,12 +79,13 @@ public VariantContextComparator(final Collection<VCFContigHeaderLine> headerLine
throw new IllegalArgumentException("There are duplicate contigs/chromosomes in the input header line collection.");
}

final Set<Integer> protoIndexValues = new HashSet<Integer>(protoContigIndexLookup.values());
final Set<Integer> protoIndexValues = new HashSet<>(protoContigIndexLookup.values());
if (protoIndexValues.size() != headerLines.size()) {
throw new IllegalArgumentException("One or more contigs share the same index number.");
}

this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup);
this.internalComparator = makeComparator(this.contigIndexLookup);
}

public VariantContextComparator(final SAMSequenceDictionary dictionary) {
Expand All @@ -83,20 +97,24 @@ public int compare(final VariantContext firstVariantContext, final VariantContex
// Will throw NullPointerException -- happily -- if either of the chromosomes/contigs aren't
// present. This error checking should already have been done in the constructor but it's left
// in as defence anyway.
int contigCompare = this.contigIndexLookup.get(firstVariantContext.getContig()).compareTo(this.contigIndexLookup.get(secondVariantContext.getContig()));
contigCompare = contigCompare == 0 ? firstVariantContext.getStart() - secondVariantContext.getStart() : contigCompare;
if (contigCompare == 0) {
// Compare variants that have the same genomic span (chr:start-end) lexicographically by all alleles (ref and alts).
for (int i = 0; i < firstVariantContext.getAlleles().size(); i++) {
// If all previous alleles are identical and the first variant has additional alleles, make the first variant greater.
if (i >= secondVariantContext.getAlleles().size()) { return 1; }
contigCompare = firstVariantContext.getAlleles().get(i).compareTo(secondVariantContext.getAlleles().get(i));
if (contigCompare != 0) return contigCompare;
return internalComparator.compare(firstVariantContext, secondVariantContext);
}

//Sorts and uniques the inputs and then compares the values pairwise in their natural order
//until there is a mismatch or 1 of the inputs runs out.
private static class SortedUniquedElementsComparator<T extends Comparable<T>> implements Comparator<Collection<T>>{
@Override
public int compare(final Collection<T> o1, final Collection<T> o2) {
final Iterator<T> left = new TreeSet<>(o1).iterator();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps is worth to add trivial cases to avoid TreeSet constructons since quite often alt-alleles list are just 1 in length?

final Iterator<T> right = new TreeSet<>(o2).iterator();
while(left.hasNext() && right.hasNext()){
final int result = left.next().compareTo(right.next());
if( result != 0 ){
return result;
}
}
return Boolean.compare(left.hasNext(), right.hasNext());
}
// If all previous alleles are identical and the second variant has additional alleles, make the second variant greater.
if (firstVariantContext.getAlleles().size() < secondVariantContext.getAlleles().size()) { return -1; }
return contigCompare;
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,75 +1,65 @@
package htsjdk.variant.variantcontext;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.SimpleFeature;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
* Unit tests for VariantContextComparator.
*/
public class VariantContextComparatorUnitTest extends HtsjdkTest {
private Allele refA, altG, altT;

@BeforeSuite
public void before() {
refA = Allele.create("A", true);
altG = Allele.create("G", false);
altT = Allele.create("T", false);
public class VariantContextComparatorUnitTest extends HtsjdkTest{

private static final String CHR1 = "chr1" , CHR2 = "chr2";

@DataProvider
public Object[][] getVCs(){
final Locatable chr1_1_1 = new SimpleFeature(CHR1, 1, 1);
final Locatable chr1_11_11 = new SimpleFeature(CHR1, 11, 11);
final Locatable chr2_1_1 = new SimpleFeature(CHR2, 1, 1);
final Locatable chr2_10_10 = new SimpleFeature(CHR2, 10,10);
final VariantContext chr1_1_1_A_G = make(chr1_1_1, Allele.REF_A, Allele.ALT_G);
final VariantContext chr2_1_1_A_G = make(chr2_1_1, Allele.REF_A, Allele.ALT_G);

final VariantContext chr2_10_10_A_G = make(chr2_10_10, Allele.REF_A, Allele.ALT_G);
final VariantContext chr1_11_11_A_G = make(chr1_11_11, Allele.REF_A, Allele.ALT_G);
final VariantContext chr1_1_A = make(chr1_1_1, Allele.REF_A);
return new Object[][] {
{chr1_1_1_A_G, chr1_1_1_A_G, 0}, //identical should match
{chr1_1_1_A_G, chr2_1_1_A_G, -1}, //chr2 comes after chr1
{chr1_1_1_A_G, chr2_10_10_A_G, -1},
{chr1_11_11_A_G, chr2_10_10_A_G, -1}, //chr before pos
{chr1_11_11_A_G, chr1_1_1_A_G, 1},
{chr1_1_1_A_G, make(chr1_1_1, Allele.REF_A, Allele.ALT_T), -1}, //alleles sort lexicographichally
{make(chr1_1_1, Allele.REF_A, Allele.ALT_G, Allele.ALT_T), chr1_1_1_A_G, 1},//extra alleles sort after fewer
{chr1_1_1_A_G, make(chr1_1_1, Allele.REF_G, Allele.ALT_A), -1}, //mismatch ref takes precendence
{make(chr1_1_1, Allele.REF_T, Allele.ALT_C, Allele.create("AAA"), Allele.ALT_G), // allele order doesn't matter
make(chr1_1_1, Allele.REF_T, Allele.ALT_G, Allele.create("AAA"), Allele.ALT_C), 0},
{chr1_1_1_A_G, chr1_1_A, 1}, // no alt allele
{chr1_1_A, chr1_1_A, 0} // no alt alleles at all

};
}

@Test
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleIdentical() {
final String contig = "chr1";
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig));
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList());

final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make();
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make();

final int compare = comparator.compare(variant1, variant2);
Assert.assertEquals(compare, 0); // TODO: What other criteria might we sort by to break this tie?
}

@Test
public void testVariantContextsWithSameSiteSortLexicographicallyByAllele() {
final String contig = "chr1";
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig));
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList());

final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make();
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altT)).make();

final int compare = comparator.compare(variant1, variant2);
Assert.assertEquals(compare, -1);
private static VariantContext make(Locatable pos, Allele ref, Allele ... alt){
final List<Allele> alleles = new ArrayList<>();
alleles.add(ref);
alleles.addAll(Arrays.asList(alt));
return new VariantContextBuilder("test", pos.getContig(), pos.getStart(), pos.getEnd(), alleles).make();
}

@Test
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForFirstVariant() {
final String contig = "chr1";
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig));
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList());

final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG, altT)).make();
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make();

final int compare = comparator.compare(variant1, variant2);
Assert.assertEquals(compare, 1);
}

@Test
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForSecondVariant() {
final String contig = "chr1";
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig));
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList());

final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make();
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG, altT)).make();

final int compare = comparator.compare(variant1, variant2);
Assert.assertEquals(compare, -1);
@Test(dataProvider = "getVCs")
public void testVariantContextComparator(VariantContext left, VariantContext right, int expectedResult){
final VariantContextComparator comparator = new VariantContextComparator(Arrays.asList(CHR1, CHR2));
Assert.assertEquals(comparator.compare(left, right), expectedResult);
Assert.assertEquals(comparator.compare(right, left), -expectedResult); // make sure it's symmetrical
}
}