Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle utf-8 encoding #1569

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 6 additions & 12 deletions src/main/java/htsjdk/samtools/util/AsciiWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.Writer;
import java.nio.charset.*;
Copy link
Collaborator

@cmnbroad cmnbroad Oct 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By convention we try to avoid import *, except in the case where not doing so would require a really large list of imports. In this case, it looks like you're only using StandardCharsets, so you can just import that.


/**
* Fast (I hope) buffered Writer that converts char to byte merely by casting, rather than charset conversion.
Expand Down Expand Up @@ -70,17 +71,10 @@ public void flush() throws IOException {
* All other Writer methods vector through this, so this is the only one that must be overridden.
*/
@Override
public void write(final char[] chars, int offset, int length) throws IOException {
while (length > 0) {
final int charsToConvert = Math.min(length, buffer.length - numBytes);
StringUtil.charsToBytes(chars, offset, charsToConvert, buffer, numBytes);
numBytes += charsToConvert;
offset += charsToConvert;
length -= charsToConvert;
if (numBytes == buffer.length) {
os.write(buffer, 0, numBytes);
numBytes = 0;
}
}
public void write(final char[] chars, int off, int len) throws IOException {
String str = new String(chars,off,len);
byte[] b = str.getBytes(StandardCharsets.UTF_8);
Copy link
Collaborator

@cmnbroad cmnbroad Oct 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to write all SAM content as UTF-8, but I think the spec only permits UTF-8 in a few places, and requires 7 bit ascii elsewhere. So we may need a different approach.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like we're actually just doing a costly reimplementation of a standard Java Writer here now. It seems like validation of the character ranges should be done in the SamRecord and not here. Maybe this class has outlived it's usefulness and we can just use a standard Writer in place of it.

int bufferLength = b.length;
os.write(b, 0, bufferLength);
}
}
94 changes: 94 additions & 0 deletions src/test/java/htsjdk/samtools/SAMFileRoundTripTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package htsjdk.samtools;

import htsjdk.*;
import htsjdk.samtools.util.*;
import org.testng.*;
import org.testng.annotations.*;
import java.io.*;

public class SAMFileRoundTripTest extends HtsjdkTest{
private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools");

@DataProvider(name = "Utf8PositiveTestCases")
public Object[][] Utf8PositiveTestCases() {
SAMProgramRecord programRecordRoundTrip = new SAMProgramRecord("33");
programRecordRoundTrip.setAttribute("CL","xy");
programRecordRoundTrip.setAttribute("DS","description");

SAMProgramRecord programRecordRoundTripUtf8 = new SAMProgramRecord("33");
programRecordRoundTripUtf8.setAttribute("CL","äカ");
programRecordRoundTripUtf8.setAttribute("DS","\uD83D\uDE00リ");

SAMSequenceRecord sequenceRecordRoundTrip = new SAMSequenceRecord("chr3",101);
sequenceRecordRoundTrip.setAttribute("DS","descriptionhere");
sequenceRecordRoundTrip.setSequenceIndex(2);

SAMSequenceRecord sequenceRecordRoundTripUtf8 = new SAMSequenceRecord("chr3",101);
sequenceRecordRoundTripUtf8.setAttribute("DS","Emoji\uD83D\uDE0A");
sequenceRecordRoundTripUtf8.setSequenceIndex(2);

return new Object[][]{
{"roundtrip.sam", programRecordRoundTrip, "@CO\tcomment here", sequenceRecordRoundTrip},
{"roundtrip_with_utf8.sam", programRecordRoundTripUtf8, "@CO\tKanjiアメリカ\uD83D\uDE00リä", sequenceRecordRoundTripUtf8}
};
}

@Test(dataProvider = "Utf8PositiveTestCases", description = "Test UTF-8 encoding present in permitted fields of a SAM file")
public void Utf8RoundTripPositiveTests(final String inputFile, SAMProgramRecord programRecord,final String commentRecord, SAMSequenceRecord sequenceRecord) throws Exception {
final File input = new File(TEST_DATA_DIR, inputFile);
final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam");
outputFile.delete();
outputFile.deleteOnExit();
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
try (SamReader reader = SamReaderFactory.makeDefault().open(input);
SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) {
for (SAMRecord rec : reader) {
writer.addAlignment(rec);
}
SAMFileHeader head = reader.getFileHeader();
Assert.assertEquals(head.getProgramRecords().get(0), programRecord);
Assert.assertEquals(head.getComments().get(0),commentRecord );
Assert.assertEquals(head.getSequence("chr3"),sequenceRecord);
}

final String originalsam;
try (InputStream is = new FileInputStream(input)) {
originalsam = IOUtil.readFully(is);
}

final String writtenSam;
try (InputStream is = new FileInputStream(outputFile)) {
writtenSam = IOUtil.readFully(is);
}

Assert.assertEquals(writtenSam, originalsam);
}

@DataProvider(name = "Utf8NegativeTestCases")
public Object[][] Utf8NegativeTestCases() {
return new Object[][]{
{"roundtrip_with_utf8_bad_1.sam", "Invalid character in read bases"},
{"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"},
{"roundtrip_with_utf8_bad_2.sam", "Non-numeric value in POS column"}
Copy link
Collaborator

@cmnbroad cmnbroad Oct 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these tests may be failing somewhat accidentally, because they're testing multi byte chars in fields that are already constrained to specific sets of characters (numeric for POS, or valid bases for read bases). I'm not certain that its demonstrating that the chars are rejected because they're multi-byte; but rather that its just that they don't conform to the constraints. I suspect they would still fail if you used single byte ASCII characters that didn't conform.

It may be find to leave these, but I think we some need additional cases that use multi byte chars in places where there is no constraint other than that they be ASCII.

};
}

@Test(dataProvider = "Utf8NegativeTestCases",description = "Test UTF-8 encoding present in unpermitted fields of a SAM file", expectedExceptions = {IllegalArgumentException.class, SAMFormatException.class })
public void Utf8RoundTripNegativeTest(final String inputFile,final String exceptionString) throws Exception {
final File input = new File(TEST_DATA_DIR, inputFile);
final File outputFile = File.createTempFile("roundtrip-utf8-out", ".sam");
outputFile.delete();
outputFile.deleteOnExit();
final SAMFileWriterFactory factory = new SAMFileWriterFactory();
try (SamReader reader = SamReaderFactory.makeDefault().open(input);
SAMFileWriter writer = factory.makeSAMWriter(reader.getFileHeader(), false, new FileOutputStream(outputFile))) {
for (SAMRecord rec : reader) {
writer.addAlignment(rec);
}
}
catch (final Exception ex) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The expectedExceptions clause indicates that you expect IllegalArgumentException or SAMFormatException. So it would be preferable to narrow the scope of the catch clause to specify the narrower types that you actually expect (you can use multi-catch clause to match more than one type). If some other exception got thrown, like an IO exception, this clause will catch it and the assert will fail, even though the real problem would be completely unrelated.

Assert.assertTrue(ex.getMessage().contains(exceptionString));
throw ex;
Copy link
Collaborator

@cmnbroad cmnbroad Oct 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way this test is constructed makes it hard to tell if its throwing on read, or on write. Lets replace it with separate tests that show that we fail in both cases. Since we will presumably fail when we read a misplaced multibyte character, I think the test that shows that we fail on write will have to create the output programmatically, rather then via roundtrip.

}
}
}
4 changes: 3 additions & 1 deletion src/test/resources/htsjdk/samtools/roundtrip.sam
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
@HD VN:1.6 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101
@SQ SN:chr3 LN:101 DS:descriptionhere
@RG ID:0 SM:Hi,Mom!
@RG ID:rg1 PL:ILLUMINA SM:sm1
@PG ID:33 CL:xy DS:description
@CO comment here
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
Expand Down
18 changes: 18 additions & 0 deletions src/test/resources/htsjdk/samtools/roundtrip_with_utf8.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@HD VN:1.6 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101 DS:Emoji😊
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ
@RG ID:rg1 PL:ILLUMINA SM:sm1
@PG ID:33 CL:äカ DS:😀リ
@CO Kanjiアメリカ😀リä
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 147 chr2 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
18 changes: 18 additions & 0 deletions src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_1.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@HD VN:1.6 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101 DS:Emoji😊
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ
@RG ID:rg1 PL:ILLUMINA SM:sm1
@PG ID:33 CL:äカ DS:😀リ
@CO Kanjiアメリカ😀リä
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
A 133 * 0 0 * chr2 1 0 CAリCAGAAGC )'.*.+2,)) RG:Z:rg1
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
18 changes: 18 additions & 0 deletions src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_2.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@HD VN:1.6 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101 DS:Emoji😊
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ
@RG ID:rg1 PL:ILLUMINA SM:sm1
@PG ID:33 CL:äカ DS:😀リ
@CO Kanjiアメリカ😀リä
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
18 changes: 18 additions & 0 deletions src/test/resources/htsjdk/samtools/roundtrip_with_utf8_bad_3.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
@HD VN:1.6 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101 DS:Emoji😊
@RG ID:0 SM:Hi,Mom! DS:Kanjiアメリカ
@RG ID:rg1 PL:ILLUMINA SM:sm1
@PG ID:33 CL:äカ DS:😀リ
@CO Kanjiアメリカ😀リä
A 73 chr2 1 255 10M * 0 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
A 133 * 0 0 * chr2 1 0 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
B 99 chr1 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) R😀:Z:rg1
B 147 chr1 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 99 chr2 1 255 10M = 26 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
C 147 chr1 😀6 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 99 chr3 1 255 10M = 25 35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
D 147 chr3 26 255 10M = 1 -35 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 99 chr1 2 255 10M = 15 30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1
E 147 chr1 15 255 10M = 2 -30 CAACAGAAGC )'.*.+2,)) RG:Z:rg1