From 31876e71ef6c5e7f26d6e2f5335c6b534f6e18ca Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 11 Jul 2024 11:43:50 +0200 Subject: [PATCH] fix(`decode_bytes`): handle multiple errors This commit fixes a bug that was detected by @mih. Due to this bug, multiple encoding errors in a single input chunk of `decode_bytes` lead to unexpected decoding-exceptions. The commit also adds a regression test to enure that multiple encoding errors in a single input chunk are handled properly. --- datasalad/itertools/decode_bytes.py | 2 +- datasalad/itertools/tests/test_decode_bytes.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/datasalad/itertools/decode_bytes.py b/datasalad/itertools/decode_bytes.py index 6255360..e52a6f7 100644 --- a/datasalad/itertools/decode_bytes.py +++ b/datasalad/itertools/decode_bytes.py @@ -102,7 +102,7 @@ def handle_decoding_error( raise exc return ( position + exc.end, - joined_data[: position + exc.start].decode(encoding) + joined_data[position : position + exc.start].decode(encoding) + joined_data[position + exc.start : position + exc.end].decode( encoding, errors='backslashreplace' ), diff --git a/datasalad/itertools/tests/test_decode_bytes.py b/datasalad/itertools/tests/test_decode_bytes.py index e597162..d7a55a1 100644 --- a/datasalad/itertools/tests/test_decode_bytes.py +++ b/datasalad/itertools/tests/test_decode_bytes.py @@ -35,3 +35,8 @@ def test_no_empty_strings(): # check that empty strings are not yielded r = tuple(decode_bytes([b'\xc3', b'\xb6'])) assert r == ('รถ',) + + +def test_multiple_errors(): + r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3'])) + assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3'