From 1231af1c35f4d38ba1caba47eb0946267e820697 Mon Sep 17 00:00:00 2001 From: Keith James Date: Tue, 17 Sep 2024 16:11:00 +0100 Subject: [PATCH] Add local_checksum parameter to Collection::put Adding this parameter allows a callable to be supplied which can obtain per-file checksums during a recursive put operation. e.g. by reading from an accompanying .md5 file or by deciding, based on file size, whether to calculate checksums or to insist that an earlier process supply one pre-calculated. Improve test coverage of checksum operations. --- src/partisan/irods.py | 15 ++++++++-- tests/test_irods.py | 67 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/src/partisan/irods.py b/src/partisan/irods.py index c9780b3..522ed22 100644 --- a/src/partisan/irods.py +++ b/src/partisan/irods.py @@ -2194,7 +2194,7 @@ def put( against the remote checksum calculated by the iRODS server for data objects. local_checksum: A caller-supplied checksum of the local file. This may be a - string, a path to a file containing a string, or a file name + string, a path to a file containing a string, or a file path transformation function. If the latter, it must accept the local path as its only argument and return a string checksum. Typically, this is useful when this checksum is available from an earlier process that @@ -2239,9 +2239,9 @@ def put( else: raise ValueError( f"Invalid type for local_checksum: {type(local_checksum)}; must be " - "a string or a path of a file containing a string" + "a string, a path of a file containing a string, or a callable " + "taking a path of a file and returning a string" ) - if fill and self.exists() and self.checksum() == chk: log.info( "Data object already exists in iRODS with matching checksum; skipping", @@ -2656,6 +2656,7 @@ def put( recurse=False, calculate_checksum=False, verify_checksum=False, + local_checksum=None, compare_checksums=False, fill=False, force=True, @@ -2673,6 +2674,12 @@ def put( verify_checksum: Verify the local checksum calculated by the iRODS C API against the remote checksum calculated by the iRODS server for data objects. See DataObject.put() for more information. + local_checksum: A callable that returns a checksum for a local file. See + DataObject.put() for more information. This is called for each file in + encountered while recursing, with the file path as its argument. + (Also accepts a string or a path to a file containing a string, as does + DataObject.put(), however this is not useful for collections except in + the edge where all the files have identical contents). compare_checksums: Compare caller-supplied local checksums to the remote checksums calculated by the iRODS server after the put operation for data objects. If the checksums do not match, raise an error. See @@ -2707,6 +2714,7 @@ def put( p, calculate_checksum=calculate_checksum, verify_checksum=verify_checksum, + local_checksum=local_checksum, compare_checksums=compare_checksums, fill=fill, force=force, @@ -2726,6 +2734,7 @@ def put( p, calculate_checksum=calculate_checksum, verify_checksum=verify_checksum, + local_checksum=local_checksum, compare_checksums=compare_checksums, force=force, timeout=timeout, diff --git a/tests/test_irods.py b/tests/test_irods.py index 70036f7..f9c2910 100644 --- a/tests/test_irods.py +++ b/tests/test_irods.py @@ -830,6 +830,73 @@ def test_data_object_put_checksum_no_verify(self, simple_collection): assert obj.size() == 555 assert obj.checksum() == "39a4aa291ca849d601e4e5b8ed627a04" + @m.it("Can put put from a local file with checksum calculated on the fly") + def test_data_object_put_checksum_supplied(self, simple_collection): + obj = DataObject(simple_collection / "new.txt") + assert not obj.exists() + + local_path = Path("./tests/data/simple/data_object/lorem.txt").absolute() + checksum = "39a4aa291ca849d601e4e5b8ed627a04" + + obj.put(local_path, calculate_checksum=True, compare_checksums=True) + assert obj.exists() + assert obj.checksum() == checksum + + @m.it("Can put put from a local file with a supplied local checksum string") + def test_data_object_put_checksum_supplied(self, simple_collection): + obj = DataObject(simple_collection / "new.txt") + assert not obj.exists() + + local_path = Path("./tests/data/simple/data_object/lorem.txt").absolute() + checksum = "39a4aa291ca849d601e4e5b8ed627a04" + obj.put( + local_path, + calculate_checksum=True, + compare_checksums=True, + local_checksum=checksum, + ) + assert obj.exists() + assert obj.checksum() == checksum + + @m.it("Can put put from a local file with a supplied local checksum callable") + def test_data_object_put_callable_supplied(self, simple_collection): + obj = DataObject(simple_collection / "new.txt") + assert not obj.exists() + + local_path = Path("./tests/data/simple/data_object/lorem.txt").absolute() + checksum = "39a4aa291ca849d601e4e5b8ed627a04" + obj.put( + local_path, + calculate_checksum=True, + compare_checksums=True, + local_checksum=lambda _: checksum, + ) + assert obj.exists() + assert obj.checksum() == checksum + + @m.it("Raises an error if a supplied local checksum callable does not match") + def test_data_object_put_callable_supplied(self, simple_collection): + obj = DataObject(simple_collection / "new.txt") + assert not obj.exists() + + local_path = Path("./tests/data/simple/data_object/lorem.txt").absolute() + with pytest.raises(ValueError, match="mismatch"): + obj.put( + local_path, + calculate_checksum=True, + compare_checksums=True, + local_checksum=lambda _: "a bad checksum", + ) + + @m.it("Raises an error if a supplied local checksum string does not match") + def test_data_object_put_checksum_supplied_mismatch(self, simple_collection): + obj = DataObject(simple_collection / "new.txt") + assert not obj.exists() + + local_path = Path("./tests/data/simple/data_object/lorem.txt").absolute() + with pytest.raises(ValueError, match="mismatch"): + obj.put(local_path, compare_checksums=True, local_checksum="a bad checksum") + @m.describe("Operations on an existing DataObject") @m.context("When a DataObject exists") @m.it("Can be detected")