oduwsdl · machawk1 · May 17, 2022 · May 17, 2022 · May 17, 2022 · May 17, 2022
diff --git a/ipwb/indexer.py b/ipwb/indexer.py
@@ -17,6 +17,7 @@
 import zlib
 import surt
 import ntpath
+import shutil
 import traceback
 import tempfile
 
@@ -29,6 +30,8 @@
 # from requests.exceptions import ConnectionError
 
 from ipwb.util import iso8601_to_digits14, ipfs_client
+from ipwb.util import is_wacz, extract_warcs_from_wacz
+from ipwb.util import cleanup_warc_files_extracted_from_wacz
 
 import requests
 import datetime
@@ -119,6 +122,21 @@ def index_file_at(warc_paths, encryption_key=None,
     for warc_path in warc_paths:
         verify_file_exists(warc_path)
 
+    # Extract WARCs from any WACZ files
+    warc_paths_to_append = []
+    wacz_paths = []
+    for warc_path in warc_paths:
+        if is_wacz(warc_path):
+            (w_paths, dirs_to_cleanup) = extract_warcs_from_wacz(warc_path)
+            warc_paths_to_append += w_paths
+            wacz_paths.append(warc_path)
+
+    # Manipulate list of WARCs extracted from WACZ
+    for ptr in wacz_paths:
+        warc_paths.remove(ptr)
+
+    warc_paths = warc_paths + warc_paths_to_append
+
     cdxj_lines = []
 
     if outfile:
@@ -167,6 +185,8 @@ def index_file_at(warc_paths, encryption_key=None,
     cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines)
     cdxj_lines = cdxj_metadata_lines + cdxj_lines
 
+    cleanup_warc_files_extracted_from_wacz(warc_paths_to_append)
+
     if quiet:
         return cdxj_lines
 
@@ -180,6 +200,10 @@ def index_file_at(warc_paths, encryption_key=None,
     else:
         print('\n'.join(cdxj_lines))
 
+    # Cleanup, e.g., dirs for WARCs from WACZ
+    for dir_to_cleanup in dirs_to_cleanup:
+        shutil.rmtree(dir_to_cleanup)
+
 
 def sanitize_cdxj_line(cdxj_line):
     return cdxj_line

diff --git a/ipwb/util.py b/ipwb/util.py
@@ -12,6 +12,11 @@
 import datetime
 import logging
 import platform
+import tempfile
+
+# For extracting WARCs from WACZ
+import glob
+from zipfile import ZipFile, is_zipfile
 
 from enum import Enum, auto
 
@@ -350,3 +355,49 @@ def check_for_update(_):
         print("The installed version of ipwb is outdated.")
         print(f"* Installed: {current}\n* Latest:    {latest}")
         print("Please run `pip install --upgrade ipwb` to upgrade.")
+
+
+def is_wacz(path):
+    # TODO: add logic to check if wacz
+    # the py-wacz validator inherits many dependencies,
+    # so ad hoc here for now
+    return is_zipfile(path)
+
+
+def get_warc_paths_in_wacz(wacz_path):
+    with ZipFile(wacz_path) as z:
+        return [w for w in z.namelist() if w.startswith('archive/')]
+
+
+def extract_warcs_to_disk(wacz_path, warc_paths) -> list:
+    '''
+    Extract WARCs and retain reference to temp path
+    for later deletion
+    '''
+    extracted_warc_paths = []
+    tmp_dirs = []
+    for warc in warc_paths:
+        with ZipFile(wacz_path) as z:
+            tmp_dir = tempfile.mkdtemp()
+            ph = z.extract(warc, tmp_dir)
+            extracted_warc_paths.append(ph)
+            tmp_dirs.append(tmp_dir)  # For later dir deletion
+
+    return (extracted_warc_paths, tmp_dirs)
+
+
+def extract_warcs_from_wacz(wacz_path):
+    warc_paths_in_wacz = get_warc_paths_in_wacz(wacz_path)
+    (warc_paths_on_disk, dirs_to_cleanup) = extract_warcs_to_disk(
+        wacz_path, warc_paths_in_wacz)
+
+    return (warc_paths_on_disk, dirs_to_cleanup)
+
+
+def cleanup_warc_files_extracted_from_wacz(warc_paths):
+    for temporary_warc in warc_paths:
+        try:
+            if os.path.isfile(temporary_warc):
+                os.remove(temporary_warc)
+        except OSError as e:
+            print(f'Error: {e.filename}, {e.strerror}')
diff --git a/samples/wacz/my-collection.wacz b/samples/wacz/my-collection.wacz
diff --git a/tests/testUtil.py b/tests/testUtil.py
@@ -57,11 +57,14 @@ def count_cdxj_entries(cdxj_data):
     return urim_count
 
 
-def start_replay(warc_filename):
+def start_replay(filename, samples_dir='warcs'):
     global p
+    if filename.endswith('.wacz'):
+        samples_dir = 'wacz'
+
     path_of_warc = os.path.join(
         Path(os.path.dirname(__file__)).parent,
-        'samples', 'warcs', warc_filename)
+        'samples', samples_dir, filename)
 
     fh, tempfile_path = tempfile.mkstemp(suffix='.cdxj')
     os.close(fh)

diff --git a/tests/test_replay.py b/tests/test_replay.py
@@ -49,6 +49,7 @@ def test_replay_404(warc, lookup, has_md_header):
     ('2mementos_queryString.warc',
      '/memento/20130202100000/memento.us/' +
      'index.php?anotherval=ipsum&someval=lorem', 200, None),
+    ('my-collection.wacz', 'memento/*/memento.us', 200, None),
 ])
 def test_replay_search(warc, lookup, status, location):
     ipwb_test.start_replay(warc)