Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WACZ support #770

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
24 changes: 24 additions & 0 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import zlib
import surt
import ntpath
import shutil
import traceback
import tempfile

Expand All @@ -29,6 +30,8 @@
# from requests.exceptions import ConnectionError

from ipwb.util import iso8601_to_digits14, ipfs_client
from ipwb.util import is_wacz, extract_warcs_from_wacz
from ipwb.util import cleanup_warc_files_extracted_from_wacz

import requests
import datetime
Expand Down Expand Up @@ -119,6 +122,21 @@ def index_file_at(warc_paths, encryption_key=None,
for warc_path in warc_paths:
verify_file_exists(warc_path)

# Extract WARCs from any WACZ files
warc_paths_to_append = []
wacz_paths = []
for warc_path in warc_paths:
if is_wacz(warc_path):
(w_paths, dirs_to_cleanup) = extract_warcs_from_wacz(warc_path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dirs_to_cleanup here is overwritten in each loop, so at the end it will only hold the reference to the temporary dirs of the last WACZ file (unless I am missing something) for cleanup.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch on dirs_to_cleanup not being retained.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the logic of extracted WARCs and temp directories can be simplified. There is a bit too much disk maintenance. The general gist is that paths to WACZ files could be inter-mingled with WARCs here and thus the WARCs extracted from the WACZ files need to be removed but not the WARCs that were passed in.

warc_paths_to_append += w_paths
wacz_paths.append(warc_path)

# Manipulate list of WARCs extracted from WACZ
for ptr in wacz_paths:
warc_paths.remove(ptr)

warc_paths = warc_paths + warc_paths_to_append
machawk1 marked this conversation as resolved.
Show resolved Hide resolved

cdxj_lines = []

if outfile:
Expand Down Expand Up @@ -167,6 +185,8 @@ def index_file_at(warc_paths, encryption_key=None,
cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines)
cdxj_lines = cdxj_metadata_lines + cdxj_lines

cleanup_warc_files_extracted_from_wacz(warc_paths_to_append)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big issue, but I think the temporary folders created by the mkdtemp() call will continue to exists (until cleaned up by the OS) because only the files inside them are deleted, not the folders themselves.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs say that the creator is responsible for the deletion, so I think we should handle this. Given each WARC gets a new temp directory, it might be better to just retain the copy of this directory path and delete it along with its contents instead of deleting the WARC then the directory, which would require tracking the directory path, too. Which approach would you rather be implemented, @ibnesayeed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which would require tracking the directory path, too

Not really! It is possible to get the path of the directory if the path of a file is known that it contains.

That said, I would perhaps preferred not holding onto the list of WARC files, instead, operate on each WARC as we discover them, whether those are regular WARC files or those extracted from WACz files. I would deal with one file at a time and loop over for the next one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ibnesayeed This seems like it requires a revamp outside of the scope of this GH issue/PR. I agree that dealing with one WARC at a time would likely be more computationally optimal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that it would require change in the workflow. When done, it would be more space efficient as not all the WARCs need to be extracted from WACZ files upfront, duplicating them on the disk, before processing them.

It is okay to leave it as things are right now and get back to this when we have a WARC record iterator for the WACZ files, when most of these changes will be rendered useless.


if quiet:
return cdxj_lines

Expand All @@ -180,6 +200,10 @@ def index_file_at(warc_paths, encryption_key=None,
else:
print('\n'.join(cdxj_lines))

# Cleanup, e.g., dirs for WARCs from WACZ
for dir_to_cleanup in dirs_to_cleanup:
shutil.rmtree(dir_to_cleanup)


def sanitize_cdxj_line(cdxj_line):
return cdxj_line
Expand Down
51 changes: 51 additions & 0 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
import datetime
import logging
import platform
import tempfile

# For extracting WARCs from WACZ
import glob
from zipfile import ZipFile, is_zipfile

from enum import Enum, auto

Expand Down Expand Up @@ -350,3 +355,49 @@ def check_for_update(_):
print("The installed version of ipwb is outdated.")
print(f"* Installed: {current}\n* Latest: {latest}")
print("Please run `pip install --upgrade ipwb` to upgrade.")


def is_wacz(path):
# TODO: add logic to check if wacz
# the py-wacz validator inherits many dependencies,
# so ad hoc here for now
return is_zipfile(path)


def get_warc_paths_in_wacz(wacz_path):
with ZipFile(wacz_path) as z:
return [w for w in z.namelist() if w.startswith('archive/')]


def extract_warcs_to_disk(wacz_path, warc_paths) -> list:
'''
Extract WARCs and retain reference to temp path
for later deletion
'''
extracted_warc_paths = []
tmp_dirs = []
for warc in warc_paths:
with ZipFile(wacz_path) as z:
tmp_dir = tempfile.mkdtemp()
ph = z.extract(warc, tmp_dir)
extracted_warc_paths.append(ph)
tmp_dirs.append(tmp_dir) # For later dir deletion

return (extracted_warc_paths, tmp_dirs)


def extract_warcs_from_wacz(wacz_path):
machawk1 marked this conversation as resolved.
Show resolved Hide resolved
warc_paths_in_wacz = get_warc_paths_in_wacz(wacz_path)
(warc_paths_on_disk, dirs_to_cleanup) = extract_warcs_to_disk(
wacz_path, warc_paths_in_wacz)

return (warc_paths_on_disk, dirs_to_cleanup)


def cleanup_warc_files_extracted_from_wacz(warc_paths):
for temporary_warc in warc_paths:
try:
if os.path.isfile(temporary_warc):
os.remove(temporary_warc)
except OSError as e:
print(f'Error: {e.filename}, {e.strerror}')
Binary file added samples/wacz/my-collection.wacz
Binary file not shown.
7 changes: 5 additions & 2 deletions tests/testUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,14 @@ def count_cdxj_entries(cdxj_data):
return urim_count


def start_replay(warc_filename):
def start_replay(filename, samples_dir='warcs'):
global p
if filename.endswith('.wacz'):
samples_dir = 'wacz'

path_of_warc = os.path.join(
Path(os.path.dirname(__file__)).parent,
'samples', 'warcs', warc_filename)
'samples', samples_dir, filename)

fh, tempfile_path = tempfile.mkstemp(suffix='.cdxj')
os.close(fh)
Expand Down
1 change: 1 addition & 0 deletions tests/test_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_replay_404(warc, lookup, has_md_header):
('2mementos_queryString.warc',
'/memento/20130202100000/memento.us/' +
'index.php?anotherval=ipsum&someval=lorem', 200, None),
('my-collection.wacz', 'memento/*/memento.us', 200, None),
])
def test_replay_search(warc, lookup, status, location):
ipwb_test.start_replay(warc)
Expand Down
Loading