Skip to content

Commit

Permalink
Remove shard- from v2d_to_metadata, transcript
Browse files Browse the repository at this point in the history
  • Loading branch information
kdu4108 committed Aug 5, 2024
1 parent c25a2d5 commit 7c98e82
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
9 changes: 4 additions & 5 deletions pseudolabeling/v2d_to_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,16 @@ def process_tar_files(source_directory, target_directory, dataset, skip_existing

os.makedirs(target_directory, exist_ok=True)

for tar_path in tqdm(os.listdir(source_directory)):
if tar_path.endswith(".tar"):
shard_name = "shard-" + os.path.splitext(tar_path)[0] + ".tar"
target_tar_path = os.path.join(target_directory, shard_name)
for filename in tqdm(os.listdir(source_directory)):
if filename.endswith(".tar"):
target_tar_path = os.path.join(target_directory, filename)
print(target_tar_path)

if skip_existing and os.path.exists(target_tar_path):
print(f"Skipping already processed file: {target_tar_path}")
continue

source_tar_path = os.path.join(source_directory, tar_path)
source_tar_path = os.path.join(source_directory, filename)
with tarfile.open(source_tar_path, "r") as tar:
temp_dir = tempfile.mkdtemp()
try:
Expand Down
10 changes: 5 additions & 5 deletions pseudolabeling/v2d_to_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import tarfile
import tempfile
from tqdm import tqdm
from datetime import timedelta


Expand All @@ -19,17 +20,16 @@ def process_tar_files(source_directory, target_directory, skip_existing=True):
"""Extract, process, and re-package JSON files in TAR archives."""
os.makedirs(target_directory, exist_ok=True)

for tar_path in os.listdir(source_directory):
if tar_path.endswith(".tar"):
shard_name = "shard-" + os.path.splitext(tar_path)[0] + ".tar"
target_tar_path = os.path.join(target_directory, shard_name)
for filename in tqdm(os.listdir(source_directory)):
if filename.endswith(".tar"):
target_tar_path = os.path.join(target_directory, filename)
print(target_tar_path)

if skip_existing and os.path.exists(target_tar_path):
print(f"Skipping already processed file: {target_tar_path}")
continue

source_tar_path = os.path.join(source_directory, tar_path)
source_tar_path = os.path.join(source_directory, filename)
with tarfile.open(source_tar_path, "r") as tar:
temp_dir = tempfile.mkdtemp()
try:
Expand Down

0 comments on commit 7c98e82

Please sign in to comment.