-
Notifications
You must be signed in to change notification settings - Fork 2
/
initial-tagging.py
64 lines (51 loc) · 1.83 KB
/
initial-tagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Create project-tags to be added to the catalog
Do this once with tags alone, to avoid re-extracting all datasets.
TODO: refactor and include in load_tabby.
"""
from argparse import ArgumentParser
import json
from pathlib import Path
from datalad_catalog.schema_utils import get_metadata_item
from datalad_tabby.io import load_tabby
from utils import mint_dataset_id
parser = ArgumentParser()
parser.add_argument("superds", type=Path, help="Superdataset location")
parser.add_argument("outfile", type=Path, help="Output metadata file")
args = parser.parse_args()
metadata_items = []
tabby_files = (args.superds / ".datalad" / "tabby").rglob("dataset*tsv")
for tabby in tabby_files:
# todo: handle encoding
record = load_tabby(
tabby,
cpaths=[Path(__file__).parent / "conventions"],
)
# dataset ID and version
dataset_id = mint_dataset_id(record.get("name"), record.get("crc-project"))
dataset_version = record.get("version")
# project names and keywords
if isinstance(record["crc-project"], str):
projects = [record["crc-project"]]
else:
projects = record["crc-project"]
keywords = record.get("keywords", [])
new_keywords = []
for project in projects:
if project.upper() not in keywords: # case sensitive is ok
new_keywords.append(project.upper())
if len(new_keywords) == 0:
print("Nothing to add for", tabby)
metadata_items.append(
get_metadata_item(
item_type="dataset",
dataset_id=dataset_id,
dataset_version=dataset_version,
source_name="manual_addition",
source_version="0.1.0",
)
| {"keywords": new_keywords}
)
with args.outfile.open("w") as json_file:
for item in metadata_items:
json.dump(item, json_file)
json_file.write("\n")