-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.py
81 lines (72 loc) · 2.65 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from pathlib import Path
import semra
import biolexica
HERE = Path(__file__).parent.resolve()
TERMS_PATH = HERE.joinpath("terms.tsv.gz")
PRIORITY = [
"doid",
"mondo",
"hp",
"symp",
"mesh",
"efo",
]
BIOLEXICA_CONFIG = biolexica.Configuration(
inputs=[
biolexica.Input(source="doid", processor="pyobo"),
biolexica.Input(source="mondo", processor="pyobo"),
biolexica.Input(source="hp", processor="pyobo"),
biolexica.Input(source="symp", processor="pyobo"),
biolexica.Input(
source="mesh",
processor="pyobo",
ancestors=[
*biolexica.get_mesh_category_curies("C"),
*biolexica.get_mesh_category_curies("F"),
# TODO should there be others?
],
),
biolexica.Input(source="efo", processor="pyobo", ancestors=["EFO:0000408"]),
biolexica.Input(source="ncit", processor="pyobo", ancestors=["ncit:C2991"]),
# biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
],
excludes=["doid:4"],
)
SEMRA_CONFIG = semra.Configuration(
name="Cell and Cell Line Mappings",
description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
"posed in the Biomappings paper, this configuration imports several different cell and "
"cell line resources and identifies mappings between them.",
inputs=[
semra.Input(source="biomappings"),
semra.Input(source="gilda"),
semra.Input(prefix="doid", source="pyobo", confidence=0.99),
semra.Input(prefix="mondo", source="pyobo", confidence=0.99),
semra.Input(prefix="hp", source="pyobo", confidence=0.99),
semra.Input(prefix="symp", source="pyobo", confidence=0.99),
semra.Input(prefix="mesh", source="pyobo", confidence=0.99),
semra.Input(prefix="efo", source="pyobo", confidence=0.99),
],
add_labels=False,
priority=PRIORITY,
keep_prefixes=PRIORITY,
remove_imprecise=False,
mutations=[
semra.Mutation(source="doid", confidence=0.7),
semra.Mutation(source="mondo", confidence=0.7),
semra.Mutation(source="hp", confidence=0.7),
semra.Mutation(source="symp", confidence=0.7),
],
raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
)
def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
BIOLEXICA_CONFIG,
mappings=mappings,
processed_path=TERMS_PATH,
)
if __name__ == "__main__":
_main()