-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocessing.py
60 lines (45 loc) · 1.64 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# This file create a new drug-protein association matrix based on shortest-path calculations.
import numpy as np
import csv
import networkx as nx
import joblib
from tqdm import tqdm
import pandas as pd
with open('./data/DrugsToProteins.txt', "r") as f:
R_DP = [element.split() for element in f.readlines()]
f.close()
PROTEINS = list(set([x[1] for x in R_DP]))
DRUGS = list(set([x[0] for x in R_DP]))
drug_set = set(DRUGS)
with open('./data/DrugsToLabels.txt', "r") as f:
R_DL_all = [[element.split()[0], " ".join(element.split()[1:])] for element in f.readlines()]
R_DL = [x for x in R_DL_all if x[0] in drug_set]
f.close()
LABELS = list(set([x[1] for x in R_DL]))
with open('./data/ProteinsToProteins.txt', "r") as f:
R_PP = [element.split()[:2] for element in f.readlines()]
f.close()
revert_edge = lambda x : [x[1], x[0]]
PROTEINS.sort()
DRUGS.sort()
G = nx.DiGraph()
G.add_nodes_from(PROTEINS)
G.add_nodes_from(DRUGS)
G.add_edges_from([revert_edge(x) for x in R_DP])
G.add_edges_from(R_PP + [revert_edge(x) for x in R_PP])
n2, n3 = len(DRUGS), len(PROTEINS)
R23_new = np.zeros((n2, n3))
for i in tqdm(range(n2)):
for j in (range(n3)):
if nx.has_path(G, PROTEINS[j], DRUGS[i]):
R23_new[i,j] = nx.shortest_path_length(G, source = PROTEINS[j], target = DRUGS[i])
for i in range(n2):
for j in range(n3):
if R23_new[i,j] > 3:
R23_new[i,j] = 0
R23_new_1 = R23_new.astype('float32')
for i in range(n2):
for j in range(n3):
if int(R23_new_1[i,j]) != 0:
R23_new_1[i,j] = 0.2 ** int(R23_new_1[i,j]-1)
np.save('R23_enhanced_matrix.npy', R23_new)