-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
executable file
·234 lines (210 loc) · 9.47 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python2.7
import sys
import os
import re
import pickle
#based on the example code at
#http://graus.nu/blog/pylucene-4-0-in-60-seconds-tutorial/
import lucene
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search import NumericRangeQuery
from org.apache.lucene.index import IndexReader
from org.apache.lucene.index import Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.queryparser.classic import MultiFieldQueryParser
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
import parse_abstracts
from IdentityExtractor import IdentifierExtracter
PRINT_ADDITIONAL_IDS=False
RETRIEVE_FULL=False
pubmed_field_set = set()
[pubmed_field_set.add(x[0]) for x in parse_abstracts.HEADER]
sym2name={0:'PubMed',1:'SRA','+':'BooleanClause.Occur.MUST'}
sra_field_set = set(["all","raw","SAMPLE_alias","SAMPLE_DESCRIPTION","SUBMISSION_TITLE","EXPERIMENT_INSTRUMENT_MODEL","STUDY_alias","STUDY_STUDY_ABSTRACT","SAMPLE_accession","EXPERIMENT_LIBRARY_STRATEGY","EXPERIMENT_alias","EXPERIMENT_TITLE","EXPERIMENT_LIBRARY_NAME","EXPERIMENT_DESIGN_DESCRIPTION","EXPERIMENT_LIBRARY_SELECTION","SAMPLE_TITLE","STUDY_accession","EXPERIMENT_LIBRARY_SOURCE","EXPERIMENT_LIBRARY_CONSTRUCTION_PROTOCOL","STUDY_STUDY_TITLE","EXPERIMENT_accession"])
lucene.initVM()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
pubmed = IndexReader.open(SimpleFSDirectory(File("./lucene_pubmed_index")))
sra = IndexReader.open(SimpleFSDirectory(File("./lucene_sra_index")))
psearcher = IndexSearcher(pubmed)
ssearcher = IndexSearcher(sra)
NUM_TO_RETRIEVE = 100
hugo_genenamesF = 'refFlat.hg38.txt.sorted'
#adapted from previous projects
def search_lucene(fields_,terms_,requirements_,searcher,index=0):
terms = []
fields = []
requirements = []
for (i,x) in enumerate(terms_):
terms.append(x[index])
fields.append(fields_[i][index])
requirements.append(requirements_[i][index])
sys.stdout.write("Running query %s: (\"%s\") in fields (%s) with requirements (%s)\n" % (sym2name[index],"\",\"".join(terms),",".join(fields),",".join([sym2name[str(x)] for x in requirements])))
query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1,terms,fields,requirements,analyzer2)
return(terms,fields,requirements,searcher.search(query, NUM_TO_RETRIEVE))
#sample queries:
#1) human lung cancer => field=raw,value = human lung cancer
#2) human AND lung AND cancer => field=raw,value = human AND lung AND cancer
#3) EXPERIMENT_TITLE::lung cancer;;SAMPLE_SAMPLE_ABSTRACT::adenocarncinoma => fields=[EXPERIMENT_TITLE],SAMPLE_SAMPLE_ABSTRACT],values=[lung cancer,adenocarncinoma]
#3b) all::tag1:value1;;all::tag2:value2 => fields=[all,all],values=[tag1:value1,tag2:value2]
#returns fields and values to search on for both pubmed (index 0) and sra (index 1)
field_patt = re.compile(r'::')
FIELD_DELIM=';;'
FIELD_VAL_DELIM='::'
def parse_query(ie,query):
terms = query.split(FIELD_DELIM)
fields = []
values = []
requirements = []
if len(terms) == 1 and field_patt.search(terms[0]) is None:
fields.append(['raw','raw'])
values.append([terms[0],terms[0]])
requirements.append([BooleanClause.Occur.MUST,BooleanClause.Occur.MUST])
else:
for term in terms:
(field,val) = term.split(FIELD_VAL_DELIM)
#going to be the same for both searchers usually
vals = [val,val]
requirements.append([BooleanClause.Occur.MUST,BooleanClause.Occur.MUST])
if field in pubmed_field_set:
fields.append([field,'raw'])
elif field in sra_field_set:
fields.append(['raw',field])
if field == 'all':
#this is special to the SRA case, so we'll split by tag:value delimiter ":"
#and then add them into the pubmed one with a space (OR) in between
(val1,val2) = val.split(':')
vals = ["%s %s" % (val1,val2),val]
#if we don't know what the field is
else:
fields.append(['raw','raw'])
values.append(vals)
raw_text = ' '.join(set([z for k in values for z in k]))
(genes,accessions,pmids) = ie.extract_identifiers("NA",0,raw_text)
return (fields,values,requirements,genes,accessions,pmids)
def get_ids_by_genenames(genes2ids,genes):
pm_ids = set()
sra_ids = set()
for gene in genes:
try:
(pids,sids) = genes2ids[gene]
pm_ids.update(pids)
sra_ids.update(sids)
except KeyError, ke:
continue
#map SRX ids back to their full id for matching
#pm_ids = set(map(str,pm_ids))
#sra_ids = set(map(lambda z: "SRX%s" % z,sra_ids))
return (pm_ids,sra_ids)
def process_query(ie,genes2ids,id2additional_ids,query):
#get the query parsed into its fields, their values, and the boolean requirements (MUST or SHOULD)
#also extract out any gene names and/or SRA accessions and/or PMIDs, though we're only interested in genenames at this time
(fields,values,requirements,genes,accessions,pmids) = parse_query(ie,query)
(pterms,pfields,preqs,presults) = search_lucene(fields,values,requirements,psearcher,index=0)
(sterms,sfields,sreqs,sresults) = search_lucene(fields,values,requirements,ssearcher,index=1)
sys.stdout.write("results: %d in pubmed; %d in sra\n" % (presults.totalHits,sresults.totalHits))
#find the ids of both (either/or) pubmed and SRA entries which contains references to one or more
#of the genenames in the query
(pubmed_ids,sra_ids) = get_ids_by_genenames(genes2ids,genes)
sra_ids.update(accessions)
parse_results(['PMID','EXPERIMENT_accession'],[presults,sresults],[psearcher,ssearcher],id2additional_ids,id_filters=[pubmed_ids,sra_ids])
ID_BOOST=1000
def score_results_for_id(result,searcher,primary_id_field,id_filter,final_results,idx):
for scoreDoc in result.scoreDocs:
pid = searcher.doc(scoreDoc.doc).get(primary_id_field)
if pid in id_filter:
scoreDoc.score+=ID_BOOST
#add both the scoreDoc AND the id (pubmed or sra)
final_results.append([idx,scoreDoc])
def parse_results(primary_id_fields,results,searchers,id2additional_ids,id_filters=[set(),set()]):
sys.stdout.write("filter set: %d %d\n" % (len(id_filters[0]),len(id_filters[1])))
merged_results = []
for (idx, result) in enumerate(results):
score_results_for_id(result,searchers[idx],primary_id_fields[idx],id_filters[idx],merged_results,idx)
for sdoc in sorted(merged_results,key=lambda x: x[1].score,reverse=True):
idx = sdoc[0]
scoreDoc = sdoc[1]
pfield = primary_id_fields[idx]
pid = searchers[idx].doc(scoreDoc.doc).get(pfield)
full_doc = None
if RETRIEVE_FULL:
full_doc = searchers[idx].doc(scoreDoc.doc).get('raw')
have_gene = False
#if scoreDoc.score >= 1000:
# have_gene = True
additional_ids = ""
if pid in id2additional_ids:
additional_ids = id2additional_ids[pid]
if RETRIEVE_FULL:
sys.stdout.write("%s\t%s\t%d\t%s\t%s\n".encode('utf-8') % (pfield,pid,scoreDoc.score,additional_ids,full_doc))
else:
sys.stdout.write("%s\t%s\t%d\t%s\n" % (pfield,pid,scoreDoc.score,additional_ids))
#for f in fields:
# f_ = docu.get(f)
#sys.stderr.write("%s\t%s\n" % (f,f_))
#pubmed,sra
PUBMED_IDX=0
SRA_IDX=1
ID_COL=[1,1]
GENE_COL=[2,3]
ADD_IDS_START_COL=[2,2]
def load_gene2id_map(files):
genes2ids = {}
id2additional_ids = {}
#for faster loading (serialized binary) look for pickled file
#UPDATE: not faster, pickling/loading from takes ~4-5 seconds longer
for (idx,file_) in enumerate(files):
with open(file_,"r") as fin:
for line in fin:
fields = line.rstrip('\n').split("\t")
#print("%d %s" % (idx,line))
id_ = fields[ID_COL[idx]]
if PRINT_ADDITIONAL_IDS and len(fields[ADD_IDS_START_COL[idx]]) > 0:
id2additional_ids[str(id_)]="\t".join(fields[ADD_IDS_START_COL[idx]:])
genes = fields[GENE_COL[idx]].split(";")
#if idx == SRA_IDX and len(id_) > 3:
#avoid the redundancy of storing the full "SRX" prefix
# id_ = id_[3:]
for gene in genes:
if gene not in genes2ids:
#pubmed,sra
genes2ids[gene] = [set(),set()]
#genes2ids[gene][idx].add(int(id_))
genes2ids[gene][idx].add(id_)
return (genes2ids,id2additional_ids)
PARAM_DELIM='&'
def main():
global PRINT_ADDITIONAL_IDS
global RETRIEVE_FULL
global NUM_TO_RETRIEVE
if len(sys.argv) < 2:
sys.stderr.write("need query\n")
sys.exit(-1)
query = sys.argv[1]
parameters = {}
def pquery(x):
(x1,x2)=x.split('=')
parameters[x1]=x2
map(pquery,query.split(PARAM_DELIM))
if 'query' not in parameters or len(parameters['query']) < 1:
sys.stderr.write("MUST SUBMIT A QUERY, e.g. query=TP53\n")
sys.exit(-1)
if 'add_ids' in parameters and parameters['add_ids']=="1":
PRINT_ADDITIONAL_IDS=True
if 'full' in parameters and parameters['full']=="1":
RETRIEVE_FULL=True
if 'limit' in parameters:
NUM_TO_RETRIEVE=int(parameters['limit'])
ie = IdentifierExtracter(hugo_genenamesF,gene_filter=re.compile(r'[\-\d]'),filter_stopwords=True)
(genes2ids,id2additional_ids) = load_gene2id_map(["pubmed_map.tsv","sra_map.tsv"])
process_query(ie,genes2ids,id2additional_ids,parameters['query'])
if __name__ == '__main__':
main()