-
Notifications
You must be signed in to change notification settings - Fork 0
/
op_search.py
159 lines (117 loc) · 5.44 KB
/
op_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
'''
Custom theano class to query the search engine.
'''
import numpy as np
import theano
from theano import gof
from theano import tensor
import parameters as prm
import utils
import average_precision
import random
class Search(theano.Op):
__props__ = ()
def __init__(self,options):
self.options = options
self.options['reformulated_queries'] = {}
def make_node(self, x1, x2, x3, x4):
assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
x1 = tensor.as_tensor_variable(x1)
x2 = tensor.as_tensor_variable(x2)
x3 = tensor.as_tensor_variable(x3)
x4 = tensor.as_tensor_variable(x4)
out = [tensor.fmatrix().type(), tensor.itensor3().type(), tensor.imatrix().type(), tensor.fmatrix().type()]
return theano.Apply(self, [x1, x2, x3, x4], out)
def perform(self, node, inputs, output_storage):
q_m = inputs[0]
#print("q_m", q_m.shape, q_m)
#for i in q_m:
# print i
D_truth = inputs[1]
n_iter = int(inputs[2])
is_train = int(inputs[3])
#outputs
metrics = np.zeros((len(q_m), len(prm.metrics_map)), np.float32)
if is_train:
max_feedback_docs = prm.max_feedback_docs_train
else:
max_feedback_docs = prm.max_feedback_docs
D_i = -2 * np.ones((len(q_m), max_feedback_docs, prm.max_words_input), np.int32)
D_gt_m = np.zeros((len(q_m), prm.max_candidates), np.float32)
D_id = np.zeros((len(q_m), prm.max_candidates), np.int32)
# no need to retrieve extra terms in the last iteration
if n_iter == prm.n_iterations - 1:
extra_terms = False
else:
extra_terms = True
# allow the search engine to cache queries only in the first iteration.
if n_iter == 0:
save_cache = prm.use_cache
else:
save_cache = False
max_cand = prm.max_candidates
qs = []
for i, q_lst in enumerate(self.options['current_queries']):
q = []
for j, word in enumerate(q_lst):
if q_m[i,j] == 1:
q.append(str(word))
q = ' '.join(q)
if len(q) == 0:
q = 'dummy'
qs.append(q)
# only used to print the reformulated queries.
self.options['reformulated_queries'][n_iter] = qs
# always return one more candidate because one of them might be the input doc.
candss = self.options['engine'].get_candidates(qs, max_cand, prm.max_feedback_docs, save_cache, extra_terms)
for i, cands in enumerate(candss):
D_truth_dic = {}
for d_truth in D_truth[i]:
if d_truth > -1:
D_truth_dic[d_truth] = 0
D_id[i,:len(cands.keys())] = cands.keys()
j = 0
m = 0
cand_ids = []
selected_docs = np.arange(prm.max_feedback_docs)
if is_train:
selected_docs = np.random.choice(selected_docs, size=prm.max_feedback_docs_train, replace=False)
for k, (cand_id, (words_idx, words)) in enumerate(cands.items()):
cand_ids.append(cand_id)
# no need to add candidate words in the last iteration.
if n_iter < prm.n_iterations - 1:
# only add docs selected by sampling (if training).
if k in selected_docs:
words = words[:prm.max_terms_per_doc]
words_idx = words_idx[:prm.max_terms_per_doc]
D_i[i,m,:len(words_idx)] = words_idx
# append empty strings, so the list size becomes <dim>.
words = words + max(0, prm.max_words_input - len(words)) * ['']
# append new words to the list of current queries.
self.options['current_queries'][i] += words
m += 1
if cand_id in D_truth_dic:
D_gt_m[i,j] = 1.
j += 1
cands_set = set(cands.keys())
if qs[i].lower() in self.options['engine'].title_id_map:
input_doc_id = self.options['engine'].title_id_map[qs[i].lower()]
# Remove input doc from returned docs.
# This operation does not raise an error if the element is not there.
cands_set.discard(input_doc_id)
intersec = len(set(D_truth_dic.keys()) & cands_set)
recall = intersec / max(1., float(len(D_truth_dic)))
precision = intersec / max(1., float(prm.max_candidates))
metrics[i,prm.metrics_map['RECALL']] = recall
metrics[i,prm.metrics_map['PRECISION']] = precision
metrics[i,prm.metrics_map['F1']] = 2 * recall * precision / max(0.01, recall + precision)
avg_precision = average_precision.compute(D_truth_dic.keys(), cand_ids)
metrics[i,prm.metrics_map['MAP']] = avg_precision
metrics[i,prm.metrics_map['LOG-GMAP']] = np.log(avg_precision + 1e-5)
output_storage[0][0] = metrics
output_storage[1][0] = D_i
output_storage[2][0] = D_id
output_storage[3][0] = D_gt_m
def grad(self, inputs, output_grads):
return [tensor.zeros_like(ii, dtype=theano.config.floatX) for ii in inputs]