-
Notifications
You must be signed in to change notification settings - Fork 1
/
distancebased.jl
238 lines (190 loc) · 9.17 KB
/
distancebased.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# function to compute the distance between a readout and column entries
"""
QuadraticDistance(; λ=1, standardize=true)
This returns an anonymous function `(x, col; prior) -> λ * (x .- col).^2 / σ`.
If `standardize` is set to `true`, `σ` represents `col`'s variance calculated in relation to `prior`, otherwise `σ` equals one.
"""
QuadraticDistance(; λ = 1, standardize = true) =
function (x, col; prior = ones(length(col)))
σ = standardize ? var(col, Weights(prior); corrected = false) : 1
return λ * (x .- col) .^ 2 / σ
end
"""
DiscreteDistance(; λ=1)
Return an anonymous function `(x, col) -> λ * (x .== col)`.
"""
DiscreteDistance(; λ = 1) = function (x, col; _...)
return map(y -> y == x ? λ : 0.0, col)
end
# default similarity functional
"""
Exponential(; λ=1)
Return an anonymous function `x -> exp(-λ * sum(x; init=0))`.
"""
Exponential(; λ = 1 / 2) = x -> exp(-λ * x)
# default uncertainty functionals
compute_variance(data::AbstractVector; weights) = var(data, Weights(weights))
compute_variance(data; weights) = sum(var(Matrix(data), Weights(weights), 1))
"""
Variance(data; prior)
Return a function of `weights` that computes the fraction of variance in the data, relative to the variance calculated with respect to a specified `prior`.
"""
function Variance(data; prior)
initial = compute_variance(data; weights = prior)
return weights -> (compute_variance(data; weights) / initial)
end
function compute_entropy(labels; weights)
aggregate_weights = collect(values(countmap(labels, Weights(weights))))
return entropy(aggregate_weights ./ sum(aggregate_weights))
end
"""
Entropy(labels; prior)
Return a function of `weights` that computes the fraction of information entropy, relative to the entropy calculated with respect to a specified `prior`.
"""
function Entropy(labels; prior)
@assert elscitype(labels) <: Multiclass "labels must be of `Multiclass` scitype, but `elscitype(labels)=$(elscitype(labels))`"
initial = compute_entropy(labels; weights = prior)
return (weights -> compute_entropy(labels; weights) / initial)
end
# Return a function that calculates the sum of distances in each row, column-wise, and applies weights based on the prior.
function sum_of_distances(data::DataFrame, targets::Vector, distances; prior::Weights)
function (evidence::Evidence)
if isempty(evidence)
return zeros(nrow(data))
else
array_distances = zeros((nrow(data), length(evidence)))
for (i, colname) in enumerate(keys(evidence))
if colname ∈ targets
continue
else
array_distances[:, i] .=
distances[colname](evidence[colname], data[!, colname]; prior)
end
end
distances_sum = vec(sum(array_distances; init = 0.0, dims = 2))
return distances_sum
end
end
end
"""
MahalanobisDistance(; diagonal=0)
Returns a function that computes [Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance) between each row of `data` and the evidence.
For a singular covariance matrix, consider adding entries to the matrix's diagonal via the `diagonal` keyword.
# Arguments
- `diagonal`: A scalar or vector to be added to the diagonal entries of the covariance matrix.
# Returns
It returns a high-level function of `(data, targets, prior)`.
When called, that function will return an internal function `compute_distances` that takes an `Evidence` and computes the Mahalanobis distance based on the input data and the evidence.
"""
function MahalanobisDistance(; diagonal = 0)
function (data, targets, prior)
non_targets = setdiff(names(data), targets)
if !all(t -> t <: Real, eltype.(eachcol(data[!, non_targets])))
@warn "Not all column types in the predictor matrix are numeric ($(eltype.(eachcol(data)))). This may cause errors."
end
Σ = cov(Matrix(data[!, non_targets]), Weights(prior))
# add diagonal entries
diagonal = diagonal isa Number ? fill(diagonal, size(Σ, 1)) : diagonal
foreach(i -> Σ[i, i] += diagonal[i], axes(Σ, 1))
# get the inverse of Σ
Λ = inv(Σ)
compute_distances = function (evidence::Evidence)
if isempty(evidence)
return zeros(nrow(data))
else
vec_evidence = map(colname -> get(evidence, colname, 0), non_targets)
distances = map(eachrow(data)) do row
vec_row = map(
colname -> haskey(evidence, colname) ? row[colname] : 0,
non_targets,
)
z = vec_evidence - vec_row
dot(z, Λ * z)
end
return distances
end
end
return compute_distances
end
end
"""
DistanceBased(data; target, uncertainty=Entropy, similarity=Exponential(), distance=Dict(); prior=ones(nrow(data)))
Compute distances between experimental evidence and historical readouts, and apply a 'similarity' functional to obtain probability mass for each row.
Consider using [`QuadraticDistance`](@ref), [`DiscreteDistance`](@ref), and [`MahalanobisDistance`](@ref).
# Return value
A named tuple with the following fields:
- `sampler`: a function of `(evidence, features, rng)`, in which `evidence` denotes the current experimental evidence, `features` represent the set of features we want to sample from, and `rng` is a random number generator; it returns a dictionary mapping the features to outcomes.
- `uncertainty`: a function of `evidence`; it returns the measure of variance or uncertainty about the target variable, conditioned on the experimental evidence acquired so far.
- `weights`: a function of `evidence`; it returns probabilities (posterior) acrss the rows in `data`.
# Arguments
- `data`: a dataframe with historical data.
- `target`: target column name or a vector of target columns names.
# Keyword Argumets
- `uncertainty`: a function that takes the subdataframe containing columns in targets along with prior, and returns an anonymous function taking a single argument (a probability vector over observations) and returns an uncertainty measure over targets.
- `similarity`: a function that, for each row, takes distances between `row[col]` and `readout[col]`, and returns a non-negative probability mass for the row.
- `distance`: a dictionary of pairs `colname => similarity functional`, where a similarity functional must implement the signature `(readout, col; prior)`. Defaults to [`QuadraticDistance`](@ref) and [`DiscreteDistance`](@ref) for `Continuous` and `Multiclass` scitypes, respectively.
- `prior`: prior across rows, uniform by default.
# Example
```julia
(; sampler, uncertainty, weights) = DistanceBased(
data;
target = "HeartDisease",
uncertainty = Entropy,
similarity = Exponential(; λ = 5),
);
```
"""
function DistanceBased(
data::DataFrame;
target,
uncertainty = Variance,
similarity = Exponential(),
distance = Dict(),
prior = ones(nrow(data)),
)
prior = Weights(prior)
targets = target isa AbstractVector ? target : [target]
if distance isa Dict
distances = Dict(
try
if haskey(distance, colname)
string(colname) => distance[colname]
elseif elscitype(data[!, colname]) <: Continuous
string(colname) => QuadraticDistance()
elseif elscitype(data[!, colname]) <: Multiclass
string(colname) => DiscreteDistance()
else
error()
end
catch
error(
"""column $colname has scitype $(elscitype(data[!, colname])), which is not supported by default.
Please provide a custom readout-column distances functional of the signature `(x, col; prior)`.""",
)
end for colname in names(data[!, Not(target)])
)
compute_distances = sum_of_distances(data, targets, distances; prior)
elseif applicable(distance, data, targets, prior)
compute_distances = distance(data, targets, prior)
else
error("distance $distance does not accept `(data, targets, prior)`")
end
# convert distances into probabilistic weights
compute_weights = function (evidence::Evidence)
similarities = prior .* map(x -> similarity(x), compute_distances(evidence))
# hard match on target columns
for colname in collect(keys(evidence)) ∩ targets
similarities .*= data[!, colname] .== evidence[colname]
end
return Weights(similarities ./ sum(similarities))
end
sampler = function (evidence::Evidence, columns, rng = default_rng())
observed = data[sample(rng, compute_weights(evidence)), :]
return Dict(c => observed[c] for c in columns)
end
f_uncertainty = uncertainty(data[!, target]; prior)
compute_uncertainty = function (evidence::Evidence)
return f_uncertainty(compute_weights(evidence))
end
return (; sampler, uncertainty = compute_uncertainty, weights = compute_weights)
end