Skip to content

Commit

Permalink
feat: add z-normalisation option
Browse files Browse the repository at this point in the history
  • Loading branch information
fantes authored and mergify[bot] committed Jun 28, 2021
1 parent e26ed77 commit 82d7cc5
Show file tree
Hide file tree
Showing 10 changed files with 784 additions and 239 deletions.
6 changes: 4 additions & 2 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ label_offset | int | yes | 0 | Negative offset (e.g. -1)
separator | string | yes | ',' | Column separator character
quote | string | yes | '"' | Quote character in CSV file
id | string | yes | empty | Column name of the training examples identifier field, if any
scale | bool | yes | false | Whether to scale all values into [0,1]
scale | bool | yes | false | Whether to scale all values internally into uniform range
scale_type | string | yes | "minmax" | scaling type in "minmax", "znorm"
categoricals | array | yes | empty | List of categorical variables
db | bool | yes | false | whether to gather data into a database, useful for very large datasets, allows treatment in constant-size memory

Expand All @@ -219,7 +220,8 @@ ignore | array of string | yes | empty | Array of column names to igno
separator | string | yes | ',' | Column separator character
quote | string | yes | '"' | Quote character in CSV file
id | string | yes | empty | Column name of the training examples identifier field, if any
scale | bool | yes | false | Whether to scale all values into [0,1]
scale | bool | yes | false | Whether to scale all values
scale_type | string | yes | "minmax" | scaling type in "minmax" (scales into [-0.5,0.5]), "znorm"
db | bool | yes | false | whether to gather data into a database, useful for very large datasets, allows treatment in constant-size memory


Expand Down
106 changes: 82 additions & 24 deletions src/backends/caffe/caffelib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2659,15 +2659,31 @@ namespace dd
if (cic->_scale && !cic->_dont_scale_labels)
{
int label_index = k % _ntargets;
double max
= cic->_max_vals
[cic->_label_pos[label_index]];
double min
= cic->_min_vals
[cic->_label_pos[label_index]];
if (cic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
if (cic->_scale_type == MINMAX)
{
double max
= cic->_max_vals
[cic->_label_pos[label_index]];
double min
= cic->_min_vals
[cic->_label_pos[label_index]];
if (cic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
}
else if (cic->_scale_type == ZNORM)
{
double mean
= cic->_mean_vals
[cic->_label_pos[label_index]];
double variance
= cic->_variance_vals
[cic->_label_pos[label_index]];
res = res * (sqrt(variance)) + mean;
}
else
throw MLLibInternalException(
"unknown scale type");
}
target_unscaled.push_back(res);
}
Expand All @@ -2684,13 +2700,28 @@ namespace dd
predictions.push_back(res);
if (cic->_scale && !cic->_dont_scale_labels)
{
double max
= cic->_max_vals[cic->_label_pos[k]];
double min
= cic->_min_vals[cic->_label_pos[k]];
if (cic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
if (cic->_scale_type == MINMAX)
{
double max = cic->_max_vals
[cic->_label_pos[k]];
double min = cic->_min_vals
[cic->_label_pos[k]];
if (cic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
}
else if (cic->_scale_type == ZNORM)
{
double mean = cic->_mean_vals
[cic->_label_pos[k]];
double variance
= cic->_variance_vals
[cic->_label_pos[k]];
res = res * (sqrt(variance)) + mean;
}
else
throw MLLibInternalException(
"unknown scale type");
}
pred_unscaled.push_back(res);
}
Expand Down Expand Up @@ -3510,7 +3541,19 @@ namespace dd
for (int k = 0; k < nout; ++k)
{
std::vector<int> loc = { t, j, k };
if (ic->_min_vals.empty() || ic->_max_vals.empty())
if (ic->_scale_type == MINMAX
&& (ic->_min_vals.empty()
|| ic->_max_vals.empty()))
{
this->_logger->info(
"not unscaling output because no bounds "
"data found");
predictions.push_back(
results[slot]->data_at(loc));
}
else if (ic->_scale_type == ZNORM
&& (ic->_mean_vals.empty()
|| ic->_variance_vals.empty()))
{
this->_logger->info(
"not unscaling output because no bounds "
Expand All @@ -3523,13 +3566,28 @@ namespace dd
double res = results[slot]->data_at(loc);
if (ic->_scale && !ic->_dont_scale_labels)
{
double max
= ic->_max_vals[ic->_label_pos[k]];
double min
= ic->_min_vals[ic->_label_pos[k]];
if (ic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
if (ic->_scale_type == MINMAX)
{
double max
= ic->_max_vals[ic->_label_pos[k]];
double min
= ic->_min_vals[ic->_label_pos[k]];
if (ic->_scale_between_minus_half_and_half)
res += 0.5;
res = res * (max - min) + min;
}
else if (ic->_scale_type == ZNORM)
{
double mean = ic->_mean_vals
[ic->_label_pos[k]];
double variance
= ic->_variance_vals
[ic->_label_pos[k]];
res = res * (sqrt(variance)) + mean;
}
else
throw MLLibInternalException(
"unknown scale type");
}
predictions.push_back(res);
}
Expand Down
24 changes: 19 additions & 5 deletions src/backends/ncnn/ncnninputconns.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,13 +254,27 @@ namespace dd
{
if (_dont_scale_labels)
return res;
if (_min_vals.empty() || _max_vals.empty())
if (_scale_type == MINMAX && (_min_vals.empty() || _max_vals.empty()))
return res;
double min = _min_vals[_label_pos[k]];
if (_scale_between_minus_half_and_half)
return (res + 0.5) * (_max_vals[_label_pos[k]] - min) + min;
if (_scale_type == ZNORM
&& (_mean_vals.empty() || _variance_vals.empty()))
return res;

if (_scale_type == MINMAX)
{
double min = _min_vals[_label_pos[k]];
if (_scale_between_minus_half_and_half)
return (res + 0.5) * (_max_vals[_label_pos[k]] - min) + min;
else
return res * (_max_vals[_label_pos[k]] - min) + min;
}
else if (_scale_type == ZNORM)
{
return res * (sqrt(_variance_vals[_label_pos[k]]))
+ _mean_vals[_label_pos[k]];
}
else
return res * (_max_vals[_label_pos[k]] - min) + min;
throw InputConnectorBadParamException("unknwon scale type");
}

public:
Expand Down
51 changes: 40 additions & 11 deletions src/backends/torch/torchlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,15 @@ namespace dd
{
if (!inputc._scale)
return val;
if (inputc._min_vals.empty() || inputc._max_vals.empty())
if (inputc._scale_type == MINMAX
&& (inputc._min_vals.empty() || inputc._max_vals.empty()))
{
this->_logger->info("not unscaling output because no bounds "
"data found");
return val;
}
else if (inputc._scale_type == ZNORM
&& (inputc._mean_vals.empty() || inputc._variance_vals.empty()))
{
this->_logger->info("not unscaling output because no bounds "
"data found");
Expand All @@ -124,21 +132,42 @@ namespace dd
{
if (!inputc._dont_scale_labels)
{
double max, min;
if (inputc._label_pos.size() > 0) // labels case
if (inputc._scale_type == MINMAX)
{
max = inputc._max_vals[inputc._label_pos[k]];
min = inputc._min_vals[inputc._label_pos[k]];
double max, min;
if (inputc._label_pos.size() > 0) // labels case
{
max = inputc._max_vals[inputc._label_pos[k]];
min = inputc._min_vals[inputc._label_pos[k]];
}
else // forecast case
{
max = inputc._max_vals[k];
min = inputc._min_vals[k];
}
if (inputc._scale_between_minus_half_and_half)
val += 0.5;
val = val * (max - min) + min;
}
else // forecast case
else if (inputc._scale_type == ZNORM)
{
max = inputc._max_vals[k];
min = inputc._min_vals[k];
double mean, variance;
if (inputc._label_pos.size() > 0) // labels case
{
mean = inputc._mean_vals[inputc._label_pos[k]];
variance = inputc._variance_vals[inputc._label_pos[k]];
}
else // forecast case
{
mean = inputc._mean_vals[k];
variance = inputc._variance_vals[k];
}
val = val * (sqrt(variance)) + mean;
}
if (inputc._scale_between_minus_half_and_half)
val += 0.5;
val = val * (max - min) + min;
else
throw MLLibInternalException("unknwon scale type");
}

return val;
}
}
Expand Down
Loading

0 comments on commit 82d7cc5

Please sign in to comment.