Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Protect cbindWithNames against nulls, duplicates in names. #85

Merged
merged 2 commits into from
Sep 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 58 additions & 51 deletions js/cbind.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as utils from "./utils.js";
import { ScranMatrix } from "./ScranMatrix.js";
import * as gc from "./gc.js";
import * as subset from "./subset.js";

function harvest_matrices(x) {
let output = utils.createBigUint64WasmArray(x.length);
Expand Down Expand Up @@ -67,87 +68,93 @@ export function rbind(inputs) {
return output;
}


/**
* Combine matrices by column, after subsetting each matrix to the intersection of common features.
*
* @param {Array} inputs - Array of one or more {@linkplain ScranMatrix} objects.
* @param {Array} names - Array of length equal to `inputs`.
* Each entry should be an Array containing the row names of the corresponding entry of `inputs`.
* Names should correspond to the rows of that entry of `inputs`.
* Any `null` names are ignored.
* If names are duplicated within each array, only the first occurrence is considered in the intersection.
*
* @return {object} An object containing:
* - `matrix`, a {@linkplain ScranMatrix} containing the combined matrices.
* - `indices`, an Int32WasmArray of length equal to the number of rows in `matrix`.
* - `indices`, an Int32Array of length equal to the number of rows in `matrix`.
* This contains the index of the row in the first entry of `inputs` corresponding to each row of `matrix`,
* i.e., the gene at the `i`-th row of `matrix` is the same as the gene at the `indices[i]`-th row of `inputs[0]`.
* This is guaranteed to be sorted.
* - `names`, an array of names identifying the rows of `matrix`.
* This is constructed by indexing the first entry of `names` with `indices`.
*/
export function cbindWithNames(x, names) {
let mat_ptrs;
let renamed = [];
let name_ptrs;
let indices;
let output = {};

try {
// Building a common set of rownames.
if (names.length !== x.length) {
throw new Error("length of 'names' should be equal to length of 'x'");
}

let common = {};
let universe = [];
for (var i = 0; i < names.length; i++) {
if (x[i].numberOfRows() !== names[i].length) {
throw new Error("length of each 'names' must equal number of rows of its corresponding 'x'");
}
names[i].forEach(x => {
if (!(x in common)) {
common[x] = universe.length;
universe.push(x);
// Find the intersection of names, following the order of the first entry.
// We do so to try to improve the chance of getting an ordered subset for efficient extraction.
let ordered_intersection = [];
let remapping = new Map;

if (names.length > 0) {
let intersection = new Set;
for (var n = 0; n < names.length; ++n) {
let current = new Set();
for (const name of names[n]) {
if (name !== null) {
if (n == 0 || intersection.has(name)) {
current.add(name);
}
}
});
}
intersection = current;
}

name_ptrs = utils.createBigUint64WasmArray(x.length);
{
let names_arr = name_ptrs.array();
for (var i = 0; i < names.length; i++) {
let current = names[i];
let replacement = utils.createInt32WasmArray(current.length);
let replacement_arr = replacement.array();
current.forEach((x, i) => {
replacement_arr[i] = common[x];
});
renamed.push(replacement);
names_arr[i] = BigInt(replacement.offset);
for (const name of names[0]) {
if (name !== null && intersection.has(name)) {
let candidate = remapping.get(name);
if (candidate == undefined) { // only consider the first occurence.
remapping.set(name, ordered_intersection.length);
ordered_intersection.push(name);
}
}
}
}

mat_ptrs = harvest_matrices(x);
indices = utils.createInt32WasmArray(x[0].numberOfRows());
output.matrix = gc.call(
module => module.cbind_with_rownames(x.length, mat_ptrs.offset, name_ptrs.offset, indices.offset),
ScranMatrix
);
// Actually generating the combined matrix.
let output = {};
let tmp_subset = []
let tmp_sliced = []

try {
for (var n = 0; n < names.length; ++n) {
let survivors = utils.createInt32WasmArray(ordered_intersection.length);
survivors.fill(-1);
let sarray = survivors.array();
names[n].forEach((x, i) => {
let candidate = remapping.get(x);
if (candidate !== undefined) {
if (sarray[candidate] < 0) { // only consider the first occurrence.
sarray[candidate] = i;
}
}
});

output.indices = indices.slice(0, output.matrix.numberOfRows());
let internames = [];
for (const i of output.indices) {
internames.push(names[0][i]);
tmp_subset.push(survivors);
tmp_sliced.push(subset.subsetRows(x[n], survivors));
}
output.names = internames;

output.matrix = cbind(tmp_sliced);
output.indices = tmp_subset[0].slice();
output.names = ordered_intersection;

} catch (e) {
utils.free(output.matrix);
throw e;

} finally {
utils.free(mat_ptrs);
utils.free(name_ptrs);
utils.free(indices);
for (const x of renamed) {
for (const x of tmp_subset) {
utils.free(x);
}
for (const x of tmp_sliced) {
utils.free(x);
}
}
Expand Down
26 changes: 0 additions & 26 deletions src/cbind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,34 +60,8 @@ NumericMatrix rbind(int n, uintptr_t mats) {
return NumericMatrix(tatami::make_DelayedBind<0>(std::move(collected)));
}

NumericMatrix cbind_with_rownames(int n, uintptr_t mats, uintptr_t names, uintptr_t indices) {
if (n == 0) {
throw std::runtime_error("need at least one matrix to cbind");
}

auto mat_ptrs = convert_array_of_offsets<const NumericMatrix*>(n, mats);
const auto& first = *(mat_ptrs[0]);
std::vector<std::remove_reference<decltype(first.ptr)>::type> inputs;
inputs.reserve(n);
for (int i = 0; i < n; ++i) {
inputs.push_back(mat_ptrs[i]->ptr);
}

auto name_ptrs = convert_array_of_offsets<const int32_t*>(n, names);
auto out = tatami::bind_intersection<1>(inputs, name_ptrs);

// Save the direct row indices for the first matrix.
const auto& idx = out.second;
auto idptr = reinterpret_cast<int*>(indices);
std::copy(idx.begin(), idx.end(), idptr);

return NumericMatrix(std::move(out.first));
}

EMSCRIPTEN_BINDINGS(cbind) {
emscripten::function("cbind", &cbind);

emscripten::function("rbind", &rbind);

emscripten::function("cbind_with_rownames", &cbind_with_rownames);
}
115 changes: 96 additions & 19 deletions tests/cbind.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,43 +60,98 @@ test("cbindWithNames works correctly (simple)", () => {
expect(combined.matrix.numberOfColumns()).toBe(60);

expect(compare.equalArrays(combined.names, ["E", "F", "G", "H", "I", "J"])).toBe(true);
expect(compare.equalArrays(combined.indices, [4, 5, 6, 7, 8, 9])).toBe(true);

expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(4))).toBe(true);
expect(compare.equalArrays(combined.matrix.column(10), mat2.column(0).slice(2, 8))).toBe(true);
expect(compare.equalArrays(combined.matrix.column(30), mat3.column(0).slice(0, 6))).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(1),
[ ...mat1.row(5), ...mat2.row(3), ...mat3.row(1) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
mat1.free();
mat2.free();
mat3.free();
})

function quick_check(mat1, mat2, mat3, combined) {
test("cbindWithNames works correctly (complex)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"]; // random order.

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
expect(combined.matrix.numberOfRows()).toBe(4);
expect(combined.matrix.numberOfColumns()).toBe(60);

expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true);
expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(5, 9))).toBe(true);
expect(compare.equalArrays(combined.indices, [5, 6, 7, 8])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [5,6,7,8].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [y2[7], y2[5], y2[3], y2[1]];
let expected2 = [7, 5, 3, 1].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [y3[3], y3[2], y3[0], y3[1]];
let expected3 = [3, 2, 0, 1].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);
}

test("cbindWithNames works correctly (complex)", () => {
expect(
compare.equalArrays(
combined.matrix.row(2),
[ ...mat1.row(7), ...mat2.row(3), ...mat3.row(0) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
mat1.free();
mat2.free();
mat3.free();
})

test("cbindWithNames works correctly (duplicates)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end.
var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "N", "J"]; // preceding duplicates that might mess with the indexing, plus duplicated 'N'.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"]; // random order.
var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the first occurrence is used.
var mat3 = simulate.simulateDenseMatrix(5, 30);
var names3 = ["L", "J", "N", "P", "L"]; // duplicate in 'L', first occurrence is used again.

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
quick_check(mat1, mat2, mat3, combined);
expect(combined.matrix.numberOfRows()).toBe(4);
expect(combined.matrix.numberOfColumns()).toBe(60);
expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true);
expect(compare.equalArrays(combined.indices, [5, 6, 7, 9])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [5, 6, 7, 9].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [6, 5, 3, 1].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [3, 2, 0, 1].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(3),
[ ...mat1.row(9), ...mat2.row(1), ...mat3.row(1) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
Expand All @@ -105,16 +160,38 @@ test("cbindWithNames works correctly (complex)", () => {
mat3.free();
})

test("cbindWithNames works correctly (duplicates)", () => {
test("cbindWithNames works correctly (nulls)", () => {
var mat1 = simulate.simulateDenseMatrix(10, 10);
var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "J", "H"]; // preceding duplicates that might mess with the indexing.
var mat2 = simulate.simulateDenseMatrix(8, 20);
var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the last occurrence is used.
var mat3 = simulate.simulateDenseMatrix(4, 30);
var names3 = ["L", "J", "N", "P"];
var names1 = ["A", "B", null, "D", "E", "F", null, "H", "I", "J"];
var mat2 = simulate.simulateDenseMatrix(10, 20);
var names2 = ["C", "D", "E", null, "G", "H", "I", "J", "K", "L"];
var mat3 = simulate.simulateDenseMatrix(10, 30);
var names3 = ["E", "F", null, "H", null, "J", "K", "L", "M", "N"];

let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]);
quick_check(mat1, mat2, mat3, combined);
expect(combined.matrix.numberOfRows()).toBe(3);
expect(combined.matrix.numberOfColumns()).toBe(60);
expect(compare.equalArrays(combined.names, ["E", "H", "J"])).toBe(true);
expect(compare.equalArrays(combined.indices, [4, 7, 9])).toBe(true);

let y1 = mat1.column(0);
let expected1 = [4, 7, 9].map(i => y1[i]);
expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true);

let y2 = mat2.column(0);
let expected2 = [2, 5, 7].map(i => y2[i]);
expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true);

let y3 = mat3.column(0);
let expected3 = [0, 3, 5].map(i => y3[i]);
expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true);

expect(
compare.equalArrays(
combined.matrix.row(0),
[ ...mat1.row(4), ...mat2.row(2), ...mat3.row(0) ]
)
).toBe(true);

// Freeing all the bits and pieces.
combined.matrix.free();
Expand Down