diff --git a/js/cbind.js b/js/cbind.js index b4d57248..8750e870 100644 --- a/js/cbind.js +++ b/js/cbind.js @@ -1,6 +1,7 @@ import * as utils from "./utils.js"; import { ScranMatrix } from "./ScranMatrix.js"; import * as gc from "./gc.js"; +import * as subset from "./subset.js"; function harvest_matrices(x) { let output = utils.createBigUint64WasmArray(x.length); @@ -67,6 +68,7 @@ export function rbind(inputs) { return output; } + /** * Combine matrices by column, after subsetting each matrix to the intersection of common features. * @@ -74,80 +76,85 @@ export function rbind(inputs) { * @param {Array} names - Array of length equal to `inputs`. * Each entry should be an Array containing the row names of the corresponding entry of `inputs`. * Names should correspond to the rows of that entry of `inputs`. + * Any `null` names are ignored. + * If names are duplicated within each array, only the first occurrence is considered in the intersection. * * @return {object} An object containing: * - `matrix`, a {@linkplain ScranMatrix} containing the combined matrices. - * - `indices`, an Int32WasmArray of length equal to the number of rows in `matrix`. + * - `indices`, an Int32Array of length equal to the number of rows in `matrix`. * This contains the index of the row in the first entry of `inputs` corresponding to each row of `matrix`, * i.e., the gene at the `i`-th row of `matrix` is the same as the gene at the `indices[i]`-th row of `inputs[0]`. + * This is guaranteed to be sorted. * - `names`, an array of names identifying the rows of `matrix`. * This is constructed by indexing the first entry of `names` with `indices`. */ export function cbindWithNames(x, names) { - let mat_ptrs; - let renamed = []; - let name_ptrs; - let indices; - let output = {}; - - try { - // Building a common set of rownames. - if (names.length !== x.length) { - throw new Error("length of 'names' should be equal to length of 'x'"); - } - - let common = {}; - let universe = []; - for (var i = 0; i < names.length; i++) { - if (x[i].numberOfRows() !== names[i].length) { - throw new Error("length of each 'names' must equal number of rows of its corresponding 'x'"); - } - names[i].forEach(x => { - if (!(x in common)) { - common[x] = universe.length; - universe.push(x); + // Find the intersection of names, following the order of the first entry. + // We do so to try to improve the chance of getting an ordered subset for efficient extraction. + let ordered_intersection = []; + let remapping = new Map; + + if (names.length > 0) { + let intersection = new Set; + for (var n = 0; n < names.length; ++n) { + let current = new Set(); + for (const name of names[n]) { + if (name !== null) { + if (n == 0 || intersection.has(name)) { + current.add(name); + } } - }); + } + intersection = current; } - name_ptrs = utils.createBigUint64WasmArray(x.length); - { - let names_arr = name_ptrs.array(); - for (var i = 0; i < names.length; i++) { - let current = names[i]; - let replacement = utils.createInt32WasmArray(current.length); - let replacement_arr = replacement.array(); - current.forEach((x, i) => { - replacement_arr[i] = common[x]; - }); - renamed.push(replacement); - names_arr[i] = BigInt(replacement.offset); + for (const name of names[0]) { + if (name !== null && intersection.has(name)) { + let candidate = remapping.get(name); + if (candidate == undefined) { // only consider the first occurence. + remapping.set(name, ordered_intersection.length); + ordered_intersection.push(name); + } } } + } - mat_ptrs = harvest_matrices(x); - indices = utils.createInt32WasmArray(x[0].numberOfRows()); - output.matrix = gc.call( - module => module.cbind_with_rownames(x.length, mat_ptrs.offset, name_ptrs.offset, indices.offset), - ScranMatrix - ); + // Actually generating the combined matrix. + let output = {}; + let tmp_subset = [] + let tmp_sliced = [] + + try { + for (var n = 0; n < names.length; ++n) { + let survivors = utils.createInt32WasmArray(ordered_intersection.length); + survivors.fill(-1); + let sarray = survivors.array(); + names[n].forEach((x, i) => { + let candidate = remapping.get(x); + if (candidate !== undefined) { + if (sarray[candidate] < 0) { // only consider the first occurrence. + sarray[candidate] = i; + } + } + }); - output.indices = indices.slice(0, output.matrix.numberOfRows()); - let internames = []; - for (const i of output.indices) { - internames.push(names[0][i]); + tmp_subset.push(survivors); + tmp_sliced.push(subset.subsetRows(x[n], survivors)); } - output.names = internames; + + output.matrix = cbind(tmp_sliced); + output.indices = tmp_subset[0].slice(); + output.names = ordered_intersection; } catch (e) { utils.free(output.matrix); throw e; } finally { - utils.free(mat_ptrs); - utils.free(name_ptrs); - utils.free(indices); - for (const x of renamed) { + for (const x of tmp_subset) { + utils.free(x); + } + for (const x of tmp_sliced) { utils.free(x); } } diff --git a/src/cbind.cpp b/src/cbind.cpp index b5126ef6..b64c378b 100644 --- a/src/cbind.cpp +++ b/src/cbind.cpp @@ -60,34 +60,8 @@ NumericMatrix rbind(int n, uintptr_t mats) { return NumericMatrix(tatami::make_DelayedBind<0>(std::move(collected))); } -NumericMatrix cbind_with_rownames(int n, uintptr_t mats, uintptr_t names, uintptr_t indices) { - if (n == 0) { - throw std::runtime_error("need at least one matrix to cbind"); - } - - auto mat_ptrs = convert_array_of_offsets(n, mats); - const auto& first = *(mat_ptrs[0]); - std::vector::type> inputs; - inputs.reserve(n); - for (int i = 0; i < n; ++i) { - inputs.push_back(mat_ptrs[i]->ptr); - } - - auto name_ptrs = convert_array_of_offsets(n, names); - auto out = tatami::bind_intersection<1>(inputs, name_ptrs); - - // Save the direct row indices for the first matrix. - const auto& idx = out.second; - auto idptr = reinterpret_cast(indices); - std::copy(idx.begin(), idx.end(), idptr); - - return NumericMatrix(std::move(out.first)); -} - EMSCRIPTEN_BINDINGS(cbind) { emscripten::function("cbind", &cbind); emscripten::function("rbind", &rbind); - - emscripten::function("cbind_with_rownames", &cbind_with_rownames); } diff --git a/tests/cbind.test.js b/tests/cbind.test.js index 00fd80b0..b0710a37 100644 --- a/tests/cbind.test.js +++ b/tests/cbind.test.js @@ -60,10 +60,19 @@ test("cbindWithNames works correctly (simple)", () => { expect(combined.matrix.numberOfColumns()).toBe(60); expect(compare.equalArrays(combined.names, ["E", "F", "G", "H", "I", "J"])).toBe(true); + expect(compare.equalArrays(combined.indices, [4, 5, 6, 7, 8, 9])).toBe(true); + expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(4))).toBe(true); expect(compare.equalArrays(combined.matrix.column(10), mat2.column(0).slice(2, 8))).toBe(true); expect(compare.equalArrays(combined.matrix.column(30), mat3.column(0).slice(0, 6))).toBe(true); + expect( + compare.equalArrays( + combined.matrix.row(1), + [ ...mat1.row(5), ...mat2.row(3), ...mat3.row(1) ] + ) + ).toBe(true); + // Freeing all the bits and pieces. combined.matrix.free(); mat1.free(); @@ -71,32 +80,78 @@ test("cbindWithNames works correctly (simple)", () => { mat3.free(); }) -function quick_check(mat1, mat2, mat3, combined) { +test("cbindWithNames works correctly (complex)", () => { + var mat1 = simulate.simulateDenseMatrix(10, 10); + var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end. + var mat2 = simulate.simulateDenseMatrix(8, 20); + var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters. + var mat3 = simulate.simulateDenseMatrix(4, 30); + var names3 = ["L", "J", "N", "P"]; // random order. + + let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]); expect(combined.matrix.numberOfRows()).toBe(4); expect(combined.matrix.numberOfColumns()).toBe(60); - expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true); - expect(compare.equalArrays(combined.matrix.column(0), mat1.column(0).slice(5, 9))).toBe(true); + expect(compare.equalArrays(combined.indices, [5, 6, 7, 8])).toBe(true); + + let y1 = mat1.column(0); + let expected1 = [5,6,7,8].map(i => y1[i]); + expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true); let y2 = mat2.column(0); - let expected2 = [y2[7], y2[5], y2[3], y2[1]]; + let expected2 = [7, 5, 3, 1].map(i => y2[i]); expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true); let y3 = mat3.column(0); - let expected3 = [y3[3], y3[2], y3[0], y3[1]]; + let expected3 = [3, 2, 0, 1].map(i => y3[i]); expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true); -} -test("cbindWithNames works correctly (complex)", () => { + expect( + compare.equalArrays( + combined.matrix.row(2), + [ ...mat1.row(7), ...mat2.row(3), ...mat3.row(0) ] + ) + ).toBe(true); + + // Freeing all the bits and pieces. + combined.matrix.free(); + mat1.free(); + mat2.free(); + mat3.free(); +}) + +test("cbindWithNames works correctly (duplicates)", () => { var mat1 = simulate.simulateDenseMatrix(10, 10); - var names1 = ["Z", "X", "V", "T", "R", "P", "N", "L", "J", "H"]; // every second letter, from the end. + var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "N", "J"]; // preceding duplicates that might mess with the indexing, plus duplicated 'N'. var mat2 = simulate.simulateDenseMatrix(8, 20); - var names2 = ["I", "J", "K", "L", "M", "N", "O", "P"]; // consecutive letters. - var mat3 = simulate.simulateDenseMatrix(4, 30); - var names3 = ["L", "J", "N", "P"]; // random order. + var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the first occurrence is used. + var mat3 = simulate.simulateDenseMatrix(5, 30); + var names3 = ["L", "J", "N", "P", "L"]; // duplicate in 'L', first occurrence is used again. let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]); - quick_check(mat1, mat2, mat3, combined); + expect(combined.matrix.numberOfRows()).toBe(4); + expect(combined.matrix.numberOfColumns()).toBe(60); + expect(compare.equalArrays(combined.names, ["P", "N", "L", "J",])).toBe(true); + expect(compare.equalArrays(combined.indices, [5, 6, 7, 9])).toBe(true); + + let y1 = mat1.column(0); + let expected1 = [5, 6, 7, 9].map(i => y1[i]); + expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true); + + let y2 = mat2.column(0); + let expected2 = [6, 5, 3, 1].map(i => y2[i]); + expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true); + + let y3 = mat3.column(0); + let expected3 = [3, 2, 0, 1].map(i => y3[i]); + expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true); + + expect( + compare.equalArrays( + combined.matrix.row(3), + [ ...mat1.row(9), ...mat2.row(1), ...mat3.row(1) ] + ) + ).toBe(true); // Freeing all the bits and pieces. combined.matrix.free(); @@ -105,16 +160,38 @@ test("cbindWithNames works correctly (complex)", () => { mat3.free(); }) -test("cbindWithNames works correctly (duplicates)", () => { +test("cbindWithNames works correctly (nulls)", () => { var mat1 = simulate.simulateDenseMatrix(10, 10); - var names1 = ["Z", "A", "Z", "A", "R", "P", "N", "L", "J", "H"]; // preceding duplicates that might mess with the indexing. - var mat2 = simulate.simulateDenseMatrix(8, 20); - var names2 = ["I", "J", "K", "L", "M", "N", "P", "P"]; // duplicate in 'P', the last occurrence is used. - var mat3 = simulate.simulateDenseMatrix(4, 30); - var names3 = ["L", "J", "N", "P"]; + var names1 = ["A", "B", null, "D", "E", "F", null, "H", "I", "J"]; + var mat2 = simulate.simulateDenseMatrix(10, 20); + var names2 = ["C", "D", "E", null, "G", "H", "I", "J", "K", "L"]; + var mat3 = simulate.simulateDenseMatrix(10, 30); + var names3 = ["E", "F", null, "H", null, "J", "K", "L", "M", "N"]; let combined = scran.cbindWithNames([mat1, mat2, mat3], [names1, names2, names3]); - quick_check(mat1, mat2, mat3, combined); + expect(combined.matrix.numberOfRows()).toBe(3); + expect(combined.matrix.numberOfColumns()).toBe(60); + expect(compare.equalArrays(combined.names, ["E", "H", "J"])).toBe(true); + expect(compare.equalArrays(combined.indices, [4, 7, 9])).toBe(true); + + let y1 = mat1.column(0); + let expected1 = [4, 7, 9].map(i => y1[i]); + expect(compare.equalArrays(combined.matrix.column(0), expected1)).toBe(true); + + let y2 = mat2.column(0); + let expected2 = [2, 5, 7].map(i => y2[i]); + expect(compare.equalArrays(combined.matrix.column(10), expected2)).toBe(true); + + let y3 = mat3.column(0); + let expected3 = [0, 3, 5].map(i => y3[i]); + expect(compare.equalArrays(combined.matrix.column(30), expected3)).toBe(true); + + expect( + compare.equalArrays( + combined.matrix.row(0), + [ ...mat1.row(4), ...mat2.row(2), ...mat3.row(0) ] + ) + ).toBe(true); // Freeing all the bits and pieces. combined.matrix.free();