diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 307270d26..c918fbb89 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -269,6 +269,7 @@ "Fuzzy Match", "Offset checker", "Hamming Distance", + "Levenshtein Distance", "Convert distance", "Convert area", "Convert mass", diff --git a/src/core/operations/LevenshteinDistance.mjs b/src/core/operations/LevenshteinDistance.mjs new file mode 100644 index 000000000..b9d30aa03 --- /dev/null +++ b/src/core/operations/LevenshteinDistance.mjs @@ -0,0 +1,98 @@ +/** + * @author mikecat + * @copyright Crown Copyright 2023 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import OperationError from "../errors/OperationError.mjs"; + +/** + * Levenshtein Distance operation + */ +class LevenshteinDistance extends Operation { + + /** + * LevenshteinDistance constructor + */ + constructor() { + super(); + + this.name = "Levenshtein Distance"; + this.module = "Default"; + this.description = "Levenshtein Distance (also known as Edit Distance) is a string metric to measure a difference between two strings that counts operations (insertions, deletions, and substitutions) on single character that are required to change one string to another."; + this.infoURL = "https://wikipedia.org/wiki/Levenshtein_distance"; + this.inputType = "string"; + this.outputType = "number"; + this.args = [ + { + name: "Sample delimiter", + type: "binaryString", + value: "\\n" + }, + { + name: "Insertion cost", + type: "number", + value: 1 + }, + { + name: "Deletion cost", + type: "number", + value: 1 + }, + { + name: "Substitution cost", + type: "number", + value: 1 + }, + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {number} + */ + run(input, args) { + const [delim, insCost, delCost, subCost] = args; + const samples = input.split(delim); + if (samples.length !== 2) { + throw new OperationError("Incorrect number of samples. Check your input and/or delimiter."); + } + if (insCost < 0 || delCost < 0 || subCost < 0) { + throw new OperationError("Negative costs are not allowed."); + } + const src = samples[0], dest = samples[1]; + let currentCost = new Array(src.length + 1); + let nextCost = new Array(src.length + 1); + for (let i = 0; i < currentCost.length; i++) { + currentCost[i] = delCost * i; + } + for (let i = 0; i < dest.length; i++) { + const destc = dest.charAt(i); + nextCost[0] = currentCost[0] + insCost; + for (let j = 0; j < src.length; j++) { + let candidate; + // insertion + let optCost = currentCost[j + 1] + insCost; + // deletion + candidate = nextCost[j] + delCost; + if (candidate < optCost) optCost = candidate; + // substitution or matched character + candidate = currentCost[j]; + if (src.charAt(j) !== destc) candidate += subCost; + if (candidate < optCost) optCost = candidate; + // store calculated cost + nextCost[j + 1] = optCost; + } + const tempCost = nextCost; + nextCost = currentCost; + currentCost = tempCost; + } + + return currentCost[currentCost.length - 1]; + } + +} + +export default LevenshteinDistance; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index 7a3361f24..2c27d8682 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -130,6 +130,7 @@ import "./tests/FletcherChecksum.mjs"; import "./tests/CMAC.mjs"; import "./tests/AESKeyWrap.mjs"; import "./tests/Rabbit.mjs"; +import "./tests/LevenshteinDistance.mjs"; // Cannot test operations that use the File type yet // import "./tests/SplitColourChannels.mjs"; diff --git a/tests/operations/tests/LevenshteinDistance.mjs b/tests/operations/tests/LevenshteinDistance.mjs new file mode 100644 index 000000000..e304165b6 --- /dev/null +++ b/tests/operations/tests/LevenshteinDistance.mjs @@ -0,0 +1,165 @@ +/** + * @author mikecat + * @copyright Crown Copyright 2023 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + "name": "Levenshtein Distance: Wikipedia example 1", + "input": "kitten\nsitting", + "expectedOutput": "3", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: Wikipedia example 2", + "input": "saturday\nsunday", + "expectedOutput": "3", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: Wikipedia example 1 with substitution cost 2", + "input": "kitten\nsitting", + "expectedOutput": "5", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, 2, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: varied costs 1", + "input": "kitten\nsitting", + "expectedOutput": "230", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 10, 100, 1000, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: varied costs 2", + "input": "kitten\nsitting", + "expectedOutput": "1020", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1000, 100, 10, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: another delimiter", + "input": "kitten sitting", + "expectedOutput": "3", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + " ", 1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: too few samples", + "input": "kitten", + "expectedOutput": "Incorrect number of samples. Check your input and/or delimiter.", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: too many samples", + "input": "kitten\nsitting\nkitchen", + "expectedOutput": "Incorrect number of samples. Check your input and/or delimiter.", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: negative insertion cost", + "input": "kitten\nsitting", + "expectedOutput": "Negative costs are not allowed.", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", -1, 1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: negative deletion cost", + "input": "kitten\nsitting", + "expectedOutput": "Negative costs are not allowed.", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, -1, 1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: negative substitution cost", + "input": "kitten\nsitting", + "expectedOutput": "Negative costs are not allowed.", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 1, 1, -1, + ], + }, + ], + }, + { + "name": "Levenshtein Distance: cost zero", + "input": "kitten\nsitting", + "expectedOutput": "0", + "recipeConfig": [ + { + "op": "Levenshtein Distance", + "args": [ + "\\n", 0, 0, 0, + ], + }, + ], + }, +]);