diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index bebdd6a5e..9a577c52f 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -320,7 +320,8 @@ "Unescape string", "Pseudo-Random Number Generator", "Sleep", - "File Tree" + "File Tree", + "Word Count" ] }, { diff --git a/src/core/operations/WordCount.mjs b/src/core/operations/WordCount.mjs new file mode 100644 index 000000000..a94eb0ac9 --- /dev/null +++ b/src/core/operations/WordCount.mjs @@ -0,0 +1,116 @@ +/** + * @author sw5678 + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import Utils from "../Utils.mjs"; +import {LETTER_DELIM_OPTIONS} from "../lib/Delim.mjs"; +import {caseInsensitiveSort} from "../lib/Sort.mjs"; + + +/** + * Word Count operation + */ +class WordCount extends Operation { + + /** + * Word Count constructor + */ + constructor() { + super(); + + this.name = "Word Count"; + this.module = "Default"; + this.description = "Provides a count of each word in a given text"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Delimiter", + type: "option", + value: LETTER_DELIM_OPTIONS + }, + { + "name": "Include Total", + "type": "boolean", + "value": true + }, + { + "name": "Order", + "type": "option", + "value": ["Alphabetical", "Count"] + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + + const delimiter = Utils.charRep(args[0]); + + // Lower case and split + const inputArray = input.replace(/(?:\r\n|\r|\n)/g, delimiter).toLowerCase().split(delimiter); + + // Count up the words + const counter = {}; + let total = 0; + for (let j = 0; j < inputArray.length; j++) { + + // Trim whitespace and replace punctuation + const word = inputArray[j].replace(/(?:!|"|#|\$|%|&|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|~|£)/g, "").trim(); + + // If empty string or ', then skip + if (word === "" || /[']+/.test(word)) { + continue; + } else if (word in counter) { + counter[word]++; + total++; + } else { + counter[word] = 1; + total++; + } + } + + // Sort results + let order; + if (args[2] === "Alphabetical") { + // Sort alphabetically + order = Object.keys(counter).sort(caseInsensitiveSort); + } else if (args[2] === "Count") { + // Sort by count + // Create the array of key-value pairs + order = Object.keys(counter).map((key) => { + return [key, counter[key]]; + }); + // Sort the array based on the second element (i.e. the value) + order.sort((first, second) => { + return second[1] - first[1]; + }); + // Obtain the list of keys in sorted order of the values. + order = order.map((e) => { + return e[0]; + }); + } + + // Process output to string + let output = "WORD,COUNT\n"; + for (let k = 0; k < order.length; k++) { + output = output + order[k] + "," + counter[order[k]] + "\n"; + } + + // Add total counter at the bottom + if (args[1]) { + output = output + "TOTAL," + total; + } + + return output; + } +} + +export default WordCount; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index 40ce7a2ee..5eb9b9ff8 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -151,6 +151,7 @@ import "./tests/TranslateDateTimeFormat.mjs"; import "./tests/Typex.mjs"; import "./tests/UnescapeString.mjs"; import "./tests/Unicode.mjs"; +import "./tests/WordCount.mjs"; import "./tests/YARA.mjs"; import "./tests/ParseCSR.mjs"; import "./tests/XXTEA.mjs"; diff --git a/tests/operations/tests/WordCount.mjs b/tests/operations/tests/WordCount.mjs new file mode 100644 index 000000000..bc518de2d --- /dev/null +++ b/tests/operations/tests/WordCount.mjs @@ -0,0 +1,117 @@ +/** + * @author sw5678 + * @copyright Crown Copyright 2023 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + "name": "Word Count: Empty test 1", + "input": "", + "expectedOutput": "WORD,COUNT\nTOTAL,0", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Empty test 2", + "input": "", + "expectedOutput": "WORD,COUNT\nTOTAL,0", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Count"], + }, + ], + }, + { + "name": "Word Count: Empty test 3", + "input": "", + "expectedOutput": "WORD,COUNT\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Empty test 4", + "input": "", + "expectedOutput": "WORD,COUNT\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Count"], + }, + ], + }, + { + "name": "Word Count: Count test 1", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nhello,2\nworld,3\nTOTAL,5", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Count test 2", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\nTOTAL,5", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", true, "Count"], + }, + ], + }, + { + "name": "Word Count: Count test 3", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nhello,2\nworld,3\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Alphabetical"], + }, + ], + }, + { + "name": "Word Count: Count test 4", + "input": "Hello world. Hello. \n\n World, ''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Space", false, "Count"], + }, + ], + }, + { + "name": "Word Count: Different delimiter test", + "input": "Hello, World\nhello, world \n''!@£$%^&*()_+=-[]{};'|:/.,<>? world", + "expectedOutput": "WORD,COUNT\nworld,3\nhello,2\n", + + "recipeConfig": [ + { + "op": "Word Count", + "args": ["Comma", false, "Count"], + }, + ], + } +]);