Skip to content

Commit

Permalink
Add operation to normalise unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
matthieuxyz committed Nov 25, 2019
1 parent 610d46a commit a6fa062
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 0 deletions.
5 changes: 5 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
"ssdeep.js": "0.0.2",
"tesseract.js": "^2.0.0-alpha.15",
"ua-parser-js": "^0.7.20",
"unorm": "^1.6.0",
"utf8": "^3.0.0",
"vkbeautify": "^0.99.3",
"xmldom": "^0.1.27",
Expand Down
1 change: 1 addition & 0 deletions src/core/config/Categories.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"URL Decode",
"Escape Unicode Characters",
"Unescape Unicode Characters",
"Normalise Unicode",
"To Quoted Printable",
"From Quoted Printable",
"To Punycode",
Expand Down
12 changes: 12 additions & 0 deletions src/core/lib/ChrEnc.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,15 @@ export const IO_FORMAT = {
"Simplified Chinese GB18030 (54936)": 54936,
};

/**
* Unicode Normalisation Forms
*
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2016
* @license Apache-2.0
*/

/**
* Character encoding format mappings.
*/
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
60 changes: 60 additions & 0 deletions src/core/operations/NormaliseUnicode.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/

import Operation from "../Operation.mjs";
import OperationError from "../errors/OperationError.mjs";
import unorm from "unorm";
import {UNICODE_NORMALISATION_FORMS} from "../lib/ChrEnc";

/**
* Normalise Unicode operation
*/
class NormaliseUnicode extends Operation {

/**
* NormaliseUnicode constructor
*/
constructor() {
super();

this.name = "Normalise Unicode";
this.module = "UnicodeNormalisation";
this.description = "Transform Unicode to one of the Normalisation Form";
this.infoURL = "http://www.unicode.org/reports/tr15/";
this.inputType = "string";
this.outputType = "string";
this.args = [
{
name: "Normal Form",
type: "option",
value: UNICODE_NORMALISATION_FORMS
}
];
}

/**
* @param {string} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const [normalForm] = args;
if (normalForm === "NFD") {
return unorm.nfd(input);
} else if (normalForm === "NFC") {
return unorm.nfc(input);
} else if (normalForm === "NFKD") {
return unorm.nfkd(input);
} else if (normalForm === "NFKC") {
return unorm.nfc(input);
}

throw new OperationError("Unknown Normalisation Form");
}

}

export default NormaliseUnicode;
1 change: 1 addition & 0 deletions tests/operations/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ import "./tests/MS.mjs";
import "./tests/Magic.mjs";
import "./tests/MorseCode.mjs";
import "./tests/NetBIOS.mjs";
import "./tests/NormaliseUnicode.mjs";
import "./tests/OTP.mjs";
import "./tests/PGP.mjs";
import "./tests/PHP.mjs";
Expand Down
54 changes: 54 additions & 0 deletions tests/operations/tests/NormaliseUnicode.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/**
* Text Encoding Brute Force tests.
*
* @author Matthieu [m@tthieux.xyz]
*
* @copyright Crown Copyright 2018
* @license Apache-2.0
*/
import TestRegister from "../../lib/TestRegister.mjs";

TestRegister.addTests([
{
name: "Normalise Unicode - NFD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFD"],
},
],
}, {
name: "Normalise Unicode - NFC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFC"],
},
],
}, {
name: "Normalise Unicode - NFKD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327I/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKD"],
},
],
}, {
name: "Normalise Unicode - NFKC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKC"],
},
],
},
]);

0 comments on commit a6fa062

Please sign in to comment.