Skip to content

Commit

Permalink
Automatically detect UTF8 character encoding in output
Browse files Browse the repository at this point in the history
  • Loading branch information
n1474335 committed Mar 26, 2024
1 parent 16dfb3f commit 65ffd8d
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 147 deletions.
81 changes: 79 additions & 2 deletions src/core/lib/ChrEnc.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,85 @@ export function chrEncWidth(page) {
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];


/**
* Character encoding format mappings.
* Detects whether the input buffer is valid UTF8.
*
* @param {ArrayBuffer} data
* @returns {number} - 0 = not UTF8, 1 = ASCII, 2 = UTF8
*/
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
export function isUTF8(data) {
const bytes = new Uint8Array(data);
let i = 0;
let onlyASCII = true;
while (i < bytes.length) {
if (( // ASCII
bytes[i] === 0x09 ||
bytes[i] === 0x0A ||
bytes[i] === 0x0D ||
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
)) {
i += 1;
continue;
}

onlyASCII = false;

if (( // non-overlong 2-byte
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
)) {
i += 2;
continue;
}

if (( // excluding overlongs
bytes[i] === 0xE0 &&
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
) ||
( // straight 3-byte
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
bytes[i] === 0xEE ||
bytes[i] === 0xEF) &&
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
) ||
( // excluding surrogates
bytes[i] === 0xED &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
)) {
i += 3;
continue;
}

if (( // planes 1-3
bytes[i] === 0xF0 &&
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // planes 4-15
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // plane 16
bytes[i] === 0xF4 &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
)) {
i += 4;
continue;
}

return 0;
}

return onlyASCII ? 1 : 2;
}
79 changes: 2 additions & 77 deletions src/core/lib/Magic.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import Utils, { isWorkerEnvironment } from "../Utils.mjs";
import Recipe from "../Recipe.mjs";
import Dish from "../Dish.mjs";
import {detectFileType, isType} from "./FileType.mjs";
import {isUTF8} from "./ChrEnc.mjs";
import chiSquared from "chi-squared";

/**
Expand Down Expand Up @@ -111,82 +112,6 @@ class Magic {
};
}

/**
* Detects whether the input buffer is valid UTF8.
*
* @returns {boolean}
*/
isUTF8() {
const bytes = new Uint8Array(this.inputBuffer);
let i = 0;
while (i < bytes.length) {
if (( // ASCII
bytes[i] === 0x09 ||
bytes[i] === 0x0A ||
bytes[i] === 0x0D ||
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
)) {
i += 1;
continue;
}

if (( // non-overlong 2-byte
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
)) {
i += 2;
continue;
}

if (( // excluding overlongs
bytes[i] === 0xE0 &&
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
) ||
( // straight 3-byte
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
bytes[i] === 0xEE ||
bytes[i] === 0xEF) &&
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
) ||
( // excluding surrogates
bytes[i] === 0xED &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
)) {
i += 3;
continue;
}

if (( // planes 1-3
bytes[i] === 0xF0 &&
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // planes 4-15
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // plane 16
bytes[i] === 0xF4 &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
)) {
i += 4;
continue;
}

return false;
}

return true;
}

/**
* Calculates the Shannon entropy of the input data.
*
Expand Down Expand Up @@ -336,7 +261,7 @@ class Magic {
data: this.inputStr.slice(0, 100),
languageScores: this.detectLanguage(extLang),
fileType: this.detectFileType(),
isUTF8: this.isUTF8(),
isUTF8: !!isUTF8(this.inputBuffer),
entropy: this.calcEntropy(),
matchingOps: matchingOps,
useful: useful,
Expand Down
8 changes: 4 additions & 4 deletions src/web/App.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -500,22 +500,22 @@ class App {
// Input Character Encoding
// Must be set before the input is loaded
if (this.uriParams.ienc) {
this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10));
this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10), true);
}

// Output Character Encoding
if (this.uriParams.oenc) {
this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10));
this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10), true);
}

// Input EOL sequence
if (this.uriParams.ieol) {
this.manager.input.eolChange(this.uriParams.ieol);
this.manager.input.eolChange(this.uriParams.ieol, true);
}

// Output EOL sequence
if (this.uriParams.oeol) {
this.manager.output.eolChange(this.uriParams.oeol);
this.manager.output.eolChange(this.uriParams.oeol, true);
}

// Read in input data from URI params
Expand Down
6 changes: 5 additions & 1 deletion src/web/stylesheets/components/_operation.css
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ select.arg {
min-width: 100px;
}

select.arg.form-control:not([size]):not([multiple]), select.custom-file-control:not([size]):not([multiple]) {
height: 100% !important;
}

textarea.arg {
min-height: 74px;
resize: vertical;
Expand All @@ -80,7 +84,7 @@ div.toggle-string {

input.toggle-string {
border-top-right-radius: 0 !important;
height: 42px !important;
height: 100%;
}

.operation [class^='bmd-label'],
Expand Down
51 changes: 45 additions & 6 deletions src/web/utils/statusBar.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class StatusBarPanel {
this.eolHandler = opts.eolHandler;
this.chrEncHandler = opts.chrEncHandler;
this.chrEncGetter = opts.chrEncGetter;
this.getEncodingState = opts.getEncodingState;
this.getEOLState = opts.getEOLState;
this.htmlOutput = opts.htmlOutput;

this.eolVal = null;
Expand Down Expand Up @@ -115,7 +117,7 @@ class StatusBarPanel {

if (isNaN(chrEncVal)) return;

this.chrEncHandler(chrEncVal);
this.chrEncHandler(chrEncVal, true);
this.updateCharEnc(chrEncVal);
hideElement(e.target.closest(".cm-status-bar-select-content"));
}
Expand Down Expand Up @@ -212,12 +214,31 @@ class StatusBarPanel {
* @param {EditorState} state
*/
updateEOL(state) {
if (state.lineBreak === this.eolVal) return;
if (this.getEOLState() < 2 && state.lineBreak === this.eolVal) return;

const val = this.dom.querySelector(".eol-value");
const button = val.closest(".cm-status-bar-select-btn");
const eolCode = eolSeqToCode[state.lineBreak];
const eolName = eolCodeToName[eolCode];
let eolCode = eolSeqToCode[state.lineBreak];
let eolName = eolCodeToName[eolCode];

switch (this.getEOLState()) {
case 1: // Detected
val.classList.add("font-italic");
eolCode += " (detected)";
eolName += " (detected)";
// Pulse
val.classList.add("pulse");
setTimeout(() => {
val.classList.remove("pulse");
}, 2000);
break;
case 0: // Unset
case 2: // Manually set
default:
val.classList.remove("font-italic");
break;
}

val.textContent = eolCode;
button.setAttribute("title", `End of line sequence:<br>${eolName}`);
button.setAttribute("data-original-title", `End of line sequence:<br>${eolName}`);
Expand All @@ -230,12 +251,30 @@ class StatusBarPanel {
*/
updateCharEnc() {
const chrEncVal = this.chrEncGetter();
if (chrEncVal === this.chrEncVal) return;
if (this.getEncodingState() < 2 && chrEncVal === this.chrEncVal) return;

const name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes";
let name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes";

const val = this.dom.querySelector(".chr-enc-value");
const button = val.closest(".cm-status-bar-select-btn");

switch (this.getEncodingState()) {
case 1: // Detected
val.classList.add("font-italic");
name += " (detected)";
// Pulse
val.classList.add("pulse");
setTimeout(() => {
val.classList.remove("pulse");
}, 2000);
break;
case 0: // Unset
case 2: // Manually set
default:
val.classList.remove("font-italic");
break;
}

val.textContent = name;
button.setAttribute("title", `${this.label} character encoding:<br>${name}`);
button.setAttribute("data-original-title", `${this.label} character encoding:<br>${name}`);
Expand Down
Loading

0 comments on commit 65ffd8d

Please sign in to comment.