From 2a2259a2608ef7e9420d9d3fa59f55530d662ae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Edenstr=C3=B6m?= Date: Wed, 6 Oct 2021 21:40:33 +0200 Subject: [PATCH] fix: failing html parsing --- libs/api-skolplattformen/lib/parseHtml.ts | 49 ++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/libs/api-skolplattformen/lib/parseHtml.ts b/libs/api-skolplattformen/lib/parseHtml.ts index a7847ed3f..62592412f 100644 --- a/libs/api-skolplattformen/lib/parseHtml.ts +++ b/libs/api-skolplattformen/lib/parseHtml.ts @@ -1,7 +1,7 @@ -import * as h2m from 'h2m' -import { htmlDecode } from 'js-htmlencode' +import h2m from 'h2m' import { decode } from 'he' -import { parse, HTMLElement, TextNode } from 'node-html-parser' +import { htmlDecode } from 'js-htmlencode' +import { HTMLElement, parse, TextNode } from 'node-html-parser' const noChildren = ['strong', 'b', 'em', 'i', 'u', 's'] const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a'] @@ -40,45 +40,48 @@ const deepClean = (node: HTMLElement): HTMLElement => { return cleaned } -const rearrangeWhitespace = (html: string = ''): string => { +const rearrangeWhitespace = (html = ''): string => { let content = html - .replace(/]*>/gm, '') - .split('').join('') - .replace(/]*>/gm, '') - .split('').join('') - .split(' ').join('&nbsp;') - + .replace(/]*>/gm, '') + .split('') + .join('') + .replace(/]*>/gm, '') + .split('') + .join('') + .split(' ') + .join('&nbsp;') + // FIXME: Make a loop that doesn't break linting trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) - content = content.split(`&nbsp;`).join(` `) + content = content.split(`&nbsp;`).join(` `) }) - + trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) - content = content.split(`&nbsp;`).join(` `) + content = content.split(`&nbsp;`).join(` `) }) trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) - content = content.split(`&nbsp;`).join(` `) + content = content.split(`&nbsp;`).join(` `) }) trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) - content = content.split(`&nbsp;`).join(` `) + content = content.split(`&nbsp;`).join(` `) }) return content } -export const clean = (html: string = ''): string => +export const clean = (html = ''): string => deepClean(parse(decode(html))).outerHTML interface Node { @@ -93,15 +96,15 @@ const overides = { img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`, i: (node: Node) => `*${node.md}*`, b: (node: Node) => `**${node.md}**`, - 'h1': (node: Node) => `# ${node.md}\n`, - 'h2': (node: Node) => `## ${node.md}\n`, - 'h3': (node: Node) => `### ${node.md}\n`, - 'h4': (node: Node) => `#### ${node.md}\n`, - 'h5': (node: Node) => `##### ${node.md}\n`, - 'h6': (node: Node) => `###### ${node.md}\n`, + h1: (node: Node) => `# ${node.md}\n`, + h2: (node: Node) => `## ${node.md}\n`, + h3: (node: Node) => `### ${node.md}\n`, + h4: (node: Node) => `#### ${node.md}\n`, + h5: (node: Node) => `##### ${node.md}\n`, + h6: (node: Node) => `###### ${node.md}\n`, } -export const toMarkdown = (html: string): string => { +export const toMarkdown = (html?: string): string => { const rearranged = rearrangeWhitespace(html) const trimmed = clean(rearranged) const markdown = h2m(trimmed, { overides, converter })