Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
Fix links being parsed as markdown links improperly
Browse files Browse the repository at this point in the history
Fixes #4674
  • Loading branch information
Dariusz Niemczyk committed Nov 25, 2021
1 parent ea97c41 commit e07ecd5
Show file tree
Hide file tree
Showing 2 changed files with 240 additions and 4 deletions.
135 changes: 131 additions & 4 deletions src/Markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.

import * as commonmark from 'commonmark';
import { escape } from "lodash";
import { logger } from 'matrix-js-sdk/src/logger';
import * as linkify from 'linkifyjs';

const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];

Expand All @@ -27,8 +29,11 @@ const TEXT_NODES = ['text', 'softbreak', 'linebreak', 'paragraph', 'document'];
interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
paragraph: (node: commonmark.Node, entering: boolean) => void;
link: (node: commonmark.Node, entering: boolean) => void;
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcazse
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
text: (node: commonmark.Node) => void;
out: (text: string) => void;
emph: (node: commonmark.Node) => void;
}

function isAllowedHtmlTag(node: commonmark.Node): boolean {
Expand Down Expand Up @@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
return par.firstChild != par.lastChild;
}

function getTextUntilEndOrLinebreak(node: commonmark.Node) {
let currentNode = node;
let text = '';
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
const { literal, type } = currentNode;
if (type === 'text' && literal) {
let n = 0;
let char = literal[n];
while (char !== ' ' && char !== null && n <= literal.length) {
if (char === ' ') {
break;
}
if (char) {
text += char;
}
n += 1;
char = literal[n];
}
if (char === ' ') {
break;
}
}
currentNode = currentNode.next;
}
return text;
}

/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
Expand All @@ -70,11 +102,87 @@ export default class Markdown {
private input: string;
private parsed: commonmark.Node;

constructor(input) {
constructor(input: string) {
this.input = input;

const parser = new commonmark.Parser();
this.parsed = parser.parse(this.input);
this.parsed = this.escapeLinks(this.parsed);
}

private escapeLinks(parsed: commonmark.Node) {
const walker = parsed.walker();
let event: commonmark.NodeWalkingStep = null;
let text = '';
let isInPara = false;
let previousNode: commonmark.Node | null = null;
while ((event = walker.next())) {
const { node } = event;
if (node.type === 'paragraph') {
if (event.entering) {
isInPara = true;
} else {
isInPara = false;
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === 'softbreak' ||
node.type === 'linebreak' ||
// Also start calculating the text from the beginning on any spaces
(node.type === 'text' && node.literal === ' ')
) {
text = '';
}
if (node.type === 'text') {
text += node.literal;
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if (node.type === 'emph' && previousNode.type === 'text') {
if (event.entering) {
const foundLinks = linkify.find(text);
for (const { value } of foundLinks) {
if (node.firstChild.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const nonEmphasizedText = `_${node.firstChild.literal}_`;
const f = getTextUntilEndOrLinebreak(node);
const newText = value + nonEmphasizedText + f;
const newLinks = linkify.find(newText);
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node('text');
emphasisTextNode.literal = nonEmphasizedText;
previousNode.insertAfter(emphasisTextNode);
node.firstChild.literal = '';
// Empty string is a sign that we should ignore it in HTML rendering
node.literal = '';
} else {
logger.error(
"Markdown links escaping found too many links for following text: ",
text,
);
logger.error(
"Markdown links escaping found too many links for modified text: ",
newText,
);
}
}
}
} else {
node.firstChild.literal = '';
// Empty string is a sign that we should ignore it in HTML rendering
node.literal = '';
// node.unlink();
}
}
}
previousNode = node;
}
return parsed;
}

isPlainText(): boolean {
Expand Down Expand Up @@ -120,9 +228,7 @@ export default class Markdown {
// you can nest them.
//
// Let's try sending with <p/>s anyway for now, though.

const realParagraph = renderer.paragraph;

renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
// If there is only one top level node, just return the
// bare text: it's a single line of text and so should be
Expand Down Expand Up @@ -175,6 +281,17 @@ export default class Markdown {
*/
};

const realEmph = renderer.emph;

renderer.emph = function(node: commonmark.Node) {
// We're escaping links with emphasis in the middle of it to act like a real link
// This empty string check here is to verify that we have modified the emphasis node properly
if (node.type === 'emph' && node.literal === "") {
return;
}
return realEmph(node);
};

return renderer.render(this.parsed);
}

Expand Down Expand Up @@ -205,6 +322,16 @@ export default class Markdown {
if (isMultiLine(node) && node.next) this.lit('\n\n');
};

const realEmph = renderer.emph;
renderer.emph = function(node: commonmark.Node) {
// We're escaping links with emphasis in the middle of it to act like a real link
// This empty string check here is to verify that we have modified the emphasis node properly
if (node.type === 'emph' && node.literal === "") {
return;
}
return realEmph(node);
};

return renderer.render(this.parsed);
}
}
109 changes: 109 additions & 0 deletions test/Markdown-test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/**
* @jest-environment jsdom
*/
import Markdown from "../src/Markdown";

describe("Markdown parser test", () => {
describe("autolinks", () => {
const testString = [
"Test1:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test1A:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test2:",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"",
"Test3:",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
].join("\n");

it('tests that links are getting properly HTML formatted', () => {
/* eslint-disable max-len */
const expectedResult = [
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(testString);
expect(md.toHTML()).toEqual(expectedResult);
});
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
const test = [
"Test1:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test1A:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test2:",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"",
"Test3:",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
].join("\n");
/* eslint-disable max-len */
/**
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
* but it seems to be actually working properly
*/
const expectedResult = [
"<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(test);
expect(md.toHTML()).toEqual(expectedResult);
});

it('expects that links in codeblock are not modified', () => {
const expectedResult = [
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test1A:',
'#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test2:',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'',
'Test3:',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
'</code></pre>',
'',
].join('\n');
const md = new Markdown("```" + testString + "```");
expect(md.toHTML()).toEqual(expectedResult);
});
});
});

0 comments on commit e07ecd5

Please sign in to comment.