Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix HTML-> MD Conversion #399

Merged
merged 4 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions __tests__/ExpensiMark-Markdown-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ const parser = new ExpensiMark();

test('Test bold HTML replacement', () => {
const boldTestStartString = 'This is a <strong>sentence,</strong> and it has some <strong>punctuation, words, and spaces</strong>. '
+ '<strong>test</strong> * testing* test*test*test. * testing * *testing * '
+ 'This is a <b>sentence,</b> and it has some <b>punctuation, words, and spaces</b>. '
+ '<b>test</b> * testing* test*test*test. * testing * *testing *';
+ '<strong>test</strong> * testing* test*test*test. * testing * *testing * '
+ 'This is a <b>sentence,</b> and it has some <b>punctuation, words, and spaces</b>. '
+ '<b>test</b> * testing* test*test*test. * testing * *testing *';
const boldTestReplacedString = 'This is a *sentence,* and it has some *punctuation, words, and spaces*. '
+ '*test* * testing* test*test*test. * testing * *testing * '
+ 'This is a *sentence,* and it has some *punctuation, words, and spaces*. '
Expand All @@ -18,9 +18,9 @@ test('Test bold HTML replacement', () => {

test('Test italic HTML replacement', () => {
const italicTestStartString = 'This is a <em>sentence,</em> and it has some <em>punctuation, words, and spaces</em>. <em>test</em> _ testing_ test_test_test. _ test _ _test _ '
+ 'This is a <i>sentence,</i> and it has some <i>punctuation, words, and spaces</i>. <i>test</i> _ testing_ test_test_test. _ test _ _test _';
+ 'This is a <i>sentence,</i> and it has some <i>punctuation, words, and spaces</i>. <i>test</i> _ testing_ test_test_test. _ test _ _test _';
const italicTestReplacedString = 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _ '
+ 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _';
+ 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _';
expect(parser.htmlToMarkdown(italicTestStartString)).toBe(italicTestReplacedString);
});

Expand Down Expand Up @@ -64,3 +64,53 @@ test('Test HTML string with attributes', () => {

expect(parser.htmlToMarkdown(testString)).toBe(resultString);
});

test('Test HTML string with spcial Tags', () => {
const testString = '<html>\n<body>\n<!--StartFragment--><span style="color: rgb(0, 0, 0); font-family: &quot;Times New Roman&quot;; font-size: medium; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; white-space: pre-wrap; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial; display: inline !important; float: none;">test message</span><!--EndFragment-->\n</body>\n</html>\n';
const resultString = 'test message';

expect(parser.htmlToMarkdown(testString)).toBe(resultString);
});


test('Test HTML string with Internal Tags', () => {
const testString = `<style>
span {
color: rgb(0, 0, 0);
font-family: "Times New Roman";
font-size: medium;
font-style: normal;
font-variant-ligatures: normal;
font-variant-caps: normal;
font-weight: 400;
letter-spacing: normal;
orphans: 2;
text-align: start;
text-indent: 0px;
text-transform: none;
white-space: pre-wrap;
widows: 2;
word-spacing: 0px;
-webkit-text-stroke-width: 0px;
text-decoration-thickness: initial;
text-decoration-style: initial;
text-decoration-color: initial;
display: inline !important;
float: none;
}
</style>
<script type="text/javascript">
document.write('Hacked');
</script>
<p>test message</p>`;
const resultString = 'test message';

expect(parser.htmlToMarkdown(testString)).toBe(resultString);
});

test('Test HTML string with encoded entities', () => {
const testString = 'Text Entity &amp; &quot;';
const resultString = 'Text Entity & "';

expect(parser.htmlToMarkdown(testString)).toBe(resultString);
});
26 changes: 23 additions & 3 deletions lib/ExpensiMark.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import _ from 'underscore';
import Str from './str';
import TLD_REGEX from './tlds';

Expand Down Expand Up @@ -33,7 +34,7 @@ export default class ExpensiMark {
// with the new lines here since they need to be converted into <br>. And we don't
// want to do this anywhere else since that would break HTML.
// &nbsp; will create styling issues so use &#32;
replacement: (match, _, textWithinFences) => {
replacement: (match, __, textWithinFences) => {
const group = textWithinFences.replace(/(?:(?![\n\r])\s)/g, '&#32;');
return `<pre>${group}</pre>`;
},
Expand Down Expand Up @@ -170,14 +171,32 @@ export default class ExpensiMark {
* @type {Object[]}
*/
this.htmlToMarkdownRules = [
{
name: 'Strip Special Tags',
regex: /(\n|\r\n)?<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim,
replacement: ''
},

// Used to Exclude tags
{
name: 'exclude',
regex: new RegExp(
[
'<(script|style)(?:"[^"]*"|\'[^\']*\'|[^\'">])*>([\\s\\S]*?)<\\/\\1>',
'(?![^<]*(<\\/pre>|<\\/code>))(\n|\r\n)?'
].join(''),
'gim'
),
replacement: '',
},
{
name: 'newline',

// Replaces open and closing <br><br/> tags with a single <br/>
pre: inputString => inputString.replace('<br></br>', '<br/>').replace('<br><br/>', '<br/>'),

// Include the immediately followed newline as `<br>\n` should be equal to one \n.
regex: /<br(?:"[^"]*"|'[^']*'|[^'">])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi,
regex: /<br(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi,
replacement: '\n'
},
{
Expand Down Expand Up @@ -298,7 +317,8 @@ export default class ExpensiMark {
* @returns {String}
*/
htmlToMarkdown(htmlString) {
let generatedMarkdown = htmlString;
let generatedMarkdown = _.unescape(htmlString);

this.htmlToMarkdownRules.forEach((rule) => {
// Pre-processes input HTML before applying regex
if (rule.pre) {
Expand Down