Merge pull request #675 from brandonhenry/add-markdown-trunctation-util

ExpensiMark: Add New Markdown Truncation Utility
Expensify · Jul 15, 2024 · cd42f4e · cd42f4e
2 parents 629f703 + 01311ad
commit cd42f4e
Show file tree

Hide file tree

Showing 2 changed files with 307 additions and 0 deletions.
diff --git a/__tests__/ExpensiMark-Markdown-Truncate-test.js b/__tests__/ExpensiMark-Markdown-Truncate-test.js
@@ -0,0 +1,108 @@
+import ExpensiMark from '../lib/ExpensiMark';
+
+const parser = new ExpensiMark();
+
+describe('truncateHTML', () => {
+    test('should return original text as HTML if it does not exceed the limit', () => {
+        const markdown = 'This is a *short* text that does not exceed the character limit.';
+        const limit = 100;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe(html);
+    });
+
+    test('should truncate HTML and add ellipsis if it exceeds the limit', () => {
+        const markdown =
+            'This is a *long* text that exceeds the character limit. It contains multiple sentences to test the truncation functionality. The truncation should occur at the specified character count.';
+        const limit = 80;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe('This is a <strong>long</strong> text that exceeds the character limit. It contains multiple sente...');
+    });
+
+    test('should handle HTML with multiple markdown elements', () => {
+        const markdown =
+            'This is a *long* text with _multiple_ markdown ~elements~. It includes *bold*, _italic_, and ~strikethrough~ formatting. The truncation should preserve the markdown syntax.';
+        const limit = 150;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe(
+            'This is a <strong>long</strong> text with <em>multiple</em> markdown <del>elements</del>. It includes <strong>bold</strong>, <em>italic</em>, and <del>strikethrough</del> formatting. The truncation should preserve the markdo...',
+        );
+    });
+
+    test('should handle HTML with nested markdown elements', () => {
+        const markdown = 'This is a *long _nested_ markdown* text. It contains *_bold italic_* and ~_strikethrough italic_~ formatting. The truncation should handle the nesting correctly.';
+        const limit = 120;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe(
+            'This is a <strong>long <em>nested</em> markdown</strong> text. It contains <strong><em>bold italic</em></strong> and <del><em>strikethrough italic</em></del> formatting. The truncation should ...',
+        );
+    });
+
+    test('should handle HTML with links', () => {
+        const markdown =
+            'This is a text with [a link](https://example.com) that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes [another link](https://example.org) for testing purposes.';
+        const limit = 141;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe(
+            `This is a text with <a href=\"https://example.com\" target=\"_blank\" rel=\"noreferrer noopener\">a link</a> that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes <a href=\"https://example.org\" target=\"_blank\" rel=\"noreferrer noopener\">another link</a>...`,
+        );
+    });
+
+    test('should handle HTML with inline code', () => {
+        const markdown = 'This is a text with `inline code`. The inline code should be preserved in the truncated text.';
+        const limit = 25;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe('This is a text with <code>inlin...</code>');
+    });
+
+    test('should handle HTML with headings', () => {
+        const markdown =
+            '# Heading 1\n\nThis is a text with headings that exceeds the limit. The headings should be preserved in the truncated text. Here is an example of a heading:\n\n## Heading 2\n\nThe truncation should handle headings correctly.';
+        const limit = 100;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe('<h1>Heading 1</h1><br />This is a text with headings that exceeds the limit. The headings should be preserved in th...');
+    });
+
+    test('should handle HTML with lists', () => {
+        const markdown = `
+    This is a text with lists. Here are examples:
+    
+    Unordered list:
+    - Item 1
+    - Item 2
+    - Item 3
+    
+    Ordered list:
+    1. First item
+    2. Second item
+    3. Third item
+    `;
+        const limit = 250; // Increased to accommodate full lists
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: false});
+        expect(result).toBe(`<br />    This is a text with lists. Here are examples:<br />    <br />    Unordered list:<br />    - Item 1<br />    - Item 2<br />    - Item 3<br />    <br />    Ordered list:<br />    1. First item<br />    2. Second item<br />    3. Third item<br />    `);
+    });
+
+    test('should handle HTML with horizontal rules', () => {
+        const markdown = 'This is a text with horizontal rules. Here is an example of a horizontal rule:\n\n---\n\nThe truncation should handle horizontal rules correctly.';
+        const limit = 100;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe('This is a text with horizontal rules. Here is an example of a horizontal rule:<br /><br />---<br /><br />The truncation shou...');
+    });
+
+    test('should handle HTML with images', () => {
+        const markdown =
+            'This is a text with an image ![alt text](https://example.com/image.jpg) that exceeds the limit. The image should be preserved in the truncated text. Additionally, it includes another image ![alt text 2](https://example.org/image2.jpg) for testing purposes.';
+        const limit = 80;
+        const html = parser.replace(markdown);
+        const result = parser.truncateHTML(html, limit, {ellipsis: true});
+        expect(result).toBe('This is a text with an image <img src="https://example.com/image.jpg" alt="alt text" /> that exceeds the limit. The image should be preser...');
+    });
+});
diff --git a/lib/ExpensiMark.ts b/lib/ExpensiMark.ts
@@ -42,6 +42,13 @@ type ReplaceOptions = {
     shouldKeepRawInput?: boolean;
 };
 
+type TruncateOptions = {
+    ellipsis?: string;
+    truncateLastWord?: boolean;
+    slop?: number;
+    removeImageTag?: boolean;
+};
+
 const MARKDOWN_LINK_REGEX = new RegExp(`\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)]\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');
 const MARKDOWN_IMAGE_REGEX = new RegExp(`\\!(?:\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)])?\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');
 
@@ -1259,6 +1266,198 @@ export default class ExpensiMark {
         return Utils.escape(originalContent);
     }
 
+    /**
+     * Determines the end position to truncate the HTML content while considering word boundaries.
+     *
+     * @param {string} content - The HTML content to be truncated.
+     * @param {number} tailPosition - The position up to which the content should be considered.
+     * @param {number} maxLength - The maximum length of the truncated content.
+     * @param {number} totalLength - The length of the content processed so far.
+     * @param {Object} opts - Options to customize the truncation.
+     * @returns {number} The calculated position to truncate the content.
+     */
+    getEndPosition(content: string, tailPosition: number | undefined, maxLength: number, totalLength: number, opts: TruncateOptions) {
+        const WORD_BREAK_REGEX = /\W+/g;
+
+        // Calculate the default position to truncate based on the maximum length and the length of the content processed so far
+        const defaultPosition = maxLength - totalLength;
+
+        // Define the slop value, which determines the tolerance for cutting off content near the maximum length
+        const slop = opts.slop;
+        if (!slop) return defaultPosition;
+
+        // Initialize the position to the default position
+        let position = defaultPosition;
+
+        // Determine if the default position is considered "short" based on the slop value
+        const isShort = defaultPosition < slop;
+
+        // Calculate the position within the slop range
+        const slopPos = isShort ? defaultPosition : slop - 1;
+
+        // Extract the substring to analyze for word boundaries, considering the slop and tail position
+        const substr = content.slice(isShort ? 0 : defaultPosition - slop, tailPosition !== undefined ? tailPosition : defaultPosition + slop);
+
+        // Find the first word boundary within the substring
+        const wordBreakMatch = WORD_BREAK_REGEX.exec(substr);
+
+        // Adjust the position to avoid truncating in the middle of a word if the option is enabled
+        if (!opts.truncateLastWord) {
+            if (tailPosition && substr.length <= tailPosition) {
+                // If tail position is defined and the substring length is within the tail position, set position to the substring length
+                position = substr.length;
+            } else {
+                // Iterate through word boundary matches to adjust the position
+                while (wordBreakMatch !== null) {
+                    if (wordBreakMatch.index < slopPos) {
+                        // If the word boundary is before the slop position, adjust position backward
+                        position = defaultPosition - (slopPos - wordBreakMatch.index);
+                        if (wordBreakMatch.index === 0 && defaultPosition <= 1) {
+                            break;
+                        }
+                    } else if (wordBreakMatch.index === slopPos) {
+                        // If the word boundary is at the slop position, set position to the default position
+                        position = defaultPosition;
+                        break;
+                    } else {
+                        // If the word boundary is after the slop position, adjust position forward
+                        position = defaultPosition + (wordBreakMatch.index - slopPos);
+                        break;
+                    }
+                }
+            }
+            // If the character at the determined position is a whitespace, adjust position backward
+            if (content.charAt(position - 1).match(/\s$/)) {
+                position--;
+            }
+        }
+
+        // Return the calculated position to truncate the content
+        return position;
+    }
+
+    /**
+     * Truncate HTML string and keep tag safe.
+     * pulled from https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
+     *
+     * @param {string} html - The string that needs to be truncated
+     * @param {number} maxLength - Length of truncated string
+     * @param {Object} [options] - Optional configuration options
+     * @returns {string} The truncated string
+     */
+    truncateHTML(html: string, maxLength: number, options?: TruncateOptions) {
+        const EMPTY_STRING = '';
+        const DEFAULT_TRUNCATE_SYMBOL = '...';
+        const DEFAULT_SLOP = Math.min(10, maxLength);
+        const tagsStack = [];
+        const KEY_VALUE_REGEX = '((?:\\s+(?:\\w+|-)+(?:\\s*=\\s*(?:"(?:\\\\.|[^"\\\\])*"|\'(?:\\\\.|[^\'\\\\])*\'|[^\'">\\s]+))?)*)';
+        const IS_CLOSE_REGEX = '\\s*\\/?\\s*';
+        const CLOSE_REGEX = '\\s*\\/\\s*';
+        const SELF_CLOSE_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
+        const HTML_TAG_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${IS_CLOSE_REGEX}>`);
+        const URL_REGEX = /(((ftp|https?):\/\/)[\\-\w@:%_\\+.~#?,&\\/\\/=]+)|((mailto:)?[_.\w\\-]+@([\w][\w\\-]+\.)+[a-zA-Z]{2,3})/g;
+        const IMAGE_TAG_REGEX = new RegExp(`<img\\s*${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
+        let truncatedContent = EMPTY_STRING;
+        let totalLength = 0;
+        let matches = HTML_TAG_REGEX.exec(html);
+        let endResult;
+        let index;
+        let tag;
+        let selfClose = null;
+        let htmlString = html;
+
+        const opts = {
+            ellipsis: DEFAULT_TRUNCATE_SYMBOL,
+            truncateLastWord: true,
+            slop: DEFAULT_SLOP,
+            ...options,
+        };
+
+        function removeImageTag(content: string): string {
+            const match = IMAGE_TAG_REGEX.exec(content);
+            if (!match) {
+                return content;
+            }
+
+            const matchIndex = match.index;
+            const matchLength = match[0].length;
+
+            return content.substring(0, matchIndex) + content.substring(matchIndex + matchLength);
+        }
+
+        function closeTags(tags: string[]): string {
+            return tags
+                .reverse()
+                .map((mappedTag) => {
+                    return `</${mappedTag}>`;
+                })
+                .join('');
+        }
+
+        while (matches) {
+            matches = HTML_TAG_REGEX.exec(htmlString);
+
+            if (!matches) {
+                if (totalLength >= maxLength) {
+                    break;
+                }
+
+                matches = URL_REGEX.exec(htmlString);
+                if (!matches || matches.index >= maxLength) {
+                    truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, undefined, maxLength, totalLength, opts));
+                    break;
+                }
+
+                while (matches) {
+                    endResult = matches[0];
+                    if (endResult !== null) {
+                        index = matches.index;
+                        truncatedContent += htmlString.substring(0, index + endResult.length - totalLength);
+                        htmlString = htmlString.substring(index + endResult.length);
+                        matches = URL_REGEX.exec(htmlString);
+                    }
+                }
+                break;
+            }
+
+            endResult = matches[0];
+            index = matches.index;
+
+            if (totalLength + index > maxLength) {
+                truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, index, maxLength, totalLength, opts));
+                break;
+            } else {
+                totalLength += index;
+                truncatedContent += htmlString.substring(0, index);
+            }
+
+            if (endResult[1] === '/') {
+                tagsStack.pop();
+                selfClose = null;
+            } else {
+                selfClose = SELF_CLOSE_REGEX.exec(endResult);
+                if (!selfClose) {
+                    tag = matches[1];
+                    tagsStack.push(tag);
+                }
+            }
+
+            truncatedContent += selfClose ? selfClose[0] : endResult;
+            htmlString = htmlString.substring(index + endResult.length); // Update htmlString
+        }
+
+        if (htmlString.length > maxLength - totalLength && opts.ellipsis) {
+            truncatedContent += opts.ellipsis ? '...' : '';
+        }
+        truncatedContent += closeTags(tagsStack);
+
+        if (opts.removeImageTag) {
+            truncatedContent = removeImageTag(truncatedContent);
+        }
+
+        return truncatedContent;
+    }
+
     /**
      * Replaces text with a replacement based on a regex
      * @param text - The text to replace