diff --git a/docs/extensibility.md b/docs/extensibility.md index 60d9b07337330..26d69f4118a96 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -74,3 +74,9 @@ There are some advanced block features which require opt-in support in the theme ## Autocomplete Autocompleters within blocks may be extended and overridden. See [autocomplete](../docs/extensibility/autocomplete.md). + +## Block Parsing and Serialization + +Posts in the editor move through a couple of different stages between being stored in `post_content` and appearing in the editor. Since the blocks themselves are data structures that live in memory it takes a parsing and serialization step to transform out from and into the stored format in the database. + +Customizing the parser is an advanced topic that you can learn more about in the [Extending the Parser](../docs/extensibility/parser.md) section. diff --git a/docs/extensibility/parser.md b/docs/extensibility/parser.md new file mode 100644 index 0000000000000..7c1e5bc1be7c2 --- /dev/null +++ b/docs/extensibility/parser.md @@ -0,0 +1,36 @@ +# Extending the Parser + +When the editor is interacting with blocks, these are stored in memory as data structures comprising a few basic properties and attributes. Upon saving a working post we serialize these data structures into a specific HTML structure and save the resultant string into the `post_content` property of the post in the WordPress database. When we load that post back into the editor we have to make the reverse transformation to build those data structures from the serialized format in HTML. + +The process of loading the serialized HTML into the editor is performed by the _block parser_. The formal specification for this transformation is encoded in the parsing expression grammar (PEG) inside the `@wordpress/block-serialization-spec-parser` package. The editor provides a default parser implementation of this grammar but there may be various reasons for replacing that implementation with a custom implementation. We can inject our own custom parser implementation through the appropriate filter. + +## Server-side parser + +Plugins have access to the parser if they want to process posts in their structured form instead of a plain HTML-as-string representation. + +## Client-side parser + +The editor uses the client-side parser while interactively working in a post. The plain HTML-as-string representation is sent to the browser by the backend and then the editor performs the first parse to initialize itself. + +## Filters + +To replace the server-side parser, use the `block_parser_class` filter. The filter transforms the string class name of a parser class. This class is expected to expose a `parse` method. + +_Example:_ + +```php +class EmptyParser { + public function parse( $post_content ) { + // return an empty document + return array(); + } +} + +function my_plugin_select_empty_parser( $prev_parser_class ) { + return 'EmptyParser'; +} + +add_filter( 'block_parser_class', 'my_plugin_select_empty_parser', 10, 1 ); +``` + +> **Note**: At the present time it's not possible to replace the client-side parser. diff --git a/docs/manifest.json b/docs/manifest.json index fccd97003d15e..19b6171d9ee41 100644 --- a/docs/manifest.json +++ b/docs/manifest.json @@ -287,6 +287,12 @@ "markdown_source": "https://raw.githubusercontent.com/WordPress/gutenberg/master/packages/block-library/README.md", "parent": "packages" }, + { + "title": "@wordpress/block-serialization-default-parser", + "slug": "packages-block-serialization-default-parser", + "markdown_source": "https://raw.githubusercontent.com/WordPress/gutenberg/master/packages/block-serialization-default-parser/README.md", + "parent": "packages" + }, { "title": "@wordpress/block-serialization-spec-parser", "slug": "packages-block-serialization-spec-parser", diff --git a/lib/blocks.php b/lib/blocks.php index 30c7059dc265b..92e89abddd646 100644 --- a/lib/blocks.php +++ b/lib/blocks.php @@ -66,8 +66,20 @@ function gutenberg_parse_blocks( $content ) { ); } - $parser = new Gutenberg_PEG_Parser; - return $parser->parse( _gutenberg_utf8_split( $content ) ); + /** + * Filter to allow plugins to replace the server-side block parser + * + * @since 3.8.0 + * + * @param string $parser_class Name of block parser class + */ + $parser_class = apply_filters( 'block_parser_class', 'WP_Block_Parser' ); + // Load default block parser for server-side parsing if the default parser class is being used. + if ( 'WP_Block_Parser' === $parser_class ) { + require_once dirname( __FILE__ ) . '/../packages/block-serialization-default-parser/parser.php'; + } + $parser = new $parser_class(); + return $parser->parse( $content ); } /** diff --git a/lib/client-assets.php b/lib/client-assets.php index 2df9bb3b85bfe..1eb93abffbc5a 100644 --- a/lib/client-assets.php +++ b/lib/client-assets.php @@ -275,6 +275,13 @@ function gutenberg_register_scripts_and_styles() { filemtime( gutenberg_dir_path() . 'build/dom/index.js' ), true ); + wp_register_script( + 'wp-block-serialization-default-parser', + gutenberg_url( 'build/block-serialization-default-parser/index.js' ), + array(), + filemtime( gutenberg_dir_path() . 'build/block-serialization-default-parser/index.js' ), + true + ); wp_register_script( 'wp-block-serialization-spec-parser', gutenberg_url( 'build/block-serialization-spec-parser/index.js' ), @@ -386,7 +393,7 @@ function gutenberg_register_scripts_and_styles() { array( 'wp-autop', 'wp-blob', - 'wp-block-serialization-spec-parser', + 'wp-block-serialization-default-parser', 'wp-data', 'wp-deprecated', 'wp-dom', diff --git a/lib/load.php b/lib/load.php index 0f18f6e4d15f9..d049d5bb8ad43 100644 --- a/lib/load.php +++ b/lib/load.php @@ -29,7 +29,6 @@ require dirname( __FILE__ ) . '/compat.php'; require dirname( __FILE__ ) . '/plugin-compat.php'; require dirname( __FILE__ ) . '/i18n.php'; -require dirname( __FILE__ ) . '/parser.php'; require dirname( __FILE__ ) . '/register.php'; diff --git a/package-lock.json b/package-lock.json index 558a91a0374ac..07fc1bace1d38 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2080,6 +2080,12 @@ "url": "^0.11.0" } }, + "@wordpress/block-serialization-default-parser": { + "version": "file:packages/block-serialization-default-parser", + "requires": { + "@babel/runtime": "^7.0.0" + } + }, "@wordpress/block-serialization-spec-parser": { "version": "file:packages/block-serialization-spec-parser" }, @@ -2089,6 +2095,7 @@ "@babel/runtime": "^7.0.0", "@wordpress/autop": "file:packages/autop", "@wordpress/blob": "file:packages/blob", + "@wordpress/block-serialization-default-parser": "file:packages/block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:packages/block-serialization-spec-parser", "@wordpress/data": "file:packages/data", "@wordpress/deprecated": "file:packages/deprecated", diff --git a/package.json b/package.json index 0b78cbcabfe64..3948ca1f76016 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@wordpress/autop": "file:packages/autop", "@wordpress/blob": "file:packages/blob", "@wordpress/block-library": "file:packages/block-library", + "@wordpress/block-serialization-default-parser": "file:packages/block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:packages/block-serialization-spec-parser", "@wordpress/blocks": "file:packages/blocks", "@wordpress/components": "file:packages/components", diff --git a/packages/block-serialization-default-parser/.npmrc b/packages/block-serialization-default-parser/.npmrc new file mode 100644 index 0000000000000..43c97e719a5a8 --- /dev/null +++ b/packages/block-serialization-default-parser/.npmrc @@ -0,0 +1 @@ +package-lock=false diff --git a/packages/block-serialization-default-parser/CHANGELOG.md b/packages/block-serialization-default-parser/CHANGELOG.md new file mode 100644 index 0000000000000..21e1ef9aa757e --- /dev/null +++ b/packages/block-serialization-default-parser/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial release. diff --git a/packages/block-serialization-default-parser/README.md b/packages/block-serialization-default-parser/README.md new file mode 100644 index 0000000000000..bdf933f6f4fa9 --- /dev/null +++ b/packages/block-serialization-default-parser/README.md @@ -0,0 +1,126 @@ +# Block Serialization Default Parser + +This library contains the default block serialization parser implementations for WordPress documents. It provides native PHP and JavaScript parsers that implement the specification from `@wordpress/block-serialization-spec-parser` and which normally operates on the document stored in `post_content`. + +## Installation + +Install the module + +```bash +npm install @wordpress/block-serialization-default-parser --save +``` + +_This package assumes that your code will run in an **ES2015+** environment. If you're using an environment that has limited or no support for ES2015+ such as lower versions of IE then using [core-js](https://github.com/zloirock/core-js) or [@babel/polyfill](https://babeljs.io/docs/en/next/babel-polyfill) will add support for these methods. Learn more about it in [Babel docs](https://babeljs.io/docs/en/next/caveats)._ + +## Usage + +Input post: +```html + +
+
+

Left

+
+ + + +
+

Middle

+
+ + + +
+
+ +``` + +Parsing code: +```js +import { parse } from '@wordpress/block-serialization-default-parser'; + +parse( post ) === [ + { + blockName: "core/columns", + attrs: { + columns: 3 + }, + innerBlocks: [ + { + blockName: "core/column", + attrs: null, + innerBlocks: [ + { + blockName: "core/paragraph", + attrs: null, + innerBlocks: [], + innerHTML: "\n

Left

\n" + } + ], + innerHTML: '\n
\n' + }, + { + blockName: "core/column", + attrs: null, + innerBlocks: [ + { + blockName: "core/paragraph", + attrs: null, + innerBlocks: [], + innerHTML: "\n

Middle

\n" + } + ], + innerHTML: '\n
\n' + }, + { + blockName: "core/column", + attrs: null, + innerBlocks: [], + innerHTML: '\n
\n' + } + ], + innerHTML: '\n
\n\n\n\n
\n' + } +]; +``` + +## Theory + +### What is different about this one from the spec-parser? + +This is a recursive-descent parser that scans linearly once through the input document. Instead of directly recursing it utilizes a trampoline mechanism to prevent stack overflow. It minimizes data copying and passing through the use of globals for tracking state through the parse. Between every token (a block comment delimiter) we can instrument the parser and intervene should we want to; for example we might put a hard limit on how long we can be parsing a document or provide additional debugging diagnostics for a document. + +The spec parser is defined via a _Parsing Expression Grammar_ (PEG) which answers many questions inherently that we must answer explicitly in this parser. The goal for this implementation is to match the characteristics of the PEG so that it can be directly swapped out and so that the only changes are better runtime performance and memory usage. + +### How does it work? + +Every serialized Gutenberg document is nominally an HTML document which, in addition to normal HTML, may also contain specially designed HTML comments -- the block comment delimiters -- which separate and isolate the blocks serialized in the document. + +This parser attempts to create a state-machine around the transitions triggered from those delimiters -- the "tokens" of the grammar. Every time we find one we should only be doing either of: + + - enter a new block; + - exit out of a block. + +Those actions have different effects depending on the context; for instance, when we exit a block we either need to add it to the output block list _or_ we need to append it as the next `innerBlock` on the parent block below it in the block stack (the place where we track open blocks). The details are documented below. + +The biggest challenge in this parser is making the right accounting of indices required to construct the `innerHTML` values for each block at every level of nesting depth. We take a simple approach: + + - Start each newly opened block with an empty `innerHTML`. + - Whenever we push a first block into the `innerBlocks` list, add the content from where the content of the parent block started to where this inner block starts. + - Whenever we push another block into the `innerBlocks` list, add the content from where the previous inner block ended to where this inner block starts. + - When we close out an open block, add the content from where the last inner block ended to where the closing block delimiter starts. + - If there are no inner blocks then we take the entire content between the opening and closing block comment delimiters as the `innerHTML`. + +### I meant, how does it perform? + +This parser operates much faster than the generated parser from the specification. Because we know more about the parsing than the PEG does we can take advantage of several tricks to improve our speed and memory usage: + + - We only have one or two distinct tokens, depending on how you look at it, and they are all readily matched via a regular expression. Instead of parsing on a character-per-character basis we can allow the PCRE RegExp engine to skip over large swaths of the document for us in order to find those tokens. + - Since `preg_match()` takes an `offset` parameter we can crawl through the input without passing copies of the input text on every step. We can track our position in the string and only pass a number instead. + - Not copying all those strings means that we'll also skip many memory allocations. + +Further, tokenizing with a RegExp brings an additional advantage. The parser generated by the PEG provides predictable performance characteristics in exchange for control over tokenization rules -- it doesn't allow us to define RegExp patterns in the rules so as to guard against _e.g._ cataclysmic backtracking that would break the PEG guarantees. + +However, since our "token language" of the block comment delimiters is _regular_ and _can_ be trivially matched with RegExp patterns, we can do that here and then something magical happens: we jump out of PHP or JavaScript and into a highly-optimized RegExp engine written in C or C++ on the host system. We thereby leave the virtual machine and its overhead. + +

Code is Poetry.

diff --git a/packages/block-serialization-default-parser/package.json b/packages/block-serialization-default-parser/package.json new file mode 100644 index 0000000000000..4daf27eb065f9 --- /dev/null +++ b/packages/block-serialization-default-parser/package.json @@ -0,0 +1,29 @@ +{ + "name": "@wordpress/block-serialization-default-parser", + "version": "1.0.0-rc.0", + "description": "Block serialization specification parser for WordPress posts.", + "author": "The WordPress Contributors", + "license": "GPL-2.0-or-later", + "keywords": [ + "wordpress", + "block", + "parser" + ], + "homepage": "https://github.com/WordPress/gutenberg/tree/master/packages/block-serialization-default-parser/README.md", + "repository": { + "type": "git", + "url": "https://github.com/WordPress/gutenberg.git" + }, + "bugs": { + "url": "https://github.com/WordPress/gutenberg/issues" + }, + "main": "build/index.js", + "module": "build-module/index.js", + "react-native": "src/index", + "dependencies": { + "@babel/runtime": "^7.0.0" + }, + "publishConfig": { + "access": "public" + } +} diff --git a/packages/block-serialization-default-parser/parser.php b/packages/block-serialization-default-parser/parser.php new file mode 100644 index 0000000000000..ba124340d7064 --- /dev/null +++ b/packages/block-serialization-default-parser/parser.php @@ -0,0 +1,449 @@ + 3 ) + * + * @since 3.8.0 + * @var array|null + */ + public $attrs; + + /** + * List of inner blocks (of this same class) + * + * @since 3.8.0 + * @var WP_Block_Parser_Block[] + */ + public $innerBlocks; + + /** + * Resultant HTML from inside block comment delimieters + * after removing inner blocks + * + * @example "...Just testing..." -> "Just testing..." + * + * @since 3.8.0 + * @var string + */ + public $innerHTML; + + function __construct( $name, $attrs, $innerBlocks, $innerHTML ) { + $this->blockName = $name; + $this->attrs = $attrs; + $this->innerBlocks = $innerBlocks; + $this->innerHTML = $innerHTML; + } +} + +/** + * Class WP_Block_Parser_Frame + * + * Holds partial blocks in memory while parsing + * + * @internal + * @since 3.8.0 + */ +class WP_Block_Parser_Frame { + /** + * Full or partial block + * + * @since 3.8.0 + * @var WP_Block_Parser_Block + */ + public $block; + + /** + * Byte offset into document for start of parse token + * + * @since 3.8.0 + * @var int + */ + public $token_start; + + /** + * Byte length of entire parse token string + * + * @since 3.8.0 + * @var int + */ + public $token_length; + + /** + * Byte offset into document for after parse token ends + * (used during reconstruction of stack into parse production) + * + * @since 3.8.0 + * @var int + */ + public $prev_offset; + + /** + * Byte offset into document where leading HTML before token starts + * + * @since 3.8.0 + * @var int + */ + public $leading_html_start; + + function __construct( $block, $token_start, $token_length, $prev_offset = null, $leading_html_start = null ) { + $this->block = $block; + $this->token_start = $token_start; + $this->token_length = $token_length; + $this->prev_offset = isset($prev_offset) ? $prev_offset : $token_start + $token_length; + $this->leading_html_start = $leading_html_start; + } +} + +/** + * Class WP_Block_Parser + * + * Parses a document and constructs a list of parsed block objects + * + * @since 3.8.0 + */ +class WP_Block_Parser { + /** + * Input document being parsed + * + * @example "Pre-text\nThis is inside a block!" + * + * @since 3.8.0 + * @var string + */ + public $document; + + /** + * Tracks parsing progress through document + * + * @since 3.8.0 + * @var int + */ + public $offset; + + /** + * List of parsed blocks + * + * @since 3.8.0 + * @var WP_Block_Parser_Block[] + */ + public $output; + + /** + * Stack of partially-parsed structures in memory during parse + * + * @since 3.8.0 + * @var WP_Block_Parser_Frame[] + */ + public $stack; + + /** + * Parses a document and returns a list of block structures + * + * When encountering an invalid parse will return a best-effort + * parse. In contrast to the specification parser this does not + * return an error on invalid inputs. + * + * @since 3.8.0 + * + * @param string $document + * @return WP_Block_Parser_Block[] + */ + function parse( $document ) { + $this->document = $document; + $this->offset = 0; + $this->output = array(); + $this->stack = array(); + + do { + // twiddle our thumbs + } while ( $this->proceed() ); + + return $this->output; + } + + /** + * Processes the next token from the input document + * and returns whether to proceed eating more tokens + * + * This is the "next step" function that essentially + * takes a token as its input and decides what to do + * with that token before descending deeper into a + * nested block tree or continuing along the document + * or breaking out of a level of nesting. + * + * @internal + * @since 3.8.0 + * @return bool + */ + function proceed() { + list( $token_type, $block_name, $attrs, $start_offset, $token_length ) = $this->next_token(); + $stack_depth = count( $this->stack ); + + switch ( $token_type ) { + case 'no-more-tokens': + // if not in a block then flush output + if ( 0 === $stack_depth ) { + $this->add_freeform(); + return false; + } + + /* + * Otherwise we have a problem + * This is an error + * + * we have options + * - treat it all as freeform text + * - assume an implicit closer (easiest when not nesting) + */ + + // for the easy case we'll assume an implicit closer + if ( 1 === $stack_depth ) { + $this->add_block_from_stack(); + return false; + } + + /* + * for the nested case where it's more difficult we'll + * have to assume that multiple closers are missing + * and so we'll collapse the whole stack piecewise + */ + while ( 0 < count( $this->stack ) ) { + $this->add_block_from_stack(); + } + return false; + + case 'void-block': + /* + * easy case is if we stumbled upon a void block + * in the top-level of the document + */ + if ( 0 === $stack_depth ) { + $this->output[] = new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ); + $this->offset = $start_offset + $token_length; + return true; + } + + // otherwise we found an inner block + $this->add_inner_block( + new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + $start_offset, + $token_length + ); + $this->offset = $start_offset + $token_length; + return true; + + case 'block-opener': + // we may have some HTML soup before the next block + $leading_html_start = $start_offset > $this->offset ? $this->offset : null; + + // track all newly-opened blocks on the stack + array_push( $this->stack, new WP_Block_Parser_Frame( + new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + $start_offset, + $token_length, + $start_offset + $token_length, + $leading_html_start + ) ); + $this->offset = $start_offset + $token_length; + return true; + + case 'block-closer': + /* + * if we're missing an opener we're in trouble + * This is an error + */ + if ( 0 === $stack_depth ) { + /* + * we have options + * - assume an implicit opener + * - assume _this_ is the opener + * - give up and close out the document + */ + $this->add_freeform(); + return false; + } + + // if we're not nesting then this is easy - close the block + if ( 1 === $stack_depth ) { + $this->add_block_from_stack( $start_offset ); + $this->offset = $start_offset + $token_length; + return true; + } + + /* + * otherwise we're nested and we have to close out the current + * block and add it as a new innerBlock to the parent + */ + $stack_top = array_pop( $this->stack ); + $stack_top->block->innerHTML .= substr( $this->document, $stack_top->prev_offset, $start_offset - $stack_top->prev_offset ); + $stack_top->prev_offset = $start_offset + $token_length; + + $this->add_inner_block( + $stack_top->block, + $stack_top->token_start, + $stack_top->token_length, + $start_offset + $token_length + ); + $this->offset = $start_offset + $token_length; + return true; + + default: + // This is an error + $this->add_freeform(); + return false; + } + } + + /** + * Scans the document from where we last left off + * and finds the next valid token to parse if it exists + * + * Returns the type of the find: kind of find, block information, attributes + * + * @internal + * @since 3.8.0 + * @return array + */ + function next_token() { + $matches = null; + + /* + * aye the magic + * we're using a single RegExp to tokenize the block comment delimiters + * we're also using a trick here because the only difference between a + * block opener and a block closer is the leading `/` before `wp:` (and + * a closer has no attributes). we can trap them both and process the + * match back in PHP to see which one it was. + */ + $has_match = preg_match( + '/).)+?}\s+)?(?\/)?-->/s', + $this->document, + $matches, + PREG_OFFSET_CAPTURE, + $this->offset + ); + + // we have no more tokens + if ( 0 === $has_match ) { + return array( 'no-more-tokens', null, null, null, null ); + } + + list( $match, $started_at ) = $matches[ 0 ]; + + $length = strlen( $match ); + $is_closer = isset( $matches[ 'closer' ] ) && -1 !== $matches[ 'closer' ][ 1 ]; + $is_void = isset( $matches[ 'void' ] ) && -1 !== $matches[ 'void' ][ 1 ]; + $namespace = $matches[ 'namespace' ]; + $namespace = ( isset( $namespace ) && -1 !== $namespace[ 1 ] ) ? $namespace[ 0 ] : 'core/'; + $name = $namespace . $matches[ 'name' ][ 0 ]; + $has_attrs = isset( $matches[ 'attrs' ] ) && -1 !== $matches[ 'attrs' ][ 1 ]; + $attrs = $has_attrs ? json_decode( $matches[ 'attrs' ][ 0 ] ) : null; + + /* + * This state isn't allowed + * This is an error + */ + if ( $is_closer && ( $is_void || $has_attrs ) ) { + // we can ignore them since they don't hurt anything + } + + if ( $is_void ) { + return array( 'void-block', $name, $attrs, $started_at, $length ); + } + + if ( $is_closer ) { + return array( 'block-closer', $name, null, $started_at, $length ); + } + + return array( 'block-opener', $name, $attrs, $started_at, $length ); + } + + /** + * Pushes a length of text from the input document + * to the output list as a freeform block + * + * @internal + * @since 3.8.0 + * @param null $length how many bytes of document text to output + */ + function add_freeform( $length = null ) { + $length = $length ? $length : strlen( $this->document ) - $this->offset; + + if ( 0 === $length ) { + return; + } + + $this->output[] = array( + 'attrs' => new stdClass(), + 'innerHTML' => substr( $this->document, $this->offset, $length ), + ); + } + + /** + * Given a block structure from memory pushes + * a new block to the output list + * + * @internal + * @since 3.8.0 + * @param WP_Block_Parser_Block $block the block to add to the output + * @param int $token_start byte offset into the document where the first token for the block starts + * @param int $token_length byte length of entire block from start of opening token to end of closing token + * @param int|null $last_offset last byte offset into document if continuing form earlier output + */ + function add_inner_block(WP_Block_Parser_Block $block, $token_start, $token_length, $last_offset = null ) { + $parent = $this->stack[ count( $this->stack ) - 1 ]; + $parent->block->innerBlocks[] = $block; + $parent->block->innerHTML .= substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset ); + $parent->prev_offset = $last_offset ? $last_offset : $token_start + $token_length; + } + + /** + * Pushes the top block from the parsing stack to the output list + * + * @internal + * @since 3.8.0 + * @param int|null $end_offset byte offset into document for where we should stop sending text output as HTML + */ + function add_block_from_stack( $end_offset = null ) { + $stack_top = array_pop( $this->stack ); + $prev_offset = $stack_top->prev_offset; + + $stack_top->block->innerHTML .= isset( $end_offset ) + ? substr( $this->document, $prev_offset, $end_offset - $prev_offset ) + : substr( $this->document, $prev_offset ); + + if ( isset( $stack_top->leading_html_start ) ) { + $this->output[] = array( + 'attrs' => array(), + 'innerHTML' => substr( + $this->document, + $stack_top->leading_html_start, + $stack_top->token_start - $stack_top->leading_html_start + ), + ); + } + + $this->output[] = $stack_top->block; + } +} diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js new file mode 100644 index 0000000000000..4f4fdb38cd19e --- /dev/null +++ b/packages/block-serialization-default-parser/src/index.js @@ -0,0 +1,257 @@ +let document; +let offset; +let output; +let stack; +const tokenizer = /)[^])+?}\s+)?(\/)?-->/g; + +function Block( blockName, attrs, innerBlocks, innerHTML ) { + return { + blockName, + attrs, + innerBlocks, + innerHTML, + }; +} + +function Frame( block, tokenStart, tokenLength, prevOffset, leadingHtmlStart ) { + return { + block, + tokenStart, + tokenLength, + prevOffset: prevOffset || tokenStart + tokenLength, + leadingHtmlStart, + }; +} + +export const parse = ( doc ) => { + document = doc; + offset = 0; + output = []; + stack = []; + tokenizer.lastIndex = 0; + + do { + // twiddle our thumbs + } while ( proceed() ); + + return output; +}; + +function proceed() { + const next = nextToken(); + const [ tokenType, blockName, attrs, startOffset, tokenLength ] = next; + const stackDepth = stack.length; + + switch ( tokenType ) { + case 'no-more-tokens': + // if not in a block then flush output + if ( 0 === stackDepth ) { + addFreeform(); + return false; + } + + // Otherwise we have a problem + // This is an error + // we have options + // - treat it all as freeform text + // - assume an implicit closer (easiest when not nesting) + + // for the easy case we'll assume an implicit closer + if ( 1 === stackDepth ) { + addBlockFromStack(); + return false; + } + + // for the nested case where it's more difficult we'll + // have to assume that multiple closers are missing + // and so we'll collapse the whole stack piecewise + while ( 0 < stack.length ) { + addBlockFromStack(); + } + return false; + + case 'void-block': + // easy case is if we stumbled upon a void block + // in the top-level of the document + if ( 0 === stackDepth ) { + output.push( Block( blockName, attrs, [], '' ) ); + offset = startOffset + tokenLength; + return true; + } + + // otherwise we found an inner block + addInnerBlock( + Block( blockName, attrs, [], '' ), + startOffset, + tokenLength, + ); + offset = startOffset + tokenLength; + return true; + + case 'block-opener': + // we may have some HTML soup before the next block + const leadingHtmlStart = ( startOffset > offset ) ? offset : null; + + // track all newly-opened blocks on the stack + stack.push( + Frame( + Block( blockName, attrs, [], '' ), + startOffset, + tokenLength, + startOffset + tokenLength, + leadingHtmlStart, + ), + ); + offset = startOffset + tokenLength; + return true; + + case 'block-closer': + // if we're missing an opener we're in trouble + // This is an error + if ( 0 === stackDepth ) { + // we have options + // - assume an implicit opener + // - assume _this_ is the opener + // - give up and close out the document + addFreeform(); + return false; + } + + // if we're not nesting then this is easy - close the block + if ( 1 === stackDepth ) { + addBlockFromStack( startOffset ); + offset = startOffset + tokenLength; + return true; + } + + // otherwise we're nested and we have to close out the current + // block and add it as a innerBlock to the parent + const stackTop = stack.pop(); + stackTop.block.innerHTML += document.substr( + stackTop.prevOffset, + startOffset - stackTop.prevOffset, + ); + stackTop.prevOffset = startOffset + tokenLength; + + addInnerBlock( + stackTop.block, + stackTop.tokenStart, + stackTop.tokenLength, + startOffset + tokenLength, + ); + offset = startOffset + tokenLength; + return true; + + default: + // This is an error + addFreeform(); + return false; + } +} + +/** + * Parse JSON if valid, otherwise return null + * + * Note that JSON coming from the block comment + * delimiters is constrained to be an object + * and cannot be things like `true` or `null` + * + * @param {string} input JSON input string to parse + * @return {Object|null} parsed JSON if valid + */ +function parseJSON( input ) { + try { + return JSON.parse( input ); + } catch ( e ) { + return null; + } +} + +function nextToken() { + // aye the magic + // we're using a single RegExp to tokenize the block comment delimiters + // we're also using a trick here because the only difference between a + // block opener and a block closer is the leading `/` before `wp:` (and + // a closer has no attributes). we can trap them both and process the + // match back in Javascript to see which one it was. + const matches = tokenizer.exec( document ); + + // we have no more tokens + if ( null === matches ) { + return [ 'no-more-tokens' ]; + } + + const startedAt = matches.index; + const [ match, closerMatch, namespaceMatch, nameMatch, attrsMatch, voidMatch ] = matches; + + const length = match.length; + const isCloser = !! closerMatch; + const isVoid = !! voidMatch; + const namespace = namespaceMatch || 'core/'; + const name = namespace + nameMatch; + const hasAttrs = !! attrsMatch; + const attrs = hasAttrs ? parseJSON( attrsMatch ) : null; + + // This state isn't allowed + // This is an error + if ( isCloser && ( isVoid || hasAttrs ) ) { + // we can ignore them since they don't hurt anything + // we may warn against this at some point or reject it + } + + if ( isVoid ) { + return [ 'void-block', name, attrs, startedAt, length ]; + } + + if ( isCloser ) { + return [ 'block-closer', name, null, startedAt, length ]; + } + + return [ 'block-opener', name, attrs, startedAt, length ]; +} + +function addFreeform( rawLength ) { + const length = rawLength ? rawLength : document.length - offset; + + if ( 0 === length ) { + return; + } + + // why is this not a Frame? it's because the current grammar + // specifies an object that's different. we can update the + // specification and change here if we want to but for now we + // want this parser to be spec-compliant + output.push( { + attrs: {}, + innerHTML: document.substr( offset, length ), + } ); +} + +function addInnerBlock( block, tokenStart, tokenLength, lastOffset ) { + const parent = stack[ stack.length - 1 ]; + parent.block.innerBlocks.push( block ); + parent.block.innerHTML += document.substr( + parent.prevOffset, + tokenStart - parent.prevOffset, + ); + parent.prevOffset = lastOffset ? lastOffset : tokenStart + tokenLength; +} + +function addBlockFromStack( endOffset ) { + const { block, leadingHtmlStart, prevOffset, tokenStart } = stack.pop(); + + if ( endOffset ) { + block.innerHTML += document.substr( prevOffset, endOffset - prevOffset ); + } else { + block.innerHTML += document.substr( prevOffset ); + } + + if ( null !== leadingHtmlStart ) { + output.push( { + attrs: {}, + innerHTML: document.substr( leadingHtmlStart, tokenStart - leadingHtmlStart ), + } ); + } + + output.push( block ); +} diff --git a/packages/block-serialization-default-parser/test/index.js b/packages/block-serialization-default-parser/test/index.js new file mode 100644 index 0000000000000..68a46b39c97c4 --- /dev/null +++ b/packages/block-serialization-default-parser/test/index.js @@ -0,0 +1,27 @@ +/** + * Internal dependencies + */ +import { parse } from '../'; + +describe( 'block-serialization-spec-parser', () => { + test( 'parse() accepts inputs with multiple Reusable blocks', () => { + const result = parse( + '' + ); + + expect( result ).toEqual( [ + { + blockName: 'core/block', + attrs: { ref: 313 }, + innerBlocks: [], + innerHTML: '', + }, + { + blockName: 'core/block', + attrs: { ref: 482 }, + innerBlocks: [], + innerHTML: '', + }, + ] ); + } ); +} ); diff --git a/packages/blocks/package.json b/packages/blocks/package.json index 049637270fbaa..c3018a510f15b 100644 --- a/packages/blocks/package.json +++ b/packages/blocks/package.json @@ -23,6 +23,7 @@ "@babel/runtime": "^7.0.0", "@wordpress/autop": "file:../autop", "@wordpress/blob": "file:../blob", + "@wordpress/block-serialization-default-parser": "file:../block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:../block-serialization-spec-parser", "@wordpress/data": "file:../data", "@wordpress/deprecated": "file:../deprecated", diff --git a/packages/blocks/src/api/parser.js b/packages/blocks/src/api/parser.js index 1635355281001..6390d3a8852a3 100644 --- a/packages/blocks/src/api/parser.js +++ b/packages/blocks/src/api/parser.js @@ -9,7 +9,7 @@ import { flow, castArray, mapValues, omit, stubFalse } from 'lodash'; */ import { autop } from '@wordpress/autop'; import { applyFilters } from '@wordpress/hooks'; -import { parse as grammarParse } from '@wordpress/block-serialization-spec-parser'; +import { parse as defaultParse } from '@wordpress/block-serialization-default-parser'; /** * Internal dependencies @@ -378,6 +378,6 @@ const createParse = ( parseImplementation ) => * * @return {Array} Block list. */ -export const parseWithGrammar = createParse( grammarParse ); +export const parseWithGrammar = createParse( defaultParse ); export default parseWithGrammar; diff --git a/phpunit/class-parsing-test.php b/phpunit/class-parsing-test.php index 9bf05fcdf34e0..b854f8e306a2b 100644 --- a/phpunit/class-parsing-test.php +++ b/phpunit/class-parsing-test.php @@ -52,7 +52,7 @@ function strip_r( $input ) { /** * @dataProvider parsing_test_filenames */ - function test_parser_output( $html_filename, $parsed_json_filename ) { + function test_spec_parser_output( $html_filename, $parsed_json_filename ) { $html_path = self::$fixtures_dir . '/' . $html_filename; $parsed_json_path = self::$fixtures_dir . '/' . $parsed_json_filename; @@ -74,4 +74,32 @@ function test_parser_output( $html_filename, $parsed_json_filename ) { "File '$parsed_json_filename' does not match expected value" ); } + + /** + * @dataProvider parsing_test_filenames + */ + function test_default_parser_output( $html_filename, $parsed_json_filename ) { + // include the parser if it was not yet loaded. + require_once dirname( __FILE__ ) . '/../packages/block-serialization-default-parser/parser.php'; + $html_path = self::$fixtures_dir . '/' . $html_filename; + $parsed_json_path = self::$fixtures_dir . '/' . $parsed_json_filename; + + foreach ( array( $html_path, $parsed_json_path ) as $filename ) { + if ( ! file_exists( $filename ) ) { + throw new Exception( "Missing fixture file: '$filename'" ); + } + } + + $html = self::strip_r( file_get_contents( $html_path ) ); + $expected_parsed = json_decode( self::strip_r( file_get_contents( $parsed_json_path ) ), true ); + + $parser = new WP_Block_Parser(); + $result = json_decode( json_encode( $parser->parse( $html ) ), true ); + + $this->assertEquals( + $expected_parsed, + $result, + "File '$parsed_json_filename' does not match expected value" + ); + } } diff --git a/test/integration/full-content/full-content.spec.js b/test/integration/full-content/full-content.spec.js index 0aa2740831a9d..5c169185accae 100644 --- a/test/integration/full-content/full-content.spec.js +++ b/test/integration/full-content/full-content.spec.js @@ -15,7 +15,7 @@ import { serialize, unstable__bootstrapServerSideBlockDefinitions, // eslint-disable-line camelcase } from '@wordpress/blocks'; -import { parse as grammarParse } from '@wordpress/block-serialization-spec-parser'; +import { parse as grammarParse } from '@wordpress/block-serialization-default-parser'; import { registerCoreBlocks } from '@wordpress/block-library'; const fixturesDir = path.join( __dirname, 'fixtures' ); diff --git a/webpack.config.js b/webpack.config.js index 748a721735c1e..2a926a952255a 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -87,6 +87,7 @@ const gutenbergPackages = [ 'autop', 'blob', 'blocks', + 'block-serialization-default-parser', 'block-serialization-spec-parser', 'compose', 'core-data',