From 461c047365a63f1cae41fdba1f2ec6dfbeba5b9a Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 6 Sep 2023 12:43:36 -0400 Subject: [PATCH] Introduce owned constants Before this commit, constants in the constant pool were assumed to be slices of the source string. This works in _almost_ all cases. There are times, however, when a string needs to be synthesized. This can occur when passing in locals that need to be scoped through eval, or when generating method names like `foo=`. After this commit, there is a single bit `owned` boolean on constants in the pool that indicates whether or not it is a slice of the source string. If it is not, it is assumed to be allocated memory that should be freed by the constant pool when the constant pool is freed. When serializing, the most significant bit in the location of the contents of the constant indicates whether or not it is owned. When it is, instead of 4 bytes for the source offset and 4 bytes for the length it is instead 4 bytes for the buffer offset and 4 bytes the length. The contents of the owned constants are embedded into the buffer after the constant pool itself. --- .gitignore | 2 +- Gemfile.lock | 1 + docs/serialization.md | 42 +++++++++++---------- include/yarp/util/yp_constant_pool.h | 15 ++++++-- rust/yarp-sys/src/lib.rs | 14 ++----- src/util/yp_constant_pool.c | 49 +++++++++++++++++++++---- src/yarp.c | 47 ++++++++++++++++++++---- templates/ext/yarp/api_node.c.erb | 11 ++++++ templates/java/org/yarp/Loader.java.erb | 25 ++++++------- templates/lib/yarp/serialize.rb.erb | 9 ++++- templates/src/serialize.c.erb | 27 ++++++++++++-- test/yarp/parse_serialize_test.rb | 10 +++++ 12 files changed, 182 insertions(+), 70 deletions(-) diff --git a/.gitignore b/.gitignore index 3c059f27c10..fe75cf59478 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ /doc/ /pkg/ /spec/reports/ -/test/serialized/ +/test/yarp/serialized/ /top-100-gems/ /tmp/ /vendor/bundle diff --git a/Gemfile.lock b/Gemfile.lock index 7a1b804f03a..f35f7bbac7b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -24,6 +24,7 @@ GEM PLATFORMS ruby universal-java-17 + universal-java-20 DEPENDENCIES ffi diff --git a/docs/serialization.md b/docs/serialization.md index e6001930dfd..4b245e0e8db 100644 --- a/docs/serialization.md +++ b/docs/serialization.md @@ -81,31 +81,35 @@ Each node is structured like the following table: | `1` | node type | | location | node location | -Each node's child is then appended to the serialized string. -The child node types can be determined by referencing `config.yml`. -Depending on the type of child node, it could take a couple of different forms, described below: - -* `node` - A child node that is a node itself. This is structured just as like parent node. -* `node?` - A child node that is optionally present. If the node is not present, then a single `0` byte will be written in its place. If it is present, then it will be structured just as like parent node. -* `node[]` - A child node that is an array of nodes. This is structured as a variable-length integer length, followed by the child nodes themselves. -* `string` - A child node that is a string. For example, this is used as the name of the method in a call node, since it cannot directly reference the source string (as in `@-` or `foo=`). This is structured as a variable-length integer byte length, followed by the string itself (_without_ a trailing null byte). +Every field on the node is then appended to the serialized string. The fields can be determined by referencing `config.yml`. Depending on the type of field, it could take a couple of different forms, described below: + +* `node` - A field that is a node. This is structured just as like parent node. +* `node?` - A field that is a node that is optionally present. If the node is not present, then a single `0` byte will be written in its place. If it is present, then it will be structured just as like parent node. +* `node[]` - A field that is an array of nodes. This is structured as a variable-length integer length, followed by the child nodes themselves. +* `string` - A field that is a string. For example, this is used as the name of the method in a call node, since it cannot directly reference the source string (as in `@-` or `foo=`). This is structured as a variable-length integer byte length, followed by the string itself (_without_ a trailing null byte). * `constant` - A variable-length integer that represents an index in the constant pool. -* `constant[]` - A child node that is an array of constants. This is structured as a variable-length integer length, followed by the child constants themselves. -* `location` - A child node that is a location. This is structured as a variable-length integer start followed by a variable-length integer length. -* `location?` - A child node that is a location that is optionally present. If the location is not present, then a single `0` byte will be written in its place. If it is present, then it will be structured just like the `location` child node. -* `uint32` - A child node that is a 32-bit unsigned integer. This is structured as a variable-length integer. +* `constant?` - An optional variable-length integer that represents an index in the constant pool. If it's not present, then a single `0` byte will be written in its place. +* `location` - A field that is a location. This is structured as a variable-length integer start followed by a variable-length integer length. +* `location?` - A field that is a location that is optionally present. If the location is not present, then a single `0` byte will be written in its place. If it is present, then it will be structured just like the `location` child node. +* `uint32` - A field that is a 32-bit unsigned integer. This is structured as a variable-length integer. + +After the syntax tree, the content pool is serialized. This is a list of constants that were referenced from within the tree. The content pool begins at the offset specified in the header. Constants can be either "owned" (in which case their contents are embedded in the serialization) or "shared" (in which case their contents represent a slice of the source string). The most significant bit of the constant indicates whether it is owned or shared. + +In the case that it is owned, the constant is structured as follows: + +| # bytes | field | +| --- | --- | +| `4` | the byte offset in the serialization for the contents of the constant | +| `4` | the byte length in the serialization | -After the syntax tree, the content pool is serialized. -This is a list of constants that were referenced from within the tree. -The content pool begins at the offset specified in the header. -Each constant is structured as: +Note that you will need to mask off the most significant bit for the byte offset in the serialization. In the case that it is shared, the constant is structured as follows: | # bytes | field | | --- | --- | -| `4` | the byte offset in the source | -| `4` | the byte length in the source | +| `4` | the byte offset in the source string for the contents of the constant | +| `4` | the byte length in the source string | -At the end of the serialization, the buffer is null terminated. +After the constant pool, the contents of the owned constants are serialized. This is just a sequence of bytes that represent the contents of the constants. At the end of the serialization, the buffer is null terminated. ## APIs diff --git a/include/yarp/util/yp_constant_pool.h b/include/yarp/util/yp_constant_pool.h index 1ac23cf88bb..ecd3ff619e3 100644 --- a/include/yarp/util/yp_constant_pool.h +++ b/include/yarp/util/yp_constant_pool.h @@ -8,6 +8,7 @@ #include "yarp/defines.h" +#include #include #include #include @@ -39,7 +40,8 @@ size_t yp_constant_id_list_memsize(yp_constant_id_list_t *list); void yp_constant_id_list_free(yp_constant_id_list_t *list); typedef struct { - yp_constant_id_t id; + unsigned int id: 31; + bool owned: 1; const uint8_t *start; size_t length; size_t hash; @@ -57,9 +59,14 @@ typedef struct { // Initialize a new constant pool with a given capacity. bool yp_constant_pool_init(yp_constant_pool_t *pool, size_t capacity); -// Insert a constant into a constant pool. Returns the id of the constant, or 0 -// if any potential calls to resize fail. -yp_constant_id_t yp_constant_pool_insert(yp_constant_pool_t *pool, const uint8_t *start, size_t length); +// Insert a constant into a constant pool that is a slice of a source string. +// Returns the id of the constant, or 0 if any potential calls to resize fail. +yp_constant_id_t yp_constant_pool_insert_shared(yp_constant_pool_t *pool, const uint8_t *start, size_t length); + +// Insert a constant into a constant pool from memory that is now owned by the +// constant pool. Returns the id of the constant, or 0 if any potential calls to +// resize fail. +yp_constant_id_t yp_constant_pool_insert_owned(yp_constant_pool_t *pool, const uint8_t *start, size_t length); // Free the memory associated with a constant pool. void yp_constant_pool_free(yp_constant_pool_t *pool); diff --git a/rust/yarp-sys/src/lib.rs b/rust/yarp-sys/src/lib.rs index 9648a3a0b8e..321749d7511 100644 --- a/rust/yarp-sys/src/lib.rs +++ b/rust/yarp-sys/src/lib.rs @@ -19,20 +19,12 @@ unused_qualifications )] -// Allowing because we're not manually defining anything that would cause this, and -// the bindgen-generated `bindgen_test_layout_yp_parser()` triggers this. -#[allow(clippy::cognitive_complexity)] -// Allowing because we're not manually defining anything that would cause this, and -// the following bindgen-generated functions triggers this: -// - `bindgen_test_layout_yp_call_node()` -// - `bindgen_test_layout_yp_def_node()` -// - `bindgen_test_layout_yp_parser()` -#[allow(clippy::too_many_lines)] +#[allow(clippy::all, clippy::pedantic, clippy::cognitive_complexity)] #[allow(missing_copy_implementations)] -#[allow(non_upper_case_globals)] +#[allow(missing_docs)] #[allow(non_camel_case_types)] #[allow(non_snake_case)] -#[allow(missing_docs)] +#[allow(non_upper_case_globals)] mod bindings { // In `build.rs`, we use `bindgen` to generate bindings based on C headers // and `librubyparser`. Here is where we pull in those bindings and make diff --git a/src/util/yp_constant_pool.c b/src/util/yp_constant_pool.c index 3ad241a9d1b..8be96138a13 100644 --- a/src/util/yp_constant_pool.c +++ b/src/util/yp_constant_pool.c @@ -106,12 +106,11 @@ yp_constant_pool_init(yp_constant_pool_t *pool, size_t capacity) { return true; } -// Insert a constant into a constant pool. Returns the id of the constant, or 0 -// if any potential calls to resize fail. -yp_constant_id_t +// Insert a constant into a constant pool and return its index in the pool. +static size_t yp_constant_pool_insert(yp_constant_pool_t *pool, const uint8_t *start, size_t length) { if (pool->size >= (pool->capacity / 4 * 3)) { - if (!yp_constant_pool_resize(pool)) return 0; + if (!yp_constant_pool_resize(pool)) return pool->capacity; } size_t hash = yp_constant_pool_hash(start, length); @@ -123,25 +122,59 @@ yp_constant_pool_insert(yp_constant_pool_t *pool, const uint8_t *start, size_t l // same as the content we are trying to insert. If it is, then we can // return the id of the existing constant. if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { - return pool->constants[index].id; + return index; } index = (index + 1) % pool->capacity; } - yp_constant_id_t id = (yp_constant_id_t)++pool->size; + pool->size++; + assert(pool->size < ((size_t) (1 << 31))); + pool->constants[index] = (yp_constant_t) { - .id = id, + .id = (unsigned int) (pool->size & 0x7FFFFFFF), .start = start, .length = length, .hash = hash }; - return id; + return index; +} + +// Insert a constant into a constant pool. Returns the id of the constant, or 0 +// if any potential calls to resize fail. +yp_constant_id_t +yp_constant_pool_insert_shared(yp_constant_pool_t *pool, const uint8_t *start, size_t length) { + size_t index = yp_constant_pool_insert(pool, start, length); + return index == pool->capacity ? 0 : ((yp_constant_id_t) pool->constants[index].id); +} + +// Insert a constant into a constant pool from memory that is now owned by the +// constant pool. Returns the id of the constant, or 0 if any potential calls to +// resize fail. +yp_constant_id_t +yp_constant_pool_insert_owned(yp_constant_pool_t *pool, const uint8_t *start, size_t length) { + size_t index = yp_constant_pool_insert(pool, start, length); + if (index == pool->capacity) return 0; + + yp_constant_t *constant = &pool->constants[index]; + constant->owned = true; + return ((yp_constant_id_t) constant->id); } // Free the memory associated with a constant pool. void yp_constant_pool_free(yp_constant_pool_t *pool) { + // For each constant in the current constant pool, free the contents if the + // contents are owned. + for (uint32_t index = 0; index < pool->capacity; index++) { + yp_constant_t *constant = &pool->constants[index]; + + // If an id is set on this constant, then we know we have content here. + if (constant->id != 0 && constant->owned) { + free((void *) constant->start); + } + } + free(pool->constants); } diff --git a/src/yarp.c b/src/yarp.c index 2bce80abade..0091ca1b2e2 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -428,7 +428,13 @@ debug_lex_state_set(yp_parser_t *parser, yp_lex_state_t state, char const * call // Retrieve the constant pool id for the given location. static inline yp_constant_id_t yp_parser_constant_id_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) { - return yp_constant_pool_insert(&parser->constant_pool, start, (size_t) (end - start)); + return yp_constant_pool_insert_shared(&parser->constant_pool, start, (size_t) (end - start)); +} + +// Retrieve the constant pool id for the given string. +static inline yp_constant_id_t +yp_parser_constant_id_owned(yp_parser_t *parser, const uint8_t *start, size_t length) { + return yp_constant_pool_insert_owned(&parser->constant_pool, start, length); } // Retrieve the constant pool id for the given token. @@ -4610,15 +4616,19 @@ yp_parser_local_depth(yp_parser_t *parser, yp_token_t *token) { return -1; } -// Add a local variable from a location to the current scope. -static yp_constant_id_t -yp_parser_local_add_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) { - yp_constant_id_t constant_id = yp_parser_constant_id_location(parser, start, end); - +// Add a constant id to the local table of the current scope. +static inline void +yp_parser_local_add(yp_parser_t *parser, yp_constant_id_t constant_id) { if (!yp_constant_id_list_includes(&parser->current_scope->locals, constant_id)) { yp_constant_id_list_append(&parser->current_scope->locals, constant_id); } +} +// Add a local variable from a location to the current scope. +static yp_constant_id_t +yp_parser_local_add_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) { + yp_constant_id_t constant_id = yp_parser_constant_id_location(parser, start, end); + if (constant_id != 0) yp_parser_local_add(parser, constant_id); return constant_id; } @@ -4628,6 +4638,13 @@ yp_parser_local_add_token(yp_parser_t *parser, yp_token_t *token) { yp_parser_local_add_location(parser, token->start, token->end); } +// Add a local variable from an owned string to the current scope. +static inline void +yp_parser_local_add_owned(yp_parser_t *parser, const uint8_t *start, size_t length) { + yp_constant_id_t constant_id = yp_parser_constant_id_owned(parser, start, length); + if (constant_id != 0) yp_parser_local_add(parser, constant_id); +} + // Add a parameter name to the current scope and check whether the name of the // parameter is unique or not. static void @@ -4644,7 +4661,9 @@ yp_parser_parameter_name_check(yp_parser_t *parser, yp_token_t *name) { } } -// Pop the current scope off the scope stack. +// Pop the current scope off the scope stack. Note that we specifically do not +// free the associated constant list because we assume that we have already +// transferred ownership of the list to the AST somewhere. static void yp_parser_scope_pop(yp_parser_t *parser) { yp_scope_t *scope = parser->current_scope; @@ -13757,7 +13776,10 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) { uint32_t local_size = yp_metadata_read_u32(metadata); metadata += 4; - yp_parser_local_add_location(parser, (const uint8_t *) metadata, (const uint8_t *) (metadata + local_size)); + uint8_t *constant = malloc(local_size); + memcpy(constant, metadata, local_size); + + yp_parser_local_add_owned(parser, constant, (size_t) local_size); metadata += local_size; } } @@ -13896,6 +13918,15 @@ yp_parser_free(yp_parser_t *parser) { yp_constant_pool_free(&parser->constant_pool); yp_newline_list_free(&parser->newline_list); + while (parser->current_scope != NULL) { + // Normally, popping the scope doesn't free the locals since it is + // assumed that ownership has transferred to the AST. However if we have + // scopes while we're freeing the parser, it's likely they came from + // eval scopes and we need to free them explicitly here. + yp_constant_id_list_free(&parser->current_scope->locals); + yp_parser_scope_pop(parser); + } + while (parser->lex_modes.index >= YP_LEX_STACK_SIZE) { lex_mode_pop(parser); } diff --git a/templates/ext/yarp/api_node.c.erb b/templates/ext/yarp/api_node.c.erb index 8fb2d2e507c..3a93f9133d1 100644 --- a/templates/ext/yarp/api_node.c.erb +++ b/templates/ext/yarp/api_node.c.erb @@ -144,30 +144,41 @@ yp_ast_new(yp_parser_t *parser, yp_node_t *node, rb_encoding *encoding) { // <%= field.name %> <%- case field -%> <%- when YARP::NodeField, YARP::OptionalNodeField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = rb_ary_pop(value_stack); <%- when YARP::NodeListField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = rb_ary_new_capa(cast-><%= field.name %>.size); for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { rb_ary_push(argv[<%= index %>], rb_ary_pop(value_stack)); } <%- when YARP::StringField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = yp_string_new(&cast-><%= field.name %>, encoding); <%- when YARP::ConstantField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" + assert(cast-><%= field.name %> != 0); argv[<%= index %>] = rb_id2sym(constants[cast-><%= field.name %> - 1]); <%- when YARP::OptionalConstantField -%> argv[<%= index %>] = cast-><%= field.name %> == 0 ? Qnil : rb_id2sym(constants[cast-><%= field.name %> - 1]); <%- when YARP::ConstantListField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = rb_ary_new_capa(cast-><%= field.name %>.size); for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { + assert(cast-><%= field.name %>.ids[index] != 0); rb_ary_push(argv[<%= index %>], rb_id2sym(constants[cast-><%= field.name %>.ids[index] - 1])); } <%- when YARP::LocationField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = yp_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source); <%- when YARP::OptionalLocationField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = cast-><%= field.name %>.start == NULL ? Qnil : yp_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source); <%- when YARP::UInt32Field -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = ULONG2NUM(cast-><%= field.name %>); <%- when YARP::FlagsField -%> +#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>" argv[<%= index %>] = ULONG2NUM(node->flags >> <%= YARP::COMMON_FLAGS %>); <%- else -%> <%- raise -%> diff --git a/templates/java/org/yarp/Loader.java.erb b/templates/java/org/yarp/Loader.java.erb index c014d5da440..ed1e93dfa61 100644 --- a/templates/java/org/yarp/Loader.java.erb +++ b/templates/java/org/yarp/Loader.java.erb @@ -30,15 +30,26 @@ public class Loader { byte[] get(ByteBuffer buffer, int oneBasedIndex) { int index = oneBasedIndex - 1; byte[] constant = cache[index]; + if (constant == null) { int offset = bufferOffset + index * 8; int start = buffer.getInt(offset); int length = buffer.getInt(offset + 4); constant = new byte[length]; - System.arraycopy(source, start, constant, 0, length); + + if (Integer.compareUnsigned(start, 1 << 31) <= 0) { + System.arraycopy(source, start, constant, 0, length); + } else { + int position = buffer.position(); + buffer.position(Math.abs(start)); + buffer.get(constant, 0, length); + buffer.position(position); + } + cache[index] = constant; } + return constant; } @@ -171,18 +182,6 @@ public class Loader { } } - private Nodes.Location[] loadLocations() { - int length = loadVarInt(); - if (length == 0) { - return Nodes.Location.EMPTY_ARRAY; - } - Nodes.Location[] locations = new Nodes.Location[length]; - for (int i = 0; i < length; i++) { - locations[i] = loadLocation(); - } - return locations; - } - private byte[] loadConstant() { return constantPool.get(buffer, loadVarInt()); } diff --git a/templates/lib/yarp/serialize.rb.erb b/templates/lib/yarp/serialize.rb.erb index 2f6d6421a9f..d6e6d38b5ba 100644 --- a/templates/lib/yarp/serialize.rb.erb +++ b/templates/lib/yarp/serialize.rb.erb @@ -157,11 +157,16 @@ module YARP unless constant offset = constant_pool_offset + index * 8 - start = serialized.unpack1("L", offset: offset) length = serialized.unpack1("L", offset: offset + 4) - constant = input.byteslice(start, length).to_sym + constant = + if start.nobits?(1 << 31) + input.byteslice(start, length).to_sym + else + serialized.byteslice(start & ((1 << 31) - 1), length).to_sym + end + constant_pool[index] = constant end diff --git a/templates/src/serialize.c.erb b/templates/src/serialize.c.erb index 8e0b0905dcc..b60bce21138 100644 --- a/templates/src/serialize.c.erb +++ b/templates/src/serialize.c.erb @@ -206,12 +206,31 @@ yp_serialize_content(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) // If we find a constant at this index, serialize it at the correct // index in the buffer. if (constant->id != 0) { - size_t buffer_offset = offset + ((constant->id - 1) * 8); + size_t buffer_offset = offset + ((((size_t) constant->id) - 1) * 8); + + if (constant->owned) { + // Since this is an owned constant, we are going to write its + // contents into the buffer after the constant pool. So + // effectively in place of the source offset, we have a buffer + // offset. We will add a leading 1 to indicate that this is a + // buffer offset. + uint32_t content_offset = yp_sizet_to_u32(buffer->length); + uint32_t owned_mask = (uint32_t) (1 << 31); + + assert(content_offset < owned_mask); + content_offset |= owned_mask; + + memcpy(buffer->value + buffer_offset, &content_offset, 4); + yp_buffer_append_bytes(buffer, constant->start, constant->length); + } else { + // Since this is a shared constant, we are going to write its + // source offset directly into the buffer. + uint32_t source_offset = yp_ptrdifft_to_u32(constant->start - parser->start); + memcpy(buffer->value + buffer_offset, &source_offset, 4); + } - uint32_t source_offset = yp_ptrdifft_to_u32(constant->start - parser->start); + // Now we can write the length of the constant into the buffer. uint32_t constant_length = yp_sizet_to_u32(constant->length); - - memcpy(buffer->value + buffer_offset, &source_offset, 4); memcpy(buffer->value + buffer_offset + 4, &constant_length, 4); } } diff --git a/test/yarp/parse_serialize_test.rb b/test/yarp/parse_serialize_test.rb index 82a1c29d48a..d3474f7104d 100644 --- a/test/yarp/parse_serialize_test.rb +++ b/test/yarp/parse_serialize_test.rb @@ -14,6 +14,16 @@ def test_parse_serialize assert_equal __FILE__, find_file_node(result)&.filepath, "Expected the filepath to be set correctly" end + def test_parse_serialize_with_locals + filepath = __FILE__ + metadata = [filepath.bytesize, filepath.b, 1, 1, 1, "foo".b].pack("LA*LLLA*") + + dumped = Debug.parse_serialize_file_metadata(filepath, metadata) + result = YARP.load(File.read(__FILE__), dumped) + + assert_kind_of ParseResult, result, "Expected the return value to be a ParseResult" + end + private def find_file_node(result)