From f441b6fd2c695cc2b457402c0758cf35db7e6f2a Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Thu, 27 Jul 2023 14:46:19 -0400 Subject: [PATCH 1/5] WIP - Introduce contextually parsing programs vs evals This is more or less the code I used in my POC in JRuby to parse evals. Evals depend on parent variable scopes and will produce a different syntax tree. Questions: 1. How does MRI compile evals currently? I cannot find anything. 2. This passes in a char * of data. It does not encode the variables we pass in because the system calling this already knows. Is this adequate though? 3. Can I get guidance on how best to test this? --- ext/yarp/extension.c | 10 +++++----- include/yarp.h | 4 ++-- src/yarp.c | 42 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/ext/yarp/extension.c b/ext/yarp/extension.c index 4e801b3b05f..7ede50bb0f9 100644 --- a/ext/yarp/extension.c +++ b/ext/yarp/extension.c @@ -194,7 +194,7 @@ dump_input(input_t *input, const char *filepath) { yp_parser_t parser; yp_parser_init(&parser, input->source, input->size, filepath); - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, false); yp_serialize(&parser, node, &buffer); VALUE result = rb_str_new(buffer.value, buffer.length); @@ -378,7 +378,7 @@ lex_input(input_t *input, const char *filepath) { }; parser.lex_callback = &lex_callback; - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, false); // Here we need to update the source range to have the correct newline // offsets. We do it here because we've already created the object and given @@ -439,7 +439,7 @@ parse_input(input_t *input, const char *filepath) { yp_parser_t parser; yp_parser_init(&parser, input->source, input->size, filepath); - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, false); rb_encoding *encoding = rb_enc_find(parser.encoding.name); VALUE source = yp_source_new(&parser); @@ -582,7 +582,7 @@ memsize(VALUE self, VALUE string) { size_t length = RSTRING_LEN(string); yp_parser_init(&parser, RSTRING_PTR(string), length, NULL); - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, false); yp_memsize_t memsize; yp_node_memsize(node, &memsize); @@ -608,7 +608,7 @@ profile_file(VALUE self, VALUE filepath) { yp_parser_t parser; yp_parser_init(&parser, input.source, input.size, checked); - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, false); yp_node_destroy(&parser, node); yp_parser_free(&parser); diff --git a/include/yarp.h b/include/yarp.h index 4bbffdbb106..492038a6e28 100644 --- a/include/yarp.h +++ b/include/yarp.h @@ -51,7 +51,7 @@ YP_EXPORTED_FUNCTION void yp_parser_register_encoding_decode_callback(yp_parser_ YP_EXPORTED_FUNCTION void yp_parser_free(yp_parser_t *parser); // Parse the Ruby source associated with the given parser and return the tree. -YP_EXPORTED_FUNCTION yp_node_t * yp_parse(yp_parser_t *parser); +YP_EXPORTED_FUNCTION yp_node_t * yp_parse(yp_parser_t *parser, bool eval); // Pretty-prints the AST represented by the given node to the given buffer. YP_EXPORTED_FUNCTION void yp_prettyprint(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer); @@ -61,7 +61,7 @@ YP_EXPORTED_FUNCTION void yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_ // Parse and serialize the AST represented by the given source to the given // buffer. -YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer); +YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *parent_scopes); // Returns a string representation of the given token type. YP_EXPORTED_FUNCTION const char * yp_token_type_to_str(yp_token_type_t token_type); diff --git a/src/yarp.c b/src/yarp.c index b3fb271d451..0a7b2c53350 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -12773,8 +12773,8 @@ parse_expression(yp_parser_t *parser, yp_binding_power_t binding_power, const ch } static yp_node_t * -parse_program(yp_parser_t *parser) { - yp_parser_scope_push(parser, true); +parse_program(yp_parser_t *parser, bool eval) { + yp_parser_scope_push(parser, !eval); parser_lex(parser); yp_statements_node_t *statements = parse_statements(parser, YP_CONTEXT_MAIN); @@ -12794,6 +12794,34 @@ parse_program(yp_parser_t *parser) { return (yp_node_t *) yp_program_node_create(parser, &locals, statements); } +// Assume always a valid string since it is from trusted source (Ruby impl internals). +// Format: [num_scopes, (num_vars1, (var_char1*, 0)*)*] +static void +yp_populate_eval_scopes(yp_parser_t *parser, const char *data) { + const char *p = data; + size_t number_of_scopes = (size_t) *p; + + p++; + for (size_t scope_index = 0; scope_index < number_of_scopes; scope_index++) { + size_t number_of_variables = (size_t) *p++; + + yp_parser_scope_push(parser, scope_index == 0); + + for (size_t variable_index = 0; variable_index < number_of_variables; variable_index++) { + char *eos = strchr(p, 0); + + yp_token_t lvar = (yp_token_t) { + .type = YP_TOKEN_IDENTIFIER, + .start = p, + .end = eos + }; + yp_parser_local_add_token(parser, &lvar); + + p = ++eos; + } + } +} + /******************************************************************************/ /* External functions */ /******************************************************************************/ @@ -12930,8 +12958,8 @@ yp_parser_free(yp_parser_t *parser) { // Parse the Ruby source associated with the given parser and return the tree. YP_EXPORTED_FUNCTION yp_node_t * -yp_parse(yp_parser_t *parser) { - return parse_program(parser); +yp_parse(yp_parser_t *parser, bool eval) { + return parse_program(parser, eval); } YP_EXPORTED_FUNCTION void @@ -12948,11 +12976,13 @@ yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) { // Parse and serialize the AST represented by the given source to the given // buffer. YP_EXPORTED_FUNCTION void -yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer) { +yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *parent_scopes) { + bool eval = parent_scopes != NULL; yp_parser_t parser; yp_parser_init(&parser, source, size, NULL); + if (eval) yp_populate_eval_scopes(&parser, parent_scopes); - yp_node_t *node = yp_parse(&parser); + yp_node_t *node = yp_parse(&parser, eval); yp_serialize(&parser, node, buffer); yp_node_destroy(&parser, node); From 04de272383e8d01fca06ffa7f99ec3ac5da5dd4a Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Thu, 27 Jul 2023 14:58:11 -0400 Subject: [PATCH 2/5] Remove trailing whitespace --- src/yarp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yarp.c b/src/yarp.c index 0a7b2c53350..adf5c1ae399 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -12982,7 +12982,7 @@ yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const c yp_parser_init(&parser, source, size, NULL); if (eval) yp_populate_eval_scopes(&parser, parent_scopes); - yp_node_t *node = yp_parse(&parser, eval); + yp_node_t *node = yp_parse(&parser, eval); yp_serialize(&parser, node, buffer); yp_node_destroy(&parser, node); From 5411abd6516d82cde95c00f3c539fbae34b8b0f7 Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Thu, 3 Aug 2023 14:15:26 -0400 Subject: [PATCH 3/5] Addressed review comments. Missing any tests and in fact this is untested so still a WIP. --- ext/yarp/extension.c | 10 ++++---- include/yarp.h | 2 +- src/yarp.c | 60 +++++++++++++++++++++++++++----------------- 3 files changed, 43 insertions(+), 29 deletions(-) diff --git a/ext/yarp/extension.c b/ext/yarp/extension.c index 7ede50bb0f9..4e801b3b05f 100644 --- a/ext/yarp/extension.c +++ b/ext/yarp/extension.c @@ -194,7 +194,7 @@ dump_input(input_t *input, const char *filepath) { yp_parser_t parser; yp_parser_init(&parser, input->source, input->size, filepath); - yp_node_t *node = yp_parse(&parser, false); + yp_node_t *node = yp_parse(&parser); yp_serialize(&parser, node, &buffer); VALUE result = rb_str_new(buffer.value, buffer.length); @@ -378,7 +378,7 @@ lex_input(input_t *input, const char *filepath) { }; parser.lex_callback = &lex_callback; - yp_node_t *node = yp_parse(&parser, false); + yp_node_t *node = yp_parse(&parser); // Here we need to update the source range to have the correct newline // offsets. We do it here because we've already created the object and given @@ -439,7 +439,7 @@ parse_input(input_t *input, const char *filepath) { yp_parser_t parser; yp_parser_init(&parser, input->source, input->size, filepath); - yp_node_t *node = yp_parse(&parser, false); + yp_node_t *node = yp_parse(&parser); rb_encoding *encoding = rb_enc_find(parser.encoding.name); VALUE source = yp_source_new(&parser); @@ -582,7 +582,7 @@ memsize(VALUE self, VALUE string) { size_t length = RSTRING_LEN(string); yp_parser_init(&parser, RSTRING_PTR(string), length, NULL); - yp_node_t *node = yp_parse(&parser, false); + yp_node_t *node = yp_parse(&parser); yp_memsize_t memsize; yp_node_memsize(node, &memsize); @@ -608,7 +608,7 @@ profile_file(VALUE self, VALUE filepath) { yp_parser_t parser; yp_parser_init(&parser, input.source, input.size, checked); - yp_node_t *node = yp_parse(&parser, false); + yp_node_t *node = yp_parse(&parser); yp_node_destroy(&parser, node); yp_parser_free(&parser); diff --git a/include/yarp.h b/include/yarp.h index 492038a6e28..410f739fb25 100644 --- a/include/yarp.h +++ b/include/yarp.h @@ -51,7 +51,7 @@ YP_EXPORTED_FUNCTION void yp_parser_register_encoding_decode_callback(yp_parser_ YP_EXPORTED_FUNCTION void yp_parser_free(yp_parser_t *parser); // Parse the Ruby source associated with the given parser and return the tree. -YP_EXPORTED_FUNCTION yp_node_t * yp_parse(yp_parser_t *parser, bool eval); +YP_EXPORTED_FUNCTION yp_node_t * yp_parse(yp_parser_t *parser); // Pretty-prints the AST represented by the given node to the given buffer. YP_EXPORTED_FUNCTION void yp_prettyprint(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer); diff --git a/src/yarp.c b/src/yarp.c index adf5c1ae399..61c8b0f845b 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -12773,8 +12773,8 @@ parse_expression(yp_parser_t *parser, yp_binding_power_t binding_power, const ch } static yp_node_t * -parse_program(yp_parser_t *parser, bool eval) { - yp_parser_scope_push(parser, !eval); +parse_program(yp_parser_t *parser) { + yp_parser_scope_push(parser, !parser->current_scope); parser_lex(parser); yp_statements_node_t *statements = parse_statements(parser, YP_CONTEXT_MAIN); @@ -12794,30 +12794,45 @@ parse_program(yp_parser_t *parser, bool eval) { return (yp_node_t *) yp_program_node_create(parser, &locals, statements); } -// Assume always a valid string since it is from trusted source (Ruby impl internals). -// Format: [num_scopes, (num_vars1, (var_char1*, 0)*)*] +// Process any additional metadata being passed into a parse. Since the source +// of these calls will be from Ruby implementation internals we assume it is from +// a trusted source. +// +// Currently, this is only passing in variable scoping surrounding an eval, but +// eventually it will be extended to hold any additional metadata. This data +// is serialized to reduce the calling complexity for a foreign function call +// vs a foreign runtime making a bindable in-memory version of a C structure. +// +// *Format* +// +// No metadata should just be NULL. For variable scopes it should be: +// +// ```text +// [number_of_variable_scopes: uint32_t, +// [number_of_variables: uint32_t, +// [data_length: uint32_t, data: char*]* +// ]* +// ] +// ``` static void -yp_populate_eval_scopes(yp_parser_t *parser, const char *data) { - const char *p = data; - size_t number_of_scopes = (size_t) *p; +yp_process_metadata(yp_parser_t *parser, const char *metadata) { + const char *p = metadata; + uint32_t number_of_scopes = (uint32_t) *p; + p += 4; - p++; for (size_t scope_index = 0; scope_index < number_of_scopes; scope_index++) { - size_t number_of_variables = (size_t) *p++; + uint32_t number_of_variables = (uint32_t) *p; + p += 4; yp_parser_scope_push(parser, scope_index == 0); for (size_t variable_index = 0; variable_index < number_of_variables; variable_index++) { - char *eos = strchr(p, 0); + int32_t length = (uint32_t) *p; + p += 4; - yp_token_t lvar = (yp_token_t) { - .type = YP_TOKEN_IDENTIFIER, - .start = p, - .end = eos - }; - yp_parser_local_add_token(parser, &lvar); + yp_parser_local_add_location(parser, p, p + length); - p = ++eos; + p += length; } } } @@ -12958,8 +12973,8 @@ yp_parser_free(yp_parser_t *parser) { // Parse the Ruby source associated with the given parser and return the tree. YP_EXPORTED_FUNCTION yp_node_t * -yp_parse(yp_parser_t *parser, bool eval) { - return parse_program(parser, eval); +yp_parse(yp_parser_t *parser) { + return parse_program(parser); } YP_EXPORTED_FUNCTION void @@ -12976,13 +12991,12 @@ yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) { // Parse and serialize the AST represented by the given source to the given // buffer. YP_EXPORTED_FUNCTION void -yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *parent_scopes) { - bool eval = parent_scopes != NULL; +yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) { yp_parser_t parser; yp_parser_init(&parser, source, size, NULL); - if (eval) yp_populate_eval_scopes(&parser, parent_scopes); + if (metadata) yp_process_metadata(&parser, metadata); - yp_node_t *node = yp_parse(&parser, eval); + yp_node_t *node = yp_parse(&parser); yp_serialize(&parser, node, buffer); yp_node_destroy(&parser, node); From 658152079315e6dd0c3052203e344369d5be1a49 Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Thu, 3 Aug 2023 14:19:49 -0400 Subject: [PATCH 4/5] Clang beats gcc in pedantry Accidentally declared int32_t and not uint32_t and it was caught on macos. --- src/yarp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/yarp.c b/src/yarp.c index 61c8b0f845b..007a1a9d5e3 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -12827,7 +12827,7 @@ yp_process_metadata(yp_parser_t *parser, const char *metadata) { yp_parser_scope_push(parser, scope_index == 0); for (size_t variable_index = 0; variable_index < number_of_variables; variable_index++) { - int32_t length = (uint32_t) *p; + uint32_t length = (uint32_t) *p; p += 4; yp_parser_local_add_location(parser, p, p + length); From 3208ee39837d911d21bec5ceda9e85fec9ea143c Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Fri, 4 Aug 2023 11:02:29 -0400 Subject: [PATCH 5/5] Address PR comments - odd whitespace - a couple of name changes - properly read uint32_t when not properly aligned --- include/yarp.h | 2 +- src/yarp.c | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/yarp.h b/include/yarp.h index 410f739fb25..1c6652a3021 100644 --- a/include/yarp.h +++ b/include/yarp.h @@ -61,7 +61,7 @@ YP_EXPORTED_FUNCTION void yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_ // Parse and serialize the AST represented by the given source to the given // buffer. -YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *parent_scopes); +YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata); // Returns a string representation of the given token type. YP_EXPORTED_FUNCTION const char * yp_token_type_to_str(yp_token_type_t token_type); diff --git a/src/yarp.c b/src/yarp.c index 007a1a9d5e3..dd2ebab6efa 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -12794,6 +12794,17 @@ parse_program(yp_parser_t *parser) { return (yp_node_t *) yp_program_node_create(parser, &locals, statements); } +static uint32_t +yp_read_u32(const char *ptr) { + if (((uintptr_t) ptr) % sizeof(uint32_t) == 0) { + return *((uint32_t *) ptr); + } else { + uint32_t value; + memcpy(&value, ptr, sizeof(uint32_t)); + return value; + } +} + // Process any additional metadata being passed into a parse. Since the source // of these calls will be from Ruby implementation internals we assume it is from // a trusted source. @@ -12815,19 +12826,19 @@ parse_program(yp_parser_t *parser) { // ] // ``` static void -yp_process_metadata(yp_parser_t *parser, const char *metadata) { +yp_parser_metadata(yp_parser_t *parser, const char *metadata) { const char *p = metadata; - uint32_t number_of_scopes = (uint32_t) *p; + uint32_t number_of_scopes = yp_read_u32(p); p += 4; for (size_t scope_index = 0; scope_index < number_of_scopes; scope_index++) { - uint32_t number_of_variables = (uint32_t) *p; + uint32_t number_of_variables = yp_read_u32(p); p += 4; yp_parser_scope_push(parser, scope_index == 0); for (size_t variable_index = 0; variable_index < number_of_variables; variable_index++) { - uint32_t length = (uint32_t) *p; + uint32_t length = yp_read_u32(p); p += 4; yp_parser_local_add_location(parser, p, p + length); @@ -12974,7 +12985,7 @@ yp_parser_free(yp_parser_t *parser) { // Parse the Ruby source associated with the given parser and return the tree. YP_EXPORTED_FUNCTION yp_node_t * yp_parse(yp_parser_t *parser) { - return parse_program(parser); + return parse_program(parser); } YP_EXPORTED_FUNCTION void @@ -12994,7 +13005,7 @@ YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) { yp_parser_t parser; yp_parser_init(&parser, source, size, NULL); - if (metadata) yp_process_metadata(&parser, metadata); + if (metadata) yp_parser_metadata(&parser, metadata); yp_node_t *node = yp_parse(&parser); yp_serialize(&parser, node, buffer);