diff --git a/include/prism/diagnostic.h b/include/prism/diagnostic.h index 273cab22e5e..7d78a160002 100644 --- a/include/prism/diagnostic.h +++ b/include/prism/diagnostic.h @@ -20,6 +20,10 @@ typedef struct { const char *message; } pm_diagnostic_t; +/** + * The diagnostic IDs of all of the diagnostics, used to communicate the types + * of errors between the parser and the user. + */ typedef enum { PM_ERR_ALIAS_ARGUMENT, PM_ERR_AMPAMPEQ_MULTI_ASSIGN, @@ -223,14 +227,27 @@ typedef enum { PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS, PM_WARN_AMBIGUOUS_PREFIX_STAR, PM_WARN_AMBIGUOUS_SLASH, + /* This must be the last member. */ PM_DIAGNOSTIC_ID_LEN, } pm_diagnostic_id_t; -// Append a diagnostic to the given list of diagnostics. +/** + * Append a diagnostic to the given list of diagnostics. + * + * @param list The list to append to. + * @param start The start of the diagnostic. + * @param end The end of the diagnostic. + * @param diag_id The diagnostic ID. + * @return Whether the diagnostic was successfully appended. + */ bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id); -// Deallocate the internal state of the given diagnostic list. +/** + * Deallocate the internal state of the given diagnostic list. + * + * @param list The list to deallocate. + */ void pm_diagnostic_list_free(pm_list_t *list); #endif diff --git a/include/prism/regexp.h b/include/prism/regexp.h index 5745512dee7..9eae245d1e1 100644 --- a/include/prism/regexp.h +++ b/include/prism/regexp.h @@ -12,8 +12,17 @@ #include #include -// Parse a regular expression and extract the names of all of the named capture -// groups. +/** + * Parse a regular expression and extract the names of all of the named capture + * groups. + * + * @param source The source code to parse. + * @param size The size of the source code. + * @param named_captures The list to add the names of the named capture groups. + * @param encoding_changed Whether or not the encoding changed from the default. + * @param encoding The encoding of the source code. + * @return Whether or not the parsing was successful. + */ PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding); #endif diff --git a/src/diagnostic.c b/src/diagnostic.c index b1067080993..fdeb9cab127 100644 --- a/src/diagnostic.c +++ b/src/diagnostic.c @@ -1,56 +1,55 @@ #include "prism/diagnostic.h" -/* - ## Message composition - - When composing an error message, use sentence fragments. - - Try describing the property of the code that caused the error, rather than the rule that is being - violated. It may help to use a fragment that completes a sentence beginning, "The parser - encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful - context) after a semicolon. - - For example:, instead of "Control escape sequence cannot be doubled", prefer: - - > "Invalid control escape sequence; control cannot be repeated" - - In some cases, where the failure is more general or syntax expectations are violated, it may make - more sense to use a fragment that completes a sentence beginning, "The parser ...". - - For example: - - > "Expected an expression after `(`" - > "Cannot parse the expression" - - - ## Message style guide - - - Use articles like "a", "an", and "the" when appropriate. - - e.g., prefer "Cannot parse the expression" to "Cannot parse expression". - - Use the common name for tokens and nodes. - - e.g., prefer "keyword splat" to "assoc splat" - - e.g., prefer "embedded document" to "embdoc" - - Capitalize the initial word of the message. - - Use back ticks around token literals - - e.g., "Expected a `=>` between the hash key and value" - - Do not use `.` or other punctuation at the end of the message. - - Do not use contractions like "can't". Prefer "cannot" to "can not". - - For tokens that can have multiple meanings, reference the token and its meaning. - - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument" - - - ## Error names (PM_ERR_*) - - - When appropriate, prefer node name to token name. - - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing. - - Prefer token name to common name. - - e.g., prefer "STAR" to "ASTERISK". - - Try to order the words in the name from more general to more specific, - - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER". - - When in doubt, look for similar patterns and name them so that they are grouped when lexically - sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example. -*/ - +/** + * ## Message composition + * + * When composing an error message, use sentence fragments. + * + * Try describing the property of the code that caused the error, rather than the rule that is being + * violated. It may help to use a fragment that completes a sentence beginning, "The parser + * encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful + * context) after a semicolon. + * + * For example:, instead of "Control escape sequence cannot be doubled", prefer: + * + * > "Invalid control escape sequence; control cannot be repeated" + * + * In some cases, where the failure is more general or syntax expectations are violated, it may make + * more sense to use a fragment that completes a sentence beginning, "The parser ...". + * + * For example: + * + * > "Expected an expression after `(`" + * > "Cannot parse the expression" + * + * + * ## Message style guide + * + * - Use articles like "a", "an", and "the" when appropriate. + * - e.g., prefer "Cannot parse the expression" to "Cannot parse expression". + * - Use the common name for tokens and nodes. + * - e.g., prefer "keyword splat" to "assoc splat" + * - e.g., prefer "embedded document" to "embdoc" + * - Capitalize the initial word of the message. + * - Use back ticks around token literals + * - e.g., "Expected a `=>` between the hash key and value" + * - Do not use `.` or other punctuation at the end of the message. + * - Do not use contractions like "can't". Prefer "cannot" to "can not". + * - For tokens that can have multiple meanings, reference the token and its meaning. + * - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument" + * + * + * ## Error names (PM_ERR_*) + * + * - When appropriate, prefer node name to token name. + * - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing. + * - Prefer token name to common name. + * - e.g., prefer "STAR" to "ASTERISK". + * - Try to order the words in the name from more general to more specific, + * - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER". + * - When in doubt, look for similar patterns and name them so that they are grouped when lexically + * sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example. + */ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = { [PM_ERR_ALIAS_ARGUMENT] = "Invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable", [PM_ERR_AMPAMPEQ_MULTI_ASSIGN] = "Unexpected `&&=` in a multiple assignment", @@ -263,7 +262,9 @@ pm_diagnostic_message(pm_diagnostic_id_t diag_id) { return message; } -// Append an error to the given list of diagnostic. +/** + * Append an error to the given list of diagnostic. + */ bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) calloc(sizeof(pm_diagnostic_t), 1); @@ -274,7 +275,9 @@ pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t * return true; } -// Deallocate the internal state of the given diagnostic list. +/** + * Deallocate the internal state of the given diagnostic list. + */ void pm_diagnostic_list_free(pm_list_t *list) { pm_list_node_t *node, *next; diff --git a/src/regexp.c b/src/regexp.c index 3462c846ce6..fa2ea5cd20a 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -1,6 +1,8 @@ #include "prism/regexp.h" -// This is the parser that is going to handle parsing regular expressions. +/** + * This is the parser that is going to handle parsing regular expressions. + */ typedef struct { const uint8_t *start; const uint8_t *cursor; @@ -10,7 +12,9 @@ typedef struct { pm_encoding_t *encoding; } pm_regexp_parser_t; -// This initializes a new parser with the given source. +/** + * This initializes a new parser with the given source. + */ static void pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) { *parser = (pm_regexp_parser_t) { @@ -23,7 +27,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui }; } -// This appends a new string to the list of named captures. +/** + * This appends a new string to the list of named captures. + */ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { pm_string_t string; @@ -32,13 +38,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, pm_string_free(&string); } -// Returns true if the next character is the end of the source. +/** + * Returns true if the next character is the end of the source. + */ static inline bool pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { return parser->cursor >= parser->end; } -// Optionally accept a char and consume it if it exists. +/** + * Optionally accept a char and consume it if it exists. + */ static inline bool pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { @@ -48,7 +58,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { return false; } -// Expect a character to be present and consume it. +/** + * Expect a character to be present and consume it. + */ static inline bool pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { @@ -58,7 +70,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { return false; } -// This advances the current token to the next instance of the given character. +/** + * This advances the current token to the next instance of the given character. + */ static bool pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { if (pm_regexp_char_is_eof(parser)) { @@ -74,37 +88,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { return true; } -// Range quantifiers are a special class of quantifiers that look like -// -// * {digit} -// * {digit,} -// * {digit,digit} -// * {,digit} -// -// Unfortunately, if there are any spaces in between, then this just becomes a -// regular character match expression and we have to backtrack. So when this -// function first starts running, we'll create a "save" point and then attempt -// to parse the quantifier. If it fails, we'll restore the save point and -// return. -// -// The properly track everything, we're going to build a little state machine. -// It looks something like the following: -// -// ┌───────┐ ┌─────────┐ ────────────┐ -// ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │ -// └───────┘ └─────────┘ <─── digit ─┘ -// │ │ │ -// ┌───────┐ │ │ rbrace -// │ comma │ <───── comma ┌──── comma ───────┘ │ -// └───────┘ V V -// │ ┌─────────┐ ┌─────────┐ -// └── digit ──> │ maximum │ ── rbrace ──> │| final |│ -// └─────────┘ └─────────┘ -// │ ^ -// └─ digit ─┘ -// -// Note that by the time we've hit this function, the lbrace has already been -// consumed so we're in the start state. +/** + * Range quantifiers are a special class of quantifiers that look like + * + * * {digit} + * * {digit,} + * * {digit,digit} + * * {,digit} + * + * Unfortunately, if there are any spaces in between, then this just becomes a + * regular character match expression and we have to backtrack. So when this + * function first starts running, we'll create a "save" point and then attempt + * to parse the quantifier. If it fails, we'll restore the save point and + * return. + * + * The properly track everything, we're going to build a little state machine. + * It looks something like the following: + * + * ┌───────┐ ┌─────────┐ ────────────┐ + * ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │ + * └───────┘ └─────────┘ <─── digit ─┘ + * │ │ │ + * ┌───────┐ │ │ rbrace + * │ comma │ <───── comma ┌──── comma ───────┘ │ + * └───────┘ V V + * │ ┌─────────┐ ┌─────────┐ + * └── digit ──> │ maximum │ ── rbrace ──> │| final |│ + * └─────────┘ └─────────┘ + * │ ^ + * └─ digit ─┘ + * + * Note that by the time we've hit this function, the lbrace has already been + * consumed so we're in the start state. + */ static bool pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) { const uint8_t *savepoint = parser->cursor; @@ -180,12 +196,14 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) { return true; } -// quantifier : star-quantifier -// | plus-quantifier -// | optional-quantifier -// | range-quantifier -// | -// ; +/** + * quantifier : star-quantifier + * | plus-quantifier + * | optional-quantifier + * | range-quantifier + * | + * ; + */ static bool pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) { if (pm_regexp_char_is_eof(parser)) return true; @@ -205,8 +223,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) { } } -// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']' -// ; +/** + * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']' + * ; + */ static bool pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { if (!pm_regexp_char_expect(parser, ':')) { @@ -226,8 +246,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { static bool pm_regexp_parse_lbracket(pm_regexp_parser_t *parser); -// match-char-set : '[' '^'? (match-range | match-char)* ']' -// ; +/** + * match-char-set : '[' '^'? (match-range | match-char)* ']' + * ; + */ static bool pm_regexp_parse_character_set(pm_regexp_parser_t *parser) { pm_regexp_char_accept(parser, '^'); @@ -251,7 +273,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) { return pm_regexp_char_expect(parser, ']'); } -// A left bracket can either mean a POSIX class or a character set. +/** + * A left bracket can either mean a POSIX class or a character set. + */ static bool pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) { const uint8_t *reset = parser->cursor; @@ -271,8 +295,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) { static bool pm_regexp_parse_expression(pm_regexp_parser_t *parser); -// These are the states of the options that are configurable on the regular -// expression (or from within a group). +/** + * These are the states of the options that are configurable on the regular + * expression (or from within a group). + */ typedef enum { PM_REGEXP_OPTION_STATE_INVALID, PM_REGEXP_OPTION_STATE_TOGGLEABLE, @@ -283,16 +309,21 @@ typedef enum { // These are the options that are configurable on the regular expression (or // from within a group). + #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) -// This is the set of options that are configurable on the regular expression. +/** + * This is the set of options that are configurable on the regular expression. + */ typedef struct { uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]; } pm_regexp_options_t; -// Initialize a new set of options to their default values. +/** + * Initialize a new set of options to their default values. + */ static void pm_regexp_options_init(pm_regexp_options_t *options) { memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS); @@ -304,8 +335,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) { options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; } -// Attempt to add the given option to the set of options. Returns true if it was -// added, false if it was already present. +/** + * Attempt to add the given option to the set of options. Returns true if it was + * added, false if it was already present. + */ static bool pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) { if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { @@ -327,8 +360,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) { return false; } -// Attempt to remove the given option from the set of options. Returns true if -// it was removed, false if it was already absent. +/** + * Attempt to remove the given option from the set of options. Returns true if + * it was removed, false if it was already absent. + */ static bool pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) { if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { @@ -349,26 +384,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) { return false; } -// Groups can have quite a few different patterns for syntax. They basically -// just wrap a set of expressions, but they can potentially have options after a -// question mark. If there _isn't_ a question mark, then it's just a set of -// expressions. If there _is_, then here are the options: -// -// * (?#...) - inline comments -// * (?:subexp) - non-capturing group -// * (?=subexp) - positive lookahead -// * (?!subexp) - negative lookahead -// * (?>subexp) - atomic group -// * (?~subexp) - absence operator -// * (?<=subexp) - positive lookbehind -// * (?subexp) - named capturing group -// * (?'name'subexp) - named capturing group -// * (?(cond)yes-subexp) - conditional expression -// * (?(cond)yes-subexp|no-subexp) - conditional expression -// * (?imxdau-imx) - turn on and off configuration -// * (?imxdau-imx:subexp) - turn on and off configuration for an expression -// +/** + * Groups can have quite a few different patterns for syntax. They basically + * just wrap a set of expressions, but they can potentially have options after a + * question mark. If there _isn't_ a question mark, then it's just a set of + * expressions. If there _is_, then here are the options: + * + * * (?#...) - inline comments + * * (?:subexp) - non-capturing group + * * (?=subexp) - positive lookahead + * * (?!subexp) - negative lookahead + * * (?>subexp) - atomic group + * * (?~subexp) - absence operator + * * (?<=subexp) - positive lookbehind + * * (?subexp) - named capturing group + * * (?'name'subexp) - named capturing group + * * (?(cond)yes-subexp) - conditional expression + * * (?(cond)yes-subexp|no-subexp) - conditional expression + * * (?imxdau-imx) - turn on and off configuration + * * (?imxdau-imx:subexp) - turn on and off configuration for an expression + */ static bool pm_regexp_parse_group(pm_regexp_parser_t *parser) { // First, parse any options for the group. @@ -503,16 +539,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { return pm_regexp_char_expect(parser, ')'); } -// item : anchor -// | match-posix-class -// | match-char-set -// | match-char-class -// | match-char-prop -// | match-char -// | match-any -// | group -// | quantified -// ; +/** + * item : anchor + * | match-posix-class + * | match-char-set + * | match-char-class + * | match-char-prop + * | match-char + * | match-any + * | group + * | quantified + * ; + */ static bool pm_regexp_parse_item(pm_regexp_parser_t *parser) { switch (*parser->cursor++) { @@ -533,8 +571,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) { } } -// expression : item+ -// ; +/** + * expression : item+ + * ; + */ static bool pm_regexp_parse_expression(pm_regexp_parser_t *parser) { if (!pm_regexp_parse_item(parser)) { @@ -550,10 +590,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) { return true; } -// pattern : EOF -// | expression EOF -// | expression '|' pattern -// ; +/** + * pattern : EOF + * | expression EOF + * | expression '|' pattern + * ; + */ static bool pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { return ( @@ -572,8 +614,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { ); } -// Parse a regular expression and extract the names of all of the named capture -// groups. +/** + * Parse a regular expression and extract the names of all of the named capture + * groups. + */ PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) { pm_regexp_parser_t parser;