Skip to content

Commit

Permalink
Documentation for pm_newline_list_t
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Nov 1, 2023
1 parent ff1d2ec commit 97b3cc0
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 68 deletions.
76 changes: 57 additions & 19 deletions include/prism/util/pm_newline_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,84 @@
#include <stddef.h>
#include <stdlib.h>

// A list of offsets of newlines in a string. The offsets are assumed to be
// sorted/inserted in ascending order.
/**
* A list of offsets of newlines in a string. The offsets are assumed to be
* sorted/inserted in ascending order.
*/
typedef struct {
/** A pointer to the start of the source string. */
const uint8_t *start;

size_t *offsets;
/** The number of offsets in the list. */
size_t size;

/** The capacity of the list that has been allocated. */
size_t capacity;

size_t last_offset;
size_t last_index;
/** The list of offsets. */
size_t *offsets;
} pm_newline_list_t;

// A line and column in a string.
/**
* A line and column in a string.
*/
typedef struct {
/** The line number. */
size_t line;

/** The column number. */
size_t column;
} pm_line_column_t;

#define PM_NEWLINE_LIST_EMPTY ((pm_newline_list_t) { \
.start = NULL, .offsets = NULL, .size = 0, .capacity = 0, .last_offset = 0, .last_index = 0 \
})

// Initialize a new newline list with the given capacity. Returns true if the
// allocation of the offsets succeeds, otherwise returns false.
/**
* Initialize a new newline list with the given capacity. Returns true if the
* allocation of the offsets succeeds, otherwise returns false.
*
* @param list The list to initialize.
* @param start A pointer to the start of the source string.
* @param capacity The initial capacity of the list.
* @return True if the allocation of the offsets succeeds, otherwise false.
*/
bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity);

// Append a new offset to the newline list. Returns true if the reallocation of
// the offsets succeeds (if one was necessary), otherwise returns false.
/**
* Append a new offset to the newline list. Returns true if the reallocation of
* the offsets succeeds (if one was necessary), otherwise returns false.
*
* @param list The list to append to.
* @param cursor A pointer to the offset to append.
* @return True if the reallocation of the offsets succeeds (if one was
* necessary), otherwise false.
*/
bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor);

// Conditionally append a new offset to the newline list, if the value passed in is a newline.
/**
* Conditionally append a new offset to the newline list, if the value passed in
* is a newline.
*
* @param list The list to append to.
* @param cursor A pointer to the offset to append.
* @return True if the reallocation of the offsets succeeds (if one was
* necessary), otherwise false.
*/
bool pm_newline_list_check_append(pm_newline_list_t *list, const uint8_t *cursor);

// Returns the line and column of the given offset. If the offset is not in the
// list, the line and column of the closest offset less than the given offset
// are returned.
/**
* Returns the line and column of the given offset. If the offset is not in the
* list, the line and column of the closest offset less than the given offset
* are returned.
*
* @param list The list to search.
* @param cursor A pointer to the offset to search for.
* @return The line and column of the given offset.
*/
pm_line_column_t pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor);

// Free the internal memory allocated for the newline list.
/**
* Free the internal memory allocated for the newline list.
*
* @param list The list to free.
*/
void pm_newline_list_free(pm_newline_list_t *list);

#endif
43 changes: 26 additions & 17 deletions include/prism/util/pm_strpbrk.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,32 @@
#include <stddef.h>
#include <string.h>

// Here we have rolled our own version of strpbrk. The standard library strpbrk
// has undefined behavior when the source string is not null-terminated. We want
// to support strings that are not null-terminated because pm_parse does not
// have the contract that the string is null-terminated. (This is desirable
// because it means the extension can call pm_parse with the result of a call to
// mmap).
//
// The standard library strpbrk also does not support passing a maximum length
// to search. We want to support this for the reason mentioned above, but we
// also don't want it to stop on null bytes. Ruby actually allows null bytes
// within strings, comments, regular expressions, etc. So we need to be able to
// skip past them.
//
// Finally, we want to support encodings wherein the charset could contain
// characters that are trailing bytes of multi-byte characters. For example, in
// Shift-JIS, the backslash character can be a trailing byte. In that case we
// need to take a slower path and iterate one multi-byte character at a time.
/**
* Here we have rolled our own version of strpbrk. The standard library strpbrk
* has undefined behavior when the source string is not null-terminated. We want
* to support strings that are not null-terminated because pm_parse does not
* have the contract that the string is null-terminated. (This is desirable
* because it means the extension can call pm_parse with the result of a call to
* mmap).
*
* The standard library strpbrk also does not support passing a maximum length
* to search. We want to support this for the reason mentioned above, but we
* also don't want it to stop on null bytes. Ruby actually allows null bytes
* within strings, comments, regular expressions, etc. So we need to be able to
* skip past them.
*
* Finally, we want to support encodings wherein the charset could contain
* characters that are trailing bytes of multi-byte characters. For example, in
* Shift-JIS, the backslash character can be a trailing byte. In that case we
* need to take a slower path and iterate one multi-byte character at a time.
*
* @param parser The parser.
* @param source The source string.
* @param charset The charset to search for.
* @param length The maximum length to search.
* @return A pointer to the first character in the source string that is in the
* charset, or NULL if no such character exists.
*/
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);

#endif
2 changes: 1 addition & 1 deletion src/prism.c
Original file line number Diff line number Diff line change
Expand Up @@ -15691,7 +15691,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const ch
.lex_callback = NULL,
.filepath_string = filepath_string,
.constant_pool = PM_CONSTANT_POOL_EMPTY,
.newline_list = PM_NEWLINE_LIST_EMPTY,
.newline_list = { 0 },
.integer_base = 0,
.current_string = PM_STRING_EMPTY,
.command_start = true,
Expand Down
32 changes: 20 additions & 12 deletions src/util/pm_newline_list.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#include "prism/util/pm_newline_list.h"

// Initialize a new newline list with the given capacity. Returns true if the
// allocation of the offsets succeeds, otherwise returns false.
/**
* Initialize a new newline list with the given capacity. Returns true if the
* allocation of the offsets succeeds, otherwise returns false.
*/
bool
pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) {
list->offsets = (size_t *) calloc(capacity, sizeof(size_t));
Expand All @@ -14,14 +16,13 @@ pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capac
list->size = 1;
list->capacity = capacity;

list->last_index = 0;
list->last_offset = 0;

return true;
}

// Append a new offset to the newline list. Returns true if the reallocation of
// the offsets succeeds (if one was necessary), otherwise returns false.
/**
* Append a new offset to the newline list. Returns true if the reallocation of
* the offsets succeeds (if one was necessary), otherwise returns false.
*/
bool
pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
if (list->size == list->capacity) {
Expand All @@ -44,7 +45,10 @@ pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
return true;
}

// Conditionally append a new offset to the newline list, if the value passed in is a newline.
/**
* Conditionally append a new offset to the newline list, if the value passed in
* is a newline.
*/
bool
pm_newline_list_check_append(pm_newline_list_t *list, const uint8_t *cursor) {
if (*cursor != '\n') {
Expand All @@ -53,9 +57,11 @@ pm_newline_list_check_append(pm_newline_list_t *list, const uint8_t *cursor) {
return pm_newline_list_append(list, cursor);
}

// Returns the line and column of the given offset. If the offset is not in the
// list, the line and column of the closest offset less than the given offset
// are returned.
/**
* Returns the line and column of the given offset. If the offset is not in the
* list, the line and column of the closest offset less than the given offset
* are returned.
*/
pm_line_column_t
pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor) {
assert(cursor >= list->start);
Expand All @@ -81,7 +87,9 @@ pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor
return ((pm_line_column_t) { left - 1, offset - list->offsets[left - 1] });
}

// Free the internal memory allocated for the newline list.
/**
* Free the internal memory allocated for the newline list.
*/
void
pm_newline_list_free(pm_newline_list_t *list) {
free(list->offsets);
Expand Down
44 changes: 25 additions & 19 deletions src/util/pm_strpbrk.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "prism/util/pm_strpbrk.h"

// This is the slow path that does care about the encoding.
/**
* This is the slow path that does care about the encoding.
*/
static inline const uint8_t *
pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
size_t index = 0;
Expand All @@ -21,7 +23,9 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
return NULL;
}

// This is the fast path that does not care about the encoding.
/**
* This is the fast path that does not care about the encoding.
*/
static inline const uint8_t *
pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
size_t index = 0;
Expand All @@ -37,23 +41,25 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
return NULL;
}

// Here we have rolled our own version of strpbrk. The standard library strpbrk
// has undefined behavior when the source string is not null-terminated. We want
// to support strings that are not null-terminated because pm_parse does not
// have the contract that the string is null-terminated. (This is desirable
// because it means the extension can call pm_parse with the result of a call to
// mmap).
//
// The standard library strpbrk also does not support passing a maximum length
// to search. We want to support this for the reason mentioned above, but we
// also don't want it to stop on null bytes. Ruby actually allows null bytes
// within strings, comments, regular expressions, etc. So we need to be able to
// skip past them.
//
// Finally, we want to support encodings wherein the charset could contain
// characters that are trailing bytes of multi-byte characters. For example, in
// Shift-JIS, the backslash character can be a trailing byte. In that case we
// need to take a slower path and iterate one multi-byte character at a time.
/**
* Here we have rolled our own version of strpbrk. The standard library strpbrk
* has undefined behavior when the source string is not null-terminated. We want
* to support strings that are not null-terminated because pm_parse does not
* have the contract that the string is null-terminated. (This is desirable
* because it means the extension can call pm_parse with the result of a call to
* mmap).
*
* The standard library strpbrk also does not support passing a maximum length
* to search. We want to support this for the reason mentioned above, but we
* also don't want it to stop on null bytes. Ruby actually allows null bytes
* within strings, comments, regular expressions, etc. So we need to be able to
* skip past them.
*
* Finally, we want to support encodings wherein the charset could contain
* characters that are trailing bytes of multi-byte characters. For example, in
* Shift-JIS, the backslash character can be a trailing byte. In that case we
* need to take a slower path and iterate one multi-byte character at a time.
*/
const uint8_t *
pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
if (length <= 0) {
Expand Down

0 comments on commit 97b3cc0

Please sign in to comment.