Skip to content

Commit

Permalink
Documentation for the encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Nov 1, 2023
1 parent 2693426 commit 52a0d80
Show file tree
Hide file tree
Showing 8 changed files with 513 additions and 154 deletions.
179 changes: 126 additions & 53 deletions include/prism/enc/pm_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,50 @@
#include <stddef.h>
#include <stdint.h>

// This struct defines the functions necessary to implement the encoding
// interface so we can determine how many bytes the subsequent character takes.
// Each callback should return the number of bytes, or 0 if the next bytes are
// invalid for the encoding and type.
/**
* This struct defines the functions necessary to implement the encoding
* interface so we can determine how many bytes the subsequent character takes.
* Each callback should return the number of bytes, or 0 if the next bytes are
* invalid for the encoding and type.
*/
typedef struct {
// Return the number of bytes that the next character takes if it is valid
// in the encoding. Does not read more than n bytes. It is assumed that n is
// at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding. Does not read more than n bytes. It is assumed that n is
* at least 1.
*/
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);

// Return the number of bytes that the next character takes if it is valid
// in the encoding and is alphabetical. Does not read more than n bytes. It
// is assumed that n is at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding and is alphabetical. Does not read more than n bytes. It
* is assumed that n is at least 1.
*/
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);

// Return the number of bytes that the next character takes if it is valid
// in the encoding and is alphanumeric. Does not read more than n bytes. It
// is assumed that n is at least 1.
/**
* Return the number of bytes that the next character takes if it is valid
* in the encoding and is alphanumeric. Does not read more than n bytes. It
* is assumed that n is at least 1.
*/
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);

// Return true if the next character is valid in the encoding and is an
// uppercase character. Does not read more than n bytes. It is assumed that
// n is at least 1.
/**
* Return true if the next character is valid in the encoding and is an
* uppercase character. Does not read more than n bytes. It is assumed that
* n is at least 1.
*/
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);

// The name of the encoding. This should correspond to a value that can be
// passed to Encoding.find in Ruby.
/**
* The name of the encoding. This should correspond to a value that can be
* passed to Encoding.find in Ruby.
*/
const char *name;

// Return true if the encoding is a multibyte encoding.
/**
* Return true if the encoding is a multibyte encoding.
*/
bool multibyte;
} pm_encoding_t;

Expand All @@ -47,50 +61,109 @@ typedef struct {
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2

// These functions are reused by some other encodings, so they are defined here
// so they can be shared.
/**
* Return the size of the next character in the ASCII encoding if it is an
* alphabetical character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

/**
* Return the size of the next character in the ASCII encoding if it is an
* alphanumeric character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

/**
* Return true if the next character in the ASCII encoding if it is an uppercase
* character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns True if the next character is valid in the encoding and is an
* uppercase character, or false if it is not.
*/
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);

// These functions are shared between the actual encoding and the fast path in
// the parser so they need to be internally visible.
/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphabetical character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);

/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphanumeric character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);

/**
* Return true if the next character in the UTF-8 encoding if it is an uppercase
* character.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns True if the next character is valid in the encoding and is an
* uppercase character, or false if it is not.
*/
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);

// This lookup table is referenced in both the UTF-8 encoding file and the
// parser directly in order to speed up the default encoding processing.
/**
* This lookup table is referenced in both the UTF-8 encoding file and the
* parser directly in order to speed up the default encoding processing. It is
* used to indicate whether a character is alphabetical, alphanumeric, or
* uppercase in unicode mappings.
*/
extern const uint8_t pm_encoding_unicode_table[256];

// These are the encodings that are supported by the parser. They are defined in
// Below are the encodings that are supported by the parser. They are defined in
// their own files in the src/enc directory.
extern pm_encoding_t pm_encoding_ascii;
extern pm_encoding_t pm_encoding_ascii_8bit;
extern pm_encoding_t pm_encoding_big5;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_gbk;
extern pm_encoding_t pm_encoding_iso_8859_1;
extern pm_encoding_t pm_encoding_iso_8859_2;
extern pm_encoding_t pm_encoding_iso_8859_3;
extern pm_encoding_t pm_encoding_iso_8859_4;
extern pm_encoding_t pm_encoding_iso_8859_5;
extern pm_encoding_t pm_encoding_iso_8859_6;
extern pm_encoding_t pm_encoding_iso_8859_7;
extern pm_encoding_t pm_encoding_iso_8859_8;
extern pm_encoding_t pm_encoding_iso_8859_9;
extern pm_encoding_t pm_encoding_iso_8859_10;
extern pm_encoding_t pm_encoding_iso_8859_11;
extern pm_encoding_t pm_encoding_iso_8859_13;
extern pm_encoding_t pm_encoding_iso_8859_14;
extern pm_encoding_t pm_encoding_iso_8859_15;
extern pm_encoding_t pm_encoding_iso_8859_16;
extern pm_encoding_t pm_encoding_koi8_r;
extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
extern pm_encoding_t pm_encoding_windows_31j;
extern pm_encoding_t pm_encoding_windows_1251;
extern pm_encoding_t pm_encoding_windows_1252;

const extern pm_encoding_t pm_encoding_ascii;
const extern pm_encoding_t pm_encoding_ascii_8bit;
const extern pm_encoding_t pm_encoding_big5;
const extern pm_encoding_t pm_encoding_euc_jp;
const extern pm_encoding_t pm_encoding_gbk;
const extern pm_encoding_t pm_encoding_iso_8859_1;
const extern pm_encoding_t pm_encoding_iso_8859_2;
const extern pm_encoding_t pm_encoding_iso_8859_3;
const extern pm_encoding_t pm_encoding_iso_8859_4;
const extern pm_encoding_t pm_encoding_iso_8859_5;
const extern pm_encoding_t pm_encoding_iso_8859_6;
const extern pm_encoding_t pm_encoding_iso_8859_7;
const extern pm_encoding_t pm_encoding_iso_8859_8;
const extern pm_encoding_t pm_encoding_iso_8859_9;
const extern pm_encoding_t pm_encoding_iso_8859_10;
const extern pm_encoding_t pm_encoding_iso_8859_11;
const extern pm_encoding_t pm_encoding_iso_8859_13;
const extern pm_encoding_t pm_encoding_iso_8859_14;
const extern pm_encoding_t pm_encoding_iso_8859_15;
const extern pm_encoding_t pm_encoding_iso_8859_16;
const extern pm_encoding_t pm_encoding_koi8_r;
const extern pm_encoding_t pm_encoding_shift_jis;
const extern pm_encoding_t pm_encoding_utf_8;
const extern pm_encoding_t pm_encoding_utf8_mac;
const extern pm_encoding_t pm_encoding_windows_31j;
const extern pm_encoding_t pm_encoding_windows_1251;
const extern pm_encoding_t pm_encoding_windows_1252;

#endif
3 changes: 2 additions & 1 deletion src/enc/pm_big5.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}

pm_encoding_t pm_encoding_big5 = {
/** Big5 encoding */
const pm_encoding_t pm_encoding_big5 = {
.name = "big5",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_big5_alnum_char,
Expand Down
3 changes: 2 additions & 1 deletion src/enc/pm_euc_jp.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}

pm_encoding_t pm_encoding_euc_jp = {
/** EUC-JP encoding */
const pm_encoding_t pm_encoding_euc_jp = {
.name = "euc-jp",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_euc_jp_alnum_char,
Expand Down
3 changes: 2 additions & 1 deletion src/enc/pm_gbk.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}

pm_encoding_t pm_encoding_gbk = {
/** GBK encoding */
const pm_encoding_t pm_encoding_gbk = {
.name = "gbk",
.char_width = pm_encoding_gbk_char_width,
.alnum_char = pm_encoding_gbk_alnum_char,
Expand Down
3 changes: 2 additions & 1 deletion src/enc/pm_shift_jis.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}

pm_encoding_t pm_encoding_shift_jis = {
/** Shift_JIS encoding */
const pm_encoding_t pm_encoding_shift_jis = {
.name = "shift_jis",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_shift_jis_alnum_char,
Expand Down
Loading

0 comments on commit 52a0d80

Please sign in to comment.