Documentation for the encodings

ruby · Nov 1, 2023 · 52a0d80 · 52a0d80
1 parent 2693426
commit 52a0d80
Show file tree

Hide file tree

Showing 8 changed files with 513 additions and 154 deletions.
diff --git a/include/prism/enc/pm_encoding.h b/include/prism/enc/pm_encoding.h
@@ -8,36 +8,50 @@
 #include <stddef.h>
 #include <stdint.h>
 
-// This struct defines the functions necessary to implement the encoding
-// interface so we can determine how many bytes the subsequent character takes.
-// Each callback should return the number of bytes, or 0 if the next bytes are
-// invalid for the encoding and type.
+/**
+ * This struct defines the functions necessary to implement the encoding
+ * interface so we can determine how many bytes the subsequent character takes.
+ * Each callback should return the number of bytes, or 0 if the next bytes are
+ * invalid for the encoding and type.
+ */
 typedef struct {
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding. Does not read more than n bytes. It is assumed that n is
-    // at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding. Does not read more than n bytes. It is assumed that n is
+     * at least 1.
+     */
     size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
 
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding and is alphabetical. Does not read more than n bytes. It
-    // is assumed that n is at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphabetical. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
     size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
 
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding and is alphanumeric. Does not read more than n bytes. It
-    // is assumed that n is at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphanumeric. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
     size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
 
-    // Return true if the next character is valid in the encoding and is an
-    // uppercase character. Does not read more than n bytes. It is assumed that
-    // n is at least 1.
+    /**
+     * Return true if the next character is valid in the encoding and is an
+     * uppercase character. Does not read more than n bytes. It is assumed that
+     * n is at least 1.
+     */
     bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
 
-    // The name of the encoding. This should correspond to a value that can be
-    // passed to Encoding.find in Ruby.
+    /**
+     * The name of the encoding. This should correspond to a value that can be
+     * passed to Encoding.find in Ruby.
+     */
     const char *name;
 
-    // Return true if the encoding is a multibyte encoding.
+    /**
+     * Return true if the encoding is a multibyte encoding.
+     */
     bool multibyte;
 } pm_encoding_t;
 
@@ -47,50 +61,109 @@ typedef struct {
 #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
 #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
 
-// These functions are reused by some other encodings, so they are defined here
-// so they can be shared.
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return true if the next character in the ASCII encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ *     uppercase character, or false if it is not.
+ */
 bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
 
-// These functions are shared between the actual encoding and the fast path in
-// the parser so they need to be internally visible.
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ *     uppercase character, or false if it is not.
+ */
 bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
 
-// This lookup table is referenced in both the UTF-8 encoding file and the
-// parser directly in order to speed up the default encoding processing.
+/**
+ * This lookup table is referenced in both the UTF-8 encoding file and the
+ * parser directly in order to speed up the default encoding processing. It is
+ * used to indicate whether a character is alphabetical, alphanumeric, or
+ * uppercase in unicode mappings.
+ */
 extern const uint8_t pm_encoding_unicode_table[256];
 
-// These are the encodings that are supported by the parser. They are defined in
+// Below are the encodings that are supported by the parser. They are defined in
 // their own files in the src/enc directory.
-extern pm_encoding_t pm_encoding_ascii;
-extern pm_encoding_t pm_encoding_ascii_8bit;
-extern pm_encoding_t pm_encoding_big5;
-extern pm_encoding_t pm_encoding_euc_jp;
-extern pm_encoding_t pm_encoding_gbk;
-extern pm_encoding_t pm_encoding_iso_8859_1;
-extern pm_encoding_t pm_encoding_iso_8859_2;
-extern pm_encoding_t pm_encoding_iso_8859_3;
-extern pm_encoding_t pm_encoding_iso_8859_4;
-extern pm_encoding_t pm_encoding_iso_8859_5;
-extern pm_encoding_t pm_encoding_iso_8859_6;
-extern pm_encoding_t pm_encoding_iso_8859_7;
-extern pm_encoding_t pm_encoding_iso_8859_8;
-extern pm_encoding_t pm_encoding_iso_8859_9;
-extern pm_encoding_t pm_encoding_iso_8859_10;
-extern pm_encoding_t pm_encoding_iso_8859_11;
-extern pm_encoding_t pm_encoding_iso_8859_13;
-extern pm_encoding_t pm_encoding_iso_8859_14;
-extern pm_encoding_t pm_encoding_iso_8859_15;
-extern pm_encoding_t pm_encoding_iso_8859_16;
-extern pm_encoding_t pm_encoding_koi8_r;
-extern pm_encoding_t pm_encoding_shift_jis;
-extern pm_encoding_t pm_encoding_utf_8;
-extern pm_encoding_t pm_encoding_utf8_mac;
-extern pm_encoding_t pm_encoding_windows_31j;
-extern pm_encoding_t pm_encoding_windows_1251;
-extern pm_encoding_t pm_encoding_windows_1252;
+
+const extern pm_encoding_t pm_encoding_ascii;
+const extern pm_encoding_t pm_encoding_ascii_8bit;
+const extern pm_encoding_t pm_encoding_big5;
+const extern pm_encoding_t pm_encoding_euc_jp;
+const extern pm_encoding_t pm_encoding_gbk;
+const extern pm_encoding_t pm_encoding_iso_8859_1;
+const extern pm_encoding_t pm_encoding_iso_8859_2;
+const extern pm_encoding_t pm_encoding_iso_8859_3;
+const extern pm_encoding_t pm_encoding_iso_8859_4;
+const extern pm_encoding_t pm_encoding_iso_8859_5;
+const extern pm_encoding_t pm_encoding_iso_8859_6;
+const extern pm_encoding_t pm_encoding_iso_8859_7;
+const extern pm_encoding_t pm_encoding_iso_8859_8;
+const extern pm_encoding_t pm_encoding_iso_8859_9;
+const extern pm_encoding_t pm_encoding_iso_8859_10;
+const extern pm_encoding_t pm_encoding_iso_8859_11;
+const extern pm_encoding_t pm_encoding_iso_8859_13;
+const extern pm_encoding_t pm_encoding_iso_8859_14;
+const extern pm_encoding_t pm_encoding_iso_8859_15;
+const extern pm_encoding_t pm_encoding_iso_8859_16;
+const extern pm_encoding_t pm_encoding_koi8_r;
+const extern pm_encoding_t pm_encoding_shift_jis;
+const extern pm_encoding_t pm_encoding_utf_8;
+const extern pm_encoding_t pm_encoding_utf8_mac;
+const extern pm_encoding_t pm_encoding_windows_31j;
+const extern pm_encoding_t pm_encoding_windows_1251;
+const extern pm_encoding_t pm_encoding_windows_1252;
 
 #endif
diff --git a/src/enc/pm_big5.c b/src/enc/pm_big5.c
@@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_big5 = {
+/** Big5 encoding */
+const pm_encoding_t pm_encoding_big5 = {
     .name = "big5",
     .char_width = pm_encoding_big5_char_width,
     .alnum_char = pm_encoding_big5_alnum_char,

diff --git a/src/enc/pm_euc_jp.c b/src/enc/pm_euc_jp.c
@@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_euc_jp = {
+/** EUC-JP encoding */
+const pm_encoding_t pm_encoding_euc_jp = {
     .name = "euc-jp",
     .char_width = pm_encoding_euc_jp_char_width,
     .alnum_char = pm_encoding_euc_jp_alnum_char,

diff --git a/src/enc/pm_gbk.c b/src/enc/pm_gbk.c
@@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_gbk = {
+/** GBK encoding */
+const pm_encoding_t pm_encoding_gbk = {
     .name = "gbk",
     .char_width = pm_encoding_gbk_char_width,
     .alnum_char = pm_encoding_gbk_alnum_char,

diff --git a/src/enc/pm_shift_jis.c b/src/enc/pm_shift_jis.c
@@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_shift_jis = {
+/** Shift_JIS encoding */
+const pm_encoding_t pm_encoding_shift_jis = {
     .name = "shift_jis",
     .char_width = pm_encoding_shift_jis_char_width,
     .alnum_char = pm_encoding_shift_jis_alnum_char,