Ruby 3.3.5p100 (2024-09-03 revision ef084cc8f4958c1b6e4ead99136631bef6d8ddba)
encoding.h File Reference

The encoding interface and implementations used by the parser. More...

#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

Go to the source code of this file.

Data Structures

struct  pm_encoding_t
 This struct defines the functions necessary to implement the encoding interface so we can determine how many bytes the subsequent character takes. More...
 

Macros

#define PRISM_ENCODING_ALPHABETIC_BIT   1 << 0
 All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical.
 
#define PRISM_ENCODING_ALPHANUMERIC_BIT   1 << 1
 All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric.
 
#define PRISM_ENCODING_UPPERCASE_BIT   1 << 2
 All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase.
 
#define PM_ENCODING_UTF_8_ENTRY   (&pm_encodings[PM_ENCODING_UTF_8])
 This is the default UTF-8 encoding.
 
#define PM_ENCODING_US_ASCII_ENTRY   (&pm_encodings[PM_ENCODING_US_ASCII])
 This is the US-ASCII encoding.
 

Enumerations

enum  pm_encoding_type_t {
  PM_ENCODING_UTF_8 = 0 , PM_ENCODING_ASCII_8BIT , PM_ENCODING_BIG5 , PM_ENCODING_BIG5_HKSCS ,
  PM_ENCODING_BIG5_UAO , PM_ENCODING_CESU_8 , PM_ENCODING_CP51932 , PM_ENCODING_CP850 ,
  PM_ENCODING_CP852 , PM_ENCODING_CP855 , PM_ENCODING_CP949 , PM_ENCODING_CP950 ,
  PM_ENCODING_CP951 , PM_ENCODING_EMACS_MULE , PM_ENCODING_EUC_JP , PM_ENCODING_EUC_JP_MS ,
  PM_ENCODING_EUC_JIS_2004 , PM_ENCODING_EUC_KR , PM_ENCODING_EUC_TW , PM_ENCODING_GB12345 ,
  PM_ENCODING_GB18030 , PM_ENCODING_GB1988 , PM_ENCODING_GB2312 , PM_ENCODING_GBK ,
  PM_ENCODING_IBM437 , PM_ENCODING_IBM720 , PM_ENCODING_IBM737 , PM_ENCODING_IBM775 ,
  PM_ENCODING_IBM852 , PM_ENCODING_IBM855 , PM_ENCODING_IBM857 , PM_ENCODING_IBM860 ,
  PM_ENCODING_IBM861 , PM_ENCODING_IBM862 , PM_ENCODING_IBM863 , PM_ENCODING_IBM864 ,
  PM_ENCODING_IBM865 , PM_ENCODING_IBM866 , PM_ENCODING_IBM869 , PM_ENCODING_ISO_8859_1 ,
  PM_ENCODING_ISO_8859_2 , PM_ENCODING_ISO_8859_3 , PM_ENCODING_ISO_8859_4 , PM_ENCODING_ISO_8859_5 ,
  PM_ENCODING_ISO_8859_6 , PM_ENCODING_ISO_8859_7 , PM_ENCODING_ISO_8859_8 , PM_ENCODING_ISO_8859_9 ,
  PM_ENCODING_ISO_8859_10 , PM_ENCODING_ISO_8859_11 , PM_ENCODING_ISO_8859_13 , PM_ENCODING_ISO_8859_14 ,
  PM_ENCODING_ISO_8859_15 , PM_ENCODING_ISO_8859_16 , PM_ENCODING_KOI8_R , PM_ENCODING_KOI8_U ,
  PM_ENCODING_MAC_CENT_EURO , PM_ENCODING_MAC_CROATIAN , PM_ENCODING_MAC_CYRILLIC , PM_ENCODING_MAC_GREEK ,
  PM_ENCODING_MAC_ICELAND , PM_ENCODING_MAC_JAPANESE , PM_ENCODING_MAC_ROMAN , PM_ENCODING_MAC_ROMANIA ,
  PM_ENCODING_MAC_THAI , PM_ENCODING_MAC_TURKISH , PM_ENCODING_MAC_UKRAINE , PM_ENCODING_SHIFT_JIS ,
  PM_ENCODING_SJIS_DOCOMO , PM_ENCODING_SJIS_KDDI , PM_ENCODING_SJIS_SOFTBANK , PM_ENCODING_STATELESS_ISO_2022_JP ,
  PM_ENCODING_STATELESS_ISO_2022_JP_KDDI , PM_ENCODING_TIS_620 , PM_ENCODING_US_ASCII , PM_ENCODING_UTF8_MAC ,
  PM_ENCODING_UTF8_DOCOMO , PM_ENCODING_UTF8_KDDI , PM_ENCODING_UTF8_SOFTBANK , PM_ENCODING_WINDOWS_1250 ,
  PM_ENCODING_WINDOWS_1251 , PM_ENCODING_WINDOWS_1252 , PM_ENCODING_WINDOWS_1253 , PM_ENCODING_WINDOWS_1254 ,
  PM_ENCODING_WINDOWS_1255 , PM_ENCODING_WINDOWS_1256 , PM_ENCODING_WINDOWS_1257 , PM_ENCODING_WINDOWS_1258 ,
  PM_ENCODING_WINDOWS_31J , PM_ENCODING_WINDOWS_874 , PM_ENCODING_MAXIMUM
}
 These are all of the encodings that prism supports. More...
 

Functions

size_t pm_encoding_utf_8_alpha_char (const uint8_t *b, ptrdiff_t n)
 Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.
 
size_t pm_encoding_utf_8_alnum_char (const uint8_t *b, ptrdiff_t n)
 Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.
 
bool pm_encoding_utf_8_isupper_char (const uint8_t *b, ptrdiff_t n)
 Return true if the next character in the UTF-8 encoding if it is an uppercase character.
 
const pm_encoding_tpm_encoding_find (const uint8_t *start, const uint8_t *end)
 Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.
 

Variables

const uint8_t pm_encoding_unicode_table [256]
 This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.
 
const pm_encoding_t pm_encodings [PM_ENCODING_MAXIMUM]
 This is the table of all of the encodings that prism supports.
 

Detailed Description

The encoding interface and implementations used by the parser.

Definition in file encoding.h.

Macro Definition Documentation

◆ PM_ENCODING_US_ASCII_ENTRY

#define PM_ENCODING_US_ASCII_ENTRY   (&pm_encodings[PM_ENCODING_US_ASCII])

This is the US-ASCII encoding.

We need a reference to it to be able to compare against it when a string is being created because it could possibly need to fall back to ASCII-8BIT.

Definition at line 236 of file encoding.h.

◆ PM_ENCODING_UTF_8_ENTRY

#define PM_ENCODING_UTF_8_ENTRY   (&pm_encodings[PM_ENCODING_UTF_8])

This is the default UTF-8 encoding.

We need a reference to it to quickly create parsers.

Definition at line 229 of file encoding.h.

Referenced by pm_encoding_find(), and pm_parser_init().

◆ PRISM_ENCODING_ALPHABETIC_BIT

#define PRISM_ENCODING_ALPHABETIC_BIT   1 << 0

All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical.

Definition at line 68 of file encoding.h.

Referenced by pm_encoding_utf_8_alpha_char().

◆ PRISM_ENCODING_ALPHANUMERIC_BIT

#define PRISM_ENCODING_ALPHANUMERIC_BIT   1 << 1

All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric.

Definition at line 74 of file encoding.h.

Referenced by pm_encoding_utf_8_alnum_char().

◆ PRISM_ENCODING_UPPERCASE_BIT

#define PRISM_ENCODING_UPPERCASE_BIT   1 << 2

All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase.

Definition at line 80 of file encoding.h.

Referenced by pm_encoding_utf_8_isupper_char().

Enumeration Type Documentation

◆ pm_encoding_type_t

These are all of the encodings that prism supports.

Definition at line 126 of file encoding.h.

Function Documentation

◆ pm_encoding_find()

const pm_encoding_t * pm_encoding_find ( const uint8_t * start,
const uint8_t * end )

Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.

Parameters
startA pointer to the first byte of the name.
endA pointer to the last byte of the name.
Returns
A pointer to the encoding struct if one is found, otherwise NULL.

Definition at line 4945 of file encoding.c.

◆ pm_encoding_utf_8_alnum_char()

size_t pm_encoding_utf_8_alnum_char ( const uint8_t * b,
ptrdiff_t n )

Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.

Parameters
bThe bytes to read.
nThe number of bytes that can be read.
Returns
The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

Definition at line 2312 of file encoding.c.

◆ pm_encoding_utf_8_alpha_char()

size_t pm_encoding_utf_8_alpha_char ( const uint8_t * b,
ptrdiff_t n )

Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.

Parameters
bThe bytes to read.
nThe number of bytes that can be read.
Returns
The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

Definition at line 2292 of file encoding.c.

◆ pm_encoding_utf_8_isupper_char()

bool pm_encoding_utf_8_isupper_char ( const uint8_t * b,
ptrdiff_t n )

Return true if the next character in the UTF-8 encoding if it is an uppercase character.

Parameters
bThe bytes to read.
nThe number of bytes that can be read.
Returns
True if the next character is valid in the encoding and is an uppercase character, or false if it is not.

Definition at line 2332 of file encoding.c.

Variable Documentation

◆ pm_encoding_unicode_table

const uint8_t pm_encoding_unicode_table[256]
extern

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

It is used to indicate whether a character is alphabetical, alphanumeric, or uppercase in unicode mappings.

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

Note that this table is different from other encodings where we used a lookup table because the indices of those tables are the byte representations, not the codepoints themselves.

Definition at line 2161 of file encoding.c.

◆ pm_encodings

const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]
extern

This is the table of all of the encodings that prism supports.

Definition at line 4217 of file encoding.c.