encoding

Name

encoding -- 

Synopsis



enum        xmlCharEncoding;
int         (*xmlCharEncodingInputFunc)     (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
int         (*xmlCharEncodingOutputFunc)    (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
struct      xmlCharEncodingHandler;
typedef     xmlCharEncodingHandlerPtr;
void        xmlInitCharEncodingHandlers     (void);
void        xmlCleanupCharEncodingHandlers  (void);
void        xmlRegisterCharEncodingHandler  (xmlCharEncodingHandlerPtr handler);
xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler
                                            (xmlCharEncoding enc);
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler
                                            (const char *name);
int         xmlAddEncodingAlias             (const char *name,
                                             const char *alias);
int         xmlDelEncodingAlias             (const char *alias);
const char* xmlGetEncodingAlias             (const char *alias);
void        xmlCleanupEncodingAliases       (void);
xmlCharEncoding xmlParseCharEncoding        (const char *name);
const char* xmlGetCharEncodingName          (xmlCharEncoding enc);
xmlCharEncoding xmlDetectCharEncoding       (unsigned char *in,
                                             int len);
int         xmlCharEncOutFunc               (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);
int         xmlCharEncInFunc                (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);
int         xmlCharEncFirstLine             (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);
int         xmlCharEncCloseFunc             (xmlCharEncodingHandler *handler);
int         UTF8Toisolat1                   (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
int         isolat1ToUTF8                   (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
int         xmlCheckUTF8                    (unsigned char *utf);
int         xmlUTF8Strsize                  (const xmlChar *utf,
                                             int len);
xmlChar*    xmlUTF8Strndup                  (const xmlChar *utf,
                                             int len);
xmlChar*    xmlUTF8Strpos                   (const xmlChar *utf,
                                             int pos);
int         xmlUTF8Strloc                   (const xmlChar *utf,
                                             const xmlChar *utfchar);
xmlChar*    xmlUTF8Strsub                   (const xmlChar *utf,
                                             int start,
                                             int len);
int         xmlUTF8Strlen                   (const xmlChar *utf);

Description

Details

enum xmlCharEncoding

typedef enum {
    XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
    XML_CHAR_ENCODING_NONE=	0, /* No char encoding detected */
    XML_CHAR_ENCODING_UTF8=	1, /* UTF-8 */
    XML_CHAR_ENCODING_UTF16LE=	2, /* UTF-16 little endian */
    XML_CHAR_ENCODING_UTF16BE=	3, /* UTF-16 big endian */
    XML_CHAR_ENCODING_UCS4LE=	4, /* UCS-4 little endian */
    XML_CHAR_ENCODING_UCS4BE=	5, /* UCS-4 big endian */
    XML_CHAR_ENCODING_EBCDIC=	6, /* EBCDIC uh! */
    XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS2=	9, /* UCS-2 */
    XML_CHAR_ENCODING_8859_1=	10,/* ISO-8859-1 ISO Latin 1 */
    XML_CHAR_ENCODING_8859_2=	11,/* ISO-8859-2 ISO Latin 2 */
    XML_CHAR_ENCODING_8859_3=	12,/* ISO-8859-3 */
    XML_CHAR_ENCODING_8859_4=	13,/* ISO-8859-4 */
    XML_CHAR_ENCODING_8859_5=	14,/* ISO-8859-5 */
    XML_CHAR_ENCODING_8859_6=	15,/* ISO-8859-6 */
    XML_CHAR_ENCODING_8859_7=	16,/* ISO-8859-7 */
    XML_CHAR_ENCODING_8859_8=	17,/* ISO-8859-8 */
    XML_CHAR_ENCODING_8859_9=	18,/* ISO-8859-9 */
    XML_CHAR_ENCODING_2022_JP=  19,/* ISO-2022-JP */
    XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */
    XML_CHAR_ENCODING_EUC_JP=   21,/* EUC-JP */
    XML_CHAR_ENCODING_ASCII=    22 /* pure ASCII */
} xmlCharEncoding;

Predefined values for some standard encodings. Libxml don't do beforehand translation on UTF8, ISOLatinX. It also support UTF16 (LE and BE) by default.

Anything else would have to be translated to UTF8 before being given to the parser itself. The BOM for UTF16 and the encoding declaration are looked at and a converter is looked for at that point. If not found the parser stops here as asked by the XML REC Converter can be registered by the user using xmlRegisterCharEncodingHandler but the current form doesn't allow stateful transcoding (a serious problem agreed !). If iconv has been found it will be used automatically and allow stateful transcoding, the simplest is then to be sure to enable icon and to provide iconv libs for the encoding support needed.


xmlCharEncodingInputFunc ()

int         (*xmlCharEncodingInputFunc)     (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of chars in the original encoding and try to convert it to an UTF-8 block of chars out.

out : a pointer to an array of bytes to store the UTF-8 result
outlen : the length of out
in : a pointer to an array of chars in the original encoding
inlen : the length of in
Returns :the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number of octets consumed.


xmlCharEncodingOutputFunc ()

int         (*xmlCharEncodingOutputFunc)    (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of UTF-8 chars in and try to convert it to an other encoding. Note: a first call designed to produce heading info is called with in = NULL. If stateful this should also initialize the encoder state.

out : a pointer to an array of bytes to store the result
outlen : the length of out
in : a pointer to an array of UTF-8 chars
inlen : the length of in
Returns :the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number of ocetes consumed.


struct xmlCharEncodingHandler

struct xmlCharEncodingHandler {
    char                       *name;
    xmlCharEncodingInputFunc   input;
    xmlCharEncodingOutputFunc  output;
#ifdef LIBXML_ICONV_ENABLED
    iconv_t                    iconv_in;
    iconv_t                    iconv_out;
#endif /* LIBXML_ICONV_ENABLED */
};


xmlCharEncodingHandlerPtr

typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;


xmlInitCharEncodingHandlers ()

void        xmlInitCharEncodingHandlers     (void);

Initialize the char encoding support, it registers the default encoding supported. NOTE: while public, this function usually doesn't need to be called in normal processing.


xmlCleanupCharEncodingHandlers ()

void        xmlCleanupCharEncodingHandlers  (void);

Cleanup the memory allocated for the char encoding support, it unregisters all the encoding handlers and the aliases.


xmlRegisterCharEncodingHandler ()

void        xmlRegisterCharEncodingHandler  (xmlCharEncodingHandlerPtr handler);

Register the char encoding handler, surprising, isn't it ?

handler : the xmlCharEncodingHandlerPtr handler block


xmlGetCharEncodingHandler ()

xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler
                                            (xmlCharEncoding enc);

Search in the registered set the handler able to read/write that encoding.

enc : an xmlCharEncoding value.
Returns :the handler or NULL if not found


xmlFindCharEncodingHandler ()

xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler
                                            (const char *name);

Search in the registered set the handler able to read/write that encoding.

name : a string describing the char encoding.
Returns :the handler or NULL if not found


xmlAddEncodingAlias ()

int         xmlAddEncodingAlias             (const char *name,
                                             const char *alias);

Registers and alias alias for an encoding named name. Existing alias will be overwritten.

name : the encoding name as parsed, in UTF-8 format (ASCII actually)
alias : the alias name as parsed, in UTF-8 format (ASCII actually)
Returns :0 in case of success, -1 in case of error


xmlDelEncodingAlias ()

int         xmlDelEncodingAlias             (const char *alias);

Unregisters an encoding alias alias

alias : the alias name as parsed, in UTF-8 format (ASCII actually)
Returns :0 in case of success, -1 in case of error


xmlGetEncodingAlias ()

const char* xmlGetEncodingAlias             (const char *alias);

Lookup an encoding name for the given alias.

alias : the alias name as parsed, in UTF-8 format (ASCII actually)
Returns :NULL if not found the original name otherwise


xmlCleanupEncodingAliases ()

void        xmlCleanupEncodingAliases       (void);

Unregisters all aliases


xmlParseCharEncoding ()

xmlCharEncoding xmlParseCharEncoding        (const char *name);

Compare the string to the known encoding schemes already known. Note that the comparison is case insensitive accordingly to the section [XML] 4.3.3 Character Encoding in Entities.

name : the encoding name as parsed, in UTF-8 format (ASCII actually)
Returns :one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE if not recognized.


xmlGetCharEncodingName ()

const char* xmlGetCharEncodingName          (xmlCharEncoding enc);

The "canonical" name for XML encoding. C.f. http://www.w3.org/TR/REC-xmlcharencoding Section 4.3.3 Character Encoding in Entities

enc : the encoding
Returns :the canonical name for the given encoding


xmlDetectCharEncoding ()

xmlCharEncoding xmlDetectCharEncoding       (unsigned char *in,
                                             int len);

Guess the encoding of the entity using the first bytes of the entity content accordingly of the non-normative appendix F of the XML-1.0 recommendation.

in : a pointer to the first bytes of the XML entity, must be at least 4 bytes long.
len : pointer to the length of the buffer
Returns :one of the XML_CHAR_ENCODING_... values.


xmlCharEncOutFunc ()

int         xmlCharEncOutFunc               (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);

Generic front-end for the encoding handler output function a first call with in == NULL has to be made firs to initiate the output in case of non-stateless encoding needing to initiate their state or the output (like the BOM in UTF16). In case of UTF8 sequence conversion errors for the given encoder, the content will be automatically remapped to a CharRef sequence.

handler : char enconding transformation data structure
out : an xmlBuffer for the output.
in : an xmlBuffer for the input
Returns :the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or


xmlCharEncInFunc ()

int         xmlCharEncInFunc                (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);

Generic front-end for the encoding handler input function

handler : char encoding transformation data structure
out : an xmlBuffer for the output.
in : an xmlBuffer for the input
Returns :the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or


xmlCharEncFirstLine ()

int         xmlCharEncFirstLine             (xmlCharEncodingHandler *handler,
                                             xmlBufferPtr out,
                                             xmlBufferPtr in);

Front-end for the encoding handler input function, but handle only the very first line, i.e. limit itself to 45 chars.

handler : char enconding transformation data structure
out : an xmlBuffer for the output.
in : an xmlBuffer for the input
Returns :the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or


xmlCharEncCloseFunc ()

int         xmlCharEncCloseFunc             (xmlCharEncodingHandler *handler);

Generic front-end for encoding handler close function

handler : char enconding transformation data structure
Returns :0 if success, or -1 in case of error


UTF8Toisolat1 ()

int         UTF8Toisolat1                   (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 block of chars out.

out : a pointer to an array of bytes to store the result
outlen : the length of out
in : a pointer to an array of UTF-8 chars
inlen : the length of in
Returns :0 if success, -2 if the transcoding fails, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of ocetes consumed.


isolat1ToUTF8 ()

int         isolat1ToUTF8                   (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 block of chars out.

out : a pointer to an array of bytes to store the result
outlen : the length of out
in : a pointer to an array of ISO Latin 1 chars
inlen : the length of in
Returns :0 if success, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of ocetes consumed.


xmlCheckUTF8 ()

int         xmlCheckUTF8                    (unsigned char *utf);

Checks utf for being valid utf-8. utf is assumed to be null-terminated. This function is not super-strict, as it will allow longer utf-8 sequences than necessary. Note that Java is capable of producing these sequences if provoked. Also note, this routine checks for the 4-byte maximum size, but does not check for 0x10ffff maximum value.

utf : Pointer to putative utf-8 encoded string.
Returns : true if utf is valid.


xmlUTF8Strsize ()

int         xmlUTF8Strsize                  (const xmlChar *utf,
                                             int len);

storage size of an UTF8 string

utf : a sequence of UTF-8 encoded bytes
len : the number of characters in the array
Returns :the storage size of the first 'len' characters of ARRAY


xmlUTF8Strndup ()

xmlChar*    xmlUTF8Strndup                  (const xmlChar *utf,
                                             int len);

a strndup for array of UTF8's

utf : the input UTF8 *
len : the len of utf (in chars)
Returns :a new UTF8 * or NULL


xmlUTF8Strpos ()

xmlChar*    xmlUTF8Strpos                   (const xmlChar *utf,
                                             int pos);

a function to provide the equivalent of fetching a character from a string array

utf : the input UTF8 *
pos : the position of the desired UTF8 char (in chars)
Returns :a pointer to the UTF8 character or NULL


xmlUTF8Strloc ()

int         xmlUTF8Strloc                   (const xmlChar *utf,
                                             const xmlChar *utfchar);

a function to provide relative location of a UTF8 char

utf : the input UTF8 *
utfchar : the UTF8 character to be found
Returns :the relative character position of the desired char or -1 if not found


xmlUTF8Strsub ()

xmlChar*    xmlUTF8Strsub                   (const xmlChar *utf,
                                             int start,
                                             int len);

Note: positions are given in units of UTF-8 chars

utf : a sequence of UTF-8 encoded bytes
start : relative pos of first char
len : total number to copy
Returns :a pointer to a newly created string or NULL if any problem


xmlUTF8Strlen ()

int         xmlUTF8Strlen                   (const xmlChar *utf);

compute the length of an UTF8 string, it doesn't do a full UTF8 checking of the content of the string.

utf : a sequence of UTF-8 encoded bytes
Returns :the number of characters in the string or -1 in case of error