encoding.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

/*
 * encoding.c : implements the encoding conversion functions needed for XML
 *
 * Related specs: 
 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
 * [ISO-8859-1]   ISO Latin-1 characters codes.
 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
 *                Worldwide Character Encoding -- Version 1.0", Addison-
 *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
 *                described in Unicode Technical Report #4.
 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
 *                Information Interchange, ANSI X3.4-1986.
 *
 * Original code from "Martin J. Duerst" <duerst@w3.org>
 *
 * See Copyright for the status of this software.
 *
 * $Id$
 *
 * Daniel.Veillard@w3.org
 */

#include "encoding.h"

/*
 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
 * block of chars out.
 *
 * Returns the number of byte written, or -1 by lack of space.
 */
int isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
{
    unsigned char* outstart= out;
    unsigned char* outend= out+outlen;
    unsigned char* inend= in+inlen;
    unsigned char c;

    while (in < inend) {
        c= *in++;
        if (c < 0x80) {
            if (out >= outend)  return -1;
            *out++ = c;
        }
        else {
            if (out >= outend)  return -1;
            *out++ = 0xC0 | (c >> 6);
            if (out >= outend)  return -1;
            *out++ = 0x80 | (0x3F & c);
        }
    }
    return out-outstart;
}


/*
 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
 * block of chars out.
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed.
 *
 * TODO: need a fallback mechanism ...
 */
int UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
{
    unsigned char* outstart= out;
    unsigned char* outend= out+outlen;
    unsigned char* inend= in+inlen;
    unsigned char c, d;

    while (in < inend) {
        c= *in++;
        if (c < 0x80) {
            if (out >= outend)  return -1;
            *out++= c;
        }
        else if (((c & 0xFE) == 0xC2) && in<inend) {
            if (out >= outend)  return -1;
            *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
        }
        else  return -2;
    }
    return out-outstart;
}

/*
 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
 * block of chars out.
 *
 * Returns the number of byte written, or -1 by lack of space.
 */
int UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
{
    unsigned char* outstart= out;
    unsigned char* outend= out+outlen;
    unsigned short* inend= in+inlen;
    unsigned int c, d;
    int bits;

    while (in < inend) {
        c= *in++;
        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
            if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
                c &= 0x03FF;
                c <<= 10;
                c |= d & 0x03FF;
                c += 0x10000;
            }
            else  return -1;
        }

      /* assertion: c is a single UTF-4 value */

        if (out >= outend)  return -1;
        if      (c <    0x80) {  *out++=  c;                bits= -6; }
        else if (c <   0x800) {  *out++= (c >>  6) | 0xC0;  bits=  0; }
        else if (c < 0x10000) {  *out++= (c >> 12) | 0xE0;  bits=  6; }
        else                  {  *out++= (c >> 18) | 0xF0;  bits= 12; }
 
        for ( ; bits < 0; bits-= 6) {
            if (out >= outend)  return -1;
            *out++= (c >> bits) & 0x3F;
        }
    }
    return out-outstart;
}

/*
 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
 * block of chars out.
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed.
 *
 * TODO: need a fallback mechanism ...
 */
int UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
{
    unsigned short* outstart= out;
    unsigned short* outend= out+outlen;
    unsigned char* inend= in+inlen;
    unsigned int c, d, trailing;

    while (in < inend) {
      d= *in++;
      if      (d < 0x80)  { c= d; trailing= 0; }
      else if (d < 0xC0)  return -2;    /* trailing byte in leading position */
      else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
      else return -2;    /* no chance for this in UTF-16 */

      for ( ; trailing; trailing--) {
          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  return -1;
          c <<= 6;
          c |= d & 0x3F;
      }

      /* assertion: c is a single UTF-4 value */
        if (c < 0x10000) {
            if (out >= outend)  return -1;
            *out++ = c;
        }
        else if (c < 0x110000) {
            if (out+1 >= outend)  return -1;
            c -= 0x10000;
            *out++ = 0xD800 | (c >> 10);
            *out++ = 0xDC00 | (c & 0x03FF);
        }
        else  return -1;
    }
    return out-outstart;
}