From ac379c197217b32954792294269d3ec4fb4c48db Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sun, 7 Aug 2011 04:54:14 +0200 Subject: Upgrade the GB18030 converter to the version from 2005. --- ChangeLog | 17 +++++++++++ NEWS | 3 ++ lib/gb18030ext.h | 66 ++++++++++++++++++++++++++++++------------ tests/GB18030-BMP.TXT | 50 ++++++++++++++++---------------- tests/GB18030.IRREVERSIBLE.TXT | 50 ++++++++++++++++++++++++++++++++ 5 files changed, 142 insertions(+), 44 deletions(-) create mode 100644 tests/GB18030.IRREVERSIBLE.TXT diff --git a/ChangeLog b/ChangeLog index 0f07076..c6cdcec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2011-08-06 Bruno Haible + + Upgrade the GB18030 converter to the version from 2005. + * lib/gb18030ext.h (gb18030ext_2uni_pagefe): Change element type to + 'unsigned int'. Change values for 0xFE51..0xFE53, 0xFE59, 0xFE61, + 0xFE66, 0xFE67, 0xFE6C, 0xFE6D, 0xFE76, 0xFE7E, 0xFE90, 0xFE91, 0xFEA0. + (gb18030ext_mbtowc): Change type of wc to 'unsigned int'. Change values + for 0xA6D9..0xA6DF, 0xA6EC..0xA6ED, 0xA6F3, 0xA8BC. + (gb18030ext_page9f, gb18030ext_pagefe): New constant arrays. + (gb18030ext_wctomb): Change values for U+1E3F, U+9FB4..U+9FBB, + U+FE10..U+FE19, U+20087, U+20089, U+200CC, U+215D7, U+2298F, U+241FE. + * tests/GB18030-BMP.TXT: Change values for 0xA6D9..0xA6DF, + 0xA6EC..0xA6ED, 0xA6F3, 0xA8BC, 0xFE51..0xFE53, 0xFE59, 0xFE61, 0xFE66, + 0xFE67, 0xFE6C, 0xFE6D, 0xFE76, 0xFE7E, 0xFE90, 0xFE91, 0xFEA0, to map + to now-assigned Unicode codepoints. + * tests/GB18030.IRREVERSIBLE.TXT: New file. + 2011-08-06 Bruno Haible Fix conversion bug in CP1258 converter. diff --git a/NEWS b/NEWS index c0f5573..3fc8dda 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,8 @@ * The 'iconv' program now produces its output as soon as it can. It no longer unnecessarily waits for more input. +* Updated the GB18030 converter to map 25 characters to code points that have + been to Unicode since 2000, rather than to code points in the Private Use + Area. * Updated the BIG5-HKSCS converter. The old BIG5-HKSCS converter is renamed to BIG5-HKSCS:2004. A new converter BIG5-HKSCS:2008 is added. BIG5-HKSCS is now an alias for BIG5-HKSCS:2008. diff --git a/lib/gb18030ext.h b/lib/gb18030ext.h index 14b0e45..5e59419 100644 --- a/lib/gb18030ext.h +++ b/lib/gb18030ext.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc. + * Copyright (C) 1999-2001, 2005, 2011 Free Software Foundation, Inc. * This file is part of the GNU LIBICONV Library. * * The GNU LIBICONV Library is free software; you can redistribute it @@ -27,20 +27,20 @@ static const unsigned short gb18030ext_2uni_pagea9[13] = { 0x303e, 0x2ff0, 0x2ff1, 0x2ff2, 0x2ff3, 0x2ff4, 0x2ff5, 0x2ff6, 0x2ff7, 0x2ff8, 0x2ff9, 0x2ffa, 0x2ffb, }; -static const unsigned short gb18030ext_2uni_pagefe[96] = { +static const unsigned int gb18030ext_2uni_pagefe[96] = { /* 0xfe */ - 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, - 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, - 0x2e81, 0xe816, 0xe817, 0xe818, 0x2e84, 0x3473, 0x3447, 0x2e88, - 0x2e8b, 0xe81e, 0x359e, 0x361a, 0x360e, 0x2e8c, 0x2e97, 0x396e, - 0x3918, 0xe826, 0x39cf, 0x39df, 0x3a73, 0x39d0, 0xe82b, 0xe82c, - 0x3b4e, 0x3c6e, 0x3ce0, 0x2ea7, 0xe831, 0xe832, 0x2eaa, 0x4056, - 0x415f, 0x2eae, 0x4337, 0x2eb3, 0x2eb6, 0x2eb7, 0xe83b, 0x43b1, - 0x43ac, 0x2ebb, 0x43dd, 0x44d6, 0x4661, 0x464c, 0xe843, 0x4723, - 0x4729, 0x477c, 0x478d, 0x2eca, 0x4947, 0x497a, 0x497d, 0x4982, - 0x4983, 0x4985, 0x4986, 0x499f, 0x499b, 0x49b7, 0x49b6, 0xe854, - 0xe855, 0x4ca3, 0x4c9f, 0x4ca0, 0x4ca1, 0x4c77, 0x4ca2, 0x4d13, - 0x4d14, 0x4d15, 0x4d16, 0x4d17, 0x4d18, 0x4d19, 0x4dae, 0xe864, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, + 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, + 0x2e81, 0x20087, 0x20089, 0x200cc, 0x2e84, 0x3473, 0x3447, 0x2e88, + 0x2e8b, 0x9fb4, 0x359e, 0x361a, 0x360e, 0x2e8c, 0x2e97, 0x396e, + 0x3918, 0x9fb5, 0x39cf, 0x39df, 0x3a73, 0x39d0, 0x9fb6, 0x9fb7, + 0x3b4e, 0x3c6e, 0x3ce0, 0x2ea7, 0x215d7, 0x9fb8, 0x2eaa, 0x4056, + 0x415f, 0x2eae, 0x4337, 0x2eb3, 0x2eb6, 0x2eb7, 0x2298f, 0x43b1, + 0x43ac, 0x2ebb, 0x43dd, 0x44d6, 0x4661, 0x464c, 0x9fb9, 0x4723, + 0x4729, 0x477c, 0x478d, 0x2eca, 0x4947, 0x497a, 0x497d, 0x4982, + 0x4983, 0x4985, 0x4986, 0x499f, 0x499b, 0x49b7, 0x49b6, 0x9fba, + 0x241fe, 0x4ca3, 0x4c9f, 0x4ca0, 0x4ca1, 0x4c77, 0x4ca2, 0x4d13, + 0x4d14, 0x4d15, 0x4d16, 0x4d17, 0x4d18, 0x4d19, 0x4dae, 0x9fbb, }; static int @@ -52,7 +52,7 @@ gb18030ext_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) unsigned char c2 = s[1]; if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff)) { unsigned int i = 190 * (c1 - 0x81) + (c2 - (c2 >= 0x80 ? 0x41 : 0x40)); - unsigned short wc = 0xfffd; + unsigned int wc = 0xfffd; switch (c1) { case 0xa2: if (i >= 6376 && i <= 6381) /* 0xA2AB..0xA2B0 */ @@ -77,12 +77,14 @@ gb18030ext_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) case 0xa6: if (i >= 7150 && i <= 7157) /* 0xA6B9..0xA6C0 */ wc = 0xe785 + (i - 7150); + else if (i >= 7183 && i <= 7184) /* 0xA6DA..0xA6DB */ + wc = 0xfe12 - (i - 7183); else if (i >= 7182 && i <= 7190) /* 0xA6D9..0xA6DF */ - wc = 0xe78d + (i - 7182); + wc = 0xfe10 + (i - 7182); else if (i >= 7201 && i <= 7202) /* 0xA6EC..0xA6ED */ - wc = 0xe794 + (i - 7201); + wc = 0xfe17 + (i - 7201); else if (i == 7208) /* 0xA6F3 */ - wc = 0xe796; + wc = 0xfe19; else if (i >= 7211 && i <= 7219) /* 0xA6F6..0xA6FE */ wc = 0xe797 + (i - 7211); break; @@ -96,7 +98,7 @@ gb18030ext_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) if (i >= 7495 && i <= 7505) /* 0xA896..0xA8A0 */ wc = 0xe7bc + (i - 7495); else if (i == 7533) /* 0xA8BC */ - wc = 0xe7c7; + wc = 0x1e3f; else if (i == 7536) /* 0xA8BF */ wc = 0x01f9; else if (i >= 7538 && i <= 7541) /* 0xA8C1..0xA8C4 */ @@ -230,6 +232,14 @@ static const unsigned short gb18030ext_page4d[16] = { 0x0000, 0x0000, 0x0000, 0xfe98, 0xfe99, 0xfe9a, 0xfe9b, 0xfe9c, /*0x10-0x17*/ 0xfe9d, 0xfe9e, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /*0x18-0x1f*/ }; +static const unsigned short gb18030ext_page9f[16] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0xfe59, 0xfe61, 0xfe66, 0xfe67, /*0xb0-0xb7*/ + 0xfe6d, 0xfe7e, 0xfe90, 0xfea0, 0x0000, 0x0000, 0x0000, 0x0000, /*0xb8-0xbf*/ +}; +static const unsigned short gb18030ext_pagefe[16] = { + 0xa6d9, 0xa6db, 0xa6da, 0xa6dc, 0xa6dd, 0xa6de, 0xa6df, 0xa6ec, /*0x10-0x17*/ + 0xa6ed, 0xa6f3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /*0x18-0x1f*/ +}; static int gb18030ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) @@ -238,6 +248,8 @@ gb18030ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) unsigned short c = 0; if (wc == 0x01f9) c = 0xa8bf; + else if (wc == 0x1e3f) + c = 0xa8bc; else if (wc == 0x20ac) c = 0xa2e3; else if (wc >= 0x2e80 && wc < 0x2ed0) @@ -290,6 +302,22 @@ gb18030ext_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) c = gb18030ext_page4d[wc-0x4d10]; else if (wc == 0x4dae) c = 0xfe9f; + else if (wc >= 0x9fb4 && wc < 0x9fbc) + c = gb18030ext_page9f[wc-0x9fb0]; + else if (wc >= 0xfe10 && wc < 0xfe1a) + c = gb18030ext_pagefe[wc-0xfe10]; + else if (wc == 0x20087) + c = 0xfe51; + else if (wc == 0x20089) + c = 0xfe52; + else if (wc == 0x200cc) + c = 0xfe53; + else if (wc == 0x215d7) + c = 0xfe6c; + else if (wc == 0x2298f) + c = 0xfe76; + else if (wc == 0x241fe) + c = 0xfe91; if (c != 0) { r[0] = (c >> 8); r[1] = (c & 0xff); return 2; diff --git a/tests/GB18030-BMP.TXT b/tests/GB18030-BMP.TXT index 4767d3e..0006e4b 100644 --- a/tests/GB18030-BMP.TXT +++ b/tests/GB18030-BMP.TXT @@ -46728,13 +46728,13 @@ 0xA6D6 0x03C7 0xA6D7 0x03C8 0xA6D8 0x03C9 -0xA6D9 0xE78D -0xA6DA 0xE78E -0xA6DB 0xE78F -0xA6DC 0xE790 -0xA6DD 0xE791 -0xA6DE 0xE792 -0xA6DF 0xE793 +0xA6D9 0xFE10 +0xA6DA 0xFE12 +0xA6DB 0xFE11 +0xA6DC 0xFE13 +0xA6DD 0xFE14 +0xA6DE 0xFE15 +0xA6DF 0xFE16 0xA6E0 0xFE35 0xA6E1 0xFE36 0xA6E2 0xFE39 @@ -46747,14 +46747,14 @@ 0xA6E9 0xFE42 0xA6EA 0xFE43 0xA6EB 0xFE44 -0xA6EC 0xE794 -0xA6ED 0xE795 +0xA6EC 0xFE17 +0xA6ED 0xFE18 0xA6EE 0xFE3B 0xA6EF 0xFE3C 0xA6F0 0xFE37 0xA6F1 0xFE38 0xA6F2 0xFE31 -0xA6F3 0xE796 +0xA6F3 0xFE19 0xA6F4 0xFE33 0xA6F5 0xFE34 0xA6F6 0xE797 @@ -47079,7 +47079,7 @@ 0xA8B9 0x00FC 0xA8BA 0x00EA 0xA8BB 0x0251 -0xA8BC 0xE7C7 +0xA8BC 0x1E3F 0xA8BD 0x0144 0xA8BE 0x0148 0xA8BF 0x01F9 @@ -63313,15 +63313,15 @@ 0xFE4E 0xFA28 0xFE4F 0xFA29 0xFE50 0x2E81 -0xFE51 0xE816 -0xFE52 0xE817 -0xFE53 0xE818 +0xFE51 0x20087 +0xFE52 0x20089 +0xFE53 0x200CC 0xFE54 0x2E84 0xFE55 0x3473 0xFE56 0x3447 0xFE57 0x2E88 0xFE58 0x2E8B -0xFE59 0xE81E +0xFE59 0x9FB4 0xFE5A 0x359E 0xFE5B 0x361A 0xFE5C 0x360E @@ -63329,19 +63329,19 @@ 0xFE5E 0x2E97 0xFE5F 0x396E 0xFE60 0x3918 -0xFE61 0xE826 +0xFE61 0x9FB5 0xFE62 0x39CF 0xFE63 0x39DF 0xFE64 0x3A73 0xFE65 0x39D0 -0xFE66 0xE82B -0xFE67 0xE82C +0xFE66 0x9FB6 +0xFE67 0x9FB7 0xFE68 0x3B4E 0xFE69 0x3C6E 0xFE6A 0x3CE0 0xFE6B 0x2EA7 -0xFE6C 0xE831 -0xFE6D 0xE832 +0xFE6C 0x215D7 +0xFE6D 0x9FB8 0xFE6E 0x2EAA 0xFE6F 0x4056 0xFE70 0x415F @@ -63350,7 +63350,7 @@ 0xFE73 0x2EB3 0xFE74 0x2EB6 0xFE75 0x2EB7 -0xFE76 0xE83B +0xFE76 0x2298F 0xFE77 0x43B1 0xFE78 0x43AC 0xFE79 0x2EBB @@ -63358,7 +63358,7 @@ 0xFE7B 0x44D6 0xFE7C 0x4661 0xFE7D 0x464C -0xFE7E 0xE843 +0xFE7E 0x9FB9 0xFE80 0x4723 0xFE81 0x4729 0xFE82 0x477C @@ -63375,8 +63375,8 @@ 0xFE8D 0x499B 0xFE8E 0x49B7 0xFE8F 0x49B6 -0xFE90 0xE854 -0xFE91 0xE855 +0xFE90 0x9FBA +0xFE91 0x241FE 0xFE92 0x4CA3 0xFE93 0x4C9F 0xFE94 0x4CA0 @@ -63391,7 +63391,7 @@ 0xFE9D 0x4D18 0xFE9E 0x4D19 0xFE9F 0x4DAE -0xFEA0 0xE864 +0xFEA0 0x9FBB 0xFEA1 0xE468 0xFEA2 0xE469 0xFEA3 0xE46A diff --git a/tests/GB18030.IRREVERSIBLE.TXT b/tests/GB18030.IRREVERSIBLE.TXT new file mode 100644 index 0000000..1dd1904 --- /dev/null +++ b/tests/GB18030.IRREVERSIBLE.TXT @@ -0,0 +1,50 @@ +0x8135F437 0x1E3F +0x82359037 0x9FB4 +0x82359038 0x9FB5 +0x82359039 0x9FB6 +0x82359130 0x9FB7 +0x82359131 0x9FB8 +0x82359132 0x9FB9 +0x82359133 0x9FBA +0x82359134 0x9FBB +0x84318236 0xFE10 +0x84318237 0xFE11 +0x84318238 0xFE12 +0x84318239 0xFE13 +0x84318330 0xFE14 +0x84318331 0xFE15 +0x84318332 0xFE16 +0x84318333 0xFE17 +0x84318334 0xFE18 +0x84318335 0xFE19 +0x95329031 0x20087 +0x95329033 0x20089 +0x95329730 0x200CC +0x9536B937 0x215D7 +0x9630BA35 0x2298F +0x9635B630 0x241FE +0xA6D9 0xE78D +0xA6DA 0xE78E +0xA6DB 0xE78F +0xA6DC 0xE790 +0xA6DD 0xE791 +0xA6DE 0xE792 +0xA6DF 0xE793 +0xA6EC 0xE794 +0xA6ED 0xE795 +0xA6F3 0xE796 +0xA8BC 0xE7C7 +0xFE51 0xE816 +0xFE52 0xE817 +0xFE53 0xE818 +0xFE59 0xE81E +0xFE61 0xE826 +0xFE66 0xE82B +0xFE67 0xE82C +0xFE6C 0xE831 +0xFE6D 0xE832 +0xFE76 0xE83B +0xFE7E 0xE843 +0xFE90 0xE854 +0xFE91 0xE855 +0xFEA0 0xE864 -- cgit v1.2.3