#!/usr/bin/python -u import sys import string import time sources = "Blocks-4.txt UnicodeData-3.1.0.txt" try: blocks = open("Blocks-4.txt", "r") except: print "Missing Blocks-4.txt, aborting ..." sys.exit(1) BlockNames = {} for line in blocks.readlines(): if line[0] == '#': continue line = string.strip(line) if line == '': continue try: fields = string.split(line, ';') range = string.strip(fields[0]) (start, end) = string.split(range, "..") name = string.strip(fields[1]) name = string.replace(name, ' ', '') except: print "Failed to process line: %s" % (line) continue BlockNames[name] = ("0x"+start, "0x"+end) blocks.close() print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) try: data = open("UnicodeData-3.1.0.txt", "r") except: print "Missing UnicodeData-3.1.0.txt, aborting ..." sys.exit(1) nbchar = 0; Categories = {} for line in data.readlines(): if line[0] == '#': continue line = string.strip(line) if line == '': continue try: fields = string.split(line, ';') point = string.strip(fields[0]) value = 0 while point != '': value = value * 16 if point[0] >= '0' and point[0] <= '9': value = value + ord(point[0]) - ord('0') elif point[0] >= 'A' and point[0] <= 'F': value = value + 10 + ord(point[0]) - ord('A') elif point[0] >= 'a' and point[0] <= 'f': value = value + 10 + ord(point[0]) - ord('a') point = point[1:] name = fields[2] except: print "Failed to process line: %s" % (line) continue nbchar = nbchar + 1 try: Categories[name].append(value) except: try: Categories[name] = [value] except: print "Failed to process line: %s" % (line) try: Categories[name[0]].append(value) except: try: Categories[name[0]] = [value] except: print "Failed to process line: %s" % (line) blocks.close() print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) #reduce the number list into ranges for cat in Categories.keys(): list = Categories[cat] start = -1 prev = -1 end = -1 ranges = [] for val in list: if start == -1: start = val prev = val continue elif val == prev + 1: prev = val continue elif prev == start: ranges.append((prev, prev)) start = val prev = val continue else: ranges.append((start, prev)) start = val prev = val continue if prev == start: ranges.append((prev, prev)) else: ranges.append((start, prev)) Categories[cat] = ranges # # Generate the resulting files # try: header = open("xmlunicode.h", "w") except: print "Failed to open xmlunicode.h" sys.exit(1) try: output = open("xmlunicode.c", "w") except: print "Failed to open xmlunicode.c" sys.exit(1) date = time.asctime(time.localtime(time.time())) header.write( """/* * xmlunicode.h: this header exports interfaces for the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Daniel Veillard */ #ifndef __XML_UNICODE_H__ #define __XML_UNICODE_H__ #ifdef __cplusplus extern "C" { #endif """ % (date, sources)); output.write( """/* * xmlunicode.c: this module implements the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Daniel Veillard */ #define IN_LIBXML #include "libxml.h" #ifdef LIBXML_UNICODE_ENABLED #include #include #include """ % (date, sources)); keys = BlockNames.keys() keys.sort() for block in keys: (start, end) = BlockNames[block] name = string.replace(block, '-', '') header.write("int\txmlUCSIs%s\t(int code);\n" % name) output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) output.write(" *\n * Check whether the character is part of %s UCS Block\n"% (block)) output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); output.write("int\nxmlUCSIs%s(int code) {\n" % name) output.write(" return((code >= %s) && (code <= %s));\n" % (start, end)) output.write("}\n\n") header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n") output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n") output.write(" * @block: UCS block name\n") output.write(" *\n * Check whether the caracter is part of the UCS Block\n") output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n"); output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n") keys = BlockNames.keys() keys.sort() for block in keys: name = string.replace(block, '-', '') output.write(" if (!strcmp(block, \"%s\"))\n return(xmlUCSIs%s(code));\n" % (block, name)); output.write(" return(-1);\n}\n\n") keys = Categories.keys() keys.sort() for name in keys: ranges = Categories[name] header.write("int\txmlUCSIsCat%s\t(int code);\n" % name) output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) output.write(" *\n * Check whether the character is part of %s UCS Category\n"% (name)) output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) start = 1 for range in ranges: (begin, end) = range; if start: output.write(" return("); start = 0 else: output.write(" ||\n "); if (begin == end): output.write("(code == %s)" % (hex(begin))) else: output.write("((code >= %s) && (code <= %s))" % ( hex(begin), hex(end))) output.write(");\n}\n\n") header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n") output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n") output.write(" * @cat: UCS Category name\n") output.write(" *\n * Check whether the caracter is part of the UCS Category\n") output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n"); output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n") keys = Categories.keys() keys.sort() for name in keys: output.write(" if (!strcmp(cat, \"%s\"))\n return(xmlUCSIsCat%s(code));\n" % (name, name)); output.write(" return(-1);\n}\n\n") header.write(""" #ifdef __cplusplus } #endif #endif /* __XML_UNICODE_H__ */ """); output.write(""" #endif /* LIBXML_UNICODE_ENABLED */ """); header.close() output.close()