00001 /************************************************* 00002 * libucp - Unicode Property Table handler * 00003 *************************************************/ 00004 00005 /* Internal header file defining the layout of compact nodes in the tree. */ 00006 00007 typedef struct cnode { 00008 unsigned short int f0; 00009 unsigned short int f1; 00010 unsigned short int f2; 00011 } cnode; 00012 00013 /* Things for the f0 field */ 00014 00015 #define f0_leftexists 0x8000 /* Left child exists */ 00016 #define f0_typemask 0x3f00 /* Type bits */ 00017 #define f0_typeshift 8 /* Type shift */ 00018 #define f0_chhmask 0x00ff /* Character high bits */ 00019 00020 /* Things for the f2 field */ 00021 00022 #define f2_rightmask 0xf000 /* Mask for right offset bits */ 00023 #define f2_rightshift 12 /* Shift for right offset */ 00024 #define f2_casemask 0x0fff /* Mask for case offset */ 00025 00026 /* The tree consists of a vector of structures of type cnode, with the root 00027 node as the first element. The three short ints (16-bits) are used as follows: 00028 00029 (f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node 00030 is the next node in the vector. 00031 (2) The 0x4000 bits of f0 is spare. 00032 (3) The 0x3f00 bits of f0 contain the character type; this is a number 00033 defined by the enumeration in ucp.h (e.g. ucp_Lu). 00034 (4) The bottom 8 bits of f0 contain the most significant byte of the 00035 character's 24-bit codepoint. 00036 00037 (f1) (1) The f1 field contains the two least significant bytes of the 00038 codepoint. 00039 00040 (f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this 00041 node. Otherwise, they contain one plus the exponent of the power of 00042 two of the offset to the right node (e.g. a value of 3 means 8). The 00043 units of the offset are node items. 00044 00045 (2) The 0x0fff bits of f2 contain the signed offset from this character to 00046 its alternate cased value. They are zero if there is no such 00047 character. 00048 00049 00050 ----------------------------------------------------------------------------- 00051 ||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) || 00052 ----------------------------------------------------------------------------- 00053 | | | 00054 | |-> spare | 00055 | exponent of right 00056 |-> left child exists child offset 00057 00058 00059 The upper/lower casing information is set only for characters that come in 00060 pairs. There are (at present) four non-one-to-one mappings in the Unicode data. 00061 These are ignored. They are: 00062 00063 1FBE Greek Prosgegrammeni (lower, with upper -> capital iota) 00064 2126 Ohm 00065 212A Kelvin 00066 212B Angstrom 00067 00068 Certainly for the last three, having an alternate case would seem to be a 00069 mistake. I don't know any Greek, so cannot comment on the first one. 00070 00071 00072 When searching the tree, proceed as follows: 00073 00074 (1) Start at the first node. 00075 00076 (2) Extract the character value from f1 and the bottom 8 bits of f0; 00077 00078 (3) Compare with the character being sought. If equal, we are done. 00079 00080 (4) If the test character is smaller, inspect the f0_leftexists flag. If it is 00081 not set, the character is not in the tree. If it is set, move to the next 00082 node, and go to (2). 00083 00084 (5) If the test character is bigger, extract the f2_rightmask bits from f2, and 00085 shift them right by f2_rightshift. If the result is zero, the character is 00086 not in the tree. Otherwise, calculate the number of nodes to skip by 00087 shifting the value 1 left by this number minus one. Go to (2). 00088 */ 00089 00090 00091 /* End of internal.h */