typedef unsigned short cu2; /* we use a void pointer because left can either point to other nodes, or */ /* can point to the unsigned short array */ typedef struct { /* if right is null, left points to the actual buffer */ void* left; void* right; /* when right is NULL, negative count tells us that the block */ /* contains surrogate pairs */ /* negative count shouldn't happen any other time */ long count; } node; long mlog2(long i) { long j = 5; i >>= 5; while (i) { j++; i >>= 1; } return j; } #define ABS(v) (((v) < 0) ? -(v) : (v)) void* make_surrogate_tree(cu2* ustr, long length) { long cur = 0, minl, fs = 0, ns = 0; long count = 0, cur_node = 0, last_node = 0; node* ret_ptr = NULL; minl = mlog2(length); while (cur < length) { if ((ustr[cur] >= 0xd800) && (ustr[cur] < 0xe000)) { /* start of surrogate pair */ if ((ustr[cur] >= 0xdc00) || ((cur+1) == length) || (ustr[cur+1] < 0xdc00) ||(ustr[cur+1] >= 0xe000)) { /* bad surrogate */ return (void*) (-1); } ns = 1; } if (count >= minl && (fs || ns)) { /* We have enough characters in the 'current' block and either we had or now have a surrogate pair. This gives us longer sequences of non-surrogate blocks, with surrogate blocks of at most log(length) characters. The non-surrogate blocks we can address in O(1) time, so it doesn't matter how long they are, but surrogate blocks we need to linearly scan, so they need to be limited to O(log n) in length. */ cur_node++; count = 0; fs = 0; } count++; cur += 1 + ns; fs = fs | ns; ns = 0; } if (count) cur_node++; if (cur_node == 1) { /* one block with no surrogates of any length, no tree needed, */ /* make one anyways, */ /* or one short block, at least one surrogate, one node needed */ ret_ptr = (node*) malloc(sizeof(node)); ret_ptr->left = (void*) ustr; ret_ptr->right = NULL; ret_ptr->count = fs ? -count : count; return (void*) end; } last_node = cur_node*2 - 1; ret_ptr = (node*) malloc(sizeof(node)*last_node); if (ret_ptr == NULL) { /* ran out of memory */ return (void*) (-2); } cur = 0; while (cur < length) { if ((ustr[cur] >= 0xd800) && (ustr[cur] < 0xe000)) { /* start of surrogate pair */ /* data previously verified... */ ns = 1; } if (count >= minl && (fs || ns)) { /* if we get here, we need at least one more block */ /* so even after we increment cur_node, cur_node <= last_node */ ret_ptr[cur_node]->count = fs ? -count : count; cur_node++; ret_ptr[cur_node]->left = (void*) (ustr + cur); ret_ptr[cur_node]->right = NULL; count = 0; fs = 0; } count++; cur += 1 + ns; fs = fs | ns; ns = 0; } if (count) { /* last block has no surrogates and is of any length */ /* or last block has surrogates and has a length < minl */ ret_ptr[cur_node]->count = fs ? -count : count; } if (cur_node != last_node) { /* shouldn't ever happen */ } /* ret_ptr[last_node//2+1] to ret_ptr[last_node] inclusive now have */ /* blocks that they point to; use array-based heap parent addressing to */ /* set up child pointers to make this a complete tree */ /* That is to say, given a node in position k, it's parent is at (k-1)//2 */ for (;cur_node > 0;cur_node--) { fs = (cur_node-1)>>1; /* fs is our parent node */ ret_ptr[fs]->count += ABS(ret_ptr[cur_node]); if (cur_node & 1) { ret_ptr[fs]->left = (void*) (ustr + cur_node); } else { ret_ptr[fs]->right = (void*) (ustr + cur_node); } } return ret_ptr; } cu2* find_offset(node* root, long offset) { node* z; long y; cu2* x; if (root == NULL) return NULL; if (offset >= root->count) return NULL; /* traverse down the tree until we have a 'leaf' node */ while (root->right) { z = (node*) root->left; y = z->count; if (y < offset) { offset -= y; root = root->right; } else { root = root->left; } } /* discern between surrogate or non-surrogate blocks */ x = (cu2*) root->left; if (root->count > 0) { x += offset; return x; } while (offset) { /* this is OK because we have pre-verified surrogate pairs */ if ((x[0] >= 0xd800) && (x[0] < 0xe000)) x++; x++; offset--; } return x; }