/// stdio.h only needed in debugging #include "stdio.h" // #include "dataTypes.h" #include "IDptByteEncode.hxx" int IDptByteEncode::Encode(unsigned char *b, u32 val) { // Encode 'val' according to a scheme which is inspired by UTF-8 encoding // Places the 1 to 5 byte sequence at *b and returns the length // of the sequence. // Compared to real utf-8, this sacrifices the feature that you can detect // if a string starts in the middle of a utf-8 encoded character (in real // utf-8, any byte of the form 10xxxxxx is a continuing sequence) but // packs more tightly if (val < 0x00000080) { // Starts 0xxxxxxx (7 bits) b[0] = val; return 1; } else if (val < 0x00004000) { // Starts 10xxxxxx (6+8=14 bits) b[1] = (val & 0x000000FF); b[0] = (val >> 8) | 0x80; // printf("E2: %d %x %x\n",val,(u32)b[0],(u32)b[1]); return 2; } else if (val < 0x00200000) { // Starts 110xxxxx (5+8+8=21 bits) b[2] = (val & 0x000000FF); b[1] = (val & 0x0000FF00) >> 8; b[0] = (val >> 16) | 0xC0; return 3; } else if (val < 0x10000000) { // Starts 1110xxxx (4+8+8+8=28 bits) b[3] = (val & 0x000000FF); b[2] = (val & 0x0000FF00) >> 8; b[1] = (val & 0x00FF0000) >> 16; b[0] = (val >> 24) | 0xE0; return 4; } else { // Starts 11110xxx (3+8+8+8+8=35 bits) b[4] = (val & 0x000000FF); b[3] = (val & 0x0000FF00) >> 8; b[2] = (val & 0x00FF0000) >> 16; b[1] = (val & 0xFF000000) >> 24; b[0] = 0xF0; // We don't have bits 32,33,34 return 5; } } int IDptByteEncode::Decode(const unsigned char *b, u32 &val) { // Decode the next sequence of bytes from the PseudoUtf-8 stream // into a character and place it in val. reeturn the length of // the sequence which has been removed. // Returns 0 if an invalid sequence is found (a byte 0xF[FEDC]) if ((b[0] & 0x80)== 0x00) { // Starts 0xxxxxxx (7 bits) val = b[0]; //printf("Returning 1 val %d\n",val); return 1; } else if ((b[0] & 0xC0)== 0x80) { // Starts 10xxxxxx (6+8=14 bits) u32 v0 = b[0]; val = b[1] | ((v0&0x3F)<<8); return 2; } else if ((b[0] & 0xE0)== 0xC0) { // Starts 110xxxxx (5+8+8=21 bits) u32 v0 = b[0], v1 = b[1]; val = b[2] | (v1<<8) | ((v0&0x1F)<<16); return 3; } else if ((b[0] & 0xF0)== 0xE0) { // Starts 1110xxxx (4+8+8+8=28 bits) u32 v0 = b[0], v1 = b[1], v2 = b[2]; val = b[3] | (v2<<8) | (v1<<16) | ((v0&0x0F)<<24); return 4; } else if ((b[0] & 0xF8)== 0xF0) { // Starts 11110xxx (3+8+8+8+8=35 bits) // unused v0: u32 v0 = b[0]; u32 v1 = b[1], v2 = b[2], v3 = b[3]; val = b[4] | (v3<<8) | (v2<<16) | (v1<<24); // No bits 32,33,34 return 5; } else return 0; } void IDptByteEncodeItr::Insert(u32 val) { if (val != 0) { // Normal insert fEndPtr += e.Encode(fEndPtr,val); fZeros = 0; } else { // Insert a zero fZeros++; if (fZeros == 1) { // First zero, just put a zero in, but prime fZerpPtr fZeroPtr = fEndPtr; fEndPtr += e.Encode(fEndPtr,val); } else { // More than one zero, OK to runlength encode *fZeroPtr = 0xFF; fEndPtr = fZeroPtr+1 + e.Encode(fZeroPtr+1,fZeros); } } } u32 IDptByteDecodeItr::GetNextValue() { if (fState == 1) fState = 2; // We have overrun buffer if (fState) return 0; // We have completed reading if (fZeros) { // We have a zero from a runlength fZeros--; return 0; } u32 val; int s = e.Decode(fEndPtr,val); if (s) { //printf("s = %d val = %d\n",s,val); fEndPtr += s; return val; // We have a normal value } else if (*fEndPtr == 0xFF) { // We have a new zero runlength fEndPtr++; // Jump over runlength marker int s = e.Decode(fEndPtr,val); if (!s) { fState = 3; return 0; } // Corruption in decoding runlength fEndPtr += s; fZeros = val-1; return 0; // Return the first zero of runlength } else if (*fEndPtr == 0xFE) { fEndPtr++; fState = 1; return 0; // Encountered end of the runlength } else { // Some unexpected value in runlength fEndPtr++; fState = 3; return 0; } } // This is stored here - it is the routine to do real UTF-8 encoding. // Not used #if 0 int DptByteEncode::RealUtf8Encode(unsigned char *b, u32 val) { // Encode 'val' according to real utf-8 // Places the 1 to 7 byte sequence at *b and returns the length // of the sequence. if (val < 0x00000080) { // Starts 0xxxxxxx (7 bits) b[0] = val; return 1; } else if (val < 0x00000800) { // Starts 110xxxxx (5+6=11 bits) b[1] = (val & 0x0000003F) | 0x80; b[0] = (val >> 6) | 0xC0; return 2; } else if (val < 0x00010000) { // Starts 1110xxxx (4+6+6=16 bits) b[2] = (val & 0x0000003F) | 0x80; b[1] = ((val & 0x00000FC0) >> 6) | 0x80; b[0] = (val >> 12) | 0xE0; return 3; } else if (val < 0x00200000) { // Starts 11110xxx (3+6+6+6=21 bits) b[3] = (val & 0x0000003F) | 0x80; b[2] = ((val & 0x00000FC0) >> 6) | 0x80; b[1] = ((val & 0x0003F000) >> 12) | 0x80; b[0] = (val >> 18) | 0xF0; return 4; } else if (val < 0x04000000) { // Starts 111110xx (2+6+6+6+6=26 bits) b[4] = (val & 0x0000003F) | 0x80; b[3] = ((val & 0x00000FC0) >> 6) | 0x80; b[2] = ((val & 0x0003F000) >> 12) | 0x80; b[1] = ((val & 0x00FC0000) >> 18) | 0x80; b[0] = (val >> 24) | 0xF8; return 5; } else if (val < 0x80000000) { // Starts 1111110x (1+6+6+6+6+6=31 bits) b[5] = (val & 0x0000003F) | 0x80; b[4] = ((val & 0x00000FC0) >> 6) | 0x80; b[3] = ((val & 0x0003F000) >> 12) | 0x80; b[2] = ((val & 0x00FC0000) >> 18) | 0x80; b[1] = ((val & 0x3F000000) >> 24) | 0x80; b[0] = (val >> 30) | 0xFC; return 6; } else { // Starts 11111110 (0+6+6+6+6+6+6=36 bits) b[6] = (val & 0x0000003F) | 0x80; b[5] = ((val & 0x00000FC0) >> 6) | 0x80; b[4] = ((val & 0x0003F000) >> 12) | 0x80; b[3] = ((val & 0x00FC0000) >> 18) | 0x80; b[2] = ((val & 0x3F000000) >> 24) | 0x80; b[1] = ((val & 0xC0000000) >> 30) | 0x80; b[0] = 0xFE; return 7; } } int DptByteEncode::RealUtf8Decode(unsigned char *b, u32 &val) { // Decodes utf-8 bytestream at b[] and places 32 bit character into // val. Returns the number of bytes from b[] which were consumed. if ((b[0] & 0x80)== 0x00) { // Starts 0xxxxxxx (7 bits) val = b[0]; return 1; } else if ((b[0] & 0xE0)== 0xC0) { // Starts 110xxxxx (5+6=11 bits) val = (b[1]&0x3F) | ((b[0]&0x1F)<<6); return 2; } else if ((b[0] & 0xF0)== 0xE0) { // Starts 1110xxxx (4+6+6=16 bits) val = (b[2]&0x3F) | ((b[1]&0x3F)<<6 | ((b[0]&0x0F)<<12); return 3; } else if ((b[0] & 0xF8)== 0xF0) { // Starts 11110xxx (3+6+6+6=21 bits) val = (b[3]&0x3F) | ((b[2]&0x3F)<<6 | ((b[1]&0x3F)<<12 | ((b[0]&0x07)<<18); return 4; } else if ((b[0] & 0xFC)== 0xF8) { // Starts 111110xx (2+6+6+6+6=26 bits) val = (b[4]&0x3F) | ((b[3]&0x3F)<<6 | ((b[2]&0x3F)<<12 | ((b[1]&0x3F)<<18 | ((b[0]&0x03)<<24); return 5; } else if ((b[0] & 0xFE)== 0xFC) { // Starts 1111110x (1+6+6+6+6+6=31 bits) val = (b[5]&0x3F) | ((b[4]&0x3F)<<6 | ((b[3]&0x3F)<<12 | ((b[2]&0x3F)<<18 | ((b[1]&0x3F)<<24 | ((b[0]&0x01)<<30); return 6; } else if ((b[0] == 0xFE) { // Starts 11111110 (0+6+6+6+6+6+6=36 bits) val = (b[6]&0x3F) | ((b[5]&0x3F)<<6 | ((b[4]&0x3F)<<12 // no bits 35-32 | ((b[3]&0x3F)<<18 | ((b[2]&0x3F)<<24 | ((b[1]&0x3F)<<30; return 7; } else return 0; // b[0]==0xFF only } #endif