diff options
| author | Alexander Gavrilov | 2012-04-05 18:10:16 +0400 |
|---|---|---|
| committer | Alexander Gavrilov | 2012-04-05 18:10:16 +0400 |
| commit | 28a741082f8b0981806b8a63589279627bd8e39e (patch) | |
| tree | 1377cede68f392e756dcfa16bc76edc0b59d8714 /library/MiscUtils.cpp | |
| parent | 59f411e4016405ebef0a928c002098586be77ebb (diff) | |
| download | dfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.gz dfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.bz2 dfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.xz | |
Encode & decode names in utf-8 for transfer in remote messages.
That's the encoding required by the protobuf spec.
Diffstat (limited to 'library/MiscUtils.cpp')
| -rw-r--r-- | library/MiscUtils.cpp | 161 |
1 files changed, 160 insertions, 1 deletions
diff --git a/library/MiscUtils.cpp b/library/MiscUtils.cpp index 8247cd00..9b26e2a6 100644 --- a/library/MiscUtils.cpp +++ b/library/MiscUtils.cpp @@ -37,6 +37,7 @@ distribution. #include <stdarg.h> #include <sstream> +#include <map> std::string stl_sprintf(const char *fmt, ...) { va_list lst; @@ -149,4 +150,162 @@ uint64_t GetTimeMs64() return ret; } -#endif
\ No newline at end of file +#endif + +/* Character decoding */ + +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static inline uint32_t +decode(uint32_t* state, uint32_t* codep, uint8_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +/* Character encoding */ + +static inline int encode(uint8_t *out, uint16_t c) { + if (c <= 0x7F) + { + out[0] = c; + return 1; + } + else if (c <= 0x7FF) + { + out[0] = (0xC0 | (c >> 6)); + out[1] = (0x80 | (c & 0x3F)); + return 2; + } + else /*if (c <= 0xFFFF)*/ + { + out[0] = (0xE0 | (c >> 12)); + out[1] = (0x80 | ((c >> 6) & 0x3F)); + out[2] = (0x80 | (c & 0x3F)); + return 3; + } +} + +/* CP437 */ + +static uint16_t character_table[256] = { + 0, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, // + 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C, + 0x25BA, 0x25C4, 0x2195, 0x203C, 0xB6, 0xA7, 0x25AC, 0x21A8, // + 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, // + 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // + 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, // + 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, // + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x2302, + 0xC7, 0xFC, 0xE9, 0xE2, 0xE4, 0xE0, 0xE5, 0xE7, // + 0xEA, 0xEB, 0xE8, 0xEF, 0xEE, 0xEC, 0xC4, 0xC5, + 0xC9, 0xE6, 0xC6, 0xF4, 0xF6, 0xF2, 0xFB, 0xF9, // + 0xFF, 0xD6, 0xDC, 0xA2, 0xA3, 0xA5, 0x20A7, 0x192, + 0xE1, 0xED, 0xF3, 0xFA, 0xF1, 0xD1, 0xAA, 0xBA, // + 0xBF, 0x2310, 0xAC, 0xBD, 0xBC, 0xA1, 0xAB, 0xBB, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, // + 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, + 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, // + 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, + 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, // + 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, + 0x3B1, 0xDF, 0x393, 0x3C0, 0x3A3, 0x3C3, 0xB5, 0x3C4, // + 0x3A6, 0x398, 0x3A9, 0x3B4, 0x221E, 0x3C6, 0x3B5, 0x2229, + 0x2261, 0xB1, 0x2265, 0x2264, 0x2320, 0x2321, 0xF7, 0x2248, // + 0xB0, 0x2219, 0xB7, 0x221A, 0x207F, 0xB2, 0x25A0, 0xA0 +}; + +std::string DF2UTF(const std::string &in) +{ + std::string out; + out.reserve(in.size()); + + uint8_t buf[4]; + for (size_t i = 0; i < in.size(); i++) + { + int cnt = encode(buf, character_table[(uint8_t)in[i]]); + out.append(&buf[0], &buf[cnt]); + } + + return out; +} + +std::string UTF2DF(const std::string &in) +{ + // Unicode to normal lookup table + static std::map<uint32_t, char> ctable; + + if (ctable.empty()) + { + for (uint16_t i = 0; i < 256; i++) + if (character_table[i] != i) + ctable[character_table[i]] = char(i); + } + + // Actual conversion loop + size_t size = in.size(); + std::string out(size, char(0)); + + uint32_t codepoint = 0; + uint32_t state = UTF8_ACCEPT, prev = UTF8_ACCEPT; + uint32_t pos = 0; + + for (unsigned i = 0; i < size; prev = state, i++) { + switch (decode(&state, &codepoint, uint8_t(in[i]))) { + case UTF8_ACCEPT: + if (codepoint < 256 && character_table[codepoint] == codepoint) { + out[pos++] = char(codepoint); + } else { + char v = ctable[codepoint]; + out[pos++] = v ? v : '?'; + } + break; + + case UTF8_REJECT: + out[pos++] = '?'; + if (prev != UTF8_ACCEPT) --i; + state = UTF8_ACCEPT; + break; + } + } + + if (pos != size) + out.resize(pos); + return out; +} |
