summaryrefslogtreecommitdiff
path: root/library/MiscUtils.cpp
diff options
context:
space:
mode:
authorAlexander Gavrilov2012-04-05 18:10:16 +0400
committerAlexander Gavrilov2012-04-05 18:10:16 +0400
commit28a741082f8b0981806b8a63589279627bd8e39e (patch)
tree1377cede68f392e756dcfa16bc76edc0b59d8714 /library/MiscUtils.cpp
parent59f411e4016405ebef0a928c002098586be77ebb (diff)
downloaddfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.gz
dfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.bz2
dfhack-28a741082f8b0981806b8a63589279627bd8e39e.tar.xz
Encode & decode names in utf-8 for transfer in remote messages.
That's the encoding required by the protobuf spec.
Diffstat (limited to 'library/MiscUtils.cpp')
-rw-r--r--library/MiscUtils.cpp161
1 files changed, 160 insertions, 1 deletions
diff --git a/library/MiscUtils.cpp b/library/MiscUtils.cpp
index 8247cd00..9b26e2a6 100644
--- a/library/MiscUtils.cpp
+++ b/library/MiscUtils.cpp
@@ -37,6 +37,7 @@ distribution.
#include <stdarg.h>
#include <sstream>
+#include <map>
std::string stl_sprintf(const char *fmt, ...) {
va_list lst;
@@ -149,4 +150,162 @@ uint64_t GetTimeMs64()
return ret;
}
-#endif \ No newline at end of file
+#endif
+
+/* Character decoding */
+
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+ // The first part of the table maps bytes to character classes that
+ // to reduce the size of the transition table and create bitmasks.
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ // The second part is a transition table that maps a combination
+ // of a state of the automaton and a character class to a state.
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static inline uint32_t
+decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state + type];
+ return *state;
+}
+
+/* Character encoding */
+
+static inline int encode(uint8_t *out, uint16_t c) {
+ if (c <= 0x7F)
+ {
+ out[0] = c;
+ return 1;
+ }
+ else if (c <= 0x7FF)
+ {
+ out[0] = (0xC0 | (c >> 6));
+ out[1] = (0x80 | (c & 0x3F));
+ return 2;
+ }
+ else /*if (c <= 0xFFFF)*/
+ {
+ out[0] = (0xE0 | (c >> 12));
+ out[1] = (0x80 | ((c >> 6) & 0x3F));
+ out[2] = (0x80 | (c & 0x3F));
+ return 3;
+ }
+}
+
+/* CP437 */
+
+static uint16_t character_table[256] = {
+ 0, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, //
+ 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
+ 0x25BA, 0x25C4, 0x2195, 0x203C, 0xB6, 0xA7, 0x25AC, 0x21A8, //
+ 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, //
+ 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, //
+ 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, //
+ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, //
+ 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, //
+ 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, //
+ 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x2302,
+ 0xC7, 0xFC, 0xE9, 0xE2, 0xE4, 0xE0, 0xE5, 0xE7, //
+ 0xEA, 0xEB, 0xE8, 0xEF, 0xEE, 0xEC, 0xC4, 0xC5,
+ 0xC9, 0xE6, 0xC6, 0xF4, 0xF6, 0xF2, 0xFB, 0xF9, //
+ 0xFF, 0xD6, 0xDC, 0xA2, 0xA3, 0xA5, 0x20A7, 0x192,
+ 0xE1, 0xED, 0xF3, 0xFA, 0xF1, 0xD1, 0xAA, 0xBA, //
+ 0xBF, 0x2310, 0xAC, 0xBD, 0xBC, 0xA1, 0xAB, 0xBB,
+ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, //
+ 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
+ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, //
+ 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
+ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, //
+ 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
+ 0x3B1, 0xDF, 0x393, 0x3C0, 0x3A3, 0x3C3, 0xB5, 0x3C4, //
+ 0x3A6, 0x398, 0x3A9, 0x3B4, 0x221E, 0x3C6, 0x3B5, 0x2229,
+ 0x2261, 0xB1, 0x2265, 0x2264, 0x2320, 0x2321, 0xF7, 0x2248, //
+ 0xB0, 0x2219, 0xB7, 0x221A, 0x207F, 0xB2, 0x25A0, 0xA0
+};
+
+std::string DF2UTF(const std::string &in)
+{
+ std::string out;
+ out.reserve(in.size());
+
+ uint8_t buf[4];
+ for (size_t i = 0; i < in.size(); i++)
+ {
+ int cnt = encode(buf, character_table[(uint8_t)in[i]]);
+ out.append(&buf[0], &buf[cnt]);
+ }
+
+ return out;
+}
+
+std::string UTF2DF(const std::string &in)
+{
+ // Unicode to normal lookup table
+ static std::map<uint32_t, char> ctable;
+
+ if (ctable.empty())
+ {
+ for (uint16_t i = 0; i < 256; i++)
+ if (character_table[i] != i)
+ ctable[character_table[i]] = char(i);
+ }
+
+ // Actual conversion loop
+ size_t size = in.size();
+ std::string out(size, char(0));
+
+ uint32_t codepoint = 0;
+ uint32_t state = UTF8_ACCEPT, prev = UTF8_ACCEPT;
+ uint32_t pos = 0;
+
+ for (unsigned i = 0; i < size; prev = state, i++) {
+ switch (decode(&state, &codepoint, uint8_t(in[i]))) {
+ case UTF8_ACCEPT:
+ if (codepoint < 256 && character_table[codepoint] == codepoint) {
+ out[pos++] = char(codepoint);
+ } else {
+ char v = ctable[codepoint];
+ out[pos++] = v ? v : '?';
+ }
+ break;
+
+ case UTF8_REJECT:
+ out[pos++] = '?';
+ if (prev != UTF8_ACCEPT) --i;
+ state = UTF8_ACCEPT;
+ break;
+ }
+ }
+
+ if (pos != size)
+ out.resize(pos);
+ return out;
+}