diff --git a/simgear/misc/strutils.cxx b/simgear/misc/strutils.cxx index 27fae3df..e8117e6b 100644 --- a/simgear/misc/strutils.cxx +++ b/simgear/misc/strutils.cxx @@ -29,9 +29,8 @@ #include // strerror_r() and strerror_s() #include #include -#include -#include - +#include + #include "strutils.hxx" #include @@ -42,6 +41,8 @@ #if defined(SG_WINDOWS) #include + #include + #include #endif using std::string; @@ -654,14 +655,86 @@ static std::string convertWStringToMultiByte(DWORD encoding, const std::wstring& std::wstring convertUtf8ToWString(const std::string& a) { +#if defined(SG_WINDOWS) std::wstring_convert, wchar_t> ucs2conv; return ucs2conv.from_bytes(a); +#else + assert(sizeof(wchar_t) == 4); + std::wstring result; + int expectedContinuationCount = 0; + wchar_t wc = 0; + + for (uint8_t utf8CodePoint : a) { + // ASCII 7-bit range + if (utf8CodePoint <= 0x7f) { + if (expectedContinuationCount != 0) { + throw sg_format_exception(); + } + + result.push_back(static_cast(utf8CodePoint)); + } else if (expectedContinuationCount > 0) { + if ((utf8CodePoint & 0xC0) != 0x80) { + throw sg_format_exception(); + } + + wc = (wc << 6) | (utf8CodePoint & 0x3F); + if (--expectedContinuationCount == 0) { + result.push_back(wc); + } + } else { + if ((utf8CodePoint & 0xE0) == 0xC0) { + expectedContinuationCount = 1; + wc = utf8CodePoint & 0x1f; + } else if ((utf8CodePoint & 0xF0) == 0xE0) { + expectedContinuationCount = 2; + wc = utf8CodePoint & 0x0f; + } else if ((utf8CodePoint & 0xF8) == 0xF0) { + expectedContinuationCount = 3; + wc =utf8CodePoint & 0x07; + } else { + // illegal UTF-8 encoding + throw sg_format_exception(); + } + } + } // of UTF-8 code point iteration + + return result; + +#endif + } std::string convertWStringToUtf8(const std::wstring& w) { +#if defined(SG_WINDOWS) std::wstring_convert, wchar_t> ucs2conv; return ucs2conv.to_bytes(w); +#else + assert(sizeof(wchar_t) == 4); + std::string result; + + for (wchar_t cp : w) { + if (cp <= 0x7f) { + result.push_back(static_cast(cp)); + } else if (cp <= 0x07ff) { + result.push_back(0xC0 | ((cp >> 6) & 0x1f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (cp <= 0xffff) { + result.push_back(0xE0 | ((cp >> 12) & 0x0f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (cp < 0x10ffff) { + result.push_back(0xF0 | ((cp >> 18) & 0x07)); + result.push_back(0x80 | ((cp >> 12) & 0x3f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else { + throw sg_format_exception(); + } + } + + return result; +#endif } std::string convertWindowsLocal8BitToUtf8(const std::string& a) diff --git a/simgear/misc/strutils_test.cxx b/simgear/misc/strutils_test.cxx index 84978da5..3a5f2e72 100644 --- a/simgear/misc/strutils_test.cxx +++ b/simgear/misc/strutils_test.cxx @@ -605,6 +605,20 @@ void test_readTime() SG_CHECK_EQUAL_EP(strutils::readTime("-0:0:28"), -28 * seconds); } +void test_utf8Convert() +{ + // F, smiley emoticon, Maths summation symbol, section sign + std::wstring a(L"\u0046\U0001F600\u2211\u00A7"); + + + std::string utf8A = strutils::convertWStringToUtf8(a); + SG_VERIFY(utf8A == std::string("F\xF0\x9F\x98\x80\xE2\x88\x91\xC2\xA7")); + + + std::wstring aRoundTrip = strutils::convertUtf8ToWString(utf8A); + SG_VERIFY(a == aRoundTrip); +} + int main(int argc, char* argv[]) { test_strip(); @@ -624,6 +638,7 @@ int main(int argc, char* argv[]) test_error_string(); test_propPathMatch(); test_readTime(); + test_utf8Convert(); return EXIT_SUCCESS; }