Text.cpp

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2001-2010 Jacek Sieka, arnetheduck on gmail point com
00003  *
00004  * This program is free software; you can redistribute it and/or modify
00005  * it under the terms of the GNU General Public License as published by
00006  * the Free Software Foundation; either version 2 of the License, or
00007  * (at your option) any later version.
00008  *
00009  * This program is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  * GNU General Public License for more details.
00013  *
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software
00016  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00017  */
00018 
00019 #include "adchpp.h"
00020 
00021 #include "Text.h"
00022 
00023 #include "Util.h"
00024 
00025 using namespace std;
00026 
00027 namespace adchpp {
00028 
00029 int Text::utf8ToWc(const char* str, wchar_t& c) {
00030     uint8_t c0 = (uint8_t)str[0];
00031     if(c0 & 0x80) {                                 // 1xxx xxxx
00032         if(c0 & 0x40) {                             // 11xx xxxx
00033             if(c0 & 0x20) {                         // 111x xxxx
00034                 if(c0 & 0x10) {                     // 1111 xxxx
00035                     int n = -4;
00036                     if(c0 & 0x08) {                 // 1111 1xxx
00037                         n = -5;
00038                         if(c0 & 0x04) {             // 1111 11xx
00039                             if(c0 & 0x02) {         // 1111 111x
00040                                 return -1;
00041                             }
00042                             n = -6;
00043                         }
00044                     }
00045                     int i = -1;
00046                     while(i > n && (str[abs(i)] & 0x80) == 0x80)
00047                         --i;
00048                     return i;
00049                 } else {        // 1110xxxx
00050                     uint8_t c1 = (uint8_t)str[1];
00051                     if((c1 & (0x80 | 0x40)) != 0x80)
00052                         return -1;
00053 
00054                     uint8_t c2 = (uint8_t)str[2];
00055                     if((c2 & (0x80 | 0x40)) != 0x80)
00056                         return -2;
00057 
00058                     // Ugly utf-16 surrogate catch
00059                     if((c0 & 0x0f) == 0x0d && (c1 & 0x3c) >= (0x08 << 2))
00060                         return -3;
00061 
00062                     // Overlong encoding
00063                     if(c0 == (0x80 | 0x40 | 0x20) && (c1 & (0x80 | 0x40 | 0x20)) == 0x80)
00064                         return -3;
00065 
00066                     c = (((wchar_t)c0 & 0x0f) << 12) |
00067                         (((wchar_t)c1 & 0x3f) << 6) |
00068                         ((wchar_t)c2 & 0x3f);
00069 
00070                     return 3;
00071                 }
00072             } else {                // 110xxxxx
00073                 uint8_t c1 = (uint8_t)str[1];
00074                 if((c1 & (0x80 | 0x40)) != 0x80)
00075                     return -1;
00076 
00077                 // Overlong encoding
00078                 if((c0 & ~1) == (0x80 | 0x40))
00079                     return -2;
00080 
00081                 c = (((wchar_t)c0 & 0x1f) << 6) |
00082                     ((wchar_t)c1 & 0x3f);
00083                 return 2;
00084             }
00085         } else {                    // 10xxxxxx
00086             return -1;
00087         }
00088     } else {                        // 0xxxxxxx
00089         c = (unsigned char)str[0];
00090         return 1;
00091     }
00092     dcassert(0);
00093 }
00094 
00095 void Text::wcToUtf8(wchar_t c, string& str) {
00096     if(c >= 0x0800) {
00097         str += (char)(0x80 | 0x40 | 0x20 | (c >> 12));
00098         str += (char)(0x80 | ((c >> 6) & 0x3f));
00099         str += (char)(0x80 | (c & 0x3f));
00100     } else if(c >= 0x0080) {
00101         str += (char)(0x80 | 0x40 | (c >> 6));
00102         str += (char)(0x80 | (c & 0x3f));
00103     } else {
00104         str += (char)c;
00105     }
00106 }
00107 
00108 const string& Text::acpToUtf8(const string& str, string& tmp) throw() {
00109     wstring wtmp;
00110     return wideToUtf8(acpToWide(str, wtmp), tmp);
00111 }
00112 
00113 const wstring& Text::acpToWide(const string& str, wstring& tmp) throw() {
00114     if(str.empty())
00115         return Util::emptyStringW;
00116 #ifdef _WIN32
00117     int n = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, str.c_str(), (int)str.length(), NULL, 0);
00118     if(n == 0) {
00119         return Util::emptyStringW;
00120     }
00121 
00122     tmp.resize(n);
00123     n = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, str.c_str(), (int)str.length(), &tmp[0], n);
00124     if(n == 0) {
00125         return Util::emptyStringW;
00126     }
00127     return tmp;
00128 #else
00129     size_t rv;
00130     wchar_t wc;
00131     const char *src = str.c_str();
00132     size_t n = str.length() + 1;
00133 
00134     tmp.clear();
00135     tmp.reserve(n);
00136 
00137     while(n > 0) {
00138         rv = mbrtowc(&wc, src, n, NULL);
00139         if(rv == 0 || rv == (size_t)-2) {
00140             break;
00141         } else if(rv == (size_t)-1) {
00142             tmp.push_back(L'_');
00143             ++src;
00144             --n;
00145         } else {
00146             tmp.push_back(wc);
00147             src += rv;
00148             n -= rv;
00149         }
00150     }
00151     return tmp;
00152 #endif
00153 }
00154 
00155 const string& Text::wideToUtf8(const wstring& str, string& tgt) throw() {
00156     if(str.empty()) {
00157         return Util::emptyString;
00158     }
00159 
00160     string::size_type n = str.length();
00161     tgt.clear();
00162     for(string::size_type i = 0; i < n; ++i) {
00163         wcToUtf8(str[i], tgt);
00164     }
00165     return tgt;
00166 }
00167 
00168 const string& Text::wideToAcp(const wstring& str, string& tmp) throw() {
00169     if(str.empty())
00170         return Util::emptyString;
00171 #ifdef _WIN32
00172     int n = WideCharToMultiByte(CP_ACP, 0, str.c_str(), (int)str.length(), NULL, 0, NULL, NULL);
00173     if(n == 0) {
00174         return Util::emptyString;
00175     }
00176 
00177     tmp.resize(n);
00178     n = WideCharToMultiByte(CP_ACP, 0, str.c_str(), (int)str.length(), &tmp[0], n, NULL, NULL);
00179     if(n == 0) {
00180         return Util::emptyString;
00181     }
00182     return tmp;
00183 #else
00184     const wchar_t* src = str.c_str();
00185     int n = wcsrtombs(NULL, &src, 0, NULL);
00186     if(n < 1) {
00187         return Util::emptyString;
00188     }
00189     src = str.c_str();
00190     tmp.resize(n);
00191     n = wcsrtombs(&tmp[0], &src, n, NULL);
00192     if(n < 1) {
00193         return Util::emptyString;
00194     }
00195     return tmp;
00196 #endif
00197 }
00198 
00199 bool Text::validateUtf8(const string& str) throw() {
00200     string::size_type i = 0;
00201     while(i < str.length()) {
00202         wchar_t dummy = 0;
00203         int j = utf8ToWc(&str[i], dummy);
00204         if(j < 0)
00205             return false;
00206         i += j;
00207     }
00208     return true;
00209 }
00210 
00211 const string& Text::utf8ToAcp(const string& str, string& tmp) throw() {
00212     wstring wtmp;
00213     return wideToAcp(utf8ToWide(str, wtmp), tmp);
00214 }
00215 
00216 const wstring& Text::utf8ToWide(const string& str, wstring& tgt) throw() {
00217     tgt.reserve(str.length());
00218     string::size_type n = str.length();
00219     for(string::size_type i = 0; i < n; ) {
00220         wchar_t c = 0;
00221         int x = utf8ToWc(str.c_str() + i, c);
00222         if(x < 0) {
00223             tgt += '_';
00224             i += abs(x);
00225         } else {
00226             i += x;
00227             tgt += c;
00228         }
00229     }
00230     return tgt;
00231 }
00232 
00233 string Text::acpToUtf8(const string& str) throw() {
00234     string tmp;
00235     return acpToUtf8(str, tmp);
00236 }
00237 
00238 wstring Text::acpToWide(const string& str) throw() {
00239     wstring tmp;
00240     return acpToWide(str, tmp);
00241 }
00242 
00243 string Text::utf8ToAcp(const string& str) throw() {
00244     string tmp;
00245     return utf8ToAcp(str, tmp);
00246 }
00247 
00248 wstring Text::utf8ToWide(const string& str) throw() {
00249     wstring tmp;
00250     return utf8ToWide(str, tmp);
00251 }
00252 
00253 string Text::wideToAcp(const wstring& str) throw() {
00254     string tmp;
00255     return wideToAcp(str, tmp);
00256 }
00257 
00258 string Text::wideToUtf8(const wstring& str) throw() {
00259     string tmp;
00260     return wideToUtf8(str, tmp);
00261 }
00262 
00263 } // namespace adchpp