• UTF-8字符串操作方法


    1. /*
    2. UTF-8(Unicode Transformation Format-8)是一种可变长度的字符编码,它可以表示 Unicode 字符集中的所有字符。下面是 UTF-8 编码的基本规范:
    3. 分为四个区间:
    4. 0x0000 0000 至 0x0000 007F:0xxxxxxx
    5. 0x0000 0080 至 0x0000 07FF:110xxxxx 10xxxxxx
    6. 0x0000 0800 至 0x0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
    7. 0x0001 0000 至 0x0010 FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    8. 第一个字节位前面有几个1,表示该字符需要几个字符表示。
    9. 例如:110xxxxx 10xxxxxx 前面几位110 表示该字符需要2个字节
    10. 例如:1110xxxx 10xxxxxx 10xxxxxx 前面几位1110 表示该字符需要3个字节
    11. 同时第一个字节 为标识位 ,后面字节每个字节前2位都是10。
    12. */
    13. #include
    14. #include
    15. #include
    16. #include
    17. #include "boost/locale.hpp"
    18. #include
    19. using namespace std;
    20. namespace
    21. {
    22. using CODECVR_BYNAME_TYPE = std::codecvt_byname<wchar_t, char, mbstate_t>;
    23. const char* GBK_LOCALE_NAME = ".936";
    24. bool UTF8ToWide(std::wstring& wstr, const std::string& utf8)
    25. {
    26. std::wstring_convertwchar_t>> converter;
    27. wstr = converter.from_bytes(utf8);
    28. return true;
    29. }
    30. bool GBKToWide(std::wstring& wstr, const std::string& gbk)
    31. {
    32. std::wstring_convert convert(new CODECVR_BYNAME_TYPE(GBK_LOCALE_NAME));
    33. wstr = convert.from_bytes(gbk);
    34. return true;
    35. }
    36. bool WideToUTF8(std::string& utf8, const std::wstring& wstr)
    37. {
    38. std::wstring_convertwchar_t>> converter;
    39. utf8 = converter.to_bytes(wstr);
    40. return true;
    41. }
    42. bool WideToGBK(std::string& gbk, const std::wstring& wstr)
    43. {
    44. std::wstring_convert convert(new CODECVR_BYNAME_TYPE(GBK_LOCALE_NAME));
    45. gbk = convert.to_bytes(wstr);
    46. return true;
    47. }
    48. bool GBKToUTF8(std::string& utf8, const std::string& gbk)
    49. {
    50. std::wstring wstr;
    51. GBKToWide(wstr, gbk);
    52. WideToUTF8(utf8, wstr);
    53. return true;
    54. }
    55. bool UTF8ToGBK(std::string& gbk, const std::string& utf8)
    56. {
    57. std::wstring wstr;
    58. UTF8ToWide(wstr, utf8);
    59. WideToGBK(gbk, wstr);
    60. return true;
    61. }
    62. }
    63. // 判断字符串是否为UTF-8编码
    64. bool IsStringUTF8(const string& str)
    65. {
    66. unsigned char bytes = 0; // UFT8可用1-6个字节编码,ASCII用一个字节
    67. unsigned char chr;
    68. for (size_t i = 0; i < str.size(); ++i)
    69. {
    70. chr = str.at(i);
    71. // 若是不是ASCII码,应该是多字节符,计算字节数
    72. if (0 == bytes)
    73. {
    74. if (chr >= 0x80)
    75. {
    76. if (chr >= 0xFC && chr <= 0xFD)
    77. bytes = 6;
    78. else if (chr >= 0xF8)
    79. bytes = 5;
    80. else if (chr >= 0xF0)
    81. bytes = 4;
    82. else if (chr >= 0xE0)
    83. bytes = 3;
    84. else if (chr >= 0xC0)
    85. bytes = 2;
    86. else
    87. return false;
    88. bytes--;
    89. }
    90. }
    91. else // 多字节符的非首字节,应为 10xxxxxx
    92. {
    93. if ((chr & 0xC0) != 0x80)
    94. return false;
    95. bytes--;
    96. }
    97. }
    98. // 违返规则
    99. if (bytes > 0) {
    100. return false;
    101. }
    102. return true;
    103. }
    104. int UTF8StringSize(const std::string& str)
    105. {
    106. // std::wstring wstr;
    107. // UTF8ToWide(wstr, str);
    108. // return wstr.size();
    109. return MultiByteToWideChar(CP_UTF8, NULL, str.c_str(), str.size(), NULL, NULL);
    110. }
    111. // 一个字符的字节数
    112. int GetUTF8Bytes(const char chr)
    113. {
    114. int bytes = 0;
    115. if (chr >= 0xFC && chr <= 0xFD)
    116. bytes = 6;
    117. else if (chr >= 0xF8)
    118. bytes = 5;
    119. else if (chr >= 0xF0)
    120. bytes = 4;
    121. else if (chr >= 0xE0)
    122. bytes = 3;
    123. else if (chr >= 0xC0)
    124. bytes = 2;
    125. else if (chr < 0x80)
    126. bytes = 0;
    127. return bytes;
    128. }
    129. // UTF-8字符串截取指定长度的字符
    130. std::string UTF8StringSafeTruncate(const std::string& name, const size_t need_size)
    131. {
    132. size_t i = 0;
    133. size_t j = 0;
    134. // 获取指定长度的字符
    135. while (i < need_size && j < name.length()) {
    136. unsigned char c = (unsigned char)name[j++];
    137. i += ((c & 0xc0) != 0x80);
    138. }
    139. // 移动到下一个字符
    140. while (j < name.length()) {
    141. unsigned char c = (unsigned char)name[j];
    142. if ((c & 0xc0) == 0x80) {
    143. j++;
    144. }
    145. else {
    146. break;
    147. }
    148. }
    149. return name.substr(0, j);
    150. }
    151. std::string ToLower_transform1(const std::string& str)
    152. {
    153. std::string str_tmp = str;
    154. std::transform(str_tmp.begin(), str_tmp.end(), str_tmp.begin(), ::tolower);
    155. return str_tmp;
    156. }
    157. std::string ToUpper_transform1(const std::string& str)
    158. {
    159. std::string str_tmp = str;
    160. std::transform(str_tmp.begin(), str_tmp.end(), str_tmp.begin(), ::toupper);
    161. return str_tmp;
    162. }
    163. std::string ToLower_transform2(const std::string& str)
    164. {
    165. std::wstring wstr_tmp;
    166. UTF8ToWide(wstr_tmp, str);;
    167. std::transform(wstr_tmp.begin(), wstr_tmp.end(), wstr_tmp.begin(), ::tolower);
    168. std::string str_tmp;
    169. WideToUTF8(str_tmp, wstr_tmp);;
    170. return str_tmp;
    171. }
    172. std::string ToUpper_transform2(const std::string& str)
    173. {
    174. std::wstring wstr_tmp;
    175. UTF8ToWide(wstr_tmp, str);;
    176. std::transform(wstr_tmp.begin(), wstr_tmp.end(), wstr_tmp.begin(), ::toupper);
    177. std::string str_tmp;
    178. WideToUTF8(str_tmp, wstr_tmp);;
    179. return str_tmp;
    180. }
    181. std::string ToLower_boost(const std::string& str)
    182. {
    183. static boost::locale::generator generator;
    184. static std::locale locale = generator("en_US.UTF-8");
    185. std::wstring wstr = boost::locale::conv::to_utf<wchar_t>(str, "utf-8");
    186. wstr = boost::locale::to_lower(wstr, locale);
    187. return boost::locale::conv::from_utf(wstr, "utf-8");
    188. }
    189. std::string ToUpper_boost(const std::string& str)
    190. {
    191. static boost::locale::generator generator;
    192. static std::locale locale = generator("en_US.UTF-8");
    193. std::wstring wstr = boost::locale::conv::to_utf<wchar_t>(str, "utf-8");
    194. wstr = boost::locale::to_upper(wstr, locale);
    195. return boost::locale::conv::from_utf(wstr, "utf-8");
    196. }
    197. int main()
    198. {
    199. std::string str_gb2312 = "中国";
    200. std::string str_utf8;
    201. GBKToUTF8(str_utf8, str_gb2312);
    202. bool utf8 = IsStringUTF8(str_gb2312);
    203. bool utf8_2 = IsStringUTF8(str_utf8);
    204. int num = UTF8StringSize(str_utf8);
    205. std::string str_utf8_truncate = UTF8StringSafeTruncate(str_utf8, 1);
    206. std::string str_gb2312_truncate;
    207. UTF8ToGBK(str_gb2312_truncate, str_utf8_truncate);
    208. std::string str_lower = "ABC";
    209. std::string str_upper = "def";
    210. std::string str_lower1 = ToLower_transform1(str_lower);
    211. std::string str_upper1 = ToUpper_transform1(str_upper);
    212. std::string str_lower2 = ToLower_transform2(str_lower);
    213. std::string str_upper2 = ToUpper_transform2(str_upper);
    214. std::string str_lower3 = ToLower_boost(str_lower);
    215. std::string str_upper3 = ToUpper_boost(str_upper);
    216. return 0;
    217. }

  • 相关阅读:
    OdeInt与GPU
    5款让人惊艳的黑科技软件,只要用过一次就会爱上
    MDM主数据平台使用总结
    <数据结构>停车场管理系统,利用栈和队列实现,包含纯c语言版和C++版的全注释源码
    Elasticsearch如何设置密码
    直流无刷电机开发应用
    Eyeshot Ultimate参数化建模升级
    跑出了几个明星厂商,DaaS赛道要火?
    学生HTML网页作业作品:HTML+CSS网站设计与实现【红色喜庆邀请函 3页】
    Promise异步async&await
  • 原文地址:https://blog.csdn.net/zhaodongdong2012/article/details/134470717