• 【POSIX】使用iconv库将UTF-8字符串转换为UTF-16字符串


     使用来进行字符串编码的转换

    1. #include
    2. #include
    3. #include
    4. #include
    5. #include
    6. #include
    7. // 需要链接iconv库
    8. // iconv -l 命令可列出所有支持的格式
    9. // example: iconv将UTF-16转换为UTF-8
    10. // iconv -f UTF-16 -t UTF-8 myfile
    11. // 注意: UTF-16 默认使用的是 UTF-16 BE
    12. void print_str_bytes(const char* str, size_t len) {
    13. for (int i = 0; i < len; i++) {
    14. char s = str[i];
    15. std::cout << i << ": " << ((int)s & 0xff) << std::endl;
    16. }
    17. }
    18. int convert_encoding(char** dst, size_t* dst_len, const char* src, size_t src_len, const char* to_encoding, const char* from_encoding) {
    19. iconv_t cd = iconv_open(to_encoding, from_encoding);
    20. if (cd == reinterpret_cast<iconv_t>(-1)) {
    21. std::cerr << "iconv_open error: " << strerror(errno) << std::endl;
    22. return -1;
    23. }
    24. size_t sl = src_len;
    25. size_t total = 2 * src_len; // BOM头占2字节 FEFF(UTF-16 BE)
    26. size_t tmp = total;
    27. char* outbuf = new char[total];
    28. std::unique_ptr<char[]> ptr(outbuf); // 用智能指针控制内存自动释放
    29. int res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
    30. if (res == -1) {
    31. std::cerr << "iconv error: " << strerror(errno) << std::endl;
    32. iconv_close(cd);
    33. return -1;
    34. }
    35. // FIXME 此处使用的是全部重新转换方案,比较浪费性能,仅适用于演示,实际应该注重src未转换的长度值
    36. while (sl != 0) {
    37. total *= 2; // 倍乘
    38. tmp = total;
    39. ptr.release();
    40. delete[] outbuf;
    41. outbuf = new char[total]; // 重新申请内存
    42. ptr = std::unique_ptr<char[]>(outbuf);
    43. res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
    44. if (res == -1) {
    45. std::cerr << "iconv error: " << strerror(errno) << std::endl;
    46. iconv_close(cd);
    47. return -1;
    48. }
    49. }
    50. std::cout << "use bytes: " << total - tmp << std::endl;
    51. *dst_len = total - tmp;
    52. *dst = ptr.release(); // 不再需要智能指针管控内存
    53. res = iconv_close(cd);
    54. if (res == -1) {
    55. std::cerr << "iconv_close error: " << strerror(errno) << std::endl;
    56. return -1;
    57. }
    58. return 0;
    59. }
    60. int main(int argc, char* argv[]) {
    61. if (argc != 2) {
    62. std::cout << "./iconv " << std::endl;
    63. return 0;
    64. }
    65. char* str = argv[1];
    66. print_str_bytes(str, strlen(str));
    67. char* dst = nullptr;
    68. size_t dst_len = 0;
    69. int res = convert_encoding(&dst, &dst_len, str, strlen(str), "UTF-16", "UTF-8");
    70. if (res == -1) {
    71. std::cerr << "oops..." << std::endl;
    72. exit(-1);
    73. }
    74. std::cout << "dst_len: " << dst_len << std::endl;
    75. print_str_bytes(dst, dst_len);
    76. // 写入到文件
    77. int fd = open("out.txt", O_RDWR| O_CREAT | O_TRUNC, S_IRWXU);
    78. if (fd == -1) {
    79. std::cerr << "open out.txt error: " << strerror(errno) << std::endl;
    80. exit(-1);
    81. }
    82. write(fd, dst, dst_len);
    83. return 0;
    84. }

    编译:

    c++ -std=c++14 -liconv iconv.cpp -o iconv

    输出:

    1. ./iconv 你hao,世界
    2. 0: 228
    3. 1: 189
    4. 2: 160
    5. 3: 104
    6. 4: 97
    7. 5: 111
    8. 6: 239
    9. 7: 188
    10. 8: 140
    11. 9: 228
    12. 10: 184
    13. 11: 150
    14. 12: 231
    15. 13: 149
    16. 14: 140
    17. use bytes: 16
    18. dst_len: 16
    19. 0: 254
    20. 1: 255
    21. 2: 79
    22. 3: 96
    23. 4: 0
    24. 5: 104
    25. 6: 0
    26. 7: 97
    27. 8: 0
    28. 9: 111
    29. 10: 255
    30. 11: 12
    31. 12: 78
    32. 13: 22
    33. 14: 117
    34. 15: 76

    使用iconv -l命令

    1. ANSI_X3.4-1968 ANSI_X3.4-1986 ASCII CP367 IBM367 ISO-IR-6 ISO646-US ISO_646.IRV:1991 US US-ASCII CSASCII
    2. UTF-8 UTF8
    3. UTF-8-MAC UTF8-MAC
    4. ISO-10646-UCS-2 UCS-2 CSUNICODE
    5. UCS-2BE UNICODE-1-1 UNICODEBIG CSUNICODE11
    6. UCS-2LE UNICODELITTLE
    7. ISO-10646-UCS-4 UCS-4 CSUCS4
    8. UCS-4BE
    9. UCS-4LE
    10. UTF-16
    11. UTF-16BE
    12. UTF-16LE
    13. UTF-32
    14. UTF-32BE
    15. UTF-32LE
    16. UNICODE-1-1-UTF-7 UTF-7 CSUNICODE11UTF7
    17. UCS-2-INTERNAL
    18. UCS-2-SWAPPED
    19. UCS-4-INTERNAL
    20. UCS-4-SWAPPED
    21. C99
    22. JAVA
    23. CP819 IBM819 ISO-8859-1 ISO-IR-100 ISO8859-1 ISO_8859-1 ISO_8859-1:1987 L1 LATIN1 CSISOLATIN1
    24. ISO-8859-2 ISO-IR-101 ISO8859-2 ISO_8859-2 ISO_8859-2:1987 L2 LATIN2 CSISOLATIN2
    25. ISO-8859-3 ISO-IR-109 ISO8859-3 ISO_8859-3 ISO_8859-3:1988 L3 LATIN3 CSISOLATIN3
    26. ISO-8859-4 ISO-IR-110 ISO8859-4 ISO_8859-4 ISO_8859-4:1988 L4 LATIN4 CSISOLATIN4
    27. CYRILLIC ISO-8859-5 ISO-IR-144 ISO8859-5 ISO_8859-5 ISO_8859-5:1988 CSISOLATINCYRILLIC
    28. ARABIC ASMO-708 ECMA-114 ISO-8859-6 ISO-IR-127 ISO8859-6 ISO_8859-6 ISO_8859-6:1987 CSISOLATINARABIC
    29. ECMA-118 ELOT_928 GREEK GREEK8 ISO-8859-7 ISO-IR-126 ISO8859-7 ISO_8859-7 ISO_8859-7:1987 ISO_8859-7:2003 CSISOLATINGREEK
    30. HEBREW ISO-8859-8 ISO-IR-138 ISO8859-8 ISO_8859-8 ISO_8859-8:1988 CSISOLATINHEBREW
    31. ISO-8859-9 ISO-IR-148 ISO8859-9 ISO_8859-9 ISO_8859-9:1989 L5 LATIN5 CSISOLATIN5
    32. ISO-8859-10 ISO-IR-157 ISO8859-10 ISO_8859-10 ISO_8859-10:1992 L6 LATIN6 CSISOLATIN6
    33. ISO-8859-11 ISO8859-11 ISO_8859-11
    34. ISO-8859-13 ISO-IR-179 ISO8859-13 ISO_8859-13 L7 LATIN7
    35. ISO-8859-14 ISO-CELTIC ISO-IR-199 ISO8859-14 ISO_8859-14 ISO_8859-14:1998 L8 LATIN8
    36. ISO-8859-15 ISO-IR-203 ISO8859-15 ISO_8859-15 ISO_8859-15:1998 LATIN-9
    37. ISO-8859-16 ISO-IR-226 ISO8859-16 ISO_8859-16 ISO_8859-16:2001 L10 LATIN10
    38. KOI8-R CSKOI8R
    39. KOI8-U
    40. KOI8-RU
    41. CP1250 MS-EE WINDOWS-1250
    42. CP1251 MS-CYRL WINDOWS-1251
    43. CP1252 MS-ANSI WINDOWS-1252
    44. CP1253 MS-GREEK WINDOWS-1253
    45. CP1254 MS-TURK WINDOWS-1254
    46. CP1255 MS-HEBR WINDOWS-1255
    47. CP1256 MS-ARAB WINDOWS-1256
    48. CP1257 WINBALTRIM WINDOWS-1257
    49. CP1258 WINDOWS-1258
    50. 850 CP850 IBM850 CSPC850MULTILINGUAL
    51. 862 CP862 IBM862 CSPC862LATINHEBREW
    52. 866 CP866 IBM866 CSIBM866
    53. MAC MACINTOSH MACROMAN CSMACINTOSH
    54. MACCENTRALEUROPE
    55. MACICELAND
    56. MACCROATIAN
    57. MACROMANIA
    58. MACCYRILLIC
    59. MACUKRAINE
    60. MACGREEK
    61. MACTURKISH
    62. MACHEBREW
    63. MACARABIC
    64. MACTHAI
    65. HP-ROMAN8 R8 ROMAN8 CSHPROMAN8
    66. NEXTSTEP
    67. ARMSCII-8
    68. GEORGIAN-ACADEMY
    69. GEORGIAN-PS
    70. KOI8-T
    71. CP154 CYRILLIC-ASIAN PT154 PTCP154 CSPTCP154
    72. MULELAO-1
    73. CP1133 IBM-CP1133
    74. ISO-IR-166 TIS-620 TIS620 TIS620-0 TIS620.2529-1 TIS620.2533-0 TIS620.2533-1
    75. CP874 WINDOWS-874
    76. VISCII VISCII1.1-1 CSVISCII
    77. TCVN TCVN-5712 TCVN5712-1 TCVN5712-1:1993
    78. ISO-IR-14 ISO646-JP JIS_C6220-1969-RO JP CSISO14JISC6220RO
    79. JISX0201-1976 JIS_X0201 X0201 CSHALFWIDTHKATAKANA
    80. ISO-IR-87 JIS0208 JIS_C6226-1983 JIS_X0208 JIS_X0208-1983 JIS_X0208-1990 X0208 CSISO87JISX0208
    81. ISO-IR-159 JIS_X0212 JIS_X0212-1990 JIS_X0212.1990-0 X0212 CSISO159JISX02121990
    82. CN GB_1988-80 ISO-IR-57 ISO646-CN CSISO57GB1988
    83. CHINESE GB_2312-80 ISO-IR-58 CSISO58GB231280
    84. CN-GB-ISOIR165 ISO-IR-165
    85. ISO-IR-149 KOREAN KSC_5601 KS_C_5601-1987 KS_C_5601-1989 CSKSC56011987
    86. EUC-JP EUCJP EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE CSEUCPKDFMTJAPANESE
    87. MS_KANJI SHIFT-JIS SHIFT_JIS SJIS CSSHIFTJIS
    88. CP932
    89. ISO-2022-JP CSISO2022JP
    90. ISO-2022-JP-1
    91. ISO-2022-JP-2 CSISO2022JP2
    92. CN-GB EUC-CN EUCCN GB2312 CSGB2312
    93. GBK
    94. CP936 MS936 WINDOWS-936
    95. GB18030
    96. ISO-2022-CN CSISO2022CN
    97. ISO-2022-CN-EXT
    98. HZ HZ-GB-2312
    99. EUC-TW EUCTW CSEUCTW
    100. BIG-5 BIG-FIVE BIG5 BIGFIVE CN-BIG5 CSBIG5
    101. CP950
    102. BIG5-HKSCS:1999
    103. BIG5-HKSCS:2001
    104. BIG5-HKSCS BIG5-HKSCS:2004 BIG5HKSCS
    105. EUC-KR EUCKR CSEUCKR
    106. CP949 UHC
    107. CP1361 JOHAB
    108. ISO-2022-KR CSISO2022KR
    109. CP856
    110. CP922
    111. CP943
    112. CP1046
    113. CP1124
    114. CP1129
    115. CP1161 IBM-1161 IBM1161 CSIBM1161
    116. CP1162 IBM-1162 IBM1162 CSIBM1162
    117. CP1163 IBM-1163 IBM1163 CSIBM1163
    118. DEC-KANJI
    119. DEC-HANYU
    120. 437 CP437 IBM437 CSPC8CODEPAGE437
    121. CP737
    122. CP775 IBM775 CSPC775BALTIC
    123. 852 CP852 IBM852 CSPCP852
    124. CP853
    125. 855 CP855 IBM855 CSIBM855
    126. 857 CP857 IBM857 CSIBM857
    127. CP858
    128. 860 CP860 IBM860 CSIBM860
    129. 861 CP-IS CP861 IBM861 CSIBM861
    130. 863 CP863 IBM863 CSIBM863
    131. CP864 IBM864 CSIBM864
    132. 865 CP865 IBM865 CSIBM865
    133. 869 CP-GR CP869 IBM869 CSIBM869
    134. CP1125
    135. EUC-JISX0213
    136. SHIFT_JISX0213
    137. ISO-2022-JP-3
    138. BIG5-2003
    139. ISO-IR-230 TDS565
    140. ATARI ATARIST
    141. RISCOS-LATIN1

  • 相关阅读:
    自己动手从零写桌面操作系统GrapeOS系列教程——1.1 GrapeOS介绍
    2-39 JSP之EL表达式
    基于Keilv5新建STM32F030工程
    C#,入门教程——关于函数参数ref的一点知识与源程序
    (附源码)springboot投票系统 毕业设计 261136
    C++11新特性之智能指针|内存泄漏
    Effective C++改善程序与设计的55个具体做法 1. 让自己习惯 c++
    Bitmap 的基本原理
    【数据结构】哈希应用——位图、布隆过滤器
    Pytorch入门实例的分解写法
  • 原文地址:https://blog.csdn.net/Taozi825232603/article/details/139382284