使用
- #include
- #include
- #include
- #include
- #include
- #include
-
- // 需要链接iconv库
-
- // iconv -l 命令可列出所有支持的格式
- // example: iconv将UTF-16转换为UTF-8
- // iconv -f UTF-16 -t UTF-8 myfile
-
- // 注意: UTF-16 默认使用的是 UTF-16 BE
-
- void print_str_bytes(const char* str, size_t len) {
- for (int i = 0; i < len; i++) {
- char s = str[i];
- std::cout << i << ": " << ((int)s & 0xff) << std::endl;
- }
- }
-
- int convert_encoding(char** dst, size_t* dst_len, const char* src, size_t src_len, const char* to_encoding, const char* from_encoding) {
- iconv_t cd = iconv_open(to_encoding, from_encoding);
- if (cd == reinterpret_cast<iconv_t>(-1)) {
- std::cerr << "iconv_open error: " << strerror(errno) << std::endl;
- return -1;
- }
-
- size_t sl = src_len;
- size_t total = 2 * src_len; // BOM头占2字节 FEFF(UTF-16 BE)
- size_t tmp = total;
- char* outbuf = new char[total];
-
- std::unique_ptr<char[]> ptr(outbuf); // 用智能指针控制内存自动释放
-
- int res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
- if (res == -1) {
- std::cerr << "iconv error: " << strerror(errno) << std::endl;
- iconv_close(cd);
- return -1;
- }
-
- // FIXME 此处使用的是全部重新转换方案,比较浪费性能,仅适用于演示,实际应该注重src未转换的长度值
- while (sl != 0) {
- total *= 2; // 倍乘
- tmp = total;
- ptr.release();
- delete[] outbuf;
-
- outbuf = new char[total]; // 重新申请内存
- ptr = std::unique_ptr<char[]>(outbuf);
-
- res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
- if (res == -1) {
- std::cerr << "iconv error: " << strerror(errno) << std::endl;
- iconv_close(cd);
- return -1;
- }
- }
-
- std::cout << "use bytes: " << total - tmp << std::endl;
- *dst_len = total - tmp;
- *dst = ptr.release(); // 不再需要智能指针管控内存
-
- res = iconv_close(cd);
- if (res == -1) {
- std::cerr << "iconv_close error: " << strerror(errno) << std::endl;
- return -1;
- }
-
- return 0;
- }
-
- int main(int argc, char* argv[]) {
- if (argc != 2) {
- std::cout << "./iconv
" << std::endl; - return 0;
- }
-
- char* str = argv[1];
-
- print_str_bytes(str, strlen(str));
-
- char* dst = nullptr;
- size_t dst_len = 0;
-
- int res = convert_encoding(&dst, &dst_len, str, strlen(str), "UTF-16", "UTF-8");
- if (res == -1) {
- std::cerr << "oops..." << std::endl;
- exit(-1);
- }
- std::cout << "dst_len: " << dst_len << std::endl;
-
- print_str_bytes(dst, dst_len);
-
- // 写入到文件
- int fd = open("out.txt", O_RDWR| O_CREAT | O_TRUNC, S_IRWXU);
- if (fd == -1) {
- std::cerr << "open out.txt error: " << strerror(errno) << std::endl;
- exit(-1);
- }
-
- write(fd, dst, dst_len);
-
- return 0;
- }
编译:
c++ -std=c++14 -liconv iconv.cpp -o iconv
输出:
- ./iconv 你hao,世界
- 0: 228
- 1: 189
- 2: 160
- 3: 104
- 4: 97
- 5: 111
- 6: 239
- 7: 188
- 8: 140
- 9: 228
- 10: 184
- 11: 150
- 12: 231
- 13: 149
- 14: 140
- use bytes: 16
- dst_len: 16
- 0: 254
- 1: 255
- 2: 79
- 3: 96
- 4: 0
- 5: 104
- 6: 0
- 7: 97
- 8: 0
- 9: 111
- 10: 255
- 11: 12
- 12: 78
- 13: 22
- 14: 117
- 15: 76
使用iconv -l命令
- ANSI_X3.4-1968 ANSI_X3.4-1986 ASCII CP367 IBM367 ISO-IR-6 ISO646-US ISO_646.IRV:1991 US US-ASCII CSASCII
- UTF-8 UTF8
- UTF-8-MAC UTF8-MAC
- ISO-10646-UCS-2 UCS-2 CSUNICODE
- UCS-2BE UNICODE-1-1 UNICODEBIG CSUNICODE11
- UCS-2LE UNICODELITTLE
- ISO-10646-UCS-4 UCS-4 CSUCS4
- UCS-4BE
- UCS-4LE
- UTF-16
- UTF-16BE
- UTF-16LE
- UTF-32
- UTF-32BE
- UTF-32LE
- UNICODE-1-1-UTF-7 UTF-7 CSUNICODE11UTF7
- UCS-2-INTERNAL
- UCS-2-SWAPPED
- UCS-4-INTERNAL
- UCS-4-SWAPPED
- C99
- JAVA
- CP819 IBM819 ISO-8859-1 ISO-IR-100 ISO8859-1 ISO_8859-1 ISO_8859-1:1987 L1 LATIN1 CSISOLATIN1
- ISO-8859-2 ISO-IR-101 ISO8859-2 ISO_8859-2 ISO_8859-2:1987 L2 LATIN2 CSISOLATIN2
- ISO-8859-3 ISO-IR-109 ISO8859-3 ISO_8859-3 ISO_8859-3:1988 L3 LATIN3 CSISOLATIN3
- ISO-8859-4 ISO-IR-110 ISO8859-4 ISO_8859-4 ISO_8859-4:1988 L4 LATIN4 CSISOLATIN4
- CYRILLIC ISO-8859-5 ISO-IR-144 ISO8859-5 ISO_8859-5 ISO_8859-5:1988 CSISOLATINCYRILLIC
- ARABIC ASMO-708 ECMA-114 ISO-8859-6 ISO-IR-127 ISO8859-6 ISO_8859-6 ISO_8859-6:1987 CSISOLATINARABIC
- ECMA-118 ELOT_928 GREEK GREEK8 ISO-8859-7 ISO-IR-126 ISO8859-7 ISO_8859-7 ISO_8859-7:1987 ISO_8859-7:2003 CSISOLATINGREEK
- HEBREW ISO-8859-8 ISO-IR-138 ISO8859-8 ISO_8859-8 ISO_8859-8:1988 CSISOLATINHEBREW
- ISO-8859-9 ISO-IR-148 ISO8859-9 ISO_8859-9 ISO_8859-9:1989 L5 LATIN5 CSISOLATIN5
- ISO-8859-10 ISO-IR-157 ISO8859-10 ISO_8859-10 ISO_8859-10:1992 L6 LATIN6 CSISOLATIN6
- ISO-8859-11 ISO8859-11 ISO_8859-11
- ISO-8859-13 ISO-IR-179 ISO8859-13 ISO_8859-13 L7 LATIN7
- ISO-8859-14 ISO-CELTIC ISO-IR-199 ISO8859-14 ISO_8859-14 ISO_8859-14:1998 L8 LATIN8
- ISO-8859-15 ISO-IR-203 ISO8859-15 ISO_8859-15 ISO_8859-15:1998 LATIN-9
- ISO-8859-16 ISO-IR-226 ISO8859-16 ISO_8859-16 ISO_8859-16:2001 L10 LATIN10
- KOI8-R CSKOI8R
- KOI8-U
- KOI8-RU
- CP1250 MS-EE WINDOWS-1250
- CP1251 MS-CYRL WINDOWS-1251
- CP1252 MS-ANSI WINDOWS-1252
- CP1253 MS-GREEK WINDOWS-1253
- CP1254 MS-TURK WINDOWS-1254
- CP1255 MS-HEBR WINDOWS-1255
- CP1256 MS-ARAB WINDOWS-1256
- CP1257 WINBALTRIM WINDOWS-1257
- CP1258 WINDOWS-1258
- 850 CP850 IBM850 CSPC850MULTILINGUAL
- 862 CP862 IBM862 CSPC862LATINHEBREW
- 866 CP866 IBM866 CSIBM866
- MAC MACINTOSH MACROMAN CSMACINTOSH
- MACCENTRALEUROPE
- MACICELAND
- MACCROATIAN
- MACROMANIA
- MACCYRILLIC
- MACUKRAINE
- MACGREEK
- MACTURKISH
- MACHEBREW
- MACARABIC
- MACTHAI
- HP-ROMAN8 R8 ROMAN8 CSHPROMAN8
- NEXTSTEP
- ARMSCII-8
- GEORGIAN-ACADEMY
- GEORGIAN-PS
- KOI8-T
- CP154 CYRILLIC-ASIAN PT154 PTCP154 CSPTCP154
- MULELAO-1
- CP1133 IBM-CP1133
- ISO-IR-166 TIS-620 TIS620 TIS620-0 TIS620.2529-1 TIS620.2533-0 TIS620.2533-1
- CP874 WINDOWS-874
- VISCII VISCII1.1-1 CSVISCII
- TCVN TCVN-5712 TCVN5712-1 TCVN5712-1:1993
- ISO-IR-14 ISO646-JP JIS_C6220-1969-RO JP CSISO14JISC6220RO
- JISX0201-1976 JIS_X0201 X0201 CSHALFWIDTHKATAKANA
- ISO-IR-87 JIS0208 JIS_C6226-1983 JIS_X0208 JIS_X0208-1983 JIS_X0208-1990 X0208 CSISO87JISX0208
- ISO-IR-159 JIS_X0212 JIS_X0212-1990 JIS_X0212.1990-0 X0212 CSISO159JISX02121990
- CN GB_1988-80 ISO-IR-57 ISO646-CN CSISO57GB1988
- CHINESE GB_2312-80 ISO-IR-58 CSISO58GB231280
- CN-GB-ISOIR165 ISO-IR-165
- ISO-IR-149 KOREAN KSC_5601 KS_C_5601-1987 KS_C_5601-1989 CSKSC56011987
- EUC-JP EUCJP EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE CSEUCPKDFMTJAPANESE
- MS_KANJI SHIFT-JIS SHIFT_JIS SJIS CSSHIFTJIS
- CP932
- ISO-2022-JP CSISO2022JP
- ISO-2022-JP-1
- ISO-2022-JP-2 CSISO2022JP2
- CN-GB EUC-CN EUCCN GB2312 CSGB2312
- GBK
- CP936 MS936 WINDOWS-936
- GB18030
- ISO-2022-CN CSISO2022CN
- ISO-2022-CN-EXT
- HZ HZ-GB-2312
- EUC-TW EUCTW CSEUCTW
- BIG-5 BIG-FIVE BIG5 BIGFIVE CN-BIG5 CSBIG5
- CP950
- BIG5-HKSCS:1999
- BIG5-HKSCS:2001
- BIG5-HKSCS BIG5-HKSCS:2004 BIG5HKSCS
- EUC-KR EUCKR CSEUCKR
- CP949 UHC
- CP1361 JOHAB
- ISO-2022-KR CSISO2022KR
- CP856
- CP922
- CP943
- CP1046
- CP1124
- CP1129
- CP1161 IBM-1161 IBM1161 CSIBM1161
- CP1162 IBM-1162 IBM1162 CSIBM1162
- CP1163 IBM-1163 IBM1163 CSIBM1163
- DEC-KANJI
- DEC-HANYU
- 437 CP437 IBM437 CSPC8CODEPAGE437
- CP737
- CP775 IBM775 CSPC775BALTIC
- 852 CP852 IBM852 CSPCP852
- CP853
- 855 CP855 IBM855 CSIBM855
- 857 CP857 IBM857 CSIBM857
- CP858
- 860 CP860 IBM860 CSIBM860
- 861 CP-IS CP861 IBM861 CSIBM861
- 863 CP863 IBM863 CSIBM863
- CP864 IBM864 CSIBM864
- 865 CP865 IBM865 CSIBM865
- 869 CP-GR CP869 IBM869 CSIBM869
- CP1125
- EUC-JISX0213
- SHIFT_JISX0213
- ISO-2022-JP-3
- BIG5-2003
- ISO-IR-230 TDS565
- ATARI ATARIST
- RISCOS-LATIN1