Implement <iconv.h>.
Bug: http://b/32978596 Test: ran tests Change-Id: I56b6ae3d9c5a3a56d2b4afba33fb8f9e964bf7b9
This commit is contained in:
parent
571c888236
commit
a648733cb7
|
@ -1448,6 +1448,7 @@ cc_library_static {
|
|||
"bionic/getpriority.cpp",
|
||||
"bionic/gettid.cpp",
|
||||
"bionic/grp_pwd.cpp",
|
||||
"bionic/iconv.cpp",
|
||||
"bionic/icu_wrappers.cpp",
|
||||
"bionic/ifaddrs.cpp",
|
||||
"bionic/inotify_init.cpp",
|
||||
|
|
|
@ -0,0 +1,368 @@
|
|||
/*
|
||||
* Copyright (C) 2017 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <iconv.h>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <endian.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <uchar.h>
|
||||
|
||||
#include "private/bionic_mbstate.h"
|
||||
|
||||
#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
|
||||
|
||||
// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
|
||||
// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
|
||||
// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
|
||||
enum Encoding {
|
||||
US_ASCII,
|
||||
UTF_8,
|
||||
UTF_16_LE,
|
||||
UTF_16_BE,
|
||||
UTF_32_LE,
|
||||
UTF_32_BE,
|
||||
WCHAR_T,
|
||||
};
|
||||
|
||||
enum Mode {
|
||||
ERROR,
|
||||
IGNORE,
|
||||
TRANSLIT,
|
||||
};
|
||||
|
||||
// This matching is strange but true.
|
||||
// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
|
||||
static bool __match_encoding(const char* lhs, const char* rhs) {
|
||||
while (*lhs && *rhs) {
|
||||
// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
|
||||
// Also implement the "delete each 0 that is not preceded by a digit" rule.
|
||||
for (; *lhs; ++lhs) {
|
||||
if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
|
||||
}
|
||||
// Case doesn't matter either.
|
||||
if (tolower(*lhs) != tolower(*rhs)) break;
|
||||
++lhs;
|
||||
++rhs;
|
||||
}
|
||||
// As a special case we treat the GNU "//" extensions as end of string.
|
||||
if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
|
||||
const char* suffix = strstr(s, "//");
|
||||
if (suffix) {
|
||||
if (!mode) return false;
|
||||
if (strcmp(suffix, "//IGNORE") == 0) {
|
||||
*mode = IGNORE;
|
||||
} else if (strcmp(suffix, "//TRANSLIT") == 0) {
|
||||
*mode = TRANSLIT;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (__match_encoding(s, "utf8")) {
|
||||
*encoding = UTF_8;
|
||||
} else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
|
||||
*encoding = US_ASCII;
|
||||
} else if (__match_encoding(s, "utf16le")) {
|
||||
*encoding = UTF_16_LE;
|
||||
} else if (__match_encoding(s, "utf16be")) {
|
||||
*encoding = UTF_16_BE;
|
||||
} else if (__match_encoding(s, "utf32le")) {
|
||||
*encoding = UTF_32_LE;
|
||||
} else if (__match_encoding(s, "utf32be")) {
|
||||
*encoding = UTF_32_BE;
|
||||
} else if (__match_encoding(s, "wchart")) {
|
||||
*encoding = WCHAR_T;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct __iconv_t {
|
||||
Encoding src_encoding;
|
||||
Encoding dst_encoding;
|
||||
Mode mode;
|
||||
|
||||
__iconv_t() : mode(ERROR) {
|
||||
}
|
||||
|
||||
int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
|
||||
// Reset state.
|
||||
wc = 0;
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
replacement_count = 0;
|
||||
ignored = false;
|
||||
src_buf = src_buf0;
|
||||
src_bytes_left = src_bytes_left0;
|
||||
dst_buf = dst_buf0;
|
||||
dst_bytes_left = dst_bytes_left0;
|
||||
|
||||
while (*src_bytes_left > 0) {
|
||||
if (!GetNext() || !Convert()) return -1;
|
||||
}
|
||||
return Done();
|
||||
}
|
||||
|
||||
private:
|
||||
char32_t wc;
|
||||
char buf[16];
|
||||
size_t src_bytes_used;
|
||||
size_t dst_bytes_used;
|
||||
mbstate_t ps;
|
||||
|
||||
size_t replacement_count;
|
||||
bool ignored;
|
||||
|
||||
char** src_buf;
|
||||
size_t* src_bytes_left;
|
||||
char** dst_buf;
|
||||
size_t* dst_bytes_left;
|
||||
|
||||
bool GetNext() {
|
||||
errno = 0;
|
||||
switch (src_encoding) {
|
||||
case US_ASCII:
|
||||
wc = **src_buf;
|
||||
src_bytes_used = 1;
|
||||
if (wc > 0x7f) errno = EILSEQ;
|
||||
break;
|
||||
|
||||
case UTF_8:
|
||||
src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
|
||||
if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
|
||||
break; // EILSEQ already set.
|
||||
} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
|
||||
errno = EINVAL;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case UTF_16_BE:
|
||||
case UTF_16_LE: {
|
||||
if (*src_bytes_left < 2) {
|
||||
errno = EINVAL;
|
||||
return false;
|
||||
}
|
||||
bool swap = (src_encoding == UTF_16_BE);
|
||||
wc = In16(*src_buf, swap);
|
||||
// 0xd800-0xdbff: high surrogates
|
||||
// 0xdc00-0xdfff: low surrogates
|
||||
if (wc >= 0xd800 && wc <= 0xdfff) {
|
||||
if (wc >= 0xdc00) { // Low surrogate before high surrogate.
|
||||
errno = EILSEQ;
|
||||
return false;
|
||||
}
|
||||
if (*src_bytes_left < 4) {
|
||||
errno = EINVAL;
|
||||
return false;
|
||||
}
|
||||
uint16_t hi = wc;
|
||||
uint16_t lo = In16(*src_buf + 2, swap);
|
||||
wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
|
||||
src_bytes_used = 4;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case UTF_32_BE:
|
||||
case UTF_32_LE:
|
||||
case WCHAR_T:
|
||||
if (*src_bytes_left < 4) {
|
||||
errno = EINVAL;
|
||||
return false;
|
||||
}
|
||||
wc = In32(*src_buf, (src_encoding == UTF_32_BE));
|
||||
break;
|
||||
}
|
||||
|
||||
if (errno == EILSEQ) {
|
||||
switch (mode) {
|
||||
case ERROR:
|
||||
return false;
|
||||
case IGNORE:
|
||||
*src_buf += src_bytes_used;
|
||||
*src_bytes_left -= src_bytes_used;
|
||||
ignored = true;
|
||||
return GetNext();
|
||||
case TRANSLIT:
|
||||
wc = '?';
|
||||
++replacement_count;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Convert() {
|
||||
errno = 0;
|
||||
switch (dst_encoding) {
|
||||
case US_ASCII:
|
||||
buf[0] = wc;
|
||||
dst_bytes_used = 1;
|
||||
if (wc > 0x7f) errno = EILSEQ;
|
||||
break;
|
||||
|
||||
case UTF_8:
|
||||
dst_bytes_used = c32rtomb(buf, wc, &ps);
|
||||
if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
|
||||
break; // EILSEQ already set.
|
||||
} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
|
||||
errno = EINVAL;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case UTF_16_BE:
|
||||
case UTF_16_LE: {
|
||||
bool swap = (dst_encoding == UTF_16_BE);
|
||||
if (wc < 0x10000) { // BMP.
|
||||
Out16(buf, wc, swap);
|
||||
} else { // Supplementary plane; output surrogate pair.
|
||||
wc -= 0x10000;
|
||||
char16_t hi = 0xd800 | (wc >> 10);
|
||||
char16_t lo = 0xdc00 | (wc & 0x3ff);
|
||||
Out16(buf + 0, hi, swap);
|
||||
Out16(buf + 2, lo, swap);
|
||||
dst_bytes_used = 4;
|
||||
}
|
||||
} break;
|
||||
|
||||
case UTF_32_BE:
|
||||
case UTF_32_LE:
|
||||
case WCHAR_T:
|
||||
Out32(wc, (dst_encoding == UTF_32_BE));
|
||||
break;
|
||||
}
|
||||
|
||||
if (errno == EILSEQ) {
|
||||
if (mode == IGNORE) {
|
||||
*src_buf += src_bytes_used;
|
||||
*src_bytes_left -= src_bytes_used;
|
||||
ignored = true;
|
||||
return true;
|
||||
} else if (mode == TRANSLIT) {
|
||||
wc = '?';
|
||||
++replacement_count;
|
||||
return Convert();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return Emit();
|
||||
}
|
||||
|
||||
uint16_t In16(const char* buf, bool swap) {
|
||||
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
|
||||
uint16_t wc = (src[0]) | (src[1] << 8);
|
||||
if (swap) wc = __swap16(wc);
|
||||
src_bytes_used = 2;
|
||||
return wc;
|
||||
}
|
||||
|
||||
uint32_t In32(const char* buf, bool swap) {
|
||||
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
|
||||
uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
|
||||
if (swap) wc = __swap32(wc);
|
||||
src_bytes_used = 4;
|
||||
return wc;
|
||||
}
|
||||
|
||||
void Out16(char* dst, char16_t ch, bool swap) {
|
||||
if (swap) ch = __swap16(ch);
|
||||
dst[0] = ch;
|
||||
dst[1] = ch >> 8;
|
||||
dst_bytes_used = 2;
|
||||
}
|
||||
|
||||
void Out32(char32_t ch, bool swap) {
|
||||
if (swap) ch = __swap32(ch);
|
||||
buf[0] = ch;
|
||||
buf[1] = ch >> 8;
|
||||
buf[2] = ch >> 16;
|
||||
buf[3] = ch >> 24;
|
||||
dst_bytes_used = 4;
|
||||
}
|
||||
|
||||
bool Emit() {
|
||||
if (dst_bytes_used > *dst_bytes_left) {
|
||||
errno = E2BIG;
|
||||
return false;
|
||||
}
|
||||
|
||||
memcpy(*dst_buf, buf, dst_bytes_used);
|
||||
*src_buf += src_bytes_used;
|
||||
*src_bytes_left -= src_bytes_used;
|
||||
*dst_buf += dst_bytes_used;
|
||||
*dst_bytes_left -= dst_bytes_used;
|
||||
return true;
|
||||
}
|
||||
|
||||
int Done() {
|
||||
if (mode == TRANSLIT) return replacement_count;
|
||||
if (ignored) {
|
||||
errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
|
||||
iconv_t result = new __iconv_t;
|
||||
if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
|
||||
!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
|
||||
delete result;
|
||||
errno = EINVAL;
|
||||
return INVALID_ICONV_T;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
size_t iconv(iconv_t __converter,
|
||||
char** __src_buf, size_t* __src_bytes_left,
|
||||
char** __dst_buf, size_t* __dst_bytes_left) {
|
||||
if (__converter == INVALID_ICONV_T) {
|
||||
errno = EBADF;
|
||||
return -1;
|
||||
}
|
||||
return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
|
||||
}
|
||||
|
||||
int iconv_close(iconv_t __converter) {
|
||||
if (__converter == INVALID_ICONV_T) {
|
||||
errno = EBADF;
|
||||
return -1;
|
||||
}
|
||||
delete __converter;
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright (C) 2017 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _ICONV_H_
|
||||
#define _ICONV_H_
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
struct __iconv_t;
|
||||
typedef struct __iconv_t* iconv_t;
|
||||
|
||||
iconv_t iconv_open(const char* __src_encoding, const char* __dst_encoding) __INTRODUCED_IN_FUTURE;
|
||||
size_t iconv(iconv_t __converter, char** __src_buf, size_t* __src_bytes_left, char** __dst_buf, size_t* __dst_bytes_left) __INTRODUCED_IN_FUTURE;
|
||||
int iconv_close(iconv_t __converter) __INTRODUCED_IN_FUTURE;
|
||||
|
||||
__END_DECLS
|
||||
|
||||
#endif
|
|
@ -1323,6 +1323,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1243,6 +1243,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1348,6 +1348,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1307,6 +1307,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1243,6 +1243,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1305,6 +1305,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -1243,6 +1243,9 @@ LIBC_P {
|
|||
__freading; # future
|
||||
__fwriting; # future
|
||||
getlogin_r; # future
|
||||
iconv; # future
|
||||
iconv_close; # future
|
||||
iconv_open; # future
|
||||
syncfs; # future
|
||||
} LIBC_O;
|
||||
|
||||
|
|
|
@ -33,6 +33,9 @@
|
|||
#include <wchar.h>
|
||||
|
||||
typedef int8_t UBool;
|
||||
#define FALSE 0
|
||||
#define TRUE 1
|
||||
|
||||
typedef int32_t UChar32;
|
||||
|
||||
enum UProperty {
|
||||
|
|
|
@ -70,6 +70,7 @@ cc_test_library {
|
|||
"getauxval_test.cpp",
|
||||
"getcwd_test.cpp",
|
||||
"grp_pwd_test.cpp",
|
||||
"iconv_test.cpp",
|
||||
"ifaddrs_test.cpp",
|
||||
"inttypes_test.cpp",
|
||||
"langinfo_test.cpp",
|
||||
|
|
|
@ -0,0 +1,429 @@
|
|||
/*
|
||||
* Copyright (C) 2017 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iconv.h>
|
||||
|
||||
#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
|
||||
|
||||
TEST(iconv, iconv_open_EINVAL) {
|
||||
errno = 0;
|
||||
ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
|
||||
ASSERT_EQ(EINVAL, errno);
|
||||
errno = 0;
|
||||
ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
|
||||
ASSERT_EQ(EINVAL, errno);
|
||||
errno = 0;
|
||||
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
|
||||
ASSERT_EQ(EINVAL, errno);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_open_comparator) {
|
||||
// Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
|
||||
// "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
|
||||
iconv_t c;
|
||||
ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
|
||||
// "...but not "utf-80" or "ut8"."
|
||||
errno = 0;
|
||||
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
|
||||
ASSERT_EQ(EINVAL, errno);
|
||||
errno = 0;
|
||||
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
|
||||
ASSERT_EQ(EINVAL, errno);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_smoke) {
|
||||
const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("UTF-32LE", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
|
||||
wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
|
||||
EXPECT_EQ(L'a', utf16[0]);
|
||||
EXPECT_EQ(L'٦', utf16[1]);
|
||||
EXPECT_EQ(L'ᄀ', utf16[2]);
|
||||
EXPECT_EQ(L'\0', utf16[3]);
|
||||
EXPECT_EQ(0U, in_bytes);
|
||||
EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_lossy_TRANSLIT) {
|
||||
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
// Two of the input characters (5 input bytes) aren't representable as ASCII.
|
||||
// With "//TRANSLIT", we use a replacement character, and report the number
|
||||
// of replacements.
|
||||
EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ('?', buf[1]);
|
||||
EXPECT_EQ('?', buf[2]);
|
||||
EXPECT_EQ('z', buf[3]);
|
||||
EXPECT_EQ(0, buf[4]);
|
||||
EXPECT_EQ(0U, in_bytes);
|
||||
EXPECT_EQ(sizeof(buf) - 4, out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_lossy_IGNORE) {
|
||||
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
// Two of the input characters (5 input bytes) aren't representable as ASCII.
|
||||
// With "//IGNORE", we just skip them (but return failure).
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(EILSEQ, errno);
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ('z', buf[1]);
|
||||
EXPECT_EQ(0, buf[2]);
|
||||
EXPECT_EQ(0U, in_bytes);
|
||||
EXPECT_EQ(sizeof(buf) - 2, out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_lossy) {
|
||||
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("ASCII", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
// The second input character isn't representable as ASCII, so we stop there.
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(EILSEQ, errno);
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ(0, buf[1]);
|
||||
EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z.
|
||||
EXPECT_EQ(sizeof(buf) - 1, out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_malformed_sequence_EILSEQ) {
|
||||
const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("UTF-8", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
// The second input byte is a malformed character, so we stop there.
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(EILSEQ, errno);
|
||||
EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
|
||||
++in;
|
||||
--in_bytes;
|
||||
errno = 0;
|
||||
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(0, errno);
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ('z', buf[1]);
|
||||
EXPECT_EQ(0, buf[2]);
|
||||
EXPECT_EQ(0U, in_bytes);
|
||||
EXPECT_EQ(sizeof(buf) - 2, out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_incomplete_sequence_EINVAL) {
|
||||
const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("UTF-8", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
|
||||
// The second input byte is just the start of a character, and we don't have any more bytes.
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(EINVAL, errno);
|
||||
EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ(0, buf[1]);
|
||||
EXPECT_EQ(1U, in_bytes);
|
||||
EXPECT_EQ(sizeof(buf) - 1, out_bytes);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_E2BIG) {
|
||||
const char* utf8 = "abc";
|
||||
char buf[BUFSIZ] = {};
|
||||
|
||||
iconv_t c = iconv_open("UTF-8", "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c);
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(in);
|
||||
|
||||
char* out = buf;
|
||||
size_t out_bytes = 1;
|
||||
|
||||
// We need three bytes, so one isn't enough (but we will make progress).
|
||||
out_bytes = 1;
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(E2BIG, errno);
|
||||
EXPECT_EQ(2U, in_bytes);
|
||||
EXPECT_EQ(0U, out_bytes);
|
||||
|
||||
// Two bytes left, so zero isn't enough (and we can't even make progress).
|
||||
out_bytes = 0;
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(E2BIG, errno);
|
||||
EXPECT_EQ(2U, in_bytes);
|
||||
EXPECT_EQ(0U, out_bytes);
|
||||
|
||||
// Two bytes left, so one isn't enough (but we will make progress).
|
||||
out_bytes = 1;
|
||||
errno = 0;
|
||||
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(E2BIG, errno);
|
||||
EXPECT_EQ(1U, in_bytes);
|
||||
EXPECT_EQ(0U, out_bytes);
|
||||
|
||||
// One byte left, so one byte is now enough.
|
||||
out_bytes = 1;
|
||||
errno = 0;
|
||||
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(0, errno);
|
||||
EXPECT_EQ(0U, in_bytes);
|
||||
EXPECT_EQ(0U, out_bytes);
|
||||
|
||||
EXPECT_EQ('a', buf[0]);
|
||||
EXPECT_EQ('b', buf[1]);
|
||||
EXPECT_EQ('c', buf[2]);
|
||||
EXPECT_EQ(0, buf[3]);
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_invalid_converter_EBADF) {
|
||||
char* in = nullptr;
|
||||
char* out = nullptr;
|
||||
size_t in_bytes = 0;
|
||||
size_t out_bytes = 0;
|
||||
errno = 0;
|
||||
ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
|
||||
ASSERT_EQ(EBADF, errno);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_close_invalid_converter_EBADF) {
|
||||
errno = 0;
|
||||
ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
|
||||
ASSERT_EQ(EBADF, errno);
|
||||
}
|
||||
|
||||
static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
|
||||
// Examples from https://en.wikipedia.org/wiki/UTF-16.
|
||||
const char* utf8 = "$€𐐷"; // U+0024, U+20AC, U+10437.
|
||||
|
||||
iconv_t c = iconv_open(dst_enc, "UTF-8");
|
||||
ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
|
||||
|
||||
char* in = const_cast<char*>(utf8);
|
||||
size_t in_bytes = strlen(utf8);
|
||||
char buf[BUFSIZ] = {};
|
||||
char* out = buf;
|
||||
size_t out_bytes = sizeof(buf);
|
||||
size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
|
||||
|
||||
// Check we got the bytes we were expecting.
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
|
||||
}
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
|
||||
// We can't round-trip if there were replacements.
|
||||
if (strstr(dst_enc, "ascii")) {
|
||||
GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
|
||||
return;
|
||||
}
|
||||
ASSERT_EQ(0U, replacement_count);
|
||||
|
||||
c = iconv_open("UTF-8", dst_enc);
|
||||
ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
|
||||
|
||||
in = buf;
|
||||
in_bytes = n;
|
||||
char buf2[BUFSIZ] = {};
|
||||
out = buf2;
|
||||
out_bytes = sizeof(buf2);
|
||||
iconv(c, &in, &in_bytes, &out, &out_bytes);
|
||||
|
||||
ASSERT_STREQ(utf8, buf2) << dst_enc;
|
||||
|
||||
ASSERT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_ascii) {
|
||||
RoundTrip("ascii//TRANSLIT", "$??", 3);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_utf8) {
|
||||
RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_utf16be) {
|
||||
RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_utf16le) {
|
||||
RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_utf32be) {
|
||||
RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_utf32le) {
|
||||
RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_round_trip_wchar_t) {
|
||||
RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
|
||||
}
|
||||
|
||||
static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
|
||||
iconv_t c = iconv_open("wchar_t", src_enc);
|
||||
char* in = const_cast<char*>(src);
|
||||
size_t in_bytes = n;
|
||||
wchar_t out_buf[16];
|
||||
size_t out_bytes = sizeof(out_buf);
|
||||
char* out = reinterpret_cast<char*>(out_buf);
|
||||
errno = 0;
|
||||
ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
|
||||
EXPECT_EQ(expected_errno, errno);
|
||||
EXPECT_EQ(0, iconv_close(c));
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EILSEQ_ascii) {
|
||||
Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EILSEQ_utf8_initial) {
|
||||
Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
|
||||
Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
|
||||
Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
|
||||
Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf8_short) {
|
||||
Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16be_short) {
|
||||
Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
|
||||
Check(EINVAL, "utf16be", "\xd8\x01", 2);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
|
||||
Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16le_short) {
|
||||
Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
|
||||
Check(EINVAL, "utf16le", "\x01\xd8", 2);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
|
||||
Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf32be_short) {
|
||||
Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
|
||||
}
|
||||
|
||||
TEST(iconv, iconv_EINVAL_utf32le_short) {
|
||||
Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
|
||||
}
|
Loading…
Reference in New Issue