/* test-tounicode.c --- Self tests for Libidn2 _to_unicode_ functions.
Copyright (C) 2017 Tim Rühsen
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/* Based on GNU Libidn tst_punycode.c */
#include
#include
#include
#include
#include
#include
#include /* u8_strconv_from_locale */
#include
#include
typedef struct
{
const char *name;
const char *punycode;
const uint32_t u32_expected[48];
int rc_expected;
} test_t;
const test_t test[] = {
{
"(A) Arabic (Egyptian)",
"xn--egbpdaj6bu4bxfgehfvwxn",
{
0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643,
0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A,
0x061F, 0,
},
IDN2_OK
},
{
"(B) Chinese (simplified)",
"xn--ihqwcrb4cv8a8dqg056pqjye",
{
0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D,
0x6587, 0
},
IDN2_OK
},
{
"(C) Chinese (traditional)",
"xn--ihqwctvzc91f659drss3x8bo0yb",
{
0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D,
0x6587, 0
},
IDN2_OK
},
{
"(D) Czech: Proprostnemluvesky",
"xn--Proprostnemluvesky-uyb24dma41a",
{
0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073,
0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076,
0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079, 0
},
IDN2_OK
},
{
"(E) Hebrew:",
"xn--4dbcagdahymbxekheh6e0a7fei0b",
{
0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5,
0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9,
0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA, 0
},
IDN2_OK
},
{
"(F) Hindi (Devanagari):",
"xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd",
{
0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928,
0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902,
0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938,
0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902, 0
},
IDN2_OK
},
{
"(G) Japanese (kanji and hiragana):",
"xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa",
{
0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E,
0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044,
0x306E, 0x304B, 0
},
IDN2_OK
},
{
"(H) Korean (Hangul syllables):",
"xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c",
{
0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4,
0xC774, 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C,
0xB2E4, 0xBA74, 0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C, 0
},
IDN2_OK
},
{
"(I) Russian (Cyrillic):",
"xn--b1abfaaepdrnnbgefbadotcwatmq2g4l",
{
0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435,
0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432,
0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443,
0x0441, 0x0441, 0x043A, 0x0438, 0
},
IDN2_OK
},
{
"(J) Spanish: PorqunopuedensimplementehablarenEspaol",
"xn--PorqunopuedensimplementehablarenEspaol-fmd56a",
{
0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F,
0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069,
0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074,
0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065,
0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C, 0
},
IDN2_OK
},
{
"(K) Vietnamese:",
"xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g",
{
0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD,
0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3,
0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069,
0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074, 0
},
IDN2_OK
},
{
"(L) 3B",
"xn--3B-ww4c5e180e575a65lsy2b",
{
0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F, 0
},
IDN2_OK
},
{
"(M) -with-SUPER-MONKEYS",
"xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n",
{
0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069,
0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052,
0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053, 0
},
IDN2_OK
},
{
"(N) Hello-Another-Way-",
"xn--Hello-Another-Way--fc4qua05auwb3674vfr0b",
{
0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E,
0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061,
0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834,
0x6240, 0
},
IDN2_OK
},
{
"(O) 2",
"xn--2-u9tlzr9756bt3uc0v",
{
0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032, 0
},
IDN2_OK
},
{
"(P) MajiKoi5",
"xn--MajiKoi5-783gue6qz075azm5e",
{
0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069,
0x3059, 0x308B, 0x0035, 0x79D2, 0x524D, 0
},
IDN2_OK
},
{
"(Q) de",
"xn--de-jg4avhby1noc0d",
{
0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0
},
IDN2_OK
},
{
"(R) ",
"xn--d9juau41awczczp",
{
0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067, 0
},
IDN2_OK
},
{
"(S) -> $1",
"xn---> $1-",
{
0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0
},
IDN2_PUNYCODE_BAD_INPUT
},
{
"(T) -> $1",
"xn---> $1",
{
0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0
},
IDN2_PUNYCODE_BAD_INPUT
},
{
"(U) -> $1.00 <-",
"xn---> $1.00 <--",
{
0x002D, 0x003E, 0x0020, 0x0024, 0x0031, 0x002E, 0x0030, 0x0030,
0x0020, 0x003C, 0x002D, 0
},
IDN2_PUNYCODE_BAD_INPUT
},
{
"(V) ",
"xn--d9juau41awczczp.xn--bel-goa.DE",
{
0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067, 0x002E,
0x00FC, 0x0062, 0x0065, 0x006c, 0x002e, 0x0044, 0x0045, 0
},
IDN2_OK
},
{
"(W) ",
"XN--d9juau41awczczp",
{
0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067, 0
},
IDN2_OK
},
{
"(X) utf-8 crash ?",
"\200bad.com",
{
0
},
IDN2_ENCODING_ERROR /* or IDN2_ICONV_FAIL with idn2_to_unicode_lzlz() due to bad UTF-8 input */
},
/* Test vectors from https://bugs.debian.org/610617 */
{
"Debian test #1",
"XN----7SBAABF4DLDYSIEHP4NTB.XN--P1AI",
{
0x0441, 0x0430, 0x043c, 0x0430, 0x0440, 0x0441, 0x043a, 0x0430,
0x044f, 0x002d, 0x043e, 0x0431, 0x043b, 0x0430, 0x0441, 0x0442,
0x044c, 0x002e, 0x0440, 0x0444, 0
},
IDN2_OK
},
{
"Debian test #2",
"xn----7SBAABF4DLDYSIEHP4NTB.XN--P1AI",
{
0x0441, 0x0430, 0x043c, 0x0430, 0x0440, 0x0441, 0x043a, 0x0430,
0x044f, 0x002d, 0x043e, 0x0431, 0x043b, 0x0430, 0x0441, 0x0442,
0x044c, 0x002e, 0x0440, 0x0444, 0
},
IDN2_OK
},
{
"Debian test #3",
"xn----7SBAABF4DLDYSIEHP4NTB.xn--P1AI",
{
0x0441, 0x0430, 0x043c, 0x0430, 0x0440, 0x0441, 0x043a, 0x0430,
0x044f, 0x002d, 0x043e, 0x0431, 0x043b, 0x0430, 0x0441, 0x0442,
0x044c, 0x002e, 0x0440, 0x0444, 0
},
IDN2_OK
},
/* Test vectors copied from gnutls */
{
"GnuTLS test #1",
"xn--nxasmm1c.com",
{
0x03b2, 0x03cc, 0x03bb, 0x03bf, 0x03c2, 0x002e, 0x0063, 0x006f,
0x006d, 0
},
IDN2_OK
},
{
"GnuTLS test #2",
"xn--nxasmq6b.com",
{
0x03b2, 0x03cc, 0x03bb, 0x03bf, 0x03c3, 0x002e, 0x0063, 0x006f,
0x006d, 0
},
IDN2_OK
},
{
"GnuTLS test #3",
"xn--fa-hia.de",
{
0x0066, 0x0061, 0x00df, 0x002e, 0x0064, 0x0065, 0
},
IDN2_OK
},
{
"GnuTLS test #4",
"xn--bcher-kva.de",
{
0x0062, 0x00fc, 0x0063, 0x0068, 0x0065, 0x0072, 0x002e, 0x0064,
0x0065, 0
},
IDN2_OK
},
{
"GnuTLS test #5",
"xn--wgv71a119e.jp",
{
0x65e5, 0x672c, 0x8a9e, 0x002e, 0x006a, 0x0070, 0
},
IDN2_OK
},
{
"GnuTLS test #6",
"xn--fiqu1az03c18t.xn--mxah1amo.com",
{
0x7b80, 0x4f53, 0x4e2d, 0x6587, 0x002e, 0x03b5, 0x03be, 0x03c4,
0x03c1, 0x03b1, 0x002e, 0x0063, 0x006f, 0x006d, 0
},
IDN2_OK
},
{
"Empty label",
"xn---.de",
{
0x2e, 0x64, 0x65, 0
},
IDN2_PUNYCODE_BAD_INPUT
},
{
"No ASCII char but delimiter",
"xn---tda.de",
{
0x2e, 0x64, 0x65, 0
},
IDN2_PUNYCODE_BAD_INPUT
},
};
static int debug = 1;
static int error_count = 0;
static int break_on_error = 0;
_GL_ATTRIBUTE_FORMAT_PRINTF (1, 2) static void
fail (const char *format, ...)
{
va_list arg_ptr;
va_start (arg_ptr, format);
vprintf (format, arg_ptr);
va_end (arg_ptr);
error_count++;
if (break_on_error)
exit (EXIT_FAILURE);
}
static void
ucs4print (const uint32_t * str, size_t len)
{
size_t i;
for (i = 0; i < len; i++)
printf ("U+%04x%s", str[i], (i + 1) % 8 ? " " : "\n");
if (len % 8)
printf ("\n");
}
static size_t G_GNUC_IDN2_ATTRIBUTE_PURE
_u32_strlen(const uint32_t *s)
{
const uint32_t *e;
for (e = s; *e; e++)
;
return e - s;
}
static size_t G_GNUC_IDN2_ATTRIBUTE_PURE
_u32_strcmp(const uint32_t *s1, const uint32_t *s2)
{
while (*s1 && *s2 && *s1 == *s2)
s1++, s2++;
return *s1 - *s2;
}
static void
_check_4z(const test_t *t, int rc, uint32_t *ucs4, const char *funcname)
{
if (rc != t->rc_expected && !(rc == IDN2_ICONV_FAIL && t->rc_expected == IDN2_ENCODING_ERROR))
{
printf ("Test[%u] '%s' failed (got %d, expected %d):\n",
(unsigned) (t - test), t->name, rc, t->rc_expected);
fail (" %s(): %s\n", funcname, idn2_strerror (rc));
}
else if (rc == IDN2_OK)
{
if (_u32_strcmp (t->u32_expected, ucs4) != 0)
{
if (debug)
{
printf ("got:\n");
ucs4print (ucs4, _u32_strlen (ucs4));
printf ("expected:\n");
ucs4print (t->u32_expected, _u32_strlen (t->u32_expected));
}
fail ("%s() entry %u mismatch\n", funcname, (unsigned) (t - test));
}
}
else if (debug)
printf ("returned %d expected %d (%s)\n",
rc, t->rc_expected, idn2_strerror (t->rc_expected));
free (ucs4);
}
int
main (void)
{
static uint32_t abc_u32[] = { 'a', 'b', 'c', 0 };
static uint32_t xntda_u32[] = { 'x', 'n', '-', '-', 't', 'd', 'a', 0 };
uint32_t q[128];
uint32_t *ucs4, *punycode_u32;
uint8_t *utf8;
char *utf8_lz;
const char *encoding;
size_t outlen, outlen2;
int rc, skip_lz = 0;
unsigned i;
/* Need to set UTF-8 for u8_strconv_from_locale / u8_strconv_to_locale to work.
* At least on Debian with libunistring 0.9.6+really0.9.3-0.1 and LC_ALL=C valgrind
* reports Conditional jump or move depends on uninitialised value */
setlocale (LC_ALL, "C.UTF-8");
encoding = locale_charset();
if (debug)
printf("charset=%s\n", encoding);
if (strcmp(encoding, "UTF-8") != 0)
skip_lz = 1;
for (i = 0; i < sizeof (test) / sizeof (test[0]); i++)
{
const test_t *t = test + i;
if (debug)
printf ("\nPUNYCODE entry %u: %s\n", i, t->name);
ucs4 = NULL; /* freed by _check_4z */
rc = idn2_to_unicode_8z4z (t->punycode, &ucs4, 0);
_check_4z (t, rc, ucs4, "idn2_to_unicode_8z4z");
punycode_u32 = u8_to_u32 (
(uint8_t *) t->punycode, strlen (t->punycode) + 1, NULL, &outlen);
if (punycode_u32)
{
ucs4 = NULL; /* freed by _check_4z */
rc = idn2_to_unicode_4z4z(punycode_u32, &ucs4, 0);
_check_4z(t, rc, ucs4, "idn2_to_unicode_4z4z");
outlen2 = sizeof (q) / sizeof (q[0]) - 1;
rc = idn2_to_unicode_44i(punycode_u32, outlen - 1, q, &outlen2, 0);
ucs4 = u32_cpy_alloc(q, outlen2 + 1);
ucs4[outlen2] = 0;
_check_4z(t, rc, ucs4, "idn2_to_unicode_44i");
free(punycode_u32);
}
ucs4 = NULL;
rc = idn2_to_unicode_8z8z (t->punycode, (char **) &utf8, 0);
if (rc == IDN2_OK)
{
ucs4 = u8_to_u32 (utf8, u8_strlen (utf8) + 1, NULL, &outlen);
free (utf8);
}
_check_4z (t, rc, ucs4, "idn2_to_unicode_8z8z");
if (skip_lz)
continue;
ucs4 = NULL;
rc = idn2_to_unicode_8zlz (t->punycode, &utf8_lz, 0);
if (rc == IDN2_OK)
{
utf8 = u8_strconv_from_encoding (utf8_lz, encoding, iconveh_error);
free (utf8_lz);
ucs4 = u8_to_u32 (utf8, u8_strlen (utf8) + 1, NULL, &outlen);
free (utf8);
}
_check_4z (t, rc, ucs4, "idn2_to_unicode_8zlz");
/* Since the test punycodes are completely ASCII,
idn2_to_unicode_8zlz and idn2_to_unicode_lzlz should
have the same results */
ucs4 = NULL;
rc = idn2_to_unicode_lzlz (t->punycode, (char **) &utf8_lz, 0);
if (rc == IDN2_OK)
{
utf8 = u8_strconv_from_encoding (utf8_lz, encoding, iconveh_error);
free (utf8_lz);
ucs4 = u8_to_u32 (utf8, u8_strlen (utf8) + 1, NULL, &outlen);
free (utf8);
}
_check_4z (t, rc, ucs4, "idn2_to_unicode_lzlz");
}
/* special checks with NULL values */
idn2_to_unicode_8z4z (NULL, NULL, 0);
idn2_to_unicode_8z4z (NULL, &ucs4, 0);
idn2_to_unicode_8z4z ("abc", NULL, 0);
idn2_to_unicode_8z4z ("xn--tda", NULL, 0);
idn2_to_unicode_4z4z (NULL, NULL, 0);
idn2_to_unicode_4z4z (NULL, &ucs4, 0);
idn2_to_unicode_4z4z (abc_u32, NULL, 0);
idn2_to_unicode_4z4z (xntda_u32, NULL, 0);
idn2_to_unicode_44i (NULL, 0, NULL, NULL, 0);
idn2_to_unicode_44i (NULL, 0, NULL, &outlen, 0);
idn2_to_unicode_44i (NULL, 0, q, NULL, 0);
outlen = 32; idn2_to_unicode_44i (NULL, 0, q, &outlen, 0);
outlen = 0; idn2_to_unicode_44i (NULL, 0, (uint32_t *) 123, &outlen, 0);
idn2_to_unicode_44i (abc_u32, 0, NULL, NULL, 0);
idn2_to_unicode_44i (abc_u32, 0, NULL, &outlen, 0);
idn2_to_unicode_44i (abc_u32, 0, q, NULL, 0);
outlen = 32; idn2_to_unicode_44i (abc_u32, 0, q, &outlen, 0);
outlen = 0; idn2_to_unicode_44i (abc_u32, 0, (uint32_t *) 123, &outlen, 0);
outlen = 0; idn2_to_unicode_44i (abc_u32, 3, (uint32_t *) 123, &outlen, 0);
outlen = 0; idn2_to_unicode_44i (abc_u32, 3, (uint32_t *) 123, NULL, 0);
idn2_to_unicode_8z8z (NULL, NULL, 0);
idn2_to_unicode_8z8z (NULL, (char **) &utf8, 0);
idn2_to_unicode_8z8z ("abc", NULL, 0);
idn2_to_unicode_8z8z ("xn--tda", NULL, 0);
idn2_to_unicode_8zlz (NULL, NULL, 0);
idn2_to_unicode_8zlz (NULL, (char **) &utf8, 0);
idn2_to_unicode_8zlz ("abc", NULL, 0);
idn2_to_unicode_8zlz ("xn--tda", NULL, 0);
idn2_to_unicode_lzlz (NULL, NULL, 0);
idn2_to_unicode_lzlz (NULL, (char **) &utf8, 0);
idn2_to_unicode_lzlz ("abc", NULL, 0);
idn2_to_unicode_lzlz ("xn--tda", NULL, 0);
if (debug && error_count)
printf("error_count: %d\n", error_count);
return !!error_count;
}