diff --git a/.gitignore b/.gitignore index cbf8d79..8218549 100644 --- .gitignore +++ .gitignore @@ -38,3 +38,5 @@ lib*.pc /Debug/ /Release/ /tmp_install/ + +/configure diff --git a/configure.in b/configure.in index 598fbd8..b83545c 100644 --- configure.in +++ configure.in @@ -730,6 +730,16 @@ AC_SUBST(with_systemd) AC_MSG_RESULT([$with_systemd]) # +# ICU +# +AC_MSG_CHECKING([whether to build with ICU support]) +PGAC_ARG_BOOL(with, icu, no, [ --with-icu build with ICU support], + [AC_DEFINE([USE_ICU], 1, [Define to build with ICU support. (--with-icu)])]) +AC_MSG_RESULT([$with_icu]) +AC_SUBST(with_icu) + + +# # Readline # PGAC_ARG_BOOL(with, readline, yes, @@ -1120,6 +1130,63 @@ if test "$with_openssl" = yes ; then AC_CHECK_FUNCS([SSL_get_current_compression]) fi +if test "$with_icu" = yes ; then + AC_CHECK_LIB(icui18n, ucol_open_57, [], [ + AC_CHECK_LIB(icui18n, ucol_open_56, [], [ + AC_CHECK_LIB(icui18n, ucol_open_55, [], [ + AC_CHECK_LIB(icui18n, ucol_open_54, [], [ + AC_CHECK_LIB(icui18n, ucol_open_53, [], [ + AC_CHECK_LIB(icui18n, ucol_open_52, [], [ + AC_CHECK_LIB(icui18n, ucol_open_50, [], [ + AC_CHECK_LIB(icui18n, ucol_open_48, [], [ + AC_CHECK_LIB(icui18n, ucol_open_46, [], [ + AC_CHECK_LIB(icui18n, ucol_open_44, [], [ + AC_CHECK_LIB(icui18n, ucol_open_43, [], [ + AC_CHECK_LIB(icui18n, ucol_open_3_8, [], [ + AC_CHECK_LIB(icui18n, ucol_open_3_6, [], [ + AC_CHECK_LIB(icui18n, ucol_open, [], [AC_MSG_ERROR([library 'icui18n' is required for ICU])]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + AC_CHECK_LIB(icuuc, ucnv_fromUChars_57, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_56, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_55, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_54, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_53, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_52, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_50, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_48, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_46, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_44, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_43, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_3_8, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars_3_6, [], [ + AC_CHECK_LIB(icuuc, ucnv_fromUChars, [], [AC_MSG_ERROR([library 'icuuc' is required for ICU])]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) + ]) +fi + if test "$with_pam" = yes ; then AC_CHECK_LIB(pam, pam_start, [], [AC_MSG_ERROR([library 'pam' is required for PAM])]) fi @@ -1273,6 +1340,10 @@ if test "$with_openssl" = yes ; then AC_CHECK_HEADER(openssl/err.h, [], [AC_MSG_ERROR([header file is required for OpenSSL])]) fi +if test "$with_icu" = yes ; then + AC_CHECK_HEADER(unicode/utypes.h, [], [AC_MSG_ERROR([header file is required for ICU])]) +fi + if test "$with_pam" = yes ; then AC_CHECK_HEADERS(security/pam_appl.h, [], [AC_CHECK_HEADERS(pam/pam_appl.h, [], diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index bbd97dc..6d8886e 100644 --- src/backend/utils/adt/formatting.c +++ src/backend/utils/adt/formatting.c @@ -92,6 +92,12 @@ #include "utils/numeric.h" #include "utils/pg_locale.h" +#ifdef USE_ICU +#define U_CHARSET_IS_UTF8 1 +#include +#include +#endif /* USE_ICU */ + /* ---------- * Routines type * ---------- @@ -940,6 +946,11 @@ typedef struct NUMProc } NUMProc; +#ifdef USE_ICU +static UCaseMap *default_casemap = NULL; /* used for UTF-8 transcriptions */ +#endif /* USE_ICU */ + + /* ---------- * Functions * ---------- @@ -1491,6 +1502,68 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) { result = asc_tolower(buff, nbytes); } +#ifdef USE_ICU + else if (GetDatabaseEncoding() == PG_UTF8) { + /* + * optimized and much simpler version for UTF-8 + */ + uint32_t buflen; + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + + if (default_casemap == NULL) + { + default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: oracle_compat.c, could not get UCaseMap."))); + } + } + + if (collid != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string comparison"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + casemap = pg_icu_casemap_from_collation(collid); + } + else + { + casemap = default_casemap; + } + + result = palloc(nbytes + 1); /* add a byte for null termination */ + /* run desired function */ + buflen = ucasemap_utf8ToLower(casemap, result, nbytes + 1, buff, nbytes, &status); + /* + * In some corner cases like Turkic `I', resulting char* can be longer than source. + * Accept that we run the transcription twice in these rare cases rather than wasting + * memory or clock cycles trying to figure out the correct size. + */ + if (buflen > nbytes) { + pfree(result); + result = palloc(buflen + 1); + status = U_ZERO_ERROR; + buflen = ucasemap_utf8ToLower(casemap, result, buflen + 1, buff, nbytes, &status); + } + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: Could not modify case"))); + } + } +#endif /* USE_ICU */ #ifdef USE_WIDE_UPPER_LOWER else if (pg_database_encoding_max_length() > 1) { @@ -1611,6 +1684,68 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) { result = asc_toupper(buff, nbytes); } +#ifdef USE_ICU + else if (GetDatabaseEncoding() == PG_UTF8) { + /* + * optimized and much simpler version for UTF-8 + */ + uint32_t buflen; + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + + if (default_casemap == NULL) + { + default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: oracle_compat.c, could not get UCaseMap."))); + } + } + + if (collid != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string comparison"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + casemap = pg_icu_casemap_from_collation(collid); + } + else + { + casemap = default_casemap; + } + + result = palloc(nbytes + 1); // add a byte for null termination + /* run desired function */ + buflen = ucasemap_utf8ToUpper(casemap, result, nbytes + 1, buff, nbytes, &status); + /* + * In some corner cases like Turkic `I', resulting char* can be longer than source. + * Accept that we run the transcription twice in these rare cases rather than wasting + * memory or clock cycles trying to figure out the correct size. + */ + if (buflen > nbytes) { + pfree(result); + result = palloc(buflen + 1); + status = U_ZERO_ERROR; + buflen = ucasemap_utf8ToUpper(casemap, result, buflen + 1, buff, nbytes, &status); + } + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: Could not modify case"))); + } + } +#endif /* USE_ICU */ #ifdef USE_WIDE_UPPER_LOWER else if (pg_database_encoding_max_length() > 1) { @@ -1732,6 +1867,69 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) { result = asc_initcap(buff, nbytes); } +#ifdef USE_ICU + else if (GetDatabaseEncoding() == PG_UTF8) + { + /* + * optimized and much simpler version for UTF-8 + */ + uint32_t buflen; + UErrorCode status = U_ZERO_ERROR; + UCaseMap *casemap; + + if (default_casemap == NULL) + { + default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: oracle_compat.c, could not get UCaseMap."))); + } + } + + if (collid != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string comparison"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + casemap = pg_icu_casemap_from_collation(collid); + } + else + { + casemap = default_casemap; + } + + result = palloc(nbytes + 1); // add a byte for null termination + /* run desired function */ + buflen = ucasemap_utf8ToTitle(casemap, result, nbytes + 1, buff, nbytes, &status); + /* + * In some corner cases like Turkic `I', resulting char* can be longer than source. + * Accept that we run the transcription twice in these rare cases rather than wasting + * memory or clock cycles trying to figure out the correct size. + */ + if (buflen > nbytes) { + pfree(result); + result = palloc(buflen + 1); + status = U_ZERO_ERROR; + buflen = ucasemap_utf8ToTitle(casemap, result, buflen + 1, buff, nbytes, &status); + } + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(status), + errmsg("ICU error: Could not modify case"))); + } + } +#endif /* USE_ICU */ #ifdef USE_WIDE_UPPER_LOWER else if (pg_database_encoding_max_length() > 1) { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index a818023..2c6e822 100644 --- src/backend/utils/adt/pg_locale.c +++ src/backend/utils/adt/pg_locale.c @@ -63,6 +63,10 @@ #include "utils/pg_locale.h" #include "utils/syscache.h" +#ifdef USE_ICU +#include +#endif + #ifdef WIN32 /* * This Windows file defines StrNCpy. We don't need it here, so we undefine @@ -118,6 +122,10 @@ typedef struct bool ctype_is_c; /* is collation's LC_CTYPE C? */ bool flags_valid; /* true if above flags are valid */ pg_locale_t locale; /* locale_t struct, or 0 if not valid */ +#ifdef USE_ICU + UCollator *icu_collator; + UCaseMap *icu_casemap; +#endif } collation_cache_entry; static HTAB *collation_cache = NULL; @@ -1127,6 +1135,50 @@ report_newlocale_failure(const char *localename) } #endif /* HAVE_LOCALE_T */ +#ifdef USE_ICU +UCollator * +pg_icu_collator_from_collation(Oid collid) +{ + collation_cache_entry *cache_entry; + + /* Callers must pass a valid OID */ + Assert(OidIsValid(collid)); + + /* Return 0 for "default" collation, just in case caller forgets */ + if (collid == DEFAULT_COLLATION_OID) + return NULL; + + cache_entry = lookup_collation_cache(collid, false); + + if (cache_entry->locale == 0) + { + pg_newlocale_from_collation(collid); + cache_entry = lookup_collation_cache(collid, false); + } + return cache_entry->icu_collator; +} + +UCaseMap *pg_icu_casemap_from_collation(Oid collid) +{ + collation_cache_entry *cache_entry; + + /* Callers must pass a valid OID */ + Assert(OidIsValid(collid)); + + /* Return 0 for "default" collation, just in case caller forgets */ + if (collid == DEFAULT_COLLATION_OID) + return NULL; + + cache_entry = lookup_collation_cache(collid, false); + + if (cache_entry->locale == 0) + { + pg_newlocale_from_collation(collid); + cache_entry = lookup_collation_cache(collid, false); + } + return cache_entry->icu_casemap; +} +#endif /* * Create a locale_t from a collation OID. Results are cached for the @@ -1176,6 +1228,26 @@ pg_newlocale_from_collation(Oid collid) collcollate = NameStr(collform->collcollate); collctype = NameStr(collform->collctype); +#ifdef USE_ICU + UErrorCode status = U_ZERO_ERROR; + UCollator *icu_collator = ucol_open(collcollate, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: pg_locale.c, could not open collator %s", collcollate))); + } + cache_entry->icu_collator = icu_collator; + + UCaseMap *icu_casemap = ucasemap_open(collcollate, U_FOLD_CASE_DEFAULT, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: pg_locale.c, could not open casemap %s", collcollate))); + } + cache_entry->icu_casemap = icu_casemap; +#endif if (strcmp(collcollate, collctype) == 0) { /* Normal case where they're the same */ diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index bf7c0cd..c67240d 100644 --- src/backend/utils/adt/varlena.c +++ src/backend/utils/adt/varlena.c @@ -35,6 +35,18 @@ #include "utils/pg_locale.h" #include "utils/sortsupport.h" +#ifdef USE_ICU +#define U_CHARSET_IS_UTF8 1 +#include +#include +#include /* Basic ICU data types */ +#include /* C Converter API */ +#include +#include +#include "unicode/uiter.h" +static UCollator *default_collator = NULL; +#endif /* USE_ICU */ + /* GUC variable */ int bytea_output = BYTEA_OUTPUT_HEX; @@ -75,6 +87,9 @@ typedef struct #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif +#ifdef USE_ICU + UCollator *icu_collator; +#endif } VarStringSortSupport; /* @@ -1396,6 +1411,94 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) if ((result == 0) && (len1 != len2)) result = (len1 < len2) ? -1 : 1; } + + else if (collid != DEFAULT_COLLATION_OID && !OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for string comparison"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + /* + * memcmp() can't tell us which of two unequal strings sorts first, + * but it's a cheap way to tell if they're equal. Testing shows that + * memcmp() followed by strcoll() is only trivially slower than + * strcoll() by itself, so we don't lose much if this doesn't work out + * very often, and if it does - for example, because there are many + * equal strings in the input - then we win big by avoiding expensive + * collation-aware comparisons. + */ + else if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) + result = 0; + +#ifdef USE_ICU + + else if (GetDatabaseEncoding() == PG_UTF8) + { + UCollator *collator; + UErrorCode status = U_ZERO_ERROR; + + /* We keep a static default collator "forever" per session, + * since it is hard coded into the database cluster at initdb + * time anyway. We create it first time we get here. */ + if (default_collator == NULL) + { + /* Expect LC_COLLATE to be set to something that ICU + * will understand. This is quite probable, since ICU + * does a lot of heuristics with this argument. I'd + * rather set this in xlog.c, but it seems ICU forgets + * it??? */ + uloc_setDefault(setlocale(LC_COLLATE, NULL), &status); + if(U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not set default lc_collate"))); + } + default_collator = ucol_open(NULL, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not open collator"))); + } + } + + if (collid != DEFAULT_COLLATION_OID) + collator = pg_icu_collator_from_collation(collid); + else + collator = default_collator; + + UCharIterator sIter, tIter; + uiter_setUTF8(&sIter, arg1, len1); + uiter_setUTF8(&tIter, arg2, len2); + result = ucol_strcollIter(collator, &sIter, &tIter, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not collate"))); + } + /* + * In some locales wcscoll() can claim that nonidentical strings + * are equal. Believing that this might be so also for ICU, and + * believing that would be bad news for a number of + * reasons, we follow Perl's lead and sort "equal" strings + * according to strcmp (on the byte representation). + */ + if (result == 0) + { + result = strncmp(arg1, arg2, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + } + } +#endif /* USE_ICU */ + else { char a1buf[TEXTBUFLEN]; @@ -1409,34 +1512,11 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) if (collid != DEFAULT_COLLATION_OID) { - if (!OidIsValid(collid)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for string comparison"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } #ifdef HAVE_LOCALE_T mylocale = pg_newlocale_from_collation(collid); #endif } - /* - * memcmp() can't tell us which of two unequal strings sorts first, - * but it's a cheap way to tell if they're equal. Testing shows that - * memcmp() followed by strcoll() is only trivially slower than - * strcoll() by itself, so we don't lose much if this doesn't work out - * very often, and if it does - for example, because there are many - * equal strings in the input - then we win big by avoiding expensive - * collation-aware comparisons. - */ - if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) - return 0; - #ifdef WIN32 /* Win32 does not have UTF-8, so we need to map to UTF-16 */ if (GetDatabaseEncoding() == PG_UTF8) @@ -1771,6 +1851,9 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) #ifdef HAVE_LOCALE_T pg_locale_t locale = 0; #endif +#ifdef USE_ICU + UCollator *icu_collator = NULL; +#endif /* * If possible, set ssup->comparator to a function which can be used to @@ -1828,6 +1911,37 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) #ifdef HAVE_LOCALE_T locale = pg_newlocale_from_collation(collid); #endif +#ifdef USE_ICU + if (GetDatabaseEncoding() == PG_UTF8) + { + icu_collator = pg_icu_collator_from_collation(collid); + } + } + else if (GetDatabaseEncoding() == PG_UTF8) + { + /* We keep a static default collator "forever" per session, + * as per discussion in varstr_cmp(). */ + if (default_collator == NULL) + { + UErrorCode status = U_ZERO_ERROR; + + uloc_setDefault(setlocale(LC_COLLATE, NULL), &status); + if(U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not set default lc_collate"))); + } + default_collator = ucol_open(NULL, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not open collator"))); + } + } + icu_collator = default_collator; +#endif } } @@ -1879,6 +1993,9 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) #ifdef HAVE_LOCALE_T sss->locale = locale; #endif +#ifdef USE_ICU + sss->icu_collator = icu_collator; +#endif /* * To avoid somehow confusing a strxfrm() blob and an original string, @@ -2089,6 +2206,23 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup) goto done; } +#ifdef USE_ICU + if (GetDatabaseEncoding() == PG_UTF8 && sss->icu_collator) + { + UErrorCode status = U_ZERO_ERROR; + UCharIterator sIter, tIter; + uiter_setUTF8(&sIter, a1p, len1); + uiter_setUTF8(&tIter, a2p, len2); + result = ucol_strcollIter(sss->icu_collator, &sIter, &tIter, &status); + if (U_FAILURE(status)) + { + ereport(WARNING, + (errcode(status), + errmsg("ICU Error: varlena.c, could not collate"))); + } + } + else +#endif #ifdef HAVE_LOCALE_T if (sss->locale) result = strcoll_l(sss->buf1, sss->buf2, sss->locale); diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c index 11099b8..d411f45 100644 --- src/backend/utils/mb/encnames.c +++ src/backend/utils/mb/encnames.c @@ -403,6 +403,118 @@ const pg_enc2gettext pg_enc2gettext_tbl[] = }; +#ifdef USE_ICU +/* + * Try to map most internal character encodings to the proper and + * preferred IANA string. Use this in mbutils.c to feed ICU info about + * the database's character encoding. + * + * Palle Girgensohn, 2005 + */ + +pg_enc2name pg_enc2iananame_tbl[] = +{ + { + "US-ASCII", PG_SQL_ASCII + }, + { + "EUC-JP", PG_EUC_JP + }, + { + "GB2312", PG_EUC_CN + }, + { + "EUC-KR", PG_EUC_KR + }, + { + "ISO-2022-CN", PG_EUC_TW + }, + { + "KS_C_5601-1987", PG_JOHAB /* either KS_C_5601-1987 or ISO-2022-KR ??? */ + }, + { + "UTF-8", PG_UTF8 + }, + { + "MULE_INTERNAL", PG_MULE_INTERNAL /* is not for real */ + }, + { + "ISO-8859-1", PG_LATIN1 + }, + { + "ISO-8859-2", PG_LATIN2 + }, + { + "ISO-8859-3", PG_LATIN3 + }, + { + "ISO-8859-4", PG_LATIN4 + }, + { + "ISO-8859-9", PG_LATIN5 + }, + { + "ISO-8859-10", PG_LATIN6 + }, + { + "ISO-8859-13", PG_LATIN7 + }, + { + "ISO-8859-14", PG_LATIN8 + }, + { + "ISO-8859-15", PG_LATIN9 + }, + { + "ISO-8859-16", PG_LATIN10 + }, + { + "windows-1256", PG_WIN1256 + }, + { + "windows-874", PG_WIN874 + }, + { + "KOI8-R", PG_KOI8R + }, + { + "windows-1251", PG_WIN1251 + }, + { + "ISO-8859-5", PG_ISO_8859_5 + }, + { + "ISO-8859-6", PG_ISO_8859_6 + }, + { + "ISO-8859-7", PG_ISO_8859_7 + }, + { + "ISO-8859-8", PG_ISO_8859_8 + }, + { + "windows-1250", PG_WIN1250 + }, + { + "Shift_JIS", PG_SJIS + }, + { + "Big5", PG_BIG5 + }, + { + "GBK", PG_GBK + }, + { + "cp949", PG_UHC + }, + { + "GB18030", PG_GB18030 + } +}; +#endif /* USE_ICU */ + + + /* ---------- * Encoding checks, for error returns -1 else encoding id * ---------- diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 7f1c881..7b7bc01 100644 --- src/backend/utils/mb/mbutils.c +++ src/backend/utils/mb/mbutils.c @@ -40,6 +40,10 @@ #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/syscache.h" +#ifdef USE_ICU +#define U_CHARSET_IS_UTF8 1 +#include +#endif /* USE_ICU */ /* * When converting strings between different encodings, we assume that space @@ -913,6 +917,9 @@ SetDatabaseEncoding(int encoding) DatabaseEncoding = &pg_enc2name_tbl[encoding]; Assert(DatabaseEncoding->encoding == encoding); +#ifdef USE_ICU + ucnv_setDefaultName((&pg_enc2iananame_tbl[encoding])->name); +#endif } void diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 24e8d0d..11da225 100644 --- src/include/mb/pg_wchar.h +++ src/include/mb/pg_wchar.h @@ -321,6 +321,10 @@ typedef struct pg_enc2name extern const pg_enc2name pg_enc2name_tbl[]; +#ifdef USE_ICU +extern pg_enc2name pg_enc2iananame_tbl[]; +#endif + /* * Encoding names for gettext */ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index b621ff2..de58917 100644 --- src/include/pg_config.h.in +++ src/include/pg_config.h.in @@ -288,6 +288,12 @@ /* Define to 1 if you have the `crypto' library (-lcrypto). */ #undef HAVE_LIBCRYPTO +/* Define to 1 if you have the `icui18n' library (-licui18n). */ +#undef HAVE_LIBICUI18N + +/* Define to 1 if you have the `icuuc' library (-licuuc). */ +#undef HAVE_LIBICUUC + /* Define to 1 if you have the `ldap' library (-lldap). */ #undef HAVE_LIBLDAP @@ -796,6 +802,9 @@ /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */ #undef USE_BSD_AUTH +/* Define to build with ICU support. (--with-icu) */ +#undef USE_ICU + /* Define to 1 if you want float4 values to be passed by value. (--enable-float4-byval) */ #undef USE_FLOAT4_BYVAL diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 0a4b9f7..d750536 100644 --- src/include/utils/pg_locale.h +++ src/include/utils/pg_locale.h @@ -19,6 +19,12 @@ #include "utils/guc.h" +#ifdef USE_ICU +#define U_CHARSET_IS_UTF8 1 +#include +#include +#include +#endif /* GUC settings */ extern char *locale_messages; @@ -71,6 +77,10 @@ typedef locale_t pg_locale_t; typedef int pg_locale_t; #endif +#ifdef USE_ICU +extern UCollator * pg_icu_collator_from_collation(Oid collid); +extern UCaseMap * pg_icu_casemap_from_collation(Oid collid); +#endif extern pg_locale_t pg_newlocale_from_collation(Oid collid); /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */