mirror of
https://github.com/beard7n/bsdports.git
synced 2026-04-12 11:31:18 +02:00
911 lines
24 KiB
Diff
911 lines
24 KiB
Diff
diff --git a/.gitignore b/.gitignore
|
|
index cbf8d79..8218549 100644
|
|
--- .gitignore
|
|
+++ .gitignore
|
|
@@ -38,3 +38,5 @@ lib*.pc
|
|
/Debug/
|
|
/Release/
|
|
/tmp_install/
|
|
+
|
|
+/configure
|
|
diff --git a/configure.in b/configure.in
|
|
index 598fbd8..b83545c 100644
|
|
--- configure.in
|
|
+++ configure.in
|
|
@@ -730,6 +730,16 @@ AC_SUBST(with_systemd)
|
|
AC_MSG_RESULT([$with_systemd])
|
|
|
|
#
|
|
+# ICU
|
|
+#
|
|
+AC_MSG_CHECKING([whether to build with ICU support])
|
|
+PGAC_ARG_BOOL(with, icu, no, [ --with-icu build with ICU support],
|
|
+ [AC_DEFINE([USE_ICU], 1, [Define to build with ICU support. (--with-icu)])])
|
|
+AC_MSG_RESULT([$with_icu])
|
|
+AC_SUBST(with_icu)
|
|
+
|
|
+
|
|
+#
|
|
# Readline
|
|
#
|
|
PGAC_ARG_BOOL(with, readline, yes,
|
|
@@ -1120,6 +1130,63 @@ if test "$with_openssl" = yes ; then
|
|
AC_CHECK_FUNCS([SSL_get_current_compression])
|
|
fi
|
|
|
|
+if test "$with_icu" = yes ; then
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_57, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_56, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_55, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_54, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_53, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_52, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_50, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_48, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_46, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_44, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_43, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_3_8, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open_3_6, [], [
|
|
+ AC_CHECK_LIB(icui18n, ucol_open, [], [AC_MSG_ERROR([library 'icui18n' is required for ICU])])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_57, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_56, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_55, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_54, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_53, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_52, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_50, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_48, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_46, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_44, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_43, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_3_8, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars_3_6, [], [
|
|
+ AC_CHECK_LIB(icuuc, ucnv_fromUChars, [], [AC_MSG_ERROR([library 'icuuc' is required for ICU])])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+ ])
|
|
+fi
|
|
+
|
|
if test "$with_pam" = yes ; then
|
|
AC_CHECK_LIB(pam, pam_start, [], [AC_MSG_ERROR([library 'pam' is required for PAM])])
|
|
fi
|
|
@@ -1273,6 +1340,10 @@ if test "$with_openssl" = yes ; then
|
|
AC_CHECK_HEADER(openssl/err.h, [], [AC_MSG_ERROR([header file <openssl/err.h> is required for OpenSSL])])
|
|
fi
|
|
|
|
+if test "$with_icu" = yes ; then
|
|
+ AC_CHECK_HEADER(unicode/utypes.h, [], [AC_MSG_ERROR([header file <unicode/utypes.h> is required for ICU])])
|
|
+fi
|
|
+
|
|
if test "$with_pam" = yes ; then
|
|
AC_CHECK_HEADERS(security/pam_appl.h, [],
|
|
[AC_CHECK_HEADERS(pam/pam_appl.h, [],
|
|
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
|
|
index bbd97dc..6d8886e 100644
|
|
--- src/backend/utils/adt/formatting.c
|
|
+++ src/backend/utils/adt/formatting.c
|
|
@@ -92,6 +92,12 @@
|
|
#include "utils/numeric.h"
|
|
#include "utils/pg_locale.h"
|
|
|
|
+#ifdef USE_ICU
|
|
+#define U_CHARSET_IS_UTF8 1
|
|
+#include <unicode/uchar.h>
|
|
+#include <unicode/ucasemap.h>
|
|
+#endif /* USE_ICU */
|
|
+
|
|
/* ----------
|
|
* Routines type
|
|
* ----------
|
|
@@ -940,6 +946,11 @@ typedef struct NUMProc
|
|
} NUMProc;
|
|
|
|
|
|
+#ifdef USE_ICU
|
|
+static UCaseMap *default_casemap = NULL; /* used for UTF-8 transcriptions */
|
|
+#endif /* USE_ICU */
|
|
+
|
|
+
|
|
/* ----------
|
|
* Functions
|
|
* ----------
|
|
@@ -1491,6 +1502,68 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
|
|
{
|
|
result = asc_tolower(buff, nbytes);
|
|
}
|
|
+#ifdef USE_ICU
|
|
+ else if (GetDatabaseEncoding() == PG_UTF8) {
|
|
+ /*
|
|
+ * optimized and much simpler version for UTF-8
|
|
+ */
|
|
+ uint32_t buflen;
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+ UCaseMap *casemap;
|
|
+
|
|
+ if (default_casemap == NULL)
|
|
+ {
|
|
+ default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: oracle_compat.c, could not get UCaseMap.")));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (collid != DEFAULT_COLLATION_OID)
|
|
+ {
|
|
+ if (!OidIsValid(collid))
|
|
+ {
|
|
+ /*
|
|
+ * This typically means that the parser could not resolve a
|
|
+ * conflict of implicit collations, so report it that way.
|
|
+ */
|
|
+ ereport(ERROR,
|
|
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
+ errmsg("could not determine which collation to use for string comparison"),
|
|
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
+ }
|
|
+ casemap = pg_icu_casemap_from_collation(collid);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ casemap = default_casemap;
|
|
+ }
|
|
+
|
|
+ result = palloc(nbytes + 1); /* add a byte for null termination */
|
|
+ /* run desired function */
|
|
+ buflen = ucasemap_utf8ToLower(casemap, result, nbytes + 1, buff, nbytes, &status);
|
|
+ /*
|
|
+ * In some corner cases like Turkic `I', resulting char* can be longer than source.
|
|
+ * Accept that we run the transcription twice in these rare cases rather than wasting
|
|
+ * memory or clock cycles trying to figure out the correct size.
|
|
+ */
|
|
+ if (buflen > nbytes) {
|
|
+ pfree(result);
|
|
+ result = palloc(buflen + 1);
|
|
+ status = U_ZERO_ERROR;
|
|
+ buflen = ucasemap_utf8ToLower(casemap, result, buflen + 1, buff, nbytes, &status);
|
|
+ }
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: Could not modify case")));
|
|
+ }
|
|
+ }
|
|
+#endif /* USE_ICU */
|
|
#ifdef USE_WIDE_UPPER_LOWER
|
|
else if (pg_database_encoding_max_length() > 1)
|
|
{
|
|
@@ -1611,6 +1684,68 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
|
|
{
|
|
result = asc_toupper(buff, nbytes);
|
|
}
|
|
+#ifdef USE_ICU
|
|
+ else if (GetDatabaseEncoding() == PG_UTF8) {
|
|
+ /*
|
|
+ * optimized and much simpler version for UTF-8
|
|
+ */
|
|
+ uint32_t buflen;
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+ UCaseMap *casemap;
|
|
+
|
|
+ if (default_casemap == NULL)
|
|
+ {
|
|
+ default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: oracle_compat.c, could not get UCaseMap.")));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (collid != DEFAULT_COLLATION_OID)
|
|
+ {
|
|
+ if (!OidIsValid(collid))
|
|
+ {
|
|
+ /*
|
|
+ * This typically means that the parser could not resolve a
|
|
+ * conflict of implicit collations, so report it that way.
|
|
+ */
|
|
+ ereport(ERROR,
|
|
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
+ errmsg("could not determine which collation to use for string comparison"),
|
|
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
+ }
|
|
+ casemap = pg_icu_casemap_from_collation(collid);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ casemap = default_casemap;
|
|
+ }
|
|
+
|
|
+ result = palloc(nbytes + 1); // add a byte for null termination
|
|
+ /* run desired function */
|
|
+ buflen = ucasemap_utf8ToUpper(casemap, result, nbytes + 1, buff, nbytes, &status);
|
|
+ /*
|
|
+ * In some corner cases like Turkic `I', resulting char* can be longer than source.
|
|
+ * Accept that we run the transcription twice in these rare cases rather than wasting
|
|
+ * memory or clock cycles trying to figure out the correct size.
|
|
+ */
|
|
+ if (buflen > nbytes) {
|
|
+ pfree(result);
|
|
+ result = palloc(buflen + 1);
|
|
+ status = U_ZERO_ERROR;
|
|
+ buflen = ucasemap_utf8ToUpper(casemap, result, buflen + 1, buff, nbytes, &status);
|
|
+ }
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: Could not modify case")));
|
|
+ }
|
|
+ }
|
|
+#endif /* USE_ICU */
|
|
#ifdef USE_WIDE_UPPER_LOWER
|
|
else if (pg_database_encoding_max_length() > 1)
|
|
{
|
|
@@ -1732,6 +1867,69 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
|
|
{
|
|
result = asc_initcap(buff, nbytes);
|
|
}
|
|
+#ifdef USE_ICU
|
|
+ else if (GetDatabaseEncoding() == PG_UTF8)
|
|
+ {
|
|
+ /*
|
|
+ * optimized and much simpler version for UTF-8
|
|
+ */
|
|
+ uint32_t buflen;
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+ UCaseMap *casemap;
|
|
+
|
|
+ if (default_casemap == NULL)
|
|
+ {
|
|
+ default_casemap = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: oracle_compat.c, could not get UCaseMap.")));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (collid != DEFAULT_COLLATION_OID)
|
|
+ {
|
|
+ if (!OidIsValid(collid))
|
|
+ {
|
|
+ /*
|
|
+ * This typically means that the parser could not resolve a
|
|
+ * conflict of implicit collations, so report it that way.
|
|
+ */
|
|
+ ereport(ERROR,
|
|
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
+ errmsg("could not determine which collation to use for string comparison"),
|
|
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
+ }
|
|
+ casemap = pg_icu_casemap_from_collation(collid);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ casemap = default_casemap;
|
|
+ }
|
|
+
|
|
+ result = palloc(nbytes + 1); // add a byte for null termination
|
|
+ /* run desired function */
|
|
+ buflen = ucasemap_utf8ToTitle(casemap, result, nbytes + 1, buff, nbytes, &status);
|
|
+ /*
|
|
+ * In some corner cases like Turkic `I', resulting char* can be longer than source.
|
|
+ * Accept that we run the transcription twice in these rare cases rather than wasting
|
|
+ * memory or clock cycles trying to figure out the correct size.
|
|
+ */
|
|
+ if (buflen > nbytes) {
|
|
+ pfree(result);
|
|
+ result = palloc(buflen + 1);
|
|
+ status = U_ZERO_ERROR;
|
|
+ buflen = ucasemap_utf8ToTitle(casemap, result, buflen + 1, buff, nbytes, &status);
|
|
+ }
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(ERROR,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU error: Could not modify case")));
|
|
+ }
|
|
+ }
|
|
+#endif /* USE_ICU */
|
|
#ifdef USE_WIDE_UPPER_LOWER
|
|
else if (pg_database_encoding_max_length() > 1)
|
|
{
|
|
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
|
|
index a818023..2c6e822 100644
|
|
--- src/backend/utils/adt/pg_locale.c
|
|
+++ src/backend/utils/adt/pg_locale.c
|
|
@@ -63,6 +63,10 @@
|
|
#include "utils/pg_locale.h"
|
|
#include "utils/syscache.h"
|
|
|
|
+#ifdef USE_ICU
|
|
+#include <unicode/ucol.h>
|
|
+#endif
|
|
+
|
|
#ifdef WIN32
|
|
/*
|
|
* This Windows file defines StrNCpy. We don't need it here, so we undefine
|
|
@@ -118,6 +122,10 @@ typedef struct
|
|
bool ctype_is_c; /* is collation's LC_CTYPE C? */
|
|
bool flags_valid; /* true if above flags are valid */
|
|
pg_locale_t locale; /* locale_t struct, or 0 if not valid */
|
|
+#ifdef USE_ICU
|
|
+ UCollator *icu_collator;
|
|
+ UCaseMap *icu_casemap;
|
|
+#endif
|
|
} collation_cache_entry;
|
|
|
|
static HTAB *collation_cache = NULL;
|
|
@@ -1127,6 +1135,50 @@ report_newlocale_failure(const char *localename)
|
|
}
|
|
#endif /* HAVE_LOCALE_T */
|
|
|
|
+#ifdef USE_ICU
|
|
+UCollator *
|
|
+pg_icu_collator_from_collation(Oid collid)
|
|
+{
|
|
+ collation_cache_entry *cache_entry;
|
|
+
|
|
+ /* Callers must pass a valid OID */
|
|
+ Assert(OidIsValid(collid));
|
|
+
|
|
+ /* Return 0 for "default" collation, just in case caller forgets */
|
|
+ if (collid == DEFAULT_COLLATION_OID)
|
|
+ return NULL;
|
|
+
|
|
+ cache_entry = lookup_collation_cache(collid, false);
|
|
+
|
|
+ if (cache_entry->locale == 0)
|
|
+ {
|
|
+ pg_newlocale_from_collation(collid);
|
|
+ cache_entry = lookup_collation_cache(collid, false);
|
|
+ }
|
|
+ return cache_entry->icu_collator;
|
|
+}
|
|
+
|
|
+UCaseMap *pg_icu_casemap_from_collation(Oid collid)
|
|
+{
|
|
+ collation_cache_entry *cache_entry;
|
|
+
|
|
+ /* Callers must pass a valid OID */
|
|
+ Assert(OidIsValid(collid));
|
|
+
|
|
+ /* Return 0 for "default" collation, just in case caller forgets */
|
|
+ if (collid == DEFAULT_COLLATION_OID)
|
|
+ return NULL;
|
|
+
|
|
+ cache_entry = lookup_collation_cache(collid, false);
|
|
+
|
|
+ if (cache_entry->locale == 0)
|
|
+ {
|
|
+ pg_newlocale_from_collation(collid);
|
|
+ cache_entry = lookup_collation_cache(collid, false);
|
|
+ }
|
|
+ return cache_entry->icu_casemap;
|
|
+}
|
|
+#endif
|
|
|
|
/*
|
|
* Create a locale_t from a collation OID. Results are cached for the
|
|
@@ -1176,6 +1228,26 @@ pg_newlocale_from_collation(Oid collid)
|
|
collcollate = NameStr(collform->collcollate);
|
|
collctype = NameStr(collform->collctype);
|
|
|
|
+#ifdef USE_ICU
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+ UCollator *icu_collator = ucol_open(collcollate, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: pg_locale.c, could not open collator %s", collcollate)));
|
|
+ }
|
|
+ cache_entry->icu_collator = icu_collator;
|
|
+
|
|
+ UCaseMap *icu_casemap = ucasemap_open(collcollate, U_FOLD_CASE_DEFAULT, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: pg_locale.c, could not open casemap %s", collcollate)));
|
|
+ }
|
|
+ cache_entry->icu_casemap = icu_casemap;
|
|
+#endif
|
|
if (strcmp(collcollate, collctype) == 0)
|
|
{
|
|
/* Normal case where they're the same */
|
|
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
|
|
index bf7c0cd..c67240d 100644
|
|
--- src/backend/utils/adt/varlena.c
|
|
+++ src/backend/utils/adt/varlena.c
|
|
@@ -35,6 +35,18 @@
|
|
#include "utils/pg_locale.h"
|
|
#include "utils/sortsupport.h"
|
|
|
|
+#ifdef USE_ICU
|
|
+#define U_CHARSET_IS_UTF8 1
|
|
+#include <unicode/uchar.h>
|
|
+#include <unicode/ucasemap.h>
|
|
+#include <unicode/utypes.h> /* Basic ICU data types */
|
|
+#include <unicode/ucnv.h> /* C Converter API */
|
|
+#include <unicode/ucol.h>
|
|
+#include <unicode/uloc.h>
|
|
+#include "unicode/uiter.h"
|
|
+static UCollator *default_collator = NULL;
|
|
+#endif /* USE_ICU */
|
|
+
|
|
|
|
/* GUC variable */
|
|
int bytea_output = BYTEA_OUTPUT_HEX;
|
|
@@ -75,6 +87,9 @@ typedef struct
|
|
#ifdef HAVE_LOCALE_T
|
|
pg_locale_t locale;
|
|
#endif
|
|
+#ifdef USE_ICU
|
|
+ UCollator *icu_collator;
|
|
+#endif
|
|
} VarStringSortSupport;
|
|
|
|
/*
|
|
@@ -1396,6 +1411,94 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
|
|
if ((result == 0) && (len1 != len2))
|
|
result = (len1 < len2) ? -1 : 1;
|
|
}
|
|
+
|
|
+ else if (collid != DEFAULT_COLLATION_OID && !OidIsValid(collid))
|
|
+ {
|
|
+ /*
|
|
+ * This typically means that the parser could not resolve a
|
|
+ * conflict of implicit collations, so report it that way.
|
|
+ */
|
|
+ ereport(ERROR,
|
|
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
+ errmsg("could not determine which collation to use for string comparison"),
|
|
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
+ }
|
|
+ /*
|
|
+ * memcmp() can't tell us which of two unequal strings sorts first,
|
|
+ * but it's a cheap way to tell if they're equal. Testing shows that
|
|
+ * memcmp() followed by strcoll() is only trivially slower than
|
|
+ * strcoll() by itself, so we don't lose much if this doesn't work out
|
|
+ * very often, and if it does - for example, because there are many
|
|
+ * equal strings in the input - then we win big by avoiding expensive
|
|
+ * collation-aware comparisons.
|
|
+ */
|
|
+ else if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
|
|
+ result = 0;
|
|
+
|
|
+#ifdef USE_ICU
|
|
+
|
|
+ else if (GetDatabaseEncoding() == PG_UTF8)
|
|
+ {
|
|
+ UCollator *collator;
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+
|
|
+ /* We keep a static default collator "forever" per session,
|
|
+ * since it is hard coded into the database cluster at initdb
|
|
+ * time anyway. We create it first time we get here. */
|
|
+ if (default_collator == NULL)
|
|
+ {
|
|
+ /* Expect LC_COLLATE to be set to something that ICU
|
|
+ * will understand. This is quite probable, since ICU
|
|
+ * does a lot of heuristics with this argument. I'd
|
|
+ * rather set this in xlog.c, but it seems ICU forgets
|
|
+ * it??? */
|
|
+ uloc_setDefault(setlocale(LC_COLLATE, NULL), &status);
|
|
+ if(U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not set default lc_collate")));
|
|
+ }
|
|
+ default_collator = ucol_open(NULL, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not open collator")));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (collid != DEFAULT_COLLATION_OID)
|
|
+ collator = pg_icu_collator_from_collation(collid);
|
|
+ else
|
|
+ collator = default_collator;
|
|
+
|
|
+ UCharIterator sIter, tIter;
|
|
+ uiter_setUTF8(&sIter, arg1, len1);
|
|
+ uiter_setUTF8(&tIter, arg2, len2);
|
|
+ result = ucol_strcollIter(collator, &sIter, &tIter, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not collate")));
|
|
+ }
|
|
+ /*
|
|
+ * In some locales wcscoll() can claim that nonidentical strings
|
|
+ * are equal. Believing that this might be so also for ICU, and
|
|
+ * believing that would be bad news for a number of
|
|
+ * reasons, we follow Perl's lead and sort "equal" strings
|
|
+ * according to strcmp (on the byte representation).
|
|
+ */
|
|
+ if (result == 0)
|
|
+ {
|
|
+ result = strncmp(arg1, arg2, Min(len1, len2));
|
|
+ if ((result == 0) && (len1 != len2))
|
|
+ result = (len1 < len2) ? -1 : 1;
|
|
+ }
|
|
+ }
|
|
+#endif /* USE_ICU */
|
|
+
|
|
else
|
|
{
|
|
char a1buf[TEXTBUFLEN];
|
|
@@ -1409,34 +1512,11 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
|
|
|
|
if (collid != DEFAULT_COLLATION_OID)
|
|
{
|
|
- if (!OidIsValid(collid))
|
|
- {
|
|
- /*
|
|
- * This typically means that the parser could not resolve a
|
|
- * conflict of implicit collations, so report it that way.
|
|
- */
|
|
- ereport(ERROR,
|
|
- (errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
- errmsg("could not determine which collation to use for string comparison"),
|
|
- errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
- }
|
|
#ifdef HAVE_LOCALE_T
|
|
mylocale = pg_newlocale_from_collation(collid);
|
|
#endif
|
|
}
|
|
|
|
- /*
|
|
- * memcmp() can't tell us which of two unequal strings sorts first,
|
|
- * but it's a cheap way to tell if they're equal. Testing shows that
|
|
- * memcmp() followed by strcoll() is only trivially slower than
|
|
- * strcoll() by itself, so we don't lose much if this doesn't work out
|
|
- * very often, and if it does - for example, because there are many
|
|
- * equal strings in the input - then we win big by avoiding expensive
|
|
- * collation-aware comparisons.
|
|
- */
|
|
- if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
|
|
- return 0;
|
|
-
|
|
#ifdef WIN32
|
|
/* Win32 does not have UTF-8, so we need to map to UTF-16 */
|
|
if (GetDatabaseEncoding() == PG_UTF8)
|
|
@@ -1771,6 +1851,9 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
|
|
#ifdef HAVE_LOCALE_T
|
|
pg_locale_t locale = 0;
|
|
#endif
|
|
+#ifdef USE_ICU
|
|
+ UCollator *icu_collator = NULL;
|
|
+#endif
|
|
|
|
/*
|
|
* If possible, set ssup->comparator to a function which can be used to
|
|
@@ -1828,6 +1911,37 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
|
|
#ifdef HAVE_LOCALE_T
|
|
locale = pg_newlocale_from_collation(collid);
|
|
#endif
|
|
+#ifdef USE_ICU
|
|
+ if (GetDatabaseEncoding() == PG_UTF8)
|
|
+ {
|
|
+ icu_collator = pg_icu_collator_from_collation(collid);
|
|
+ }
|
|
+ }
|
|
+ else if (GetDatabaseEncoding() == PG_UTF8)
|
|
+ {
|
|
+ /* We keep a static default collator "forever" per session,
|
|
+ * as per discussion in varstr_cmp(). */
|
|
+ if (default_collator == NULL)
|
|
+ {
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+
|
|
+ uloc_setDefault(setlocale(LC_COLLATE, NULL), &status);
|
|
+ if(U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not set default lc_collate")));
|
|
+ }
|
|
+ default_collator = ucol_open(NULL, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not open collator")));
|
|
+ }
|
|
+ }
|
|
+ icu_collator = default_collator;
|
|
+#endif
|
|
}
|
|
}
|
|
|
|
@@ -1879,6 +1993,9 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
|
|
#ifdef HAVE_LOCALE_T
|
|
sss->locale = locale;
|
|
#endif
|
|
+#ifdef USE_ICU
|
|
+ sss->icu_collator = icu_collator;
|
|
+#endif
|
|
|
|
/*
|
|
* To avoid somehow confusing a strxfrm() blob and an original string,
|
|
@@ -2089,6 +2206,23 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
|
|
goto done;
|
|
}
|
|
|
|
+#ifdef USE_ICU
|
|
+ if (GetDatabaseEncoding() == PG_UTF8 && sss->icu_collator)
|
|
+ {
|
|
+ UErrorCode status = U_ZERO_ERROR;
|
|
+ UCharIterator sIter, tIter;
|
|
+ uiter_setUTF8(&sIter, a1p, len1);
|
|
+ uiter_setUTF8(&tIter, a2p, len2);
|
|
+ result = ucol_strcollIter(sss->icu_collator, &sIter, &tIter, &status);
|
|
+ if (U_FAILURE(status))
|
|
+ {
|
|
+ ereport(WARNING,
|
|
+ (errcode(status),
|
|
+ errmsg("ICU Error: varlena.c, could not collate")));
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+#endif
|
|
#ifdef HAVE_LOCALE_T
|
|
if (sss->locale)
|
|
result = strcoll_l(sss->buf1, sss->buf2, sss->locale);
|
|
diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c
|
|
index 11099b8..d411f45 100644
|
|
--- src/backend/utils/mb/encnames.c
|
|
+++ src/backend/utils/mb/encnames.c
|
|
@@ -403,6 +403,118 @@ const pg_enc2gettext pg_enc2gettext_tbl[] =
|
|
};
|
|
|
|
|
|
+#ifdef USE_ICU
|
|
+/*
|
|
+ * Try to map most internal character encodings to the proper and
|
|
+ * preferred IANA string. Use this in mbutils.c to feed ICU info about
|
|
+ * the database's character encoding.
|
|
+ *
|
|
+ * Palle Girgensohn, 2005
|
|
+ */
|
|
+
|
|
+pg_enc2name pg_enc2iananame_tbl[] =
|
|
+{
|
|
+ {
|
|
+ "US-ASCII", PG_SQL_ASCII
|
|
+ },
|
|
+ {
|
|
+ "EUC-JP", PG_EUC_JP
|
|
+ },
|
|
+ {
|
|
+ "GB2312", PG_EUC_CN
|
|
+ },
|
|
+ {
|
|
+ "EUC-KR", PG_EUC_KR
|
|
+ },
|
|
+ {
|
|
+ "ISO-2022-CN", PG_EUC_TW
|
|
+ },
|
|
+ {
|
|
+ "KS_C_5601-1987", PG_JOHAB /* either KS_C_5601-1987 or ISO-2022-KR ??? */
|
|
+ },
|
|
+ {
|
|
+ "UTF-8", PG_UTF8
|
|
+ },
|
|
+ {
|
|
+ "MULE_INTERNAL", PG_MULE_INTERNAL /* is not for real */
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-1", PG_LATIN1
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-2", PG_LATIN2
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-3", PG_LATIN3
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-4", PG_LATIN4
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-9", PG_LATIN5
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-10", PG_LATIN6
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-13", PG_LATIN7
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-14", PG_LATIN8
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-15", PG_LATIN9
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-16", PG_LATIN10
|
|
+ },
|
|
+ {
|
|
+ "windows-1256", PG_WIN1256
|
|
+ },
|
|
+ {
|
|
+ "windows-874", PG_WIN874
|
|
+ },
|
|
+ {
|
|
+ "KOI8-R", PG_KOI8R
|
|
+ },
|
|
+ {
|
|
+ "windows-1251", PG_WIN1251
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-5", PG_ISO_8859_5
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-6", PG_ISO_8859_6
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-7", PG_ISO_8859_7
|
|
+ },
|
|
+ {
|
|
+ "ISO-8859-8", PG_ISO_8859_8
|
|
+ },
|
|
+ {
|
|
+ "windows-1250", PG_WIN1250
|
|
+ },
|
|
+ {
|
|
+ "Shift_JIS", PG_SJIS
|
|
+ },
|
|
+ {
|
|
+ "Big5", PG_BIG5
|
|
+ },
|
|
+ {
|
|
+ "GBK", PG_GBK
|
|
+ },
|
|
+ {
|
|
+ "cp949", PG_UHC
|
|
+ },
|
|
+ {
|
|
+ "GB18030", PG_GB18030
|
|
+ }
|
|
+};
|
|
+#endif /* USE_ICU */
|
|
+
|
|
+
|
|
+
|
|
/* ----------
|
|
* Encoding checks, for error returns -1 else encoding id
|
|
* ----------
|
|
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
|
|
index 7f1c881..7b7bc01 100644
|
|
--- src/backend/utils/mb/mbutils.c
|
|
+++ src/backend/utils/mb/mbutils.c
|
|
@@ -40,6 +40,10 @@
|
|
#include "utils/builtins.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/syscache.h"
|
|
+#ifdef USE_ICU
|
|
+#define U_CHARSET_IS_UTF8 1
|
|
+#include <unicode/ucnv.h>
|
|
+#endif /* USE_ICU */
|
|
|
|
/*
|
|
* When converting strings between different encodings, we assume that space
|
|
@@ -913,6 +917,9 @@ SetDatabaseEncoding(int encoding)
|
|
|
|
DatabaseEncoding = &pg_enc2name_tbl[encoding];
|
|
Assert(DatabaseEncoding->encoding == encoding);
|
|
+#ifdef USE_ICU
|
|
+ ucnv_setDefaultName((&pg_enc2iananame_tbl[encoding])->name);
|
|
+#endif
|
|
}
|
|
|
|
void
|
|
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
|
|
index 24e8d0d..11da225 100644
|
|
--- src/include/mb/pg_wchar.h
|
|
+++ src/include/mb/pg_wchar.h
|
|
@@ -321,6 +321,10 @@ typedef struct pg_enc2name
|
|
|
|
extern const pg_enc2name pg_enc2name_tbl[];
|
|
|
|
+#ifdef USE_ICU
|
|
+extern pg_enc2name pg_enc2iananame_tbl[];
|
|
+#endif
|
|
+
|
|
/*
|
|
* Encoding names for gettext
|
|
*/
|
|
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
|
|
index b621ff2..de58917 100644
|
|
--- src/include/pg_config.h.in
|
|
+++ src/include/pg_config.h.in
|
|
@@ -288,6 +288,12 @@
|
|
/* Define to 1 if you have the `crypto' library (-lcrypto). */
|
|
#undef HAVE_LIBCRYPTO
|
|
|
|
+/* Define to 1 if you have the `icui18n' library (-licui18n). */
|
|
+#undef HAVE_LIBICUI18N
|
|
+
|
|
+/* Define to 1 if you have the `icuuc' library (-licuuc). */
|
|
+#undef HAVE_LIBICUUC
|
|
+
|
|
/* Define to 1 if you have the `ldap' library (-lldap). */
|
|
#undef HAVE_LIBLDAP
|
|
|
|
@@ -796,6 +802,9 @@
|
|
/* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
|
|
#undef USE_BSD_AUTH
|
|
|
|
+/* Define to build with ICU support. (--with-icu) */
|
|
+#undef USE_ICU
|
|
+
|
|
/* Define to 1 if you want float4 values to be passed by value.
|
|
(--enable-float4-byval) */
|
|
#undef USE_FLOAT4_BYVAL
|
|
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
|
|
index 0a4b9f7..d750536 100644
|
|
--- src/include/utils/pg_locale.h
|
|
+++ src/include/utils/pg_locale.h
|
|
@@ -19,6 +19,12 @@
|
|
|
|
#include "utils/guc.h"
|
|
|
|
+#ifdef USE_ICU
|
|
+#define U_CHARSET_IS_UTF8 1
|
|
+#include <unicode/uchar.h>
|
|
+#include <unicode/ucasemap.h>
|
|
+#include <unicode/ucol.h>
|
|
+#endif
|
|
|
|
/* GUC settings */
|
|
extern char *locale_messages;
|
|
@@ -71,6 +77,10 @@ typedef locale_t pg_locale_t;
|
|
typedef int pg_locale_t;
|
|
#endif
|
|
|
|
+#ifdef USE_ICU
|
|
+extern UCollator * pg_icu_collator_from_collation(Oid collid);
|
|
+extern UCaseMap * pg_icu_casemap_from_collation(Oid collid);
|
|
+#endif
|
|
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
|
|
|
|
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
|