dd/d90/unicode_8h_source.html

00001 //#ifndef unicode_h
00002 //#define unicode_h
00003
00005 // Includes
00006 //#include "base.h"
00007 #include <new>
00008
00009 typedef int TUniVecIdx;
00010
00011 //-----------------------------------------------------------------------------
00012 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
00013 //-----------------------------------------------------------------------------
00014
00015 // Error handling modes for the TUniCodec class.
00016 typedef enum TUnicodeErrorHandling_
00017 {
00018         // What happens when an error occurs:
00019         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
00020         uehThrow = 1,   // - an exception is thrown (TUnicodeException)
00021         uehReplace = 2, // - the replacement character is added to the output vector
00022         uehAbort = 3    // - the encoding/decoding process stops immediately
00023 }
00024 TUnicodeErrorHandling;
00025
00026 class TUnicodeException
00027 {
00028 public:
00029         TStr message;  // error message
00030         size_t srcIdx; // the position in the source vector where the error occurred
00031         int srcChar;   // the source character at the position srcIdx
00032         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
00033                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
00034 };
00035
00036 typedef enum TUniByteOrder_
00037 {
00038         boMachineEndian = 0,
00039         boLittleEndian = 1,
00040         boBigEndian = 2
00041 }
00042 TUniByteOrder;
00043
00044 typedef enum TUtf16BomHandling_
00045 {
00046         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
00047         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
00048         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
00049 }
00050 TUtf16BomHandling;
00051
00052 class TUniCodec
00053 {
00054 public:
00055         // 0xfffd is defined as the replacement character by the Unicode standard.
00056         // By default, it is rendered as a question mark inside a diamond: "<?>".
00057         enum { DefaultReplacementChar = 0xfffd };
00058
00059         // The replacement character is inserted into the destination vector
00060         // if an error occurs in the source vector.  By default, this is set
00061         // to DefaultReplacementChar.
00062         int replacementChar;
00063         // The error handling mode.
00064         TUnicodeErrorHandling errorHandling;
00065         // There are a number of situations where there is strictly speaking an error in
00066         // the source data although it can still be decoded in a reasonably meaningful way.
00067         // If strict == true, these situations are treated as errors.  Examples:
00068         // - when decoding UTF-8:
00069         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
00070         //     encoded as a two-byte sequence)
00071         //   - a codepoint > 0x10ffff
00072         // - when decoding UTF-16:
00073         //   - a codepoint from the range reserved for the second character of a surrogate pair
00074         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
00075         // - when encoding UTF-8:
00076         //   - a codepoint > 0x10ffff
00077         // - when encoding UTF-16:
00078         //   - a codepoint from the range reserved from the second character of a surrogate pair
00079         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
00080         //     surrogate pair, is always an error, even with strict == false]
00081         bool strict;
00082         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
00083         // of the source vector, it is skipped (when decoding).
00084         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding
00085         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,
00086         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.
00087         bool skipBom;
00088
00089         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
00090         {
00091         }
00092
00093         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
00094                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
00095         {
00096         }
00097
00098 protected:
00099         enum {
00100 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
00101                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
00102                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
00103                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
00104                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
00105                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
00106                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
00107                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
00108                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
00109                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
00110                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
00111                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
00112                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
00113 #undef DefineByte
00114         };
00115
00116         typedef TUniVecIdx TVecIdx;
00117         //friend class TUniChDb;
00118         friend class TUniCaseFolding;
00119
00120 public:
00121
00122         //-----------------------------------------------------------------------
00123         // UTF-8
00124         //-----------------------------------------------------------------------
00125
00126         // Returns the number of characters that have been successfully decoded.
00127         // This does not include any replacement characters that may have been inserted into 'dest'.
00128         template<typename TSrcVec, typename TDestCh>
00129         size_t DecodeUtf8(
00130                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00131                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00132         template<typename TSrcVec, typename TDestCh>
00133         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
00134
00135         // Returns the number of characters that have been successfully encoded.
00136         // This does not include any replacement characters that may have been inserted into 'dest'.
00137         template<typename TSrcVec, typename TDestCh>
00138         size_t EncodeUtf8(
00139                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00140                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00141         template<typename TSrcVec, typename TDestCh>
00142         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
00143
00144         // The following wrappers around the UTF-8 encoder return a TStr containing
00145         // the UTF-8-encoded version of the input string.
00146         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
00147         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
00148
00149         //-----------------------------------------------------------------------
00150         // UTF-16 Decoder
00151         //-----------------------------------------------------------------------
00152
00153 protected:
00154         enum {
00155                 Utf16FirstSurrogate = 0xd800,
00156                 Utf16SecondSurrogate = 0xdc00
00157         };
00158
00159         static bool IsMachineLittleEndian();
00160
00161 public:
00162
00163         // Returns the number of characters that have been successfully decoded.
00164         // This does not include any replacement characters that may have been inserted into 'dest'.
00165         // Each element of 'src' is assumed to contain one byte of data.
00166         // srcCount must be even (though srcIdx doesn't need to be).
00167         template<typename TSrcVec, typename TDestCh>
00168         size_t DecodeUtf16FromBytes(
00169                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00170                 TVec<TDestCh>& dest, const bool clrDest,
00171                 const TUtf16BomHandling bomHandling = bomAllowed,
00172                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00173
00174         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
00175         // are used to determine if the two bytes of each word should be swapped before further
00176         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
00177         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
00178         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
00179         // beginning of the source data is used to determine the "original" byte order of the data;
00180         // if this doesn't match the byte order of the local machine, the two bytes of each word will
00181         // be swapped during the decoding process.
00182         template<typename TSrcVec, typename TDestCh>
00183         size_t DecodeUtf16FromWords(
00184                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00185                 TVec<TDestCh>& dest, bool clrDest,
00186                 const TUtf16BomHandling bomHandling = bomAllowed,
00187                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00188
00189         //-----------------------------------------------------------------------
00190         // UTF-16 Encoder
00191         //-----------------------------------------------------------------------
00192
00193         // Returns the number of characters that have been successfully encoded.
00194         // This does not include any replacement characters that may have been inserted into 'dest'.
00195         //
00196         // Notes:
00197         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
00198         //   treated as an error, regardless of the value of 'strict'.
00199         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
00200         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
00201         //   as the first character of a surrogate pair.
00202         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
00203         //   can be encoded in principle; however, if strict == true, they are treated as errors.
00204         template<typename TSrcVec, typename TDestCh>
00205         size_t EncodeUtf16ToWords(
00206                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00207                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00208                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00209
00210         template<typename TSrcVec, typename TDestCh>
00211         size_t EncodeUtf16ToBytes(
00212                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00213                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00214                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00215
00216         //-----------------------------------------------------------------------
00217         // Helper declarations for the test drivers
00218         //-----------------------------------------------------------------------
00219
00220 protected:
00221
00222         static uint GetRndUint(TRnd& rnd);
00223         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
00224
00225         //-----------------------------------------------------------------------
00226         // UTF-8 Test Driver
00227         //-----------------------------------------------------------------------
00228
00229 protected:
00230         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
00231         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00232         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00233         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
00234 public:
00235         void TestUtf8();
00236
00237         //-----------------------------------------------------------------------
00238         // UTF-16 Test Driver
00239         //-----------------------------------------------------------------------
00240
00241 protected:
00242         void WordsToBytes(const TIntV& src, TIntV& dest);
00243         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00244                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.
00245                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00246                 FILE *f);
00247         static inline int SwapBytes(int x) {
00248                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
00249         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00250         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00251         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00252                 const TUtf16BomHandling bomHandling,
00253                 const TUniByteOrder defaultByteOrder,
00254                 const bool insertBom);
00255 public:
00256         void TestUtf16();
00257
00258 };
00259
00260 //-----------------------------------------------------------------------------
00261 // Case folding
00262 //-----------------------------------------------------------------------------
00263 // Note: there's no need to access this class directly.
00264 // Use TUniChDb::GetCaseFolded() instead.
00265
00266 typedef THash<TInt, TIntV> TIntIntVH;
00267
00268 class TUniCaseFolding
00269 {
00270 protected:
00271         TIntH cfCommon, cfSimple, cfTurkic;
00272         TIntIntVH cfFull;
00273
00274         template<typename TSrcDat, typename TDestDat>
00275         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
00276                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
00277         friend class TUniChDb;
00278         typedef TUniVecIdx TVecIdx;
00279
00280 public:
00281         TUniCaseFolding() { }
00282         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
00283         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
00284         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
00285         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
00286         void LoadTxt(const TStr& fileName);
00287
00288         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.
00289         template<typename TSrcVec, typename TDestCh>
00290         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00291                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
00292         {
00293                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
00294                 {
00295                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;
00296                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
00297                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
00298                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
00299                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
00300                 }
00301         }
00302
00303         template<typename TSrcVec>
00304         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
00305         {
00306                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
00307                 {
00308                         int c = src[TVecIdx(srcIdx)], i;
00309                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
00310                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
00311                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
00312                 }
00313         }
00314
00315 protected:
00316         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
00317 public:
00318         void Test();
00319 };
00320
00321 //-----------------------------------------------------------------------------
00322 // TCodecBase -- an abstract base class for codecs
00323 //-----------------------------------------------------------------------------
00324
00325 class TCodecBase;
00326 typedef TPt<TCodecBase> PCodecBase;
00327 typedef TVec<PCodecBase> TCodecBaseV;
00328
00329 class TCodecBase
00330 {
00331 protected:
00332         TCRef CRef;
00333         friend class TPt<TCodecBase>;
00334 public:
00335         virtual ~TCodecBase() { }
00336
00337         template<class TCodecImpl>
00338         static PCodecBase New(); /* {
00339                 return new TCodecWrapper<TCodecImpl>(); } */
00340
00341         virtual TStr GetName() const = 0;
00342         virtual void Test() const { }
00343
00344         // Returns the number of characters that have been successfully decoded.
00345         // This does not include any replacement characters that may have been inserted into 'dest'.
00346         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00347         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00348
00349         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00350         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00351
00352         // Returns the number of characters that have been successfully encoded.
00353         // This does not include any replacement characters that may have been inserted into 'dest'.
00354         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00355         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
00356         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
00357
00358         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00359         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00360         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00361 };
00362
00363 //-----------------------------------------------------------------------------
00364 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
00365 // parameter class for the actual implementation of the codec.
00366 //-----------------------------------------------------------------------------
00367 // Thus, if you know in advance that you'll need ISO-8859-2, just use
00368 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding
00369 // in advance, use a PCodecBase pointing to a suitable specialization
00370 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)
00371 // to obtain a suitable pointer.
00372
00373 template<class TCodecImpl_>
00374 class TCodecWrapper : public TCodecBase
00375 {
00376 public:
00377         typedef TCodecImpl_ TCodecImpl;
00378         TCodecImpl impl;
00379 public:
00380
00381         virtual TStr GetName() const { return impl.GetName(); }
00382
00383         virtual void Test() const { impl.Test(); }
00384
00385         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00386                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00387         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00388                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00389
00390         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00391                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00392         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
00393                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00394         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00395                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
00396                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00397                 return retVal; }
00398 };
00399
00400 template<class TCodecImpl>
00401 PCodecBase TCodecBase::New() {
00402   return new TCodecWrapper<TCodecImpl>();
00403 }
00404
00405 //-----------------------------------------------------------------------------
00406 // TVecElt -- a template for determining the type of a vector's elements
00407 //-----------------------------------------------------------------------------
00408
00409 template<class TVector_>
00410 class TVecElt
00411 {
00412 };
00413
00414 template<class TDat>
00415 class TVecElt<TVec<TDat> >
00416 {
00417 public:
00418         typedef TVec<TDat> TVector;
00419         typedef TDat TElement;
00420         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
00421 };
00422
00423 template<>
00424 class TVecElt<TChA>
00425 {
00426 public:
00427         typedef TChA TVector;
00428         typedef char TElement;
00429         static inline void Add(TVector& vector, const TElement& element) { vector += element; }
00430 };
00431
00432
00433 //-----------------------------------------------------------------------------
00434 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
00435 //-----------------------------------------------------------------------------
00436
00437 class TEncoding_ISO8859_1
00438 {
00439 public:
00440         static inline TStr GetName() { return "ISO-8859-1"; }
00441         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
00442         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
00443 };
00444
00445 class TEncoding_ISO8859_2 // ISO Latin 2
00446 {
00447 public:
00448         static inline TStr GetName() { return "ISO-8859-2"; }
00449         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00450         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00451                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00452         static int FromUnicode(int c) {
00453                 if (0 <= c && c < 0xa0) return c;
00454                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00455                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00456                 else return -1; }
00457 };
00458
00459 class TEncoding_ISO8859_3
00460 {
00461 public:
00462         static inline TStr GetName() { return "ISO-8859-3"; }
00463         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
00464         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00465                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00466         static int FromUnicode(int c) {
00467                 if (0 <= c && c < 0xa0) return c;
00468                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00469                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
00470                 else return -1; }
00471 };
00472
00473 class TEncoding_ISO8859_4
00474 {
00475 public:
00476         static inline TStr GetName() { return "ISO-8859-4"; }
00477         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00478         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00479                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00480         static int FromUnicode(int c) {
00481                 if (0 <= c && c < 0xa0) return c;
00482                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00483                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00484                 else return -1; }
00485 };
00486
00487 class TEncoding_YuAscii
00488 {
00489 public:
00490         static const int uniChars[10], yuAsciiChars[10];
00491         static inline TStr GetName() { return "YU-ASCII"; }
00492         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00493                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
00494                         if (c == yuAsciiChars[i]) return uniChars[i];
00495                 return c; }
00496         static int FromUnicode(int c) {
00497                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
00498                         if (c == uniChars[i]) return yuAsciiChars[i];
00499                         else if(c == yuAsciiChars[i]) return -1;
00500                 if (0 <= c && c <= 255) return c; else return -1; }
00501 };
00502
00503 class TEncoding_CP437 // DOS US
00504 {
00505 public:
00506         static inline TStr GetName() { return "CP437"; }
00507         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
00508         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00509                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00510         static int FromUnicode(int c) {
00511                 if (0 <= c && c < 0x80) return c;
00512                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
00513                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
00514                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
00515                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
00516                 else if (c == 0x192) return 0x9f;
00517                 else if (c == 0x207f) return 0xfc;
00518                 else if (c == 0x20a7) return 0x9e;
00519                 else if (c == 0x2310) return 0xa9;
00520                 else if (c == 0x2320) return 0xf4;
00521                 else if (c == 0x2321) return 0xf5;
00522                 else return -1; }
00523 };
00524
00525 class TEncoding_CP852 // DOS Latin 2
00526 {
00527 public:
00528         static inline TStr GetName() { return "CP852"; }
00529         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
00530         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00531                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00532         static int FromUnicode(int c) {
00533                 if (0 <= c && c < 0x80) return c;
00534                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00535                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00536                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
00537                 else return -1; }
00538 };
00539
00540 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
00541 {
00542 public:
00543         static inline TStr GetName() { return "CP1250"; }
00544         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
00545         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00546                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00547         static int FromUnicode(int c) {
00548                 if (0 <= c && c < 0x80) return c;
00549                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00550                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00551                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
00552                 else if (c == 0x20ac) return 0x80;
00553                 else if (c == 0x2122) return 0x99;
00554                 else return -1; }
00555 };
00556
00557 template<class TEncoding_>
00558 class T8BitCodec
00559 {
00560 protected:
00561         typedef TUniVecIdx TVecIdx;
00562 public:
00563         typedef TEncoding_ TEncoding;
00564         TUnicodeErrorHandling errorHandling;
00565         int replacementChar;
00566
00567         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
00568         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
00569                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
00570         static TStr GetName() { return TEncoding::GetName(); }
00571
00572         void Test() const
00573         {
00574                 int nDecoded = 0;
00575                 for (int c = 0; c <= 255; c++) {
00576                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
00577                         nDecoded++;
00578                         IAssert(0 <= cu && cu < 0x110000);
00579                         int c2 = TEncoding::FromUnicode(cu);
00580                         IAssert(c2 == c); }
00581                 int nEncoded = 0;
00582                 for (int cu = 0; cu < 0x110000; cu++) {
00583                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
00584                         nEncoded++;
00585                         IAssert(0 <= c && c <= 255);
00586                         int cu2 = TEncoding::ToUnicode(c);
00587                         IAssert(cu2 == cu); }
00588                 IAssert(nDecoded == nEncoded);
00589         }
00590
00591         // Returns the number of characters that have been successfully decoded.
00592         // This does not include any replacement characters that may have been inserted into 'dest'.
00593         template<typename TSrcVec, typename TDestCh>
00594         size_t ToUnicode(
00595                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00596                 TVec<TDestCh>& dest, const bool clrDest = true) const
00597         {
00598                 if (clrDest) dest.Clr();
00599                 size_t toDo = srcCount;
00600                 while (toDo-- > 0) {
00601                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
00602                         int chDest = TEncoding::ToUnicode(chSrc);
00603                         dest.Add(chDest); }
00604                 return srcCount;
00605         }
00606         template<typename TSrcVec, typename TDestCh>
00607         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00608
00609         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00610         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00611
00612         // Returns the number of characters that have been successfully encoded.
00613         // This does not include any replacement characters that may have been inserted into 'dest'.
00614         template<typename TSrcVec, typename TDestVec>
00615         size_t FromUnicode(
00616                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00617                 TDestVec& dest, const bool clrDest = true) const
00618         {
00619                 typedef typename TVecElt<TDestVec>::TElement TDestCh;
00620                 if (clrDest) dest.Clr();
00621                 size_t toDo = srcCount, nEncoded = 0;
00622                 while (toDo-- > 0) {
00623                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
00624                         int chDest = TEncoding::FromUnicode(chSrc);
00625                         if (chDest < 0) {
00626                                 switch (errorHandling) {
00627                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
00628                                 case uehAbort: return nEncoded;
00629                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
00630                                 case uehIgnore: continue;
00631                                 default: Fail; } }
00632                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
00633                 return nEncoded;
00634         }
00635
00636         template<typename TSrcVec, typename TDestVec>
00637         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00638
00639         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00640                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
00641                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00642                 return retVal; }
00643         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
00644 };
00645
00646 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;
00647 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;
00648 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;
00649 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;
00650 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;
00651 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;
00652 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;
00653 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;
00654
00655 //-----------------------------------------------------------------------------
00656 // Various declarations used by the Unicode Character Database
00657 //-----------------------------------------------------------------------------
00658
00659 typedef enum TUniChCategory_
00660 {
00661 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
00662         DefineUniCat(Letter, 'L'),             // ucLetter
00663         DefineUniCat(Mark, 'M'),
00664         DefineUniCat(Number, 'N'),
00665         DefineUniCat(Punctuation, 'P'),
00666         DefineUniCat(Symbol, 'S'),
00667         DefineUniCat(Separator, 'Z'),
00668         DefineUniCat(Other, 'C')
00669 #undef DefineUniCat
00670 }
00671 TUniChCategory;
00672
00673 typedef enum TUniChSubCategory_
00674 {
00675 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
00676         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
00677         DefineUniSubCat(Letter, Lowercase, 'l'),
00678         DefineUniSubCat(Letter, Titlecase, 't'),
00679         DefineUniSubCat(Letter, Modifier, 'm'),
00680         DefineUniSubCat(Letter, Other, 'o'),
00681         DefineUniSubCat(Mark, Nonspacing, 'n'),
00682         DefineUniSubCat(Mark, SpacingCombining, 'c'),
00683         DefineUniSubCat(Mark, Enclosing, 'e'),
00684         DefineUniSubCat(Number, DecimalDigit, 'd'),
00685         DefineUniSubCat(Number, Letter, 'l'),
00686         DefineUniSubCat(Number, Other, 'o'),
00687         DefineUniSubCat(Punctuation, Connector, 'c'),
00688         DefineUniSubCat(Punctuation, Dash, 'd'),
00689         DefineUniSubCat(Punctuation, Open, 's'),
00690         DefineUniSubCat(Punctuation, Close, 'e'),
00691         DefineUniSubCat(Punctuation, InitialQuote, 'i'),
00692         DefineUniSubCat(Punctuation, FinalQuote, 'f'),
00693         DefineUniSubCat(Punctuation, Other, 'o'),
00694         DefineUniSubCat(Symbol, Math, 'm'),
00695         DefineUniSubCat(Symbol, Currency, 'c'),
00696         DefineUniSubCat(Symbol, Modifier, 'k'),
00697         DefineUniSubCat(Symbol, Other, 'o'),
00698         DefineUniSubCat(Separator, Space, 's'),
00699         DefineUniSubCat(Separator, Line, 'l'),
00700         DefineUniSubCat(Separator, Paragraph, 'p'),
00701         DefineUniSubCat(Other, Control, 'c'),
00702         DefineUniSubCat(Other, Format, 'f'),
00703         DefineUniSubCat(Other, Surrogate, 's'),
00704         DefineUniSubCat(Other, PrivateUse, 'o'),
00705         DefineUniSubCat(Other, NotAssigned, 'n')
00706 }
00707 TUniChSubCategory;
00708
00709 typedef enum TUniChFlags_
00710 {
00711         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
00712         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
00713         // Flags used when searching for word boundaries.  See UAX #29.
00714         ucfWbFormat = 1 << 2,
00715         ucfWbKatakana = 1 << 3,
00716         ucfWbALetter = 1 << 4,
00717         ucfWbMidLetter = 1 << 5,
00718         ucfWbMidNum = 1 << 6,
00719         ucfWbNumeric = 1 << 7,
00720         ucfWbExtendNumLet = 1 << 8,
00721         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
00722         ucfSbSep = 1 << 9,
00723         ucfSbFormat = 1 << 10,
00724         ucfSbSp = 1 << 11,
00725         ucfSbLower = 1 << 12,
00726         ucfSbUpper = 1 << 13,
00727         ucfSbOLetter = 1 << 14,
00728         ucfSbNumeric = 1 << 15,
00729         ucfSbATerm = 1 << 16,
00730         ucfSbSTerm = 1 << 17,
00731         ucfSbClose = 1 << 18,
00732         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
00733         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
00734         // Flags from DerivedCoreProperties.txt.
00735         // [The comments are from UCD.html.]
00736         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
00737         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
00738         ucfDcpAlphabetic = 1 << 19,
00739         // - For programmatic determination of default-ignorable code points.
00740         //   New characters that should be ignored in processing (unless explicitly supported)
00741         //   will be assigned in these ranges, permitting programs to correctly handle the default
00742         //   behavior of such characters when not otherwise supported.  For more information, see
00743         //   UAX #29: Text Boundaries [Breaks].
00744         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
00745         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
00746         ucfDcpDefaultIgnorableCodePoint = 1 << 20,
00747         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
00748         //   Generated from: Other_Lowercase + Ll
00749         ucfDcpLowercase = 1 << 21,
00750         // - For programmatic determination of grapheme cluster boundaries.
00751         //   For more information, see UAX #29: Text Boundaries [Breaks].
00752         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
00753         ucfDcpGraphemeBase = 1 << 22,
00754         // - For programmatic determination of grapheme cluster boundaries.
00755         //   For more information, see UAX #29: Text Boundaries [Breaks].
00756         //   Generated from: Other_Grapheme_Extend + Me + Mn
00757         //   Note: depending on an application's interpretation of Co (private use), they may be either
00758         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
00759         ucfDcpGraphemeExtend = 1 << 23,
00760         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00761         ucfDcpIdStart = 1 << 24,
00762         ucfDcpIdContinue = 1 << 25,
00763         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
00764         //   Generated from: Sm + Other_Math
00765         ucfDcpMath = 1 << 26,
00766         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
00767         //   Generated from: Lu + Other_Uppercase
00768         ucfDcpUppercase = 1 << 27,
00769         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00770         ucfDcpXidStart = 1 << 28,
00771         ucfDcpXidContinue = 1 << 29,
00772         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
00773                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
00774 }
00775 TUniChFlags;
00776
00777 typedef enum TUniChProperties_
00778 {
00779         // The flags from PropList.txt.
00780         // [The comments are from UCD.html.]
00781         // - ASCII characters commonly used for the representation of hexadecimal numbers.
00782         //   [= 0123456789abcdefABCDEF]
00783         ucfPrAsciiHexDigit = 1,
00784         // - Those format control characters which have specific functions in the Bidirectional Algorithm.
00785         ucfPrBidiControl = 2,
00786         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
00787         //   plus compatibility equivalents to those. Most of these have the Pd General Category,
00788         //   but some have the Sm General Category because of their use in mathematics.
00789         //     U+0002d  HYPHEN-MINUS
00790         //     U+0058a  ARMENIAN HYPHEN
00791         //     U+005be  HEBREW PUNCTUATION MAQAF
00792         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00793         //     U+02010  HYPHEN
00794         //     U+02011  NON-BREAKING HYPHEN
00795         //     U+02012  FIGURE DASH
00796         //     U+02013  EN DASH
00797         //     U+02014  EM DASH
00798         //     U+02015  HORIZONTAL BAR
00799         //     U+02053  SWUNG DASH
00800         //     U+0207b  SUPERSCRIPT MINUS
00801         //     U+0208b  SUBSCRIPT MINUS
00802         //     U+02212  MINUS SIGN
00803         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00804         //     U+0301c  WAVE DASH
00805         //     U+03030  WAVY DASH
00806         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
00807         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
00808         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
00809         //     U+0fe58  SMALL EM DASH
00810         //     U+0fe63  SMALL HYPHEN-MINUS
00811         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00812         ucfPrDash = 4,
00813         // - For a machine-readable list of deprecated characters.  No characters will ever be removed
00814         //   from the standard, but the usage of deprecated characters is strongly discouraged.
00815         ucfPrDeprecated = 8,
00816         // - Characters that linguistically modify the meaning of another character to which they apply.
00817         //   Some diacritics are not combining characters, and some combining characters are not diacritics.
00818         ucfPrDiacritic = 0x10,
00819         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
00820         //   character.  Typical of these are length and iteration marks.
00821         ucfPrExtender = 0x20,
00822         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
00823         ucfPrGraphemeLink = 0x40,
00824         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
00825         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
00826         ucfPrHexDigit = 0x80,
00827         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
00828         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
00829         //     U+0002d  HYPHEN-MINUS
00830         //     U+000ad  SOFT HYPHEN
00831         //     U+0058a  ARMENIAN HYPHEN
00832         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00833         //     U+02010  HYPHEN
00834         //     U+02011  NON-BREAKING HYPHEN
00835         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00836         //     U+030fb  KATAKANA MIDDLE DOT
00837         //     U+0fe63  SMALL HYPHEN-MINUS
00838         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00839         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
00840         ucfPrHyphen = 0x100,
00841         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
00842         ucfPrIdeographic = 0x200,
00843         // - Those format control characters which have specific functions for control of cursive joining and ligation.
00844         ucfPrJoinControl = 0x400,
00845         // - There are a small number of characters that do not use logical order.
00846         //   These characters require special handling in most processing.
00847         ucfPrLogicalOrderException = 0x800,
00848         // - Code points that are permanently reserved for internal use.
00849         ucfPrNoncharacterCodePoint = 0x1000,
00850         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
00851         ucfPrPatternSyntax = 0x2000,
00852         ucfPrPatternWhiteSpace = 0x4000,
00853         // - Those punctuation characters that function as quotation marks.
00854         //     U+00022  QUOTATION MARK
00855         //     U+00027  APOSTROPHE
00856         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00857         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
00858         //     U+02018  LEFT SINGLE QUOTATION MARK
00859         //     U+02019  RIGHT SINGLE QUOTATION MARK
00860         //     U+0201a  SINGLE LOW-9 QUOTATION MARK
00861         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
00862         //     U+0201c  LEFT DOUBLE QUOTATION MARK
00863         //     U+0201d  RIGHT DOUBLE QUOTATION MARK
00864         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
00865         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
00866         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
00867         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
00868         //     U+0300c  LEFT CORNER BRACKET
00869         //     U+0300d  RIGHT CORNER BRACKET
00870         //     U+0300e  LEFT WHITE CORNER BRACKET
00871         //     U+0300f  RIGHT WHITE CORNER BRACKET
00872         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
00873         //     U+0301e  DOUBLE PRIME QUOTATION MARK
00874         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
00875         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
00876         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
00877         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
00878         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
00879         //     U+0ff02  FULLWIDTH QUOTATION MARK
00880         //     U+0ff07  FULLWIDTH APOSTROPHE
00881         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
00882         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
00883         ucfPrQuotationMark = 0x8000,
00884         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
00885         //   An explicit _dot above_ can be added where required, such as in Lithuanian.
00886         ucfPrSoftDotted = 0x10000,
00887         // - Sentence Terminal. Used in UAX #29: Text Boundaries.
00888         //     U+00021  EXCLAMATION MARK
00889         //     U+0002e  FULL STOP
00890         //     U+0003f  QUESTION MARK
00891         //     U+0203c  DOUBLE EXCLAMATION MARK
00892         //     U+0203d  INTERROBANG
00893         //     U+02047  DOUBLE QUESTION MARK
00894         //     U+02048  QUESTION EXCLAMATION MARK
00895         //     U+02049  EXCLAMATION QUESTION MARK
00896         //     U+03002  IDEOGRAPHIC FULL STOP
00897         //     [plus many characters from other writing systems]
00898         ucfPrSTerm = 0x20000,
00899         // - Those punctuation characters that generally mark the end of textual units.
00900         //   [JB note: this set contains more character than STerm.  For example, it contains
00901         //   the comma, colon and semicolon, whereas STerm doesn't.]
00902         //     U+00021  EXCLAMATION MARK
00903         //     U+0002c  COMMA
00904         //     U+0002e  FULL STOP
00905         //     U+0003a  COLON
00906         //     U+0003b  SEMICOLON
00907         //     U+0003f  QUESTION MARK
00908         //     U+0203c  DOUBLE EXCLAMATION MARK
00909         //     U+0203d  INTERROBANG
00910         //     U+02047  DOUBLE QUESTION MARK
00911         //     U+02048  QUESTION EXCLAMATION MARK
00912         //     U+02049  EXCLAMATION QUESTION MARK
00913         //     [plus *lots* of charcters from other writing systems]
00914         ucfPrTerminalPunctuation = 0x40000,
00915         // - Indicates all those characters that qualify as Variation Selectors.
00916         //   For details on the behavior of these characters, see StandardizedVariants.html and
00917         //   Section 16.4, Variation Selectors in [Unicode].
00918         ucfPrVariationSelector = 0x80000,
00919         // - Those separator characters and control characters which should be treated by
00920         //   programming languages as "white space" for the purpose of parsing elements.
00921         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
00922         //         since their functions are restricted to line-break control.
00923         //         Their names are unfortunately misleading in this respect.
00924         //   Note: There are other senses of "whitespace" that encompass a different set of characters.
00925         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
00926         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
00927         //   This includes the following characters:
00928         //     U+0009  <control>
00929         //     U+000a  <control>
00930         //     U+000b  <control>
00931         //     U+000c  <control>
00932         //     U+000d  <control>
00933         //     U+0020  SPACE
00934         //     U+0085  <control>
00935         //     U+00a0  NO-BREAK SPACE
00936         //     U+1680  OGHAM SPACE MARK
00937         //     U+180e  MONGOLIAN VOWEL SEPARATOR
00938         //     U+2000  EN QUAD
00939         //     U+2001  EM QUAD
00940         //     U+2002  EN SPACE
00941         //     U+2003  EM SPACE
00942         //     U+2004  THREE-PER-EM SPACE
00943         //     U+2005  FOUR-PER-EM SPACE
00944         //     U+2006  SIX-PER-EM SPACE
00945         //     U+2007  FIGURE SPACE
00946         //     U+2008  PUNCTUATION SPACE
00947         //     U+2009  THIN SPACE
00948         //     U+200a  HAIR SPACE
00949         //     U+2028  LINE SEPARATOR
00950         //     U+2029  PARAGRAPH SEPARATOR
00951         //     U+202f  NARROW NO-BREAK SPACE
00952         //     U+205f  MEDIUM MATHEMATICAL SPACE
00953         //     U+3000  IDEOGRAPHIC SPACE
00954         ucfPrWhiteSpace = 0x100000
00955 }
00956 TUniChProperties;
00957
00958 typedef enum TUniChPropertiesX_
00959 {
00960         // More properties from PropList.txt.
00961         // - Used to derive the properties in DerivedCoreProperties.txt.
00962         ucfPxOtherAlphabetic = 1,
00963         ucfPxOtherDefaultIgnorableCodePoint = 2,
00964         ucfPxOtherGraphemeExtend = 4,
00965         ucfPxOtherIdContinue = 8,
00966         ucfPxOtherIdStart = 0x10,
00967         ucfPxOtherLowercase = 0x20,
00968         ucfPxOtherMath = 0x40,
00969         ucfPxOtherUppercase = 0x80,
00970         // - Used in ideographic description sequences.
00971         ucfPxIdsBinaryOperator = 0x100,
00972         ucfPxIdsTrinaryOperator = 0x200,
00973         ucfPxRadical = 0x400,
00974         ucfPxUnifiedIdeograph = 0x800
00975 }
00976 TUniChPropertiesX;
00977
00978 //-----------------------------------------------------------------------------
00979 // TUniChInfo -- contains information about a single Unicode codepoint
00980 //-----------------------------------------------------------------------------
00981
00982 class TUniChInfo
00983 {
00984 public:
00985         enum { // combining classes (for 'combClass'); from UnicodeData.txt
00986                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
00987                 ccOverlaysAndInterior = 1,
00988                 ccNuktas = 7,
00989                 ccHiraganaKatakanaVoicingMarks = 8,
00990                 ccViramas = 9,
00991                 ccFixedPositionStart = 10, // Start of fixed position classes
00992                 ccFixedPositionEnd = 199, // End of fixed position classes
00993                 ccBelowLeftAttached = 200,
00994                 ccBelowAttached = 202,
00995                 ccBelowRightAttached = 204,
00996                 ccLeftAttached = 208, // Left attached (reordrant around single base character)
00997                 ccRightAttached = 210,
00998                 ccAboveLeftAttached = 212,
00999                 ccAboveAttached = 214,
01000                 ccAboveRightAttached = 216,
01001                 ccBelowLeft = 218,
01002                 ccBelow = 220,
01003                 ccBelowRight = 222,
01004                 ccLeft = 224, // Left (reordrant around single base character)
01005                 ccRight = 226,
01006                 ccAboveLeft = 228,
01007                 ccAbove = 230,
01008                 ccAboveRight = 232,
01009                 ccDoubleBelow = 233,
01010                 ccDoubleAbove = 234,
01011                 ccBelowIotaSubscript = 240, // Below (iota subscript)
01012                 ccInvalid = 255 // not defined by Unicode
01013         };
01014         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
01015         uchar combClass; // canonical combining class
01016         TUniChCategory cat; // = TUniChCategory(chCat)
01017         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
01018         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
01019         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt
01020         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
01021         int nameOffset; // offset into 'TUniChDb.charNames'
01022         int flags; // a combination of TUniChFlags
01023         int properties; // a combination of TUniChProperties
01024         int propertiesX; // a combination of TUniChPropertiesX
01025         ushort lineBreak; // from LineBreak.txt
01026
01027         // Converts a 2-letter linebreak code into a 16-bit integer.
01028         static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); }
01029         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;
01030
01031 public:
01032         void InitAfterLoad() {
01033                 cat = (TUniChCategory) chCat;
01034                 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); }
01035         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
01036                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
01037                 subCat = catAndSubCat;
01038                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
01039         friend class TUniChDb;
01040
01041         // Inexplicably missing from TSIn/TSOut...
01042         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
01043         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
01044         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
01045         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
01046
01047 public:
01048         void Save(TSOut& SOut) const {
01049                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
01050                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
01051                 SOut.Save(decompOffset); SOut.Save(nameOffset);
01052                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
01053         void Load(TSIn& SIn) {
01054                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
01055                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
01056                 SIn.Load(decompOffset); SIn.Load(nameOffset);
01057                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
01058         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
01059         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
01060                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
01061                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
01062                 InitAfterLoad(); }
01063
01064         // DerivedCoreProperties flags.
01065         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
01066         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
01067         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
01068         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
01069         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
01070         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
01071         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
01072         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }
01073         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
01074         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }
01075         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
01076         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
01077         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
01078         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
01079
01080         // PropList.txt flags.
01081         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
01082         void SetProperty(const TUniChProperties flag) { properties |= flag; }
01083         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }
01084         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
01085         bool IsDash() const { return IsProperty(ucfPrDash); }
01086         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
01087         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
01088         bool IsExtender() const { return IsProperty(ucfPrExtender); }
01089         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
01090         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
01091         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
01092         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
01093         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
01094         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }
01095         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }
01096         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }
01097         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
01098         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
01099         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }
01100         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }
01101         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
01102
01103         // Additional PropList.txt flags.
01104         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
01105         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
01106
01107         // Miscellaneous flags.
01108         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }
01109         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }
01110
01111         // Word-boundary flags.
01112         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
01113         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
01114         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
01115         int GetWbFlags() const { return flags & ucfWbMask; }
01116         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
01117         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }
01118         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
01119                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
01120                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
01121
01122         // Sentence-boundary flags.
01123         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
01124         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
01125         int GetSbFlags() const { return flags & ucfSbMask; }
01126         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
01127         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }
01128         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
01129                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
01130                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
01131                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
01132
01133         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
01134
01135         // Grapheme-boundary flags.
01136         bool IsGbExtend() const { return IsGraphemeExtend(); }
01137
01138         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
01139         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
01140
01141         // Character categories.
01142         TUniChCategory GetCat() const { return (TUniChCategory) cat; }
01143         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }
01144         // The following characters belong to the 'symbol/currency' subcategory:
01145         //     U+00024  DOLLAR SIGN
01146         //     U+000a2  CENT SIGN
01147         //     U+000a3  POUND SIGN
01148         //     U+000a4  CURRENCY SIGN
01149         //     U+000a5  YEN SIGN
01150         //     U+020a3  FRENCH FRANC SIGN
01151         //     U+020a4  LIRA SIGN
01152         //     U+020ac  EURO SIGN
01153         //     [and plenty of others]
01154         bool IsCurrency() const { return subCat == ucSymbolCurrency; }
01155         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
01156         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
01157         // the full ranges of private-use and surrogate characters.
01158         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
01159         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
01160
01161         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
01162                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
01163                 for (const char *p = s; *p; p += 2)
01164                         if (chCat == p[0] && chSubCat == p[1]) return true;
01165                 return false; }
01166 };
01167
01168 //-----------------------------------------------------------------------------
01169 // TUniTrie -- a trie for suffixes that should not appear at the end
01170 // of a sentence
01171 //-----------------------------------------------------------------------------
01172
01173 template<typename TItem_>
01174 class TUniTrie
01175 {
01176 public:
01177         typedef TItem_ TItem;
01178 protected:
01179         class TNode {
01180         public:
01181                 TItem item;
01182                 int child, sib;
01183                 bool terminal;
01184                 TNode() : child(-1), sib(-1), terminal(false) { }
01185                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
01186         };
01187         typedef TVec<TNode> TNodeV;
01188         typedef TPair<TItem, TItem> TItemPr;
01189         typedef TTriple<TItem, TItem, TItem> TItemTr;
01190         typedef TUniVecIdx TVecIdx;
01191         THash<TItem, TVoid> singles; //
01192         THash<TItemPr, TVoid> pairs;
01193         THash<TItemTr, TInt> roots;
01194         TNodeV nodes;
01195 public:
01196         TUniTrie() { }
01197         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
01198
01199         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
01200
01201         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
01202         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
01203         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
01204                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
01205                 if (keyId < 0) return 0; else return roots[keyId]; }
01206         int GetChild(const int parentIdx, const TItem& item) const {
01207                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
01208                         const TNode &node = nodes[childIdx];
01209                         if (node.item == item) return childIdx;
01210                         childIdx = node.sib; }
01211                 return -1; }
01212         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
01213
01214         // Adds a new string to the trie.  Note that the last characters appear
01215         // closer to the root of the trie.
01216         template<typename TSrcVec>
01217         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
01218         {
01219                 IAssert(srcCount > 0);
01220                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
01221                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
01222                 size_t srcLast = srcIdx + (srcCount - 1);
01223                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
01224                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
01225                 if (keyId >= 0) curNodeIdx = roots[keyId];
01226                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
01227                 //
01228                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
01229                 {
01230                         const TItem curItem = src[TVecIdx(srcPos)];
01231                         int childNodeIdx = nodes[curNodeIdx].child;
01232                         while (childNodeIdx >= 0) {
01233                                 TNode &childNode = nodes[childNodeIdx];
01234                                 if (childNode.item == curItem) break;
01235                                 childNodeIdx = childNode.sib; }
01236                         if (childNodeIdx < 0) {
01237                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
01238                                 nodes[curNodeIdx].child = childNodeIdx; }
01239                         curNodeIdx = childNodeIdx;
01240                         if (srcPos == srcIdx) break; else srcPos--;
01241                 }
01242                 nodes[curNodeIdx].terminal = true;
01243         }
01244
01245         template<typename TSrcVec>
01246         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
01247 };
01248
01249 //-----------------------------------------------------------------------------
01250 // TUniChDb -- provides access to the Unicode Character Database
01251 //-----------------------------------------------------------------------------
01252
01253 class TUniChDb
01254 {
01255 protected:
01256         void InitAfterLoad();
01257         typedef TUniVecIdx TVecIdx;
01258
01259 public:
01260         THash<TInt, TUniChInfo> h; // key: codepoint
01261         TStrPool charNames;
01262         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
01263         TIntV decompositions;
01264         THash<TIntPr, TInt> inverseDec;
01265         TUniCaseFolding caseFolding;
01266         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
01267         // The conditional mappings are hardcoded into GetCaseConverted().
01268         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;
01269         int scriptUnknown; // = scripts.GetKey("Unknown")
01270
01271         TUniChDb() : scriptUnknown(-1) { }
01272         explicit TUniChDb(TSIn& SIn) { Load(SIn); }
01273         void Clr() {
01274                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
01275                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
01276                 scripts.Clr(); }
01277         void Save(TSOut& SOut) const {
01278                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
01279                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
01280                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
01281                 SOut.SaveCs(); }
01282         void Load(TSIn& SIn) {
01283                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
01284                 decompositions.Load(SIn);
01285                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
01286                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
01287                 SIn.LoadCs(); InitAfterLoad(); }
01288         void LoadBin(const TStr& fnBin) {
01289                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
01290         void Test(const TStr& basePath);
01291
01292         // File names used by LoadTxt() and its subroutines.
01293         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
01294         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
01295         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
01296         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
01297         static TStr GetScriptsFn() { return "Scripts.txt"; }
01298         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
01299         static TStr GetLineBreakFn() { return "LineBreak.txt"; }
01300         static TStr GetPropListFn() { return "PropList.txt"; }
01301         static TStr GetAuxiliaryDir() { return "auxiliary"; }
01302         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
01303         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
01304         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
01305         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
01306         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
01307         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
01308
01309         //-------------------------------------------------------------------------
01310         // Script names
01311         //-------------------------------------------------------------------------
01312
01313         // These constants are used when initializing from the text files.
01314         static TStr GetScriptNameUnknown() { return "Unknown"; }
01315         static TStr GetScriptNameKatakana() { return "Katakana"; }
01316         static TStr GetScriptNameHiragana() { return "Hiragana"; }
01317         //
01318         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
01319         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
01320         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
01321         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
01322
01323         //-------------------------------------------------------------------------
01324         // Character namesnames
01325         //-------------------------------------------------------------------------
01326
01327         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
01328         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
01329         TStr GetCharNameS(const int cp) const {
01330                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
01331                 const char *p = GetCharName(cp); if (p) return p;
01332                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
01333         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
01334                 if (! f) f = stdout;
01335                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
01336                         fprintf(f, "%s", prefix.CStr());
01337                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
01338                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
01339         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
01340
01341         //-------------------------------------------------------------------------
01342         // Character information
01343         //-------------------------------------------------------------------------
01344         // These methods provide access to a subset of the functionality
01345         // available in TUniChInfo.
01346
01347         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
01348                 int i = h.GetKeyId(cp);
01349                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}
01350         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
01351         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
01352
01353         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
01354         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
01355         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
01356         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
01357
01358 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
01359 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
01360 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
01361 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
01362 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
01363
01364 #define DECLARE_FORWARDED_PROPERTY_METHODS \
01365         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
01366         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
01367         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
01368         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
01369         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
01370         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
01371         ___UniFwd2(IsXidStart, IsXidContinue)  \
01372         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
01373         ___UniFwd1(IsGbExtend)  \
01374         ___UniFwd2(IsCased, IsCurrency)
01375
01376         DECLARE_FORWARDED_PROPERTY_METHODS
01377
01378 #undef ___UniFwd1
01379
01380         bool IsPrivateUse(const int cp) const {
01381                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
01382                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
01383                         // Planes 15 and 16 are entirely for private use.
01384                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
01385         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
01386         // For db80..dbff it is clear that the surrogate pair containing this high surrogate
01387         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
01388         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.
01389         bool IsSurrogate(const int cp) const {
01390                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
01391                 return 0xd800 <= cp && cp <= 0xdcff; }
01392
01393         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
01394         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
01395         // for composition to work correctly.
01396         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
01397
01398         //-------------------------------------------------------------------------
01399         // Hangul constants
01400         //-------------------------------------------------------------------------
01401
01402         enum {
01403         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
01404         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
01405         HangulNCount = HangulVCount * HangulTCount,   // 588
01406         HangulSCount = HangulLCount * HangulNCount   // 11172
01407         };
01408
01409         //-------------------------------------------------------------------------
01410         // Word boundaries (UAX #29)
01411         //-------------------------------------------------------------------------
01412
01413 protected:
01414         // UAX #29, rule WB3: ignore Format and Extend characters.
01415         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
01416         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
01417         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
01418         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
01419         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01420                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01421         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01422         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01423                 if (position >= srcEnd) return;
01424                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01425         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01426         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01427                 if (position >= srcEnd) return;
01428                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
01429                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01430         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
01431         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
01432                 if (position <= srcStart) return false;
01433                 while (position > srcStart) {
01434                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
01435                 return false; }
01436         // Test driver for WbFind*NonIgnored.
01437         void TestWbFindNonIgnored(const TIntV& src) const;
01438         void TestWbFindNonIgnored() const;
01439 public:
01440         // Finds the next word boundary strictly after 'position'.
01441         // Note that there is a valid word boundary at 'srcIdx + srcCount'.
01442         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01443         template<typename TSrcVec>
01444         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01445         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
01446         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01447         // always set to 'true'.
01448         template<typename TSrcVec>
01449         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01450 protected:
01451         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
01452
01453         //-------------------------------------------------------------------------
01454         // Sentence boundaries (UAX #29)
01455         //-------------------------------------------------------------------------
01456
01457 protected:
01458         TUniTrie<TInt> sbExTrie;
01459
01460         // Checks whether a sentence that ended at src[position - 1]
01461         // would end in one of the suffixes from sbExTrie.
01462         template<typename TSrcVec>
01463         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
01464
01465 public:
01466         // Finds the next sentence boundary strictly after 'position'.
01467         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
01468         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01469         template<typename TSrcVec>
01470         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01471         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
01472         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01473         // always set to 'true'.
01474         template<typename TSrcVec>
01475         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01476
01477         // These methods allow the user to define a set of sentence boundary exceptions.
01478         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require
01479         // a sentence boundary in a position that would cause the sentence to end with
01480         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
01481         // we will *not* place a sentence boundary there.
01482         //
01483         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
01484         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
01485         // a standard set of English-language exceptions.
01486         void SbEx_Clr() { sbExTrie.Clr(); }
01487         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
01488         // template<> void SbEx_Add(const TStr& s) {
01489         void SbEx_Add(const TStr& s) {
01490           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
01491         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
01492         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
01493                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
01494                 return vec.Len(); }
01495         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
01496         int SbEx_SetStdEnglish() {
01497                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
01498                 SbEx_Clr(); return SbEx_AddMulti(data, false); }
01499
01500         //-------------------------------------------------------------------------
01501         // Normalization, decomposition, etc. (UAX #15)
01502         //-------------------------------------------------------------------------
01503
01504 protected:
01505         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
01506         // If 'compatibility == false', only canonical decompositions are used.
01507         template<typename TDestCh>
01508         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
01509 public:
01510         // This appends, to 'dest', the decomposed form of the source string.
01511         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01512         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01513         template<typename TSrcVec, typename TDestCh>
01514         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01515                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01516         template<typename TSrcVec, typename TDestCh>
01517         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01518                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01519         // This performs canonical composition on the source string, and appends
01520         // the result to the destination string.  The source string should be the
01521         // result of a (canonical or compatibility) decomposition; if this is the
01522         // case, the composition will lead to a normalization form C (NFC) or
01523         // normalization form KC (NFKC), depending on whether canonical or compatibility
01524         // decomposition was used.
01525         template<typename TSrcVec, typename TDestCh>
01526         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01527                         TVec<TDestCh>& dest, bool clrDest = true) const;
01528         template<typename TSrcVec, typename TDestCh>
01529         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01530                 Compose(src, 0, src.Len(), dest, clrDest); }
01531         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01532         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01533         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01534         // source string.
01535         template<typename TSrcVec, typename TDestCh>
01536         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01537                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01538         template<typename TSrcVec, typename TDestCh>
01539         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01540                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01541         // Copies the starter characters from 'src' to 'dest'; the other
01542         // characters are skipped.  'src' should already have been decomposed.
01543         // Returns the number of characters extracted.
01544         template<typename TSrcVec, typename TDestCh>
01545         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01546                         TVec<TDestCh>& dest, bool clrDest = true) const;
01547         template<typename TSrcVec, typename TDestCh>
01548         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01549                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
01550         // Extracts the starters into a temporary vector and then copies it into 'src'.
01551         template<typename TSrcVec>
01552         size_t ExtractStarters(TSrcVec& src) const {
01553                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
01554                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
01555                 return retVal; }
01556
01557 protected:
01558         void TestComposition(const TStr& basePath);
01559
01560         //-------------------------------------------------------------------------
01561         // Initialization from the text files
01562         //-------------------------------------------------------------------------
01563
01564 protected:
01565         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
01566         void InitScripts(const TStr& basePath);
01567         void InitLineBreaks(const TStr& basePath);
01568         void InitDerivedCoreProperties(const TStr& basePath);
01569         void InitPropList(const TStr& basePath);
01570         void InitSpecialCasing(const TStr& basePath);
01571         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);
01572 public:
01573         void LoadTxt(const TStr& basePath);
01574         void SaveBin(const TStr& fnBinUcd);
01575
01576         //-------------------------------------------------------------------------
01577         // Case conversions
01578         //-------------------------------------------------------------------------
01579
01580 public:
01581         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
01582         // Appends the case-converted form of 'src' to 'dest'.
01583         // 'how' defines what kind of case conversion is required.
01584         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
01585         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
01586         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
01587         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
01588         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
01589         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
01590         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01591         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01592         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01593
01594         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01595         // This is simpler and faster.  Since each character now maps into exactly one
01596         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01597         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
01598         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
01599         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
01600         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
01601         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
01602         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
01603         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
01604
01605         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
01606         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
01607         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
01608         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
01609         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
01610         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
01611         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
01612
01613 public:
01614         friend class TUniCaseFolding;
01615
01616         // Case folding is an alternative to the above functions.  It is intended primarily
01617         // to produce strings that are suitable for comparisons.  For example,
01618         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01619         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01620         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
01621         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01622         //   into a string of two or more characters.
01623         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01624         //   each string before comparing them (see sec. 3.13 of the standard).
01625         template<typename TSrcVec, typename TDestCh>
01626         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01627                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
01628         template<typename TSrcVec, typename TDestCh>
01629         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
01630                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
01631         // ToCaseFolded folds the string in place.  However, this means that only the simple
01632         // case foldings can be used (the full ones could increase the length of the string).
01633         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
01634         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
01635
01636 protected:
01637         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
01638         void TestCaseConversions();
01639
01640         //-------------------------------------------------------------------------
01641         // Text file reader for the Unicode character database
01642         //-------------------------------------------------------------------------
01643
01644 protected:
01645
01646         class TUcdFileReader
01647         {
01648         protected:
01649                 TChA buf;
01650         public:
01651                 TChA comment; // contains '#' and everything after it
01652         protected:
01653                 FILE *f;
01654                 int putBackCh;
01655                 int GetCh() {
01656                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
01657                         return fgetc(f); }
01658                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
01659                 // Returns 'false' iff the EOF was encountered before anything was read.
01660                 bool ReadNextLine() {
01661                         buf.Clr(); comment.Clr();
01662                         bool inComment = false, first = true;
01663                         while (true) {
01664                                 int c = GetCh();
01665                                 if (c == EOF) return ! first;
01666                                 else if (c == 13) {
01667                                         c = GetCh(); if (c != 10) PutBack(c);
01668                                         return true; }
01669                                 else if (c == 10) return true;
01670                                 else if (c == '#') inComment = true;
01671                                 if (! inComment) buf += char(c);
01672                                 else comment += char(c); }
01673                                 /*first = false;*/}
01674         private:
01675                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }
01676                 TUcdFileReader(const TUcdFileReader& r) { Fail; }
01677         public:
01678                 TUcdFileReader() : f(0) { }
01679                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
01680                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
01681                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
01682                 ~TUcdFileReader() { Close(); }
01683                 bool GetNextLine(TStrV& dest) {
01684                         dest.Clr();
01685                         while (true) {
01686                                 if (! ReadNextLine()) return false;
01687                                 TStr line = buf; line.ToTrunc();
01688                                 if (line.Len() <= 0) continue;
01689                                 line.SplitOnAllCh(';', dest, false);
01690                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
01691                                 return true; }}
01692                 static int ParseCodePoint(const TStr& s) {
01693                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
01694                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
01695                         if (ClrDestP) dest.Clr();
01696                         TStrV parts; s.SplitOnWs(parts);
01697                         for (int i = 0; i < parts.Len(); i++) {
01698                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
01699                                 dest.Add(c); } }
01700                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
01701                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
01702                         from = ParseCodePoint(s.GetSubStr(0, i - 1));
01703                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
01704         };
01705
01706         //-------------------------------------------------------------------------
01707         // Helper class for processing the text files
01708         //-------------------------------------------------------------------------
01709         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
01710         // and not all codepoints from the range have also been listed in
01711         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created
01712         // when processing DerivedCoreProps.txt and similar files.
01713         // To assign the correct (sub)categories to these new codepoints,
01714         // the following class will extract the subcategory info from the
01715         // comments in DerivedCoreProps.txt and similar files.
01716
01717         class TSubcatHelper
01718         {
01719         public:
01720                 bool hasCat; TUniChSubCategory subCat;
01721                 TStrH invalidCatCodes;
01722                 TUniChDb &owner;
01723
01724                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
01725
01726                 void ProcessComment(TUniChDb::TUcdFileReader &reader)
01727                 {
01728                         hasCat = false; subCat = ucOtherNotAssigned;
01729                         if (reader.comment.Len() > 3)
01730                         {
01731                                 IAssert(reader.comment[0] == '#');
01732                                 IAssert(reader.comment[1] == ' ');
01733                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];
01734                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
01735                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
01736                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
01737                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
01738                         }
01739                 }
01740
01741                 void SetCat(const int cp) {
01742                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01743                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);
01744                         IAssert(hasCat);
01745                         owner.h[i].SetCatAndSubCat(subCat); }
01746                 void TestCat(const int cp) {
01747                         if (! hasCat) return;
01748                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01749                         IAssert(owner.h[i].subCat == subCat); }
01750
01751                 ~TSubcatHelper()
01752                 {
01753                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
01754                         // Output any unexpected ones (there shouldn't be any).
01755                         if (! invalidCatCodes.Empty()) {
01756                                 printf("Invalid cat code(s) in the comments: ");
01757                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
01758                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
01759                                 printf("\n"); }
01760                 }
01761         };
01762 };
01763
01764 //-----------------------------------------------------------------------------
01765 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
01766 //-----------------------------------------------------------------------------
01767
01768 class TUnicode
01769 {
01770 public:
01771         TUniCodec codec;
01772         TUniChDb ucd;
01773
01774         TUnicode() { Init(); }
01775         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
01776         void Init() { InitCodecs(); }
01777
01778         //-----------------------------------------------------------------------
01779         // UTF-8
01780         //-----------------------------------------------------------------------
01781
01782         // Returns the number of characters that have been successfully decoded.
01783         // This does not include any replacement characters that may have been inserted into 'dest'.
01784         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01785         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01786
01787         // Returns the number of characters that have been successfully encoded.
01788         // This does not include any replacement characters that may have been inserted into 'dest'.
01789         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
01790
01791         // The following wrapper around the UTF-8 encoder returns a TStr containing
01792         // the UTF-8-encoded version of the input string.
01793         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
01794
01795         //-----------------------------------------------------------------------
01796         // UTF-16 Decoder
01797         //-----------------------------------------------------------------------
01798
01799         // Returns the number of characters that have been successfully decoded.
01800         // This does not include any replacement characters that may have been inserted into 'dest'.
01801         // Each element of 'src' is assumed to contain one byte of data.
01802         // srcCount must be even (though srcIdx doesn't need to be).
01803         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
01804                 const TUtf16BomHandling bomHandling = bomAllowed,
01805                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01806                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01807
01808         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
01809         // are used to determine if the two bytes of each word should be swapped before further
01810         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
01811         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
01812         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
01813         // beginning of the source data is used to determine the "original" byte order of the data;
01814         // if this doesn't match the byte order of the local machine, the two bytes of each word will
01815         // be swapped during the decoding process.
01816         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
01817                 const TUtf16BomHandling bomHandling = bomAllowed,
01818                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01819                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01820
01821         //-----------------------------------------------------------------------
01822         // UTF-16 Encoder
01823         //-----------------------------------------------------------------------
01824
01825         // Returns the number of characters that have been successfully encoded.
01826         // This does not include any replacement characters that may have been inserted into 'dest'.
01827         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
01828                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01829                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01830
01831         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
01832                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01833                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01834
01835         //-----------------------------------------------------------------------
01836         // 8-bit codecs
01837         //-----------------------------------------------------------------------
01838
01839         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;
01840         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;
01841         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;
01842         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;
01843         T8BitCodec<TEncoding_YuAscii> yuAscii;
01844         T8BitCodec<TEncoding_CP1250> cp1250;
01845         T8BitCodec<TEncoding_CP852> cp852;
01846         T8BitCodec<TEncoding_CP437> cp437;
01847
01848         //-----------------------------------------------------------------------
01849         // Codec registry
01850         //-----------------------------------------------------------------------
01851         // If you know you'll need ISO-8859-2, just use
01852         //   TUnicode unicode;
01853         //   unicode.iso8859_2.Encode(...);
01854         // If you don't know what you'll need, use:
01855         //   TUnicode unicode;
01856         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);
01857         //   myCodec->Encode(...);
01858         // Note that the first approach is slightly more efficient because there
01859         // aren't any virtual method calls involved.
01860
01861 protected:
01862         THash<TStr, PCodecBase> codecs;
01863         static inline TStr NormalizeCodecName(const TStr& name) {
01864                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
01865 public:
01866         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
01867                 TStrV names; nameList.SplitOnWs(names);
01868                 for (int i = 0; i < names.Len(); i++)
01869                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }
01870         void UnregisterCodec(const TStr& nameList) {
01871                 TStrV names; nameList.SplitOnWs(names);
01872                 for (int i = 0; i < names.Len(); i++)
01873                         codecs.DelKey(NormalizeCodecName(names[i])); }
01874         void ClrCodecs() { codecs.Clr(); }
01875         void InitCodecs();
01876         PCodecBase GetCodec(const TStr& name) const {
01877                 TStr s = NormalizeCodecName(name);
01878                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
01879                 return p; }
01880         void GetAllCodecs(TCodecBaseV& dest) const {
01881                 dest.Clr();
01882                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
01883                         PCodecBase codec = codecs[i]; bool found = false;
01884                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
01885                         if (! found) dest.Add(codec); }}
01886
01887         //-------------------------------------------------------------------------
01888         // Word boundaries (UAX #29)
01889         //-------------------------------------------------------------------------
01890
01891         // Finds the next word boundary strictly after 'position'.
01892         // Note that there are valid word boundaries at 0 and at 'src.Len()'.
01893         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01894         bool FindNextWordBoundary(const TIntV& src, int &position) const {
01895                 if (position < 0) { position = 0; return true; }
01896                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01897         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
01898         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01899         // always set to 'true'.
01900         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
01901
01902         //-------------------------------------------------------------------------
01903         // Sentence boundaries (UAX #29)
01904         //-------------------------------------------------------------------------
01905
01906         // Finds the next sentence boundary strictly after 'position'.
01907         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
01908         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01909         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
01910                 if (position < 0) { position = 0; return true; }
01911                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01912         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
01913         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01914         // always set to 'true'.
01915         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
01916
01917         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }
01918         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }
01919
01920         //-------------------------------------------------------------------------
01921         // Normalization, decomposition, etc. (UAX #15)
01922         //-------------------------------------------------------------------------
01923
01924         // This sets 'dest' to the decomposed form of the source string.
01925         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01926         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01927         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
01928         // This performs canonical composition on the source string, and stores
01929         // the result in the destination vector.  The source string should be the
01930         // result of a (canonical or compatibility) decomposition; if this is the
01931         // case, the composition will lead to a normalization form C (NFC) or
01932         // normalization form KC (NFKC), depending on whether canonical or compatibility
01933         // decomposition was used.
01934         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
01935         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01936         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01937         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01938         // source string.
01939         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
01940         // Copies the starter characters from 'src' to 'dest'; the other
01941         // characters are skipped.  'src' should already have been decomposed.
01942         // Returns the number of characters extracted.  This function can be
01943         // used to remove diacritical marks from a string (after it has been decomposed!).
01944         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
01945         // Extracts the starters into a temporary vector and then copies it into 'src'.
01946         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
01947
01948         //-------------------------------------------------------------------------
01949         // Case conversions
01950         //-------------------------------------------------------------------------
01951         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
01952         // use the case-conversion methods in TUniChDb, which allow the caller
01953         // to request language-specific case mappings for these languages.
01954
01955 public:
01956         typedef TUniChDb::TCaseConversion TCaseConversion;
01957         // Sets 'dest' to the case-converted form of 'src'.
01958         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
01959         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
01960         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
01961
01962         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01963         // This is simpler and faster.  Since each character now maps into exactly one
01964         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01965         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
01966         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
01967         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
01968
01969         // These functions perform simple case-conversions in-place.
01970         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
01971         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
01972         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
01973
01974         // Case folding is an alternative to the above functions.  It is intended primarily
01975         // to produce strings that are suitable for comparisons.  For example,
01976         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01977         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01978         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01979         //   into a string of two or more characters.
01980         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01981         //   each string before comparing them (see sec. 3.13 of the standard).
01982         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
01983         // ToCaseFolded folds the string in place.  However, this means that only the simple
01984         // case foldings can be used (the full ones could increase the length of the string).
01985         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
01986
01987         TStr GetUtf8CaseFolded(const TStr& s) const {
01988                 bool isAscii = true;
01989                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
01990                 if (isAscii) return s.GetLc();
01991                 TIntV src; DecodeUtf8(s, src);
01992                 TIntV dest; GetCaseFolded(src, dest);
01993                 return EncodeUtf8Str(dest); }
01994
01995         //-------------------------------------------------------------------------
01996         // Character properties
01997         //-------------------------------------------------------------------------
01998         // These methods simply call the corresponding TUniChDb method
01999         // (which typically calls the corresponding method of TUniChInfo).
02000         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
02001         // They are all of the form        bool IsXxxx(const int cp) const
02002         // Some of the more notable ones include:
02003         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
02004         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
02005         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
02006
02007 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
02008         DECLARE_FORWARDED_PROPERTY_METHODS
02009 #undef DECLARE_FORWARDED_PROPERTY_METHODS
02010 #undef __UniFwd1
02011         ___UniFwd2(IsPrivateUse, IsSurrogate)
02012
02013         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
02014         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
02015
02016         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
02017         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
02018         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
02019
02020 };
02021
02022 //-----------------------------------------------------------------------------
02023 // TUniCodec -- UTF-8 Decoder
02024 //-----------------------------------------------------------------------------
02025
02026 // Returns the number of characters that have been successfully decoded.
02027 // This does not include any replacement characters that may have been inserted into 'dest'.
02028 template<typename TSrcVec, typename TDestCh>
02029 size_t TUniCodec::DecodeUtf8(
02030         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02031         TVec<TDestCh>& dest, const bool clrDest) const
02032 {
02033         size_t nDecoded = 0;
02034         if (clrDest) dest.Clr();
02035         const size_t origSrcIdx = srcIdx;
02036         const size_t srcEnd = srcIdx + srcCount;
02037         while (srcIdx < srcEnd)
02038         {
02039                 const size_t charSrcIdx = srcIdx;
02040                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02041                 if ((c & _1000_0000) == 0) {
02042                         // c is one of the characters 0..0x7f, encoded as a single byte.
02043                         dest.Add(TDestCh(c)); nDecoded++; continue; }
02044                 else if ((c & _1100_0000) == _1000_0000) {
02045                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
02046                         // We must have been thrown into the middle of a multi-byte character.
02047                         switch (errorHandling) {
02048                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
02049                         case uehAbort: return nDecoded;
02050                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02051                         case uehIgnore: continue;
02052                         default: Fail; } }
02053                 else
02054                 {
02055                         // c introduces a sequence of 2..6 bytes, depending on how many
02056                         // of the most significant bits of c are set.
02057                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
02058                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
02059                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
02060                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
02061                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
02062                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
02063                         else {
02064                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
02065                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
02066                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
02067                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
02068                                 if (strict)  {
02069                                         switch (errorHandling) {
02070                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
02071                                         case uehAbort: return nDecoded;
02072                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
02073                                         // and try to decode the character.  Then, since 'strict' is true and
02074                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
02075                                         // and (in the case of uehReplace) insert a replacement character then.
02076                                         // This is probably better than inserting a replacement character right
02077                                         // away and then trying to read the next byte as if a new character
02078                                         // was beginning there -- if the current byte is really followed by five
02079                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
02080                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
02081                                         case uehIgnore: break; // continue;
02082                                         default: Fail; } }
02083                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
02084                         // Decode this multi-byte sequence.
02085                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
02086                         bool cancel = false;
02087                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
02088                                 // See if there are enough bytes left in the source vector.
02089                                 if (! (srcIdx < srcEnd)) {
02090                                         switch (errorHandling) {
02091                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
02092                                         case uehAbort: return nDecoded;
02093                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
02094                                         case uehIgnore: cancel = true; continue;
02095                                         default: Fail; } }
02096                                 // Read the next byte.
02097                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02098                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
02099                                         switch (errorHandling) {
02100                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
02101                                         case uehAbort: return nDecoded;
02102                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
02103                                         case uehIgnore: srcIdx--; cancel = true; continue;
02104                                         default: Fail; } }
02105                                 cOut <<= 6; cOut |= (c & _0011_1111); }
02106                         if (cancel) continue;
02107                         if (strict) {
02108                                 // err1: This codepoint has been represented by more bytes than it should have been.
02109                                 // For example, cOut in the range 0..127 should be represented by a single byte,
02110                                 // not by two or more bytes.
02111                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
02112                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
02113                                 // the appearance of null bytes in the encoded stream.
02114                                 bool err1 = (cOut < minVal);
02115                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
02116                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
02117                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
02118                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
02119                                 if (err1 || err2) switch (errorHandling) {
02120                                         case uehThrow:
02121                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
02122                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
02123                                                 else { Fail; break; }
02124                                         case uehAbort: return nDecoded;
02125                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02126                                         case uehIgnore: continue;
02127                                         default: Fail; } }
02128                         // Add the decoded codepoint to the destination vector.
02129                         // If this is the first decoded character, and it's one of the byte-order marks
02130                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
02131                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
02132                                 dest.Add(cOut); nDecoded++; }
02133                 } // else (multi-byte sequence)
02134         } // while
02135         return nDecoded;
02136 }
02137
02138 //-----------------------------------------------------------------------
02139 // TUniCodec -- UTF-8 Encoder
02140 //-----------------------------------------------------------------------
02141
02142 // Returns the number of characters that have been successfully encoded.
02143 // This does not include any replacement characters that may have been inserted into 'dest'.
02144 template<typename TSrcVec, typename TDestCh>
02145 size_t TUniCodec::EncodeUtf8(
02146         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02147         TVec<TDestCh>& dest, const bool clrDest) const
02148 {
02149         size_t nEncoded = 0;
02150         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
02151         {
02152                 uint c = uint(src[TVecIdx(srcIdx)]);
02153                 bool err = false;
02154                 if (strict && c > 0x10ffff) {
02155                         err = true;
02156                         switch (errorHandling) {
02157                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
02158                         case uehAbort: return nEncoded;
02159                         case uehReplace: c = replacementChar; break;
02160                         case uehIgnore: continue;
02161                         default: Fail; } }
02162                 if (c < 0x80u)
02163                         dest.Add(TDestCh(c & 0xffu));
02164                 else if (c < 0x800u) {
02165                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
02166                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02167                 else if (c < 0x10000u) {
02168                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
02169                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02170                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02171                 else if (c < 0x200000u) {
02172                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
02173                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02174                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02175                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02176                 else if (c < 0x4000000u) {
02177                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
02178                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02179                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02180                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02181                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02182                 else {
02183                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
02184                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
02185                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02186                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02187                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02188                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02189                 if (! err) nEncoded++;
02190         }
02191         return nEncoded;
02192 }
02193
02194 //-----------------------------------------------------------------------
02195 // TUniCodec -- UTF-16 Encoder
02196 //-----------------------------------------------------------------------
02197
02198 // Returns the number of characters that have been successfully decoded.
02199 // This does not include any replacement characters that may have been inserted into 'dest'.
02200 // Each element of 'src' is assumed to contain one byte of data.
02201 // srcCount must be even (though srcIdx doesn't need to be).
02202 template<typename TSrcVec, typename TDestCh>
02203 size_t TUniCodec::DecodeUtf16FromBytes(
02204         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02205         TVec<TDestCh>& dest, const bool clrDest,
02206         const TUtf16BomHandling bomHandling,
02207         const TUniByteOrder defaultByteOrder) const
02208 {
02209         IAssert(srcCount % 2 == 0);
02210         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02211         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02212         if (clrDest) dest.Clr();
02213         size_t nDecoded = 0;
02214         if (srcCount <= 0) return nDecoded;
02215         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02216         bool littleEndian = false;
02217   bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
02218         if (bomHandling == bomIgnored) littleEndian = leDefault;
02219         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02220         {
02221                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
02222                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
02223                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
02224                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
02225                 else { // Report an error.
02226                         switch (errorHandling) {
02227                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
02228                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02229                         default: Fail; } }
02230         }
02231         else Fail;
02232         while (srcIdx < srcEnd)
02233         {
02234                 const size_t charSrcIdx = srcIdx;
02235                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02236                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02237                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02238                 {
02239                         // c is the first character in a surrogate pair.  Read the next character.
02240                         if (! (srcIdx + 2 <= srcEnd)) {
02241                                 switch (errorHandling) {
02242                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02243                                 case uehAbort: return nDecoded;
02244                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02245                                 case uehIgnore: continue;
02246                                 default: Fail; } }
02247                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02248                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02249                         // c2 should be the second character of the surrogate pair.
02250                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02251                                 switch (errorHandling) {
02252                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02253                                 case uehAbort: return nDecoded;
02254                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02255                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
02256                                 case uehIgnore: srcIdx -= 2; continue;
02257                                 default: Fail; } }
02258                         // c and c2 each contain 10 bits of information.
02259                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02260                         cc += 0x10000;
02261                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02262                 }
02263                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02264                         switch (errorHandling) {
02265                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02266                         case uehAbort: return nDecoded;
02267                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02268                         case uehIgnore: continue;
02269                         default: Fail; } }
02270                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02271                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02272                 // Otherwise, store 'c' to the destination vector.
02273                 dest.Add(TDestCh(c)); nDecoded++;
02274         }
02275         return nDecoded;
02276 }
02277
02278 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
02279 // are used to determine if the two bytes of each word should be swapped before further
02280 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
02281 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
02282 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
02283 // beginning of the source data is used to determine the "original" byte order of the data;
02284 // if this doesn't match the byte order of the local machine, the two bytes of each word will
02285 // be swapped during the decoding process.
02286 template<typename TSrcVec, typename TDestCh>
02287 size_t TUniCodec::DecodeUtf16FromWords(
02288         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02289         TVec<TDestCh>& dest, bool clrDest,
02290         const TUtf16BomHandling bomHandling,
02291         const TUniByteOrder defaultByteOrder) const
02292 {
02293         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02294         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02295         if (clrDest) dest.Clr();
02296         size_t nDecoded = 0;
02297         if (srcCount <= 0) return nDecoded;
02298         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02299         bool swap = false;
02300   bool isMachineLe = IsMachineLittleEndian();
02301         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
02302         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
02303         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02304         {
02305                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
02306                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
02307                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
02308                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
02309                 else { // Report an error.
02310                         switch (errorHandling) {
02311                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
02312                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02313                         default: Fail; } }
02314         }
02315         else Fail;
02316         while (srcIdx < srcEnd)
02317         {
02318                 const size_t charSrcIdx = srcIdx;
02319                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02320                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02321                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02322                 {
02323                         // c is the first character in a surrogate pair.  Read the next character.
02324                         if (! (srcIdx < srcEnd)) {
02325                                 switch (errorHandling) {
02326                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02327                                 case uehAbort: return nDecoded;
02328                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02329                                 case uehIgnore: continue;
02330                                 default: Fail; } }
02331                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02332                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
02333                         // c2 should be the second character of the surrogate pair.
02334                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02335                                 switch (errorHandling) {
02336                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02337                                 case uehAbort: return nDecoded;
02338                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02339                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
02340                                 case uehIgnore: srcIdx -= 1; continue;
02341                                 default: Fail; } }
02342                         // c and c2 each contain 10 bits of information.
02343                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02344                         cc += 0x10000;
02345                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02346                 }
02347                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02348                         switch (errorHandling) {
02349                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02350                         case uehAbort: return nDecoded;
02351                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02352                         case uehIgnore: continue;
02353                         default: Fail; } }
02354                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02355                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02356                 // Otherwise, store 'c' to the destination vector.
02357                 dest.Add(TDestCh(c)); nDecoded++;
02358         }
02359         return nDecoded;
02360 }
02361
02362 //-----------------------------------------------------------------------
02363 // TUniCodec -- UTF-16 Encoder
02364 //-----------------------------------------------------------------------
02365
02366 // Returns the number of characters that have been successfully encoded.
02367 // This does not include any replacement characters that may have been inserted into 'dest'.
02368 template<typename TSrcVec, typename TDestCh>
02369 size_t TUniCodec::EncodeUtf16ToWords(
02370         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02371         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02372         const TUniByteOrder destByteOrder) const
02373 {
02374         bool isMachineLe = IsMachineLittleEndian();
02375         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
02376         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02377         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
02378         while (srcIdx < srcEnd)
02379         {
02380                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02381                 if (! (c <= 0x10ffffu)) {
02382                         switch (errorHandling) {
02383                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02384                         case uehAbort: return nEncoded;
02385                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02386                         case uehIgnore: continue;
02387                         default: Fail; } }
02388                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02389                         switch (errorHandling) {
02390                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02391                         case uehAbort: return nEncoded;
02392                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02393                         case uehIgnore: continue;
02394                         default: Fail; } }
02395                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02396                         switch (errorHandling) {
02397                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02398                         case uehAbort: return nEncoded;
02399                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02400                         case uehIgnore: continue;
02401                         default: Fail; } }
02402                 // If c is <= 0xffff, it can be stored directly.
02403                 if (c <= 0xffffu) {
02404                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02405                         dest.Add(TDestCh(c)); nEncoded++; continue; }
02406                 // Otherwise, represent c by a pair of surrogate characters.
02407                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02408                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02409                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02410                 if (swap) {
02411                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
02412                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
02413                 dest.Add(TDestCh(c1));
02414                 dest.Add(TDestCh(c2));
02415                 nEncoded++; continue;
02416         }
02417         return nEncoded;
02418 }
02419
02420 template<typename TSrcVec, typename TDestCh>
02421 size_t TUniCodec::EncodeUtf16ToBytes(
02422         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02423         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02424         const TUniByteOrder destByteOrder) const
02425 {
02426         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
02427         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02428         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
02429         while (srcIdx < srcEnd)
02430         {
02431                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02432                 if (! (c <= 0x10ffffu)) {
02433                         switch (errorHandling) {
02434                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02435                         case uehAbort: return nEncoded;
02436 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
02437                         case uehReplace: ___OutRepl; continue;
02438                         case uehIgnore: continue;
02439                         default: Fail; } }
02440                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02441                         switch (errorHandling) {
02442                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02443                         case uehAbort: return nEncoded;
02444                         case uehReplace: ___OutRepl; continue;
02445                         case uehIgnore: continue;
02446                         default: Fail; } }
02447                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02448                         switch (errorHandling) {
02449                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02450                         case uehAbort: return nEncoded;
02451                         case uehReplace: ___OutRepl; continue;
02452                         case uehIgnore: continue;
02453                         default: Fail; } }
02454 #undef ___OutRepl
02455                 // If c is <= 0xffff, it can be stored directly.
02456                 if (c <= 0xffffu) {
02457                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
02458                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
02459                         nEncoded++; continue; }
02460                 // Otherwise, represent c by a pair of surrogate characters.
02461                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02462                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02463                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02464                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
02465                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
02466                 nEncoded++; continue;
02467         }
02468         return nEncoded;
02469 }
02470
02471 //-----------------------------------------------------------------------------
02472 // TUniChDb -- word boundaries
02473 //-----------------------------------------------------------------------------
02474
02475 template<typename TSrcVec>
02476 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02477 {
02478         // WB1.  Break at the start of text.
02479         if (position < srcIdx) { position = srcIdx; return true; }
02480         // If we are beyond the end of the text, there aren't any word breaks left.
02481         const size_t srcEnd = srcIdx + srcCount;
02482         if (position >= srcEnd) return false;
02483         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02484         size_t origPos = position;
02485         if (IsWbIgnored(src[TVecIdx(position)])) {
02486                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02487                         position = origPos;
02488         }
02489         // Determine the previous nonignored character (before 'position').
02490         size_t posPrev = position;
02491         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02492         // Sec 6.2.  Allow a break between Sep and an ignored character.
02493         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02494         // Determine the next nonignored character (after 'position').
02495         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02496         size_t posNext2;
02497         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02498         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02499         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
02500         int cNext2, wbfNext2;
02501         //
02502         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02503                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02504                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
02505         {
02506                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02507                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02508                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02509                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02510                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02511                 wbfNext2 = GetWbFlags(cNext2);
02512 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02513 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
02514 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02515                 // WB3.  Do not break within CRLF.
02516                 if (cCur == 13 && cNext == 10) continue;
02517                 // WB5.  Do not break between most letters.
02518                 TestCurNext(ucfWbALetter, ucfWbALetter);
02519                 // WB6.  Do not break letters across certain punctuation.
02520                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02521                 // WB7.  Do not break letters across certain punctuation.
02522                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02523                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
02524                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
02525                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
02526                 TestCurNext(ucfWbALetter, ucfWbNumeric);
02527                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
02528                 TestCurNext(ucfWbNumeric, ucfWbALetter);
02529                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
02530                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02531                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
02532                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02533                 // WB13.  Do not break between Katakana.
02534                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
02535                 // WB13a.  Do not break from extenders.
02536                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
02537                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
02538                 // WB13b.  Do not break from extenders.
02539                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
02540                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
02541                 // WB14.  Otherwise, break everywhere.
02542                 position = posNext; return true;
02543 #undef TestCurNext
02544 #undef TestCurNext2
02545 #undef TestPrevCurNext
02546         }
02547         // WB2.  Break at the end of text.
02548         IAssert(position == srcEnd);
02549         return true;
02550 }
02551
02552 // ToDo: provide a more efficient implementation of this.
02553 template<typename TSrcVec>
02554 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02555 {
02556         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02557         dest.PutAll(false);
02558         size_t position = srcIdx;
02559         dest[TVecIdx(position - srcIdx)] = true;
02560         while (position < srcIdx + srcCount)
02561         {
02562                 size_t oldPos = position;
02563                 FindNextWordBoundary(src, srcIdx, srcCount, position);
02564                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02565                 dest[TVecIdx(position - srcIdx)] = true;
02566         }
02567         Assert(dest[TVecIdx(srcCount)]);
02568 }
02569
02570 //-----------------------------------------------------------------------------
02571 // TUniChDb -- sentence boundaries
02572 //-----------------------------------------------------------------------------
02573
02574 template<typename TSrcVec>
02575 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
02576 {
02577         if (sbExTrie.Empty()) return true;
02578         // We'll move back from the position where a sentence-boundary is being considered.
02579         size_t pos = position;
02580         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02581         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
02582         // - Skip the Sep, if there is one.
02583         if ((c & ucfSbSep) == ucfSbSep) {
02584                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02585                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02586         // - Skip any Sp characters.
02587         while ((sfb & ucfSbSp) == ucfSbSp) {
02588                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02589                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02590         // - Skip any Close characters.
02591         while ((sfb & ucfSbSp) == ucfSbSp) {
02592                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02593                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02594         // - Skip any ATerm | STerm characters.
02595         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
02596                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02597                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02598         // Now start moving through the trie.
02599         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
02600         while (true)
02601         {
02602                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
02603                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
02604                 TUniChCategory cat = GetCat(c);
02605                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
02606                         // Check if the suffix we've read so far is one of those that appear in the trie.
02607                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
02608                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
02609                         IAssert(len >= 3); IAssert(node >= 0);
02610                         if (sbExTrie.IsNodeTerminal(node)) return false;
02611                         if (atEnd) return true; }
02612                 if (len == 1) { cButLast = c; len++; }
02613                 else if (len == 2) { cButButLast = c; len++;
02614                         // Now we have read the last three characters; start descending the suitable subtrie.
02615                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
02616                         if (node < 0) return true; }
02617                 else {
02618                         // Descend down the trie.
02619                         node = sbExTrie.GetChild(node, c);
02620                         if (node < 0) return true; }
02621         }
02622         //return true;
02623 }
02624
02625 template<typename TSrcVec>
02626 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02627 {
02628         // SB1.  Break at the start of text.
02629         if (position < srcIdx) { position = srcIdx; return true; }
02630         // If we are beyond the end of the text, there aren't any word breaks left.
02631         const size_t srcEnd = srcIdx + srcCount;
02632         if (position >= srcEnd) return false;
02633         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02634         size_t origPos = position;
02635         if (IsWbIgnored(src[TVecIdx(position)])) {
02636                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02637                         position = origPos;
02638         }
02639         // Determine the previous nonignored character (before 'position').
02640         size_t posPrev = position;
02641         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02642         // Sec 6.2.  Allow a break between Sep and an ignored character.
02643         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02644         // Determine the next nonignored character (after 'position').
02645         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02646         size_t posNext2;
02647         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02648         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02649         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
02650         int cNext2, sbfNext2;
02651         // Initialize the state of the peek-back automaton.
02652         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
02653         TPeekBackState backState;
02654         {
02655                 size_t pos = position;
02656                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
02657                 while (true)
02658                 {
02659                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02660                         // Skip at most one Sep.
02661                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02662                         if ((sbf & ucfSbSep) == ucfSbSep) {
02663                                 wasSep = true;
02664                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02665                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02666                         // Skip zero or more Sp's.
02667                         bool stop = false;
02668                         while ((sbf & ucfSbSp) == ucfSbSp) {
02669                                 wasSp = true;
02670                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02671                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02672                         if (stop) break;
02673                         // Skip zero or more Close's.
02674                         while ((sbf & ucfSbClose) == ucfSbClose) {
02675                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02676                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02677                         if (stop) break;
02678                         // Process an ATerm or STerm.
02679                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
02680                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
02681                         break;
02682                 }
02683                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
02684                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
02685                 else backState = stInit;
02686         }
02687         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
02688         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
02689         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
02690         // Our peek-ahead automaton must tell us whether it is Lower or something else.
02691         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
02692         TPeekAheadState aheadState = stUnknown;
02693         //
02694         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02695                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02696                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
02697         {
02698                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02699                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02700                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02701                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02702                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02703                 sbfNext2 = GetSbFlags(cNext2);
02704                 // Update the peek-back automaton.
02705 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
02706 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
02707                 switch (backState) {
02708                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
02709                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
02710                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
02711                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02712                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02713                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02714                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02715                         default: IAssert(false); }
02716 #undef Trans
02717 #undef TestCur
02718                 // Update the peek-ahead automaton.
02719 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
02720                 if (! IsPeekAheadSkippable(sbfCur)) {
02721                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
02722                         if (aheadState == stLower) IAssert(isLower);
02723                         else if (aheadState == stNotLower) IAssert(! isLower);
02724                         // We haven't peaked ahead farther than this so far -- invalidate the state.
02725                         aheadState = stUnknown; }
02726                 if (aheadState == stUnknown)
02727                 {
02728                         // Peak ahead to the next non-peekahead-skippable character.
02729                         size_t pos = posNext;
02730                         while (pos < srcEnd) {
02731                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02732                                 if (! IsPeekAheadSkippable(sbf)) {
02733                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
02734                                         else aheadState = stNotLower;
02735                                         break; }
02736                                 WbFindNextNonIgnored(src, pos, srcEnd); }
02737                         if (! (pos < srcEnd)) aheadState = stNotLower;
02738                 }
02739 #undef IsPeekAheadSkippable
02740                 //
02741 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02742 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
02743 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02744                 // SB3.  Do not break within CRLF.
02745                 if (cCur == 13 && cNext == 10) continue;
02746                 // SB4.  Break ater paragraph separators.
02747                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
02748                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02749                         position = posNext; return true; }
02750                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
02751                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
02752                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
02753                 // or numeric period, and thus may not mark the end of a sentence.
02754                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
02755                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
02756                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
02757                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
02758                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
02759                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
02760                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
02761                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
02762                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
02763                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
02764                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
02765                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
02766                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
02767                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
02768                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02769                         position = posNext; return true; } // SB11
02770                 // WB12.  Otherwise, do not break.
02771                 continue;
02772 #undef TestCurNext
02773 #undef TestCurNext2
02774 #undef TestPrevCurNext
02775         }
02776         // WB2.  Break at the end of text.
02777         IAssert(position == srcEnd);
02778         return true;
02779 }
02780
02781 // ToDo: provide a more efficient implementation of this.
02782 template<typename TSrcVec>
02783 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02784 {
02785         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02786         dest.PutAll(false);
02787         size_t position = srcIdx;
02788         dest[TVecIdx(position - srcIdx)] = true;
02789         while (position < srcIdx + srcCount)
02790         {
02791                 size_t oldPos = position;
02792                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
02793                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02794                 dest[TVecIdx(position - srcIdx)] = true;
02795         }
02796         Assert(dest[TVecIdx(srcCount)]);
02797 }
02798
02799 //-----------------------------------------------------------------------------
02800 // TUniChDb -- case conversions
02801 //-----------------------------------------------------------------------------
02802
02803 template<typename TSrcVec, typename TDestCh>
02804 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02805                                                                 TVec<TDestCh>& dest, const bool clrDest,
02806                                                                 const TUniChDb::TCaseConversion how,
02807                                                                 const bool turkic, const bool lithuanian) const
02808 {
02809         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
02810         if (clrDest) dest.Clr();
02811         enum {
02812                 GreekCapitalLetterSigma = 0x3a3,
02813                 GreekSmallLetterSigma = 0x3c3,
02814                 GreekSmallLetterFinalSigma = 0x3c2,
02815                 LatinCapitalLetterI = 0x49,
02816                 LatinCapitalLetterJ = 0x4a,
02817                 LatinCapitalLetterIWithOgonek = 0x12e,
02818                 LatinCapitalLetterIWithGrave = 0xcc,
02819                 LatinCapitalLetterIWithAcute = 0xcd,
02820                 LatinCapitalLetterIWithTilde = 0x128,
02821                 LatinCapitalLetterIWithDotAbove = 0x130,
02822                 LatinSmallLetterI = 0x69,
02823                 CombiningDotAbove = 0x307
02824         };
02825         //
02826         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
02827         size_t nextWordBoundary = srcIdx;
02828         TBoolV wordBoundaries; bool wbsKnown = false;
02829         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
02830         {
02831                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
02832                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
02833                 // For conversion to titlecase, the first cased character of each word
02834                 // must be converted to titlecase; everything else must be converted
02835                 // to lowercase.
02836                 TUniChDb::TCaseConversion howHere;
02837                 if (how != ccTitle) howHere = how;
02838                 else {
02839                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
02840                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
02841                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
02842                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
02843                         bool isCased = IsCased(cp);
02844                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
02845                         else { howHere = ccLower;
02846                                 if (isCased && seenCased) seenTwoCased = true; }
02847                 }
02848                 // First, process the conditional mappings from SpecialCasing.txt.
02849                 // These will be processed in code -- they were ignored while
02850                 // we were reading SpecialCasing.txt itself.
02851                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
02852                 {
02853                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
02854                         // the standard doesn't define it.  We'll use FinalCased instead.
02855                         // FinalCased: within the closest word boundaries containing C,
02856                         // there is a cased letter before C, and there is no cased letter after C.
02857                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
02858                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
02859                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
02860                         if (how == ccTitle)
02861                                 printf("!");
02862                         //while (srcIdx2 < nextBoundary)
02863                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02864                         {
02865                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02866                                 if (IsCased(cp2)) { casedAfter = true; break; }
02867                         }
02868                         if (! casedAfter)
02869                         {
02870                                 //size_t prevBoundary = srcIdx - 1;
02871                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
02872                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
02873                                 //while (prevBoundary < srcIdx2)
02874                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02875                                 {
02876                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02877                                         if (IsCased(cp2)) { casedBefore = true; break; }
02878                                 }
02879                                 if (casedBefore) {
02880                                         // Now we have a FinalCased character.
02881                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
02882                         }
02883                         // If we got here, add a non-final sigma.
02884                         dest.Add(GreekSmallLetterSigma); continue;
02885                 }
02886                 else if (lithuanian)
02887                 {
02888                         if (howHere == ccLower)
02889                         {
02890                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
02891                                 {
02892                                         bool moreAbove = false;
02893                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02894                                         {
02895                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02896                                                 const int cc2 = GetCombiningClass(cp2);
02897                                                 if (cc2 == TUniChInfo::ccStarter) break;
02898                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
02899                                         }
02900                                         if (moreAbove)
02901                                         {
02902                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
02903                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
02904                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
02905                                         }
02906                                 }
02907                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
02908                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
02909                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
02910                         }
02911                         if (cp == CombiningDotAbove)
02912                         {
02913                                 // Lithuanian, howHere != ccLower.
02914                                 // AfterSoftDotted := the last preceding character with a combining class
02915                                 // of zero before C was Soft_Dotted, and there is no intervening combining
02916                                 // character class 230 (ABOVE).
02917                                 bool afterSoftDotted = false;
02918                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02919                                 while (origSrcIdx < srcIdx2)
02920                                 {
02921                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02922                                         int cc2 = GetCombiningClass(cp2);
02923                                         if (cc2 == TUniChInfo::ccAbove) break;
02924                                         if (cc2 == TUniChInfo::ccStarter) {
02925                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
02926                                 }
02927                                 if (afterSoftDotted)
02928                                 {
02929                                         Assert(lithuanian);
02930                                         // Remove DOT ABOVE after "i" with upper or titlecase.
02931                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
02932                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
02933                                         if (how == ccLower) { dest.Add(0x307); continue; }
02934                                         if (how == ccUpper) continue;
02935                                         Assert(how == ccTitle);
02936                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
02937                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
02938                                         dest.Add(0x307); continue;
02939                                 }
02940                         }
02941                 }
02942                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
02943                 {
02944                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
02945                         // The following rules handle those cases.
02946                         if (cp == LatinCapitalLetterIWithDotAbove) {
02947                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
02948                         // When lowercasing, remove dot_above in the sequence I + dot_above,
02949                         // which will turn into i.  This matches the behavior of the
02950                         // canonically equivalent I-dot_above.
02951                         else if (cp == CombiningDotAbove)
02952                         {
02953                                 // AfterI: the last preceding base character was an uppercase I,
02954                                 // and there is no intervening combining character class 230 (ABOVE).
02955                                 bool afterI = false;
02956                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02957                                 while (origSrcIdx < srcIdx2)
02958                                 {
02959                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02960                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
02961                                         int cc2 = GetCombiningClass(cp2);
02962                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
02963                                 }
02964                                 if (afterI) {
02965                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
02966                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
02967                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
02968                                                 // This suggests that if a cased character is found, others in that word should be left alone.
02969                                                 // This seems unusual; we map all other characters to lowercase instead.
02970                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
02971                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
02972                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
02973                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
02974                                                 // So we treat this as a special case here.
02975                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
02976                                                 dest.Add(0x307); continue; }
02977                                         if (howHere != ccLower) dest.Add(0x307);
02978                                         continue; }
02979                         }
02980                         // When lowercasing, unless an I is before a dot_above,
02981                         // it turns into a dotless i.
02982                         else if (cp == LatinCapitalLetterI)
02983                         {
02984                                 // BeforeDot: C is followed by U+0307 (combining dot above).
02985                                 // Any sequence of characters with a combining class that is
02986                                 // neither 0 nor 230 may intervene between the current character
02987                                 // and the combining dot above.
02988                                 bool beforeDot = false;
02989                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02990                                 {
02991                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02992                                         if (cp2 == 0x307) { beforeDot = true; break; }
02993                                         const int cc2 = GetCombiningClass(cp2);
02994                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
02995                                 }
02996                                 if (! beforeDot) {
02997                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
02998                         }
02999                         // When uppercasing, i turns into a dotted capital I.
03000                         else if (cp == LatinSmallLetterI)
03001                         {
03002                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
03003                         }
03004                 }
03005                 // Try to use the unconditional mappings.
03006                 const TIntIntVH &specHere = (
03007                         howHere == how ? specials :
03008                         howHere == ccLower ? specialCasingLower :
03009                         howHere == ccTitle ? specialCasingTitle :
03010                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
03011                 int i = specHere.GetKeyId(cp);
03012                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
03013                 // Try to use the simple (one-character) mappings.
03014                 i = h.GetKeyId(cp);
03015                 if (i >= 0) {
03016                         const TUniChInfo &ci = h[i];
03017                         int cpNew = (
03018                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
03019                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
03020                                                                          ci.simpleTitleCaseMapping);
03021                         if (cpNew < 0) cpNew = cp;
03022                         dest.Add(cpNew); continue; }
03023                 // As a final resort, leave 'cp' unchanged.
03024                 dest.Add(cp);
03025         }
03026 }
03027
03028 template<typename TSrcVec, typename TDestCh>
03029 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03030         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
03031 {
03032         if (clrDest) dest.Clr();
03033         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03034         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
03035         {
03036                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03037                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
03038                 const TUniChInfo &ci = h[i];
03039                 // With titlecasing, the first cased character of each word must be put into titlecase,
03040                 // all others into lowercase.  This is what the howHere variable is for.
03041                 TUniChDb::TCaseConversion howHere;
03042                 if (how != ccTitle) howHere = how;
03043                 else {
03044                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
03045                                 seenCased = false;
03046                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03047                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03048                         bool isCased = IsCased(cp);
03049                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03050                         else howHere = ccLower;
03051                 }
03052                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03053                 if (cpNew < 0) cpNew = cp;
03054                 dest.Add(cpNew);
03055         }
03056 }
03057
03058 template<typename TSrcVec>
03059 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
03060 {
03061         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03062         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
03063         {
03064                 const int cp = src[TVecIdx(srcIdx)];
03065                 int i = h.GetKeyId(cp); if (i < 0) continue;
03066                 const TUniChInfo &ci = h[i];
03067                 // With titlecasing, the first cased character of each word must be put into titlecase,
03068                 // all others into lowercase.  This is what the howHere variable is for.
03069                 TUniChDb::TCaseConversion howHere;
03070                 if (how != ccTitle) howHere = how;
03071                 else {
03072                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
03073                                 seenCased = false;
03074                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03075                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03076                         bool isCased = IsCased(cp);
03077                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03078                         else howHere = ccLower;
03079                 }
03080                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03081                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
03082         }
03083 }
03084
03085 //-----------------------------------------------------------------------------
03086 // TUniChDb -- composition, decomposition, normal forms
03087 //-----------------------------------------------------------------------------
03088
03089 template<typename TDestCh>
03090 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
03091 {
03092         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
03093         {
03094                 // UAX #15, sec. 16: Hangul decomposition
03095                 const int SIndex = codePoint - HangulSBase;
03096                 const int L = HangulLBase + SIndex / HangulNCount;
03097                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
03098                 const int T = HangulTBase + (SIndex % HangulTCount);
03099                 dest.Add(L); dest.Add(V);
03100                 if (T != HangulTBase) dest.Add(T);
03101                 return;
03102         }
03103         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
03104         const TUniChInfo &ci = h[i];
03105         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
03106         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
03107         while (true) {
03108                 int cp = decompositions[ofs++]; if (cp < 0) return;
03109                 AddDecomposition(cp, dest, compatibility); }
03110 }
03111
03112 template<typename TSrcVec, typename TDestCh>
03113 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03114                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
03115 {
03116         if (clrDest) dest.Clr();
03117         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
03118         // Decompose the string.
03119         while (srcIdx < srcCount) {
03120                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
03121         // Rearrange the decomposed string into canonical order.
03122         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
03123         {
03124                 size_t j = destIdx;
03125                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
03126                 int cpCls = GetCombiningClass(cp);
03127                 if (cpCls == TUniChInfo::ccStarter) continue;
03128                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
03129                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
03130                 dest[TVecIdx(j)] = cp;
03131         }
03132 }
03133
03134 template<typename TSrcVec, typename TDestCh>
03135 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03136                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
03137 {
03138         if (clrDest) dest.Clr();
03139         TIntV temp;
03140         Decompose(src, srcIdx, srcCount, temp, compatibility);
03141         Compose(temp, 0, temp.Len(), dest, clrDest);
03142 }
03143
03144 template<typename TSrcVec, typename TDestCh>
03145 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03146                 TVec<TDestCh>& dest, bool clrDest) const
03147 {
03148         if (clrDest) dest.Clr();
03149         bool lastStarterKnown = false; // has a starter been encountered yet?
03150         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
03151         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
03152         const size_t srcEnd = srcIdx + srcCount;
03153         int ccMax = -1; // The highest combining class among the characters since the last starter.
03154         while (srcIdx < srcEnd)
03155         {
03156                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03157                 const int cpClass = GetCombiningClass(cp);
03158                 //int cpCombined = -1;
03159                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
03160                 // by some intermediate character, we can try to combine them.
03161                 if (lastStarterKnown && ccMax < cpClass)
03162                 {
03163                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
03164                         int cpCombined = -1;
03165                         do {
03166                                 // Try to look up a composition in the inverseDec table.
03167                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
03168                                 // UAX #15, sec. 16: Hangul composition
03169                                 // - Try to combine L and V.
03170                                 const int LIndex = cpLastStarter - HangulLBase;
03171                                 if (0 <= LIndex && LIndex < HangulLCount) {
03172                                         const int VIndex = cp - HangulVBase;
03173                                         if (0 <= VIndex && VIndex < HangulVCount) {
03174                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
03175                                                 break; } }
03176                                 // - Try to combine LV and T.
03177                                 const int SIndex = cpLastStarter - HangulSBase;
03178                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
03179                                 {
03180                                         const int TIndex = cp - HangulTBase;
03181                                         if (0 <= TIndex && TIndex < HangulTCount) {
03182                                                 cpCombined = cpLastStarter + TIndex;
03183                                                 break; }
03184                                 }
03185                         } while (false);
03186                         // If a combining character has been found, use it to replace the old cpStarter.
03187                         if (cpCombined >= 0) {
03188                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
03189                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
03190                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
03191                                 cpLastStarter = cpCombined; continue; }
03192                 }
03193                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
03194                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
03195                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
03196                         ccMax = cpClass;
03197                 dest.Add(cp);
03198         }
03199 }
03200
03201 template<typename TSrcVec, typename TDestCh>
03202 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03203                 TVec<TDestCh>& dest, bool clrDest) const
03204 {
03205         if (clrDest) dest.Clr();
03206         size_t retVal = 0;
03207         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
03208                 const int cp = src[TVecIdx(srcIdx)];
03209                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
03210                         { dest.Add(cp); retVal++; } }
03211         return retVal;
03212 }
03213
03214 inline bool AlwaysFalse()
03215 {
03216         int sum = 0;
03217         for (int i = 0; i < 5; i++) sum += i;
03218         return sum > 100;
03219 }
03220
03221 inline bool AlwaysTrue()
03222 {
03223         int sum = 0;
03224         for (int i = 0; i < 5; i++) sum += i;
03225         return sum < 100;
03226 }
03227
03228 /*
03229
03230 Notes on decomposition:
03231
03232 - In UnicodeData.txt, there is a field with the decomposition mapping.
03233   This field may also include a tag, <...>.
03234   If there is a tag, this is a compatibility mapping.
03235   Otherwise it is a canonical mapping.
03236 - Canonical decomposition uses only canonical mappings,
03237   compatibility decomposition uses both canonical and compatibility mappings.
03238 - Decomposition:
03239   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
03240   2. Put the string into canonical order, which means:
03241      while there exists a pair of characters, A immediately followed by B,
03242          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:
03243            swap A and B;
03244   This results in NFD (normalized form D, after canonical decomposition)
03245   or NFKD (normalized form KD, after compatibility decomposition).
03246 - Canonical composition:
03247   1. Before composition, the string should have been decomposed
03248      (using either canonical or compatibility decomposition).
03249   2. For each character C (from left to right):
03250      2.1.  Find the last starter S before C (if not found, continue).
03251          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.
03252          2.3.  If there exists a character L for which the canonical decomposition is S+L
03253                and L is not in the composition exclusion table [i.e. L is a "primary composite"],
03254                    then replace S by L, and remove C.
03255   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
03256   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
03257 - Composition exclusion table:
03258   - Anything in CompositionExclusions.txt.
03259   - Singletons: characters whose canonical decomposition is a single character.
03260   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
03261
03262 Example:
03263                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)
03264                                  E-macron (0112; composition class 0;                          0045 0304)
03265                                  grave   (0300; composition class 230)
03266                  macron  (0304; composition class 230)
03267   source string: 00c8 0304
03268   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
03269   after canonical composition: 00c8 0304
03270
03271   cc(horn) = 216
03272   cc(dot below) = 220
03273   cc(dot above) = 230
03274
03275 ToDos:
03276 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
03277   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
03278   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
03279   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
03280 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
03281   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
03282   (+ simple case mappinge v UnicodeData.txt).
03283   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
03284   in jih potem upostevamo posebej kar v source kodi nasih programov [za
03285   podrobno definicijo pogojev pa glej tabelo 3.13].
03286   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
03287     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
03288         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
03289         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za
03290         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
03291   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
03292     in se posebej str. 90.
03293   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
03294   - definicija cased ipd. na str. 89
03295 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
03296   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
03297   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
03298   To je se najbolje dodati med flagse posameznega characterja.
03299 - general category: sec. 4.5
03300 - motivacija za titlecase: 5.18
03301 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
03302   pod Full_Composition_Exclusion
03303 - script names: Scripts.txt in UAX #24.
03304 - block names: Blocks.txt
03305 - space characters: table 6.2 in baje tudi UCD.html
03306 - dash characters: table 6.3
03307 */
03308
03309 //#endif
03310