dd/d90/unicode_8h_source.html

00001 //#ifndef unicode_h
00002 //#define unicode_h
00003
00005 // Includes
00006 //#include "base.h"
00007
00008 typedef int TUniVecIdx;
00009
00010 //-----------------------------------------------------------------------------
00011 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
00012 //-----------------------------------------------------------------------------
00013
00014 // Error handling modes for the TUniCodec class.
00015 typedef enum TUnicodeErrorHandling_
00016 {
00017         // What happens when an error occurs:
00018         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
00019         uehThrow = 1,   // - an exception is thrown (TUnicodeException)
00020         uehReplace = 2, // - the replacement character is added to the output vector
00021         uehAbort = 3    // - the encoding/decoding process stops immediately
00022 }
00023 TUnicodeErrorHandling;
00024
00025 class TUnicodeException
00026 {
00027 public:
00028         TStr message;  // error message
00029         size_t srcIdx; // the position in the source vector where the error occurred
00030         int srcChar;   // the source character at the position srcIdx
00031         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
00032                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
00033 };
00034
00035 typedef enum TUniByteOrder_
00036 {
00037         boMachineEndian = 0,
00038         boLittleEndian = 1,
00039         boBigEndian = 2
00040 }
00041 TUniByteOrder;
00042
00043 typedef enum TUtf16BomHandling_
00044 {
00045         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
00046         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
00047         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
00048 }
00049 TUtf16BomHandling;
00050
00051 class TUniCodec
00052 {
00053 public:
00054         // 0xfffd is defined as the replacement character by the Unicode standard.
00055         // By default, it is rendered as a question mark inside a diamond: "<?>".
00056         enum { DefaultReplacementChar = 0xfffd };
00057
00058         // The replacement character is inserted into the destination vector
00059         // if an error occurs in the source vector.  By default, this is set
00060         // to DefaultReplacementChar.
00061         int replacementChar;
00062         // The error handling mode.
00063         TUnicodeErrorHandling errorHandling;
00064         // There are a number of situations where there is strictly speaking an error in
00065         // the source data although it can still be decoded in a reasonably meaningful way.
00066         // If strict == true, these situations are treated as errors.  Examples:
00067         // - when decoding UTF-8:
00068         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
00069         //     encoded as a two-byte sequence)
00070         //   - a codepoint > 0x10ffff
00071         // - when decoding UTF-16:
00072         //   - a codepoint from the range reserved for the second character of a surrogate pair
00073         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
00074         // - when encoding UTF-8:
00075         //   - a codepoint > 0x10ffff
00076         // - when encoding UTF-16:
00077         //   - a codepoint from the range reserved from the second character of a surrogate pair
00078         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
00079         //     surrogate pair, is always an error, even with strict == false]
00080         bool strict;
00081         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
00082         // of the source vector, it is skipped (when decoding).
00083         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding
00084         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,
00085         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.
00086         bool skipBom;
00087
00088         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
00089         {
00090         }
00091
00092         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
00093                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
00094         {
00095         }
00096
00097 protected:
00098         enum {
00099 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
00100                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
00101                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
00102                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
00103                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
00104                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
00105                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
00106                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
00107                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
00108                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
00109                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
00110                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
00111                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
00112 #undef DefineByte
00113         };
00114
00115         typedef TUniVecIdx TVecIdx;
00116         //friend class TUniChDb;
00117         friend class TUniCaseFolding;
00118
00119 public:
00120
00121         //-----------------------------------------------------------------------
00122         // UTF-8
00123         //-----------------------------------------------------------------------
00124
00125         // Returns the number of characters that have been successfully decoded.
00126         // This does not include any replacement characters that may have been inserted into 'dest'.
00127         template<typename TSrcVec, typename TDestCh>
00128         size_t DecodeUtf8(
00129                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00130                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00131         template<typename TSrcVec, typename TDestCh>
00132         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
00133
00134         // Returns the number of characters that have been successfully encoded.
00135         // This does not include any replacement characters that may have been inserted into 'dest'.
00136         template<typename TSrcVec, typename TDestCh>
00137         size_t EncodeUtf8(
00138                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00139                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00140         template<typename TSrcVec, typename TDestCh>
00141         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
00142
00143         // The following wrappers around the UTF-8 encoder return a TStr containing
00144         // the UTF-8-encoded version of the input string.
00145         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
00146         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
00147
00148         //-----------------------------------------------------------------------
00149         // UTF-16 Decoder
00150         //-----------------------------------------------------------------------
00151
00152 protected:
00153         enum {
00154                 Utf16FirstSurrogate = 0xd800,
00155                 Utf16SecondSurrogate = 0xdc00
00156         };
00157
00158         static bool IsMachineLittleEndian();
00159
00160 public:
00161
00162         // Returns the number of characters that have been successfully decoded.
00163         // This does not include any replacement characters that may have been inserted into 'dest'.
00164         // Each element of 'src' is assumed to contain one byte of data.
00165         // srcCount must be even (though srcIdx doesn't need to be).
00166         template<typename TSrcVec, typename TDestCh>
00167         size_t DecodeUtf16FromBytes(
00168                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00169                 TVec<TDestCh>& dest, const bool clrDest,
00170                 const TUtf16BomHandling bomHandling = bomAllowed,
00171                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00172
00173         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
00174         // are used to determine if the two bytes of each word should be swapped before further
00175         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
00176         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
00177         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
00178         // beginning of the source data is used to determine the "original" byte order of the data;
00179         // if this doesn't match the byte order of the local machine, the two bytes of each word will
00180         // be swapped during the decoding process.
00181         template<typename TSrcVec, typename TDestCh>
00182         size_t DecodeUtf16FromWords(
00183                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00184                 TVec<TDestCh>& dest, bool clrDest,
00185                 const TUtf16BomHandling bomHandling = bomAllowed,
00186                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00187
00188         //-----------------------------------------------------------------------
00189         // UTF-16 Encoder
00190         //-----------------------------------------------------------------------
00191
00192         // Returns the number of characters that have been successfully encoded.
00193         // This does not include any replacement characters that may have been inserted into 'dest'.
00194         //
00195         // Notes:
00196         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
00197         //   treated as an error, regardless of the value of 'strict'.
00198         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
00199         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
00200         //   as the first character of a surrogate pair.
00201         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
00202         //   can be encoded in principle; however, if strict == true, they are treated as errors.
00203         template<typename TSrcVec, typename TDestCh>
00204         size_t EncodeUtf16ToWords(
00205                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00206                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00207                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00208
00209         template<typename TSrcVec, typename TDestCh>
00210         size_t EncodeUtf16ToBytes(
00211                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00212                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00213                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00214
00215         //-----------------------------------------------------------------------
00216         // Helper declarations for the test drivers
00217         //-----------------------------------------------------------------------
00218
00219 protected:
00220
00221         static uint GetRndUint(TRnd& rnd);
00222         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
00223
00224         //-----------------------------------------------------------------------
00225         // UTF-8 Test Driver
00226         //-----------------------------------------------------------------------
00227
00228 protected:
00229         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
00230         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00231         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00232         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
00233 public:
00234         void TestUtf8();
00235
00236         //-----------------------------------------------------------------------
00237         // UTF-16 Test Driver
00238         //-----------------------------------------------------------------------
00239
00240 protected:
00241         void WordsToBytes(const TIntV& src, TIntV& dest);
00242         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00243                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.
00244                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00245                 FILE *f);
00246         static inline int SwapBytes(int x) {
00247                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
00248         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00249         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00250         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00251                 const TUtf16BomHandling bomHandling,
00252                 const TUniByteOrder defaultByteOrder,
00253                 const bool insertBom);
00254 public:
00255         void TestUtf16();
00256
00257 };
00258
00259 //-----------------------------------------------------------------------------
00260 // Case folding
00261 //-----------------------------------------------------------------------------
00262 // Note: there's no need to access this class directly.
00263 // Use TUniChDb::GetCaseFolded() instead.
00264
00265 typedef THash<TInt, TIntV> TIntIntVH;
00266
00267 class TUniCaseFolding
00268 {
00269 protected:
00270         TIntH cfCommon, cfSimple, cfTurkic;
00271         TIntIntVH cfFull;
00272
00273         template<typename TSrcDat, typename TDestDat>
00274         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
00275                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
00276         friend class TUniChDb;
00277         typedef TUniVecIdx TVecIdx;
00278
00279 public:
00280         TUniCaseFolding() { }
00281         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
00282         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
00283         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
00284         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
00285         void LoadTxt(const TStr& fileName);
00286
00287         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.
00288         template<typename TSrcVec, typename TDestCh>
00289         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00290                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
00291         {
00292                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
00293                 {
00294                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;
00295                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
00296                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
00297                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
00298                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
00299                 }
00300         }
00301
00302         template<typename TSrcVec>
00303         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
00304         {
00305                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
00306                 {
00307                         int c = src[TVecIdx(srcIdx)], i;
00308                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
00309                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
00310                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
00311                 }
00312         }
00313
00314 protected:
00315         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
00316 public:
00317         void Test();
00318 };
00319
00320 //-----------------------------------------------------------------------------
00321 // TCodecBase -- an abstract base class for codecs
00322 //-----------------------------------------------------------------------------
00323
00324 class TCodecBase;
00325 typedef TPt<TCodecBase> PCodecBase;
00326 typedef TVec<PCodecBase> TCodecBaseV;
00327
00328 class TCodecBase
00329 {
00330 protected:
00331         TCRef CRef;
00332         friend class TPt<TCodecBase>;
00333 public:
00334         virtual ~TCodecBase() { }
00335
00336         template<class TCodecImpl>
00337         static PCodecBase New(); /* {
00338                 return new TCodecWrapper<TCodecImpl>(); } */
00339
00340         virtual TStr GetName() const = 0;
00341         virtual void Test() const { }
00342
00343         // Returns the number of characters that have been successfully decoded.
00344         // This does not include any replacement characters that may have been inserted into 'dest'.
00345         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00346         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00347
00348         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00349         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00350
00351         // Returns the number of characters that have been successfully encoded.
00352         // This does not include any replacement characters that may have been inserted into 'dest'.
00353         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00354         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
00355         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
00356
00357         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00358         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00359         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00360 };
00361
00362 //-----------------------------------------------------------------------------
00363 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
00364 // parameter class for the actual implementation of the codec.
00365 //-----------------------------------------------------------------------------
00366 // Thus, if you know in advance that you'll need ISO-8859-2, just use
00367 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding
00368 // in advance, use a PCodecBase pointing to a suitable specialization
00369 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)
00370 // to obtain a suitable pointer.
00371
00372 template<class TCodecImpl_>
00373 class TCodecWrapper : public TCodecBase
00374 {
00375 public:
00376         typedef TCodecImpl_ TCodecImpl;
00377         TCodecImpl impl;
00378 public:
00379
00380         virtual TStr GetName() const { return impl.GetName(); }
00381
00382         virtual void Test() const { impl.Test(); }
00383
00384         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00385                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00386         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00387                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00388
00389         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00390                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00391         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
00392                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00393         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00394                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
00395                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00396                 return retVal; }
00397 };
00398
00399 template<class TCodecImpl>
00400 PCodecBase TCodecBase::New() {
00401   return new TCodecWrapper<TCodecImpl>();
00402 }
00403
00404 //-----------------------------------------------------------------------------
00405 // TVecElt -- a template for determining the type of a vector's elements
00406 //-----------------------------------------------------------------------------
00407
00408 template<class TVector_>
00409 class TVecElt
00410 {
00411 };
00412
00413 template<class TDat>
00414 class TVecElt<TVec<TDat> >
00415 {
00416 public:
00417         typedef TVec<TDat> TVector;
00418         typedef TDat TElement;
00419         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
00420 };
00421
00422 template<>
00423 class TVecElt<TChA>
00424 {
00425 public:
00426         typedef TChA TVector;
00427         typedef char TElement;
00428         static inline void Add(TVector& vector, const TElement& element) { vector += element; }
00429 };
00430
00431
00432 //-----------------------------------------------------------------------------
00433 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
00434 //-----------------------------------------------------------------------------
00435
00436 class TEncoding_ISO8859_1
00437 {
00438 public:
00439         static inline TStr GetName() { return "ISO-8859-1"; }
00440         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
00441         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
00442 };
00443
00444 class TEncoding_ISO8859_2 // ISO Latin 2
00445 {
00446 public:
00447         static inline TStr GetName() { return "ISO-8859-2"; }
00448         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00449         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00450                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00451         static int FromUnicode(int c) {
00452                 if (0 <= c && c < 0xa0) return c;
00453                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00454                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00455                 else return -1; }
00456 };
00457
00458 class TEncoding_ISO8859_3
00459 {
00460 public:
00461         static inline TStr GetName() { return "ISO-8859-3"; }
00462         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
00463         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00464                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00465         static int FromUnicode(int c) {
00466                 if (0 <= c && c < 0xa0) return c;
00467                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00468                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
00469                 else return -1; }
00470 };
00471
00472 class TEncoding_ISO8859_4
00473 {
00474 public:
00475         static inline TStr GetName() { return "ISO-8859-4"; }
00476         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00477         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00478                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00479         static int FromUnicode(int c) {
00480                 if (0 <= c && c < 0xa0) return c;
00481                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00482                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00483                 else return -1; }
00484 };
00485
00486 class TEncoding_YuAscii
00487 {
00488 public:
00489         static const int uniChars[10], yuAsciiChars[10];
00490         static inline TStr GetName() { return "YU-ASCII"; }
00491         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00492                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
00493                         if (c == yuAsciiChars[i]) return uniChars[i];
00494                 return c; }
00495         static int FromUnicode(int c) {
00496                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
00497                         if (c == uniChars[i]) return yuAsciiChars[i];
00498                         else if(c == yuAsciiChars[i]) return -1;
00499                 if (0 <= c && c <= 255) return c; else return -1; }
00500 };
00501
00502 class TEncoding_CP437 // DOS US
00503 {
00504 public:
00505         static inline TStr GetName() { return "CP437"; }
00506         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
00507         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00508                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00509         static int FromUnicode(int c) {
00510                 if (0 <= c && c < 0x80) return c;
00511                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
00512                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
00513                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
00514                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
00515                 else if (c == 0x192) return 0x9f;
00516                 else if (c == 0x207f) return 0xfc;
00517                 else if (c == 0x20a7) return 0x9e;
00518                 else if (c == 0x2310) return 0xa9;
00519                 else if (c == 0x2320) return 0xf4;
00520                 else if (c == 0x2321) return 0xf5;
00521                 else return -1; }
00522 };
00523
00524 class TEncoding_CP852 // DOS Latin 2
00525 {
00526 public:
00527         static inline TStr GetName() { return "CP852"; }
00528         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
00529         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00530                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00531         static int FromUnicode(int c) {
00532                 if (0 <= c && c < 0x80) return c;
00533                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00534                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00535                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
00536                 else return -1; }
00537 };
00538
00539 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
00540 {
00541 public:
00542         static inline TStr GetName() { return "CP1250"; }
00543         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
00544         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00545                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00546         static int FromUnicode(int c) {
00547                 if (0 <= c && c < 0x80) return c;
00548                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00549                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00550                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
00551                 else if (c == 0x20ac) return 0x80;
00552                 else if (c == 0x2122) return 0x99;
00553                 else return -1; }
00554 };
00555
00556 template<class TEncoding_>
00557 class T8BitCodec
00558 {
00559 protected:
00560         typedef TUniVecIdx TVecIdx;
00561 public:
00562         typedef TEncoding_ TEncoding;
00563         TUnicodeErrorHandling errorHandling;
00564         int replacementChar;
00565
00566         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
00567         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
00568                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
00569         static TStr GetName() { return TEncoding::GetName(); }
00570
00571         void Test() const
00572         {
00573                 int nDecoded = 0;
00574                 for (int c = 0; c <= 255; c++) {
00575                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
00576                         nDecoded++;
00577                         IAssert(0 <= cu && cu < 0x110000);
00578                         int c2 = TEncoding::FromUnicode(cu);
00579                         IAssert(c2 == c); }
00580                 int nEncoded = 0;
00581                 for (int cu = 0; cu < 0x110000; cu++) {
00582                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
00583                         nEncoded++;
00584                         IAssert(0 <= c && c <= 255);
00585                         int cu2 = TEncoding::ToUnicode(c);
00586                         IAssert(cu2 == cu); }
00587                 IAssert(nDecoded == nEncoded);
00588         }
00589
00590         // Returns the number of characters that have been successfully decoded.
00591         // This does not include any replacement characters that may have been inserted into 'dest'.
00592         template<typename TSrcVec, typename TDestCh>
00593         size_t ToUnicode(
00594                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00595                 TVec<TDestCh>& dest, const bool clrDest = true) const
00596         {
00597                 if (clrDest) dest.Clr();
00598                 size_t toDo = srcCount;
00599                 while (toDo-- > 0) {
00600                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
00601                         int chDest = TEncoding::ToUnicode(chSrc);
00602                         dest.Add(chDest); }
00603                 return srcCount;
00604         }
00605         template<typename TSrcVec, typename TDestCh>
00606         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00607
00608         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00609         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00610
00611         // Returns the number of characters that have been successfully encoded.
00612         // This does not include any replacement characters that may have been inserted into 'dest'.
00613         template<typename TSrcVec, typename TDestVec>
00614         size_t FromUnicode(
00615                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00616                 TDestVec& dest, const bool clrDest = true) const
00617         {
00618                 typedef typename TVecElt<TDestVec>::TElement TDestCh;
00619                 if (clrDest) dest.Clr();
00620                 size_t toDo = srcCount, nEncoded = 0;
00621                 while (toDo-- > 0) {
00622                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
00623                         int chDest = TEncoding::FromUnicode(chSrc);
00624                         if (chDest < 0) {
00625                                 switch (errorHandling) {
00626                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
00627                                 case uehAbort: return nEncoded;
00628                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
00629                                 case uehIgnore: continue;
00630                                 default: Fail; } }
00631                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
00632                 return nEncoded;
00633         }
00634
00635         template<typename TSrcVec, typename TDestVec>
00636         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00637
00638         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00639                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
00640                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00641                 return retVal; }
00642         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
00643 };
00644
00645 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;
00646 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;
00647 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;
00648 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;
00649 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;
00650 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;
00651 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;
00652 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;
00653
00654 //-----------------------------------------------------------------------------
00655 // Various declarations used by the Unicode Character Database
00656 //-----------------------------------------------------------------------------
00657
00658 typedef enum TUniChCategory_
00659 {
00660 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
00661         DefineUniCat(Letter, 'L'),             // ucLetter
00662         DefineUniCat(Mark, 'M'),
00663         DefineUniCat(Number, 'N'),
00664         DefineUniCat(Punctuation, 'P'),
00665         DefineUniCat(Symbol, 'S'),
00666         DefineUniCat(Separator, 'Z'),
00667         DefineUniCat(Other, 'C')
00668 #undef DefineUniCat
00669 }
00670 TUniChCategory;
00671
00672 typedef enum TUniChSubCategory_
00673 {
00674 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
00675         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
00676         DefineUniSubCat(Letter, Lowercase, 'l'),
00677         DefineUniSubCat(Letter, Titlecase, 't'),
00678         DefineUniSubCat(Letter, Modifier, 'm'),
00679         DefineUniSubCat(Letter, Other, 'o'),
00680         DefineUniSubCat(Mark, Nonspacing, 'n'),
00681         DefineUniSubCat(Mark, SpacingCombining, 'c'),
00682         DefineUniSubCat(Mark, Enclosing, 'e'),
00683         DefineUniSubCat(Number, DecimalDigit, 'd'),
00684         DefineUniSubCat(Number, Letter, 'l'),
00685         DefineUniSubCat(Number, Other, 'o'),
00686         DefineUniSubCat(Punctuation, Connector, 'c'),
00687         DefineUniSubCat(Punctuation, Dash, 'd'),
00688         DefineUniSubCat(Punctuation, Open, 's'),
00689         DefineUniSubCat(Punctuation, Close, 'e'),
00690         DefineUniSubCat(Punctuation, InitialQuote, 'i'),
00691         DefineUniSubCat(Punctuation, FinalQuote, 'f'),
00692         DefineUniSubCat(Punctuation, Other, 'o'),
00693         DefineUniSubCat(Symbol, Math, 'm'),
00694         DefineUniSubCat(Symbol, Currency, 'c'),
00695         DefineUniSubCat(Symbol, Modifier, 'k'),
00696         DefineUniSubCat(Symbol, Other, 'o'),
00697         DefineUniSubCat(Separator, Space, 's'),
00698         DefineUniSubCat(Separator, Line, 'l'),
00699         DefineUniSubCat(Separator, Paragraph, 'p'),
00700         DefineUniSubCat(Other, Control, 'c'),
00701         DefineUniSubCat(Other, Format, 'f'),
00702         DefineUniSubCat(Other, Surrogate, 's'),
00703         DefineUniSubCat(Other, PrivateUse, 'o'),
00704         DefineUniSubCat(Other, NotAssigned, 'n')
00705 }
00706 TUniChSubCategory;
00707
00708 typedef enum TUniChFlags_
00709 {
00710         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
00711         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
00712         // Flags used when searching for word boundaries.  See UAX #29.
00713         ucfWbFormat = 1 << 2,
00714         ucfWbKatakana = 1 << 3,
00715         ucfWbALetter = 1 << 4,
00716         ucfWbMidLetter = 1 << 5,
00717         ucfWbMidNum = 1 << 6,
00718         ucfWbNumeric = 1 << 7,
00719         ucfWbExtendNumLet = 1 << 8,
00720         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
00721         ucfSbSep = 1 << 9,
00722         ucfSbFormat = 1 << 10,
00723         ucfSbSp = 1 << 11,
00724         ucfSbLower = 1 << 12,
00725         ucfSbUpper = 1 << 13,
00726         ucfSbOLetter = 1 << 14,
00727         ucfSbNumeric = 1 << 15,
00728         ucfSbATerm = 1 << 16,
00729         ucfSbSTerm = 1 << 17,
00730         ucfSbClose = 1 << 18,
00731         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
00732         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
00733         // Flags from DerivedCoreProperties.txt.
00734         // [The comments are from UCD.html.]
00735         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
00736         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
00737         ucfDcpAlphabetic = 1 << 19,
00738         // - For programmatic determination of default-ignorable code points.
00739         //   New characters that should be ignored in processing (unless explicitly supported)
00740         //   will be assigned in these ranges, permitting programs to correctly handle the default
00741         //   behavior of such characters when not otherwise supported.  For more information, see
00742         //   UAX #29: Text Boundaries [Breaks].
00743         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
00744         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
00745         ucfDcpDefaultIgnorableCodePoint = 1 << 20,
00746         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
00747         //   Generated from: Other_Lowercase + Ll
00748         ucfDcpLowercase = 1 << 21,
00749         // - For programmatic determination of grapheme cluster boundaries.
00750         //   For more information, see UAX #29: Text Boundaries [Breaks].
00751         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
00752         ucfDcpGraphemeBase = 1 << 22,
00753         // - For programmatic determination of grapheme cluster boundaries.
00754         //   For more information, see UAX #29: Text Boundaries [Breaks].
00755         //   Generated from: Other_Grapheme_Extend + Me + Mn
00756         //   Note: depending on an application's interpretation of Co (private use), they may be either
00757         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
00758         ucfDcpGraphemeExtend = 1 << 23,
00759         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00760         ucfDcpIdStart = 1 << 24,
00761         ucfDcpIdContinue = 1 << 25,
00762         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
00763         //   Generated from: Sm + Other_Math
00764         ucfDcpMath = 1 << 26,
00765         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
00766         //   Generated from: Lu + Other_Uppercase
00767         ucfDcpUppercase = 1 << 27,
00768         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00769         ucfDcpXidStart = 1 << 28,
00770         ucfDcpXidContinue = 1 << 29,
00771         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
00772                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
00773 }
00774 TUniChFlags;
00775
00776 typedef enum TUniChProperties_
00777 {
00778         // The flags from PropList.txt.
00779         // [The comments are from UCD.html.]
00780         // - ASCII characters commonly used for the representation of hexadecimal numbers.
00781         //   [= 0123456789abcdefABCDEF]
00782         ucfPrAsciiHexDigit = 1,
00783         // - Those format control characters which have specific functions in the Bidirectional Algorithm.
00784         ucfPrBidiControl = 2,
00785         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
00786         //   plus compatibility equivalents to those. Most of these have the Pd General Category,
00787         //   but some have the Sm General Category because of their use in mathematics.
00788         //     U+0002d  HYPHEN-MINUS
00789         //     U+0058a  ARMENIAN HYPHEN
00790         //     U+005be  HEBREW PUNCTUATION MAQAF
00791         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00792         //     U+02010  HYPHEN
00793         //     U+02011  NON-BREAKING HYPHEN
00794         //     U+02012  FIGURE DASH
00795         //     U+02013  EN DASH
00796         //     U+02014  EM DASH
00797         //     U+02015  HORIZONTAL BAR
00798         //     U+02053  SWUNG DASH
00799         //     U+0207b  SUPERSCRIPT MINUS
00800         //     U+0208b  SUBSCRIPT MINUS
00801         //     U+02212  MINUS SIGN
00802         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00803         //     U+0301c  WAVE DASH
00804         //     U+03030  WAVY DASH
00805         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
00806         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
00807         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
00808         //     U+0fe58  SMALL EM DASH
00809         //     U+0fe63  SMALL HYPHEN-MINUS
00810         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00811         ucfPrDash = 4,
00812         // - For a machine-readable list of deprecated characters.  No characters will ever be removed
00813         //   from the standard, but the usage of deprecated characters is strongly discouraged.
00814         ucfPrDeprecated = 8,
00815         // - Characters that linguistically modify the meaning of another character to which they apply.
00816         //   Some diacritics are not combining characters, and some combining characters are not diacritics.
00817         ucfPrDiacritic = 0x10,
00818         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
00819         //   character.  Typical of these are length and iteration marks.
00820         ucfPrExtender = 0x20,
00821         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
00822         ucfPrGraphemeLink = 0x40,
00823         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
00824         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
00825         ucfPrHexDigit = 0x80,
00826         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
00827         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
00828         //     U+0002d  HYPHEN-MINUS
00829         //     U+000ad  SOFT HYPHEN
00830         //     U+0058a  ARMENIAN HYPHEN
00831         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00832         //     U+02010  HYPHEN
00833         //     U+02011  NON-BREAKING HYPHEN
00834         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00835         //     U+030fb  KATAKANA MIDDLE DOT
00836         //     U+0fe63  SMALL HYPHEN-MINUS
00837         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00838         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
00839         ucfPrHyphen = 0x100,
00840         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
00841         ucfPrIdeographic = 0x200,
00842         // - Those format control characters which have specific functions for control of cursive joining and ligation.
00843         ucfPrJoinControl = 0x400,
00844         // - There are a small number of characters that do not use logical order.
00845         //   These characters require special handling in most processing.
00846         ucfPrLogicalOrderException = 0x800,
00847         // - Code points that are permanently reserved for internal use.
00848         ucfPrNoncharacterCodePoint = 0x1000,
00849         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
00850         ucfPrPatternSyntax = 0x2000,
00851         ucfPrPatternWhiteSpace = 0x4000,
00852         // - Those punctuation characters that function as quotation marks.
00853         //     U+00022  QUOTATION MARK
00854         //     U+00027  APOSTROPHE
00855         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00856         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
00857         //     U+02018  LEFT SINGLE QUOTATION MARK
00858         //     U+02019  RIGHT SINGLE QUOTATION MARK
00859         //     U+0201a  SINGLE LOW-9 QUOTATION MARK
00860         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
00861         //     U+0201c  LEFT DOUBLE QUOTATION MARK
00862         //     U+0201d  RIGHT DOUBLE QUOTATION MARK
00863         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
00864         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
00865         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
00866         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
00867         //     U+0300c  LEFT CORNER BRACKET
00868         //     U+0300d  RIGHT CORNER BRACKET
00869         //     U+0300e  LEFT WHITE CORNER BRACKET
00870         //     U+0300f  RIGHT WHITE CORNER BRACKET
00871         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
00872         //     U+0301e  DOUBLE PRIME QUOTATION MARK
00873         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
00874         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
00875         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
00876         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
00877         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
00878         //     U+0ff02  FULLWIDTH QUOTATION MARK
00879         //     U+0ff07  FULLWIDTH APOSTROPHE
00880         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
00881         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
00882         ucfPrQuotationMark = 0x8000,
00883         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
00884         //   An explicit _dot above_ can be added where required, such as in Lithuanian.
00885         ucfPrSoftDotted = 0x10000,
00886         // - Sentence Terminal. Used in UAX #29: Text Boundaries.
00887         //     U+00021  EXCLAMATION MARK
00888         //     U+0002e  FULL STOP
00889         //     U+0003f  QUESTION MARK
00890         //     U+0203c  DOUBLE EXCLAMATION MARK
00891         //     U+0203d  INTERROBANG
00892         //     U+02047  DOUBLE QUESTION MARK
00893         //     U+02048  QUESTION EXCLAMATION MARK
00894         //     U+02049  EXCLAMATION QUESTION MARK
00895         //     U+03002  IDEOGRAPHIC FULL STOP
00896         //     [plus many characters from other writing systems]
00897         ucfPrSTerm = 0x20000,
00898         // - Those punctuation characters that generally mark the end of textual units.
00899         //   [JB note: this set contains more character than STerm.  For example, it contains
00900         //   the comma, colon and semicolon, whereas STerm doesn't.]
00901         //     U+00021  EXCLAMATION MARK
00902         //     U+0002c  COMMA
00903         //     U+0002e  FULL STOP
00904         //     U+0003a  COLON
00905         //     U+0003b  SEMICOLON
00906         //     U+0003f  QUESTION MARK
00907         //     U+0203c  DOUBLE EXCLAMATION MARK
00908         //     U+0203d  INTERROBANG
00909         //     U+02047  DOUBLE QUESTION MARK
00910         //     U+02048  QUESTION EXCLAMATION MARK
00911         //     U+02049  EXCLAMATION QUESTION MARK
00912         //     [plus *lots* of charcters from other writing systems]
00913         ucfPrTerminalPunctuation = 0x40000,
00914         // - Indicates all those characters that qualify as Variation Selectors.
00915         //   For details on the behavior of these characters, see StandardizedVariants.html and
00916         //   Section 16.4, Variation Selectors in [Unicode].
00917         ucfPrVariationSelector = 0x80000,
00918         // - Those separator characters and control characters which should be treated by
00919         //   programming languages as "white space" for the purpose of parsing elements.
00920         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
00921         //         since their functions are restricted to line-break control.
00922         //         Their names are unfortunately misleading in this respect.
00923         //   Note: There are other senses of "whitespace" that encompass a different set of characters.
00924         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
00925         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
00926         //   This includes the following characters:
00927         //     U+0009  <control>
00928         //     U+000a  <control>
00929         //     U+000b  <control>
00930         //     U+000c  <control>
00931         //     U+000d  <control>
00932         //     U+0020  SPACE
00933         //     U+0085  <control>
00934         //     U+00a0  NO-BREAK SPACE
00935         //     U+1680  OGHAM SPACE MARK
00936         //     U+180e  MONGOLIAN VOWEL SEPARATOR
00937         //     U+2000  EN QUAD
00938         //     U+2001  EM QUAD
00939         //     U+2002  EN SPACE
00940         //     U+2003  EM SPACE
00941         //     U+2004  THREE-PER-EM SPACE
00942         //     U+2005  FOUR-PER-EM SPACE
00943         //     U+2006  SIX-PER-EM SPACE
00944         //     U+2007  FIGURE SPACE
00945         //     U+2008  PUNCTUATION SPACE
00946         //     U+2009  THIN SPACE
00947         //     U+200a  HAIR SPACE
00948         //     U+2028  LINE SEPARATOR
00949         //     U+2029  PARAGRAPH SEPARATOR
00950         //     U+202f  NARROW NO-BREAK SPACE
00951         //     U+205f  MEDIUM MATHEMATICAL SPACE
00952         //     U+3000  IDEOGRAPHIC SPACE
00953         ucfPrWhiteSpace = 0x100000
00954 }
00955 TUniChProperties;
00956
00957 typedef enum TUniChPropertiesX_
00958 {
00959         // More properties from PropList.txt.
00960         // - Used to derive the properties in DerivedCoreProperties.txt.
00961         ucfPxOtherAlphabetic = 1,
00962         ucfPxOtherDefaultIgnorableCodePoint = 2,
00963         ucfPxOtherGraphemeExtend = 4,
00964         ucfPxOtherIdContinue = 8,
00965         ucfPxOtherIdStart = 0x10,
00966         ucfPxOtherLowercase = 0x20,
00967         ucfPxOtherMath = 0x40,
00968         ucfPxOtherUppercase = 0x80,
00969         // - Used in ideographic description sequences.
00970         ucfPxIdsBinaryOperator = 0x100,
00971         ucfPxIdsTrinaryOperator = 0x200,
00972         ucfPxRadical = 0x400,
00973         ucfPxUnifiedIdeograph = 0x800
00974 }
00975 TUniChPropertiesX;
00976
00977 //-----------------------------------------------------------------------------
00978 // TUniChInfo -- contains information about a single Unicode codepoint
00979 //-----------------------------------------------------------------------------
00980
00981 class TUniChInfo
00982 {
00983 public:
00984         enum { // combining classes (for 'combClass'); from UnicodeData.txt
00985                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
00986                 ccOverlaysAndInterior = 1,
00987                 ccNuktas = 7,
00988                 ccHiraganaKatakanaVoicingMarks = 8,
00989                 ccViramas = 9,
00990                 ccFixedPositionStart = 10, // Start of fixed position classes
00991                 ccFixedPositionEnd = 199, // End of fixed position classes
00992                 ccBelowLeftAttached = 200,
00993                 ccBelowAttached = 202,
00994                 ccBelowRightAttached = 204,
00995                 ccLeftAttached = 208, // Left attached (reordrant around single base character)
00996                 ccRightAttached = 210,
00997                 ccAboveLeftAttached = 212,
00998                 ccAboveAttached = 214,
00999                 ccAboveRightAttached = 216,
01000                 ccBelowLeft = 218,
01001                 ccBelow = 220,
01002                 ccBelowRight = 222,
01003                 ccLeft = 224, // Left (reordrant around single base character)
01004                 ccRight = 226,
01005                 ccAboveLeft = 228,
01006                 ccAbove = 230,
01007                 ccAboveRight = 232,
01008                 ccDoubleBelow = 233,
01009                 ccDoubleAbove = 234,
01010                 ccBelowIotaSubscript = 240, // Below (iota subscript)
01011                 ccInvalid = 255 // not defined by Unicode
01012         };
01013         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
01014         uchar combClass; // canonical combining class
01015         TUniChCategory cat; // = TUniChCategory(chCat)
01016         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
01017         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
01018         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt
01019         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
01020         int nameOffset; // offset into 'TUniChDb.charNames'
01021         int flags; // a combination of TUniChFlags
01022         int properties; // a combination of TUniChProperties
01023         int propertiesX; // a combination of TUniChPropertiesX
01024         ushort lineBreak; // from LineBreak.txt
01025
01026         // Converts a 2-letter linebreak code into a 16-bit integer.
01027         static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); }
01028         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;
01029
01030 public:
01031         void InitAfterLoad() {
01032                 cat = (TUniChCategory) chCat;
01033                 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); }
01034         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
01035                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
01036                 subCat = catAndSubCat;
01037                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
01038         friend class TUniChDb;
01039
01040         // Inexplicably missing from TSIn/TSOut...
01041         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
01042         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
01043         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
01044         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
01045
01046 public:
01047         void Save(TSOut& SOut) const {
01048                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
01049                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
01050                 SOut.Save(decompOffset); SOut.Save(nameOffset);
01051                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
01052         void Load(TSIn& SIn) {
01053                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
01054                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
01055                 SIn.Load(decompOffset); SIn.Load(nameOffset);
01056                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
01057         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
01058         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
01059                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
01060                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
01061                 InitAfterLoad(); }
01062
01063         // DerivedCoreProperties flags.
01064         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
01065         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
01066         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
01067         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
01068         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
01069         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
01070         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
01071         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }
01072         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
01073         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }
01074         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
01075         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
01076         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
01077         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
01078
01079         // PropList.txt flags.
01080         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
01081         void SetProperty(const TUniChProperties flag) { properties |= flag; }
01082         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }
01083         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
01084         bool IsDash() const { return IsProperty(ucfPrDash); }
01085         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
01086         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
01087         bool IsExtender() const { return IsProperty(ucfPrExtender); }
01088         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
01089         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
01090         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
01091         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
01092         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
01093         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }
01094         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }
01095         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }
01096         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
01097         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
01098         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }
01099         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }
01100         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
01101
01102         // Additional PropList.txt flags.
01103         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
01104         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
01105
01106         // Miscellaneous flags.
01107         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }
01108         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }
01109
01110         // Word-boundary flags.
01111         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
01112         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
01113         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
01114         int GetWbFlags() const { return flags & ucfWbMask; }
01115         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
01116         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }
01117         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
01118                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
01119                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
01120
01121         // Sentence-boundary flags.
01122         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
01123         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
01124         int GetSbFlags() const { return flags & ucfSbMask; }
01125         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
01126         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }
01127         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
01128                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
01129                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
01130                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
01131
01132         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
01133
01134         // Grapheme-boundary flags.
01135         bool IsGbExtend() const { return IsGraphemeExtend(); }
01136
01137         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
01138         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
01139
01140         // Character categories.
01141         TUniChCategory GetCat() const { return (TUniChCategory) cat; }
01142         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }
01143         // The following characters belong to the 'symbol/currency' subcategory:
01144         //     U+00024  DOLLAR SIGN
01145         //     U+000a2  CENT SIGN
01146         //     U+000a3  POUND SIGN
01147         //     U+000a4  CURRENCY SIGN
01148         //     U+000a5  YEN SIGN
01149         //     U+020a3  FRENCH FRANC SIGN
01150         //     U+020a4  LIRA SIGN
01151         //     U+020ac  EURO SIGN
01152         //     [and plenty of others]
01153         bool IsCurrency() const { return subCat == ucSymbolCurrency; }
01154         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
01155         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
01156         // the full ranges of private-use and surrogate characters.
01157         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
01158         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
01159
01160         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
01161                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
01162                 for (const char *p = s; *p; p += 2)
01163                         if (chCat == p[0] && chSubCat == p[1]) return true;
01164                 return false; }
01165 };
01166
01167 //-----------------------------------------------------------------------------
01168 // TUniTrie -- a trie for suffixes that should not appear at the end
01169 // of a sentence
01170 //-----------------------------------------------------------------------------
01171
01172 template<typename TItem_>
01173 class TUniTrie
01174 {
01175 public:
01176         typedef TItem_ TItem;
01177 protected:
01178         class TNode {
01179         public:
01180                 TItem item;
01181                 int child, sib;
01182                 bool terminal;
01183                 TNode() : child(-1), sib(-1), terminal(false) { }
01184                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
01185         };
01186         typedef TVec<TNode> TNodeV;
01187         typedef TPair<TItem, TItem> TItemPr;
01188         typedef TTriple<TItem, TItem, TItem> TItemTr;
01189         typedef TUniVecIdx TVecIdx;
01190         THash<TItem, TVoid> singles; //
01191         THash<TItemPr, TVoid> pairs;
01192         THash<TItemTr, TInt> roots;
01193         TNodeV nodes;
01194 public:
01195         TUniTrie() { }
01196         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
01197
01198         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
01199
01200         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
01201         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
01202         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
01203                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
01204                 if (keyId < 0) return 0; else return roots[keyId]; }
01205         int GetChild(const int parentIdx, const TItem& item) const {
01206                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
01207                         const TNode &node = nodes[childIdx];
01208                         if (node.item == item) return childIdx;
01209                         childIdx = node.sib; }
01210                 return -1; }
01211         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
01212
01213         // Adds a new string to the trie.  Note that the last characters appear
01214         // closer to the root of the trie.
01215         template<typename TSrcVec>
01216         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
01217         {
01218                 IAssert(srcCount > 0);
01219                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
01220                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
01221                 size_t srcLast = srcIdx + (srcCount - 1);
01222                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
01223                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
01224                 if (keyId >= 0) curNodeIdx = roots[keyId];
01225                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
01226                 //
01227                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
01228                 {
01229                         const TItem curItem = src[TVecIdx(srcPos)];
01230                         int childNodeIdx = nodes[curNodeIdx].child;
01231                         while (childNodeIdx >= 0) {
01232                                 TNode &childNode = nodes[childNodeIdx];
01233                                 if (childNode.item == curItem) break;
01234                                 childNodeIdx = childNode.sib; }
01235                         if (childNodeIdx < 0) {
01236                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
01237                                 nodes[curNodeIdx].child = childNodeIdx; }
01238                         curNodeIdx = childNodeIdx;
01239                         if (srcPos == srcIdx) break; else srcPos--;
01240                 }
01241                 nodes[curNodeIdx].terminal = true;
01242         }
01243
01244         template<typename TSrcVec>
01245         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
01246 };
01247
01248 //-----------------------------------------------------------------------------
01249 // TUniChDb -- provides access to the Unicode Character Database
01250 //-----------------------------------------------------------------------------
01251
01252 class TUniChDb
01253 {
01254 protected:
01255         void InitAfterLoad();
01256         typedef TUniVecIdx TVecIdx;
01257
01258 public:
01259         THash<TInt, TUniChInfo> h; // key: codepoint
01260         TStrPool charNames;
01261         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
01262         TIntV decompositions;
01263         THash<TIntPr, TInt> inverseDec;
01264         TUniCaseFolding caseFolding;
01265         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
01266         // The conditional mappings are hardcoded into GetCaseConverted().
01267         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;
01268         int scriptUnknown; // = scripts.GetKey("Unknown")
01269
01270         TUniChDb() : scriptUnknown(-1) { }
01271         explicit TUniChDb(TSIn& SIn) { Load(SIn); }
01272         void Clr() {
01273                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
01274                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
01275                 scripts.Clr(); }
01276         void Save(TSOut& SOut) const {
01277                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
01278                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
01279                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
01280                 SOut.SaveCs(); }
01281         void Load(TSIn& SIn) {
01282                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
01283                 decompositions.Load(SIn);
01284                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
01285                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
01286                 SIn.LoadCs(); InitAfterLoad(); }
01287         void LoadBin(const TStr& fnBin) {
01288                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
01289         void Test(const TStr& basePath);
01290
01291         // File names used by LoadTxt() and its subroutines.
01292         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
01293         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
01294         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
01295         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
01296         static TStr GetScriptsFn() { return "Scripts.txt"; }
01297         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
01298         static TStr GetLineBreakFn() { return "LineBreak.txt"; }
01299         static TStr GetPropListFn() { return "PropList.txt"; }
01300         static TStr GetAuxiliaryDir() { return "auxiliary"; }
01301         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
01302         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
01303         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
01304         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
01305         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
01306         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
01307
01308         //-------------------------------------------------------------------------
01309         // Script names
01310         //-------------------------------------------------------------------------
01311
01312         // These constants are used when initializing from the text files.
01313         static TStr GetScriptNameUnknown() { return "Unknown"; }
01314         static TStr GetScriptNameKatakana() { return "Katakana"; }
01315         static TStr GetScriptNameHiragana() { return "Hiragana"; }
01316         //
01317         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
01318         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
01319         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
01320         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
01321
01322         //-------------------------------------------------------------------------
01323         // Character namesnames
01324         //-------------------------------------------------------------------------
01325
01326         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
01327         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
01328         TStr GetCharNameS(const int cp) const {
01329                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
01330                 const char *p = GetCharName(cp); if (p) return p;
01331                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
01332         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
01333                 if (! f) f = stdout;
01334                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
01335                         fprintf(f, "%s", prefix.CStr());
01336                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
01337                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
01338         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
01339
01340         //-------------------------------------------------------------------------
01341         // Character information
01342         //-------------------------------------------------------------------------
01343         // These methods provide access to a subset of the functionality
01344         // available in TUniChInfo.
01345
01346         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
01347                 int i = h.GetKeyId(cp);
01348                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}
01349         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
01350         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
01351
01352         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
01353         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
01354         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
01355         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
01356
01357 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
01358 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
01359 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
01360 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
01361 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
01362
01363 #define DECLARE_FORWARDED_PROPERTY_METHODS \
01364         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
01365         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
01366         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
01367         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
01368         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
01369         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
01370         ___UniFwd2(IsXidStart, IsXidContinue)  \
01371         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
01372         ___UniFwd1(IsGbExtend)  \
01373         ___UniFwd2(IsCased, IsCurrency)
01374
01375         DECLARE_FORWARDED_PROPERTY_METHODS
01376
01377 #undef ___UniFwd1
01378
01379         bool IsPrivateUse(const int cp) const {
01380                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
01381                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
01382                         // Planes 15 and 16 are entirely for private use.
01383                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
01384         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
01385         // For db80..dbff it is clear that the surrogate pair containing this high surrogate
01386         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
01387         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.
01388         bool IsSurrogate(const int cp) const {
01389                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
01390                 return 0xd800 <= cp && cp <= 0xdcff; }
01391
01392         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
01393         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
01394         // for composition to work correctly.
01395         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
01396
01397         //-------------------------------------------------------------------------
01398         // Hangul constants
01399         //-------------------------------------------------------------------------
01400
01401         enum {
01402         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
01403         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
01404         HangulNCount = HangulVCount * HangulTCount,   // 588
01405         HangulSCount = HangulLCount * HangulNCount   // 11172
01406         };
01407
01408         //-------------------------------------------------------------------------
01409         // Word boundaries (UAX #29)
01410         //-------------------------------------------------------------------------
01411
01412 protected:
01413         // UAX #29, rule WB3: ignore Format and Extend characters.
01414         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
01415         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
01416         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
01417         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
01418         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01419                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01420         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01421         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01422                 if (position >= srcEnd) return;
01423                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01424         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01425         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01426                 if (position >= srcEnd) return;
01427                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
01428                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01429         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
01430         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
01431                 if (position <= srcStart) return false;
01432                 while (position > srcStart) {
01433                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
01434                 return false; }
01435         // Test driver for WbFind*NonIgnored.
01436         void TestWbFindNonIgnored(const TIntV& src) const;
01437         void TestWbFindNonIgnored() const;
01438 public:
01439         // Finds the next word boundary strictly after 'position'.
01440         // Note that there is a valid word boundary at 'srcIdx + srcCount'.
01441         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01442         template<typename TSrcVec>
01443         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01444         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
01445         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01446         // always set to 'true'.
01447         template<typename TSrcVec>
01448         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01449 protected:
01450         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
01451
01452         //-------------------------------------------------------------------------
01453         // Sentence boundaries (UAX #29)
01454         //-------------------------------------------------------------------------
01455
01456 protected:
01457         TUniTrie<TInt> sbExTrie;
01458
01459         // Checks whether a sentence that ended at src[position - 1]
01460         // would end in one of the suffixes from sbExTrie.
01461         template<typename TSrcVec>
01462         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
01463
01464 public:
01465         // Finds the next sentence boundary strictly after 'position'.
01466         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
01467         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01468         template<typename TSrcVec>
01469         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01470         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
01471         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01472         // always set to 'true'.
01473         template<typename TSrcVec>
01474         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01475
01476         // These methods allow the user to define a set of sentence boundary exceptions.
01477         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require
01478         // a sentence boundary in a position that would cause the sentence to end with
01479         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
01480         // we will *not* place a sentence boundary there.
01481         //
01482         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
01483         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
01484         // a standard set of English-language exceptions.
01485         void SbEx_Clr() { sbExTrie.Clr(); }
01486         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
01487         // template<> void SbEx_Add(const TStr& s) {
01488         void SbEx_Add(const TStr& s) {
01489           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
01490         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
01491         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
01492                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
01493                 return vec.Len(); }
01494         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
01495         int SbEx_SetStdEnglish() {
01496                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
01497                 SbEx_Clr(); return SbEx_AddMulti(data, false); }
01498
01499         //-------------------------------------------------------------------------
01500         // Normalization, decomposition, etc. (UAX #15)
01501         //-------------------------------------------------------------------------
01502
01503 protected:
01504         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
01505         // If 'compatibility == false', only canonical decompositions are used.
01506         template<typename TDestCh>
01507         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
01508 public:
01509         // This appends, to 'dest', the decomposed form of the source string.
01510         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01511         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01512         template<typename TSrcVec, typename TDestCh>
01513         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01514                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01515         template<typename TSrcVec, typename TDestCh>
01516         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01517                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01518         // This performs canonical composition on the source string, and appends
01519         // the result to the destination string.  The source string should be the
01520         // result of a (canonical or compatibility) decomposition; if this is the
01521         // case, the composition will lead to a normalization form C (NFC) or
01522         // normalization form KC (NFKC), depending on whether canonical or compatibility
01523         // decomposition was used.
01524         template<typename TSrcVec, typename TDestCh>
01525         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01526                         TVec<TDestCh>& dest, bool clrDest = true) const;
01527         template<typename TSrcVec, typename TDestCh>
01528         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01529                 Compose(src, 0, src.Len(), dest, clrDest); }
01530         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01531         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01532         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01533         // source string.
01534         template<typename TSrcVec, typename TDestCh>
01535         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01536                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01537         template<typename TSrcVec, typename TDestCh>
01538         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01539                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01540         // Copies the starter characters from 'src' to 'dest'; the other
01541         // characters are skipped.  'src' should already have been decomposed.
01542         // Returns the number of characters extracted.
01543         template<typename TSrcVec, typename TDestCh>
01544         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01545                         TVec<TDestCh>& dest, bool clrDest = true) const;
01546         template<typename TSrcVec, typename TDestCh>
01547         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01548                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
01549         // Extracts the starters into a temporary vector and then copies it into 'src'.
01550         template<typename TSrcVec>
01551         size_t ExtractStarters(TSrcVec& src) const {
01552                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
01553                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
01554                 return retVal; }
01555
01556 protected:
01557         void TestComposition(const TStr& basePath);
01558
01559         //-------------------------------------------------------------------------
01560         // Initialization from the text files
01561         //-------------------------------------------------------------------------
01562
01563 protected:
01564         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
01565         void InitScripts(const TStr& basePath);
01566         void InitLineBreaks(const TStr& basePath);
01567         void InitDerivedCoreProperties(const TStr& basePath);
01568         void InitPropList(const TStr& basePath);
01569         void InitSpecialCasing(const TStr& basePath);
01570         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);
01571 public:
01572         void LoadTxt(const TStr& basePath);
01573         void SaveBin(const TStr& fnBinUcd);
01574
01575         //-------------------------------------------------------------------------
01576         // Case conversions
01577         //-------------------------------------------------------------------------
01578
01579 public:
01580         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
01581         // Appends the case-converted form of 'src' to 'dest'.
01582         // 'how' defines what kind of case conversion is required.
01583         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
01584         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
01585         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
01586         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
01587         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
01588         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
01589         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01590         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01591         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01592
01593         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01594         // This is simpler and faster.  Since each character now maps into exactly one
01595         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01596         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
01597         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
01598         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
01599         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
01600         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
01601         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
01602         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
01603
01604         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
01605         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
01606         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
01607         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
01608         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
01609         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
01610         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
01611
01612 public:
01613         friend class TUniCaseFolding;
01614
01615         // Case folding is an alternative to the above functions.  It is intended primarily
01616         // to produce strings that are suitable for comparisons.  For example,
01617         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01618         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01619         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
01620         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01621         //   into a string of two or more characters.
01622         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01623         //   each string before comparing them (see sec. 3.13 of the standard).
01624         template<typename TSrcVec, typename TDestCh>
01625         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01626                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
01627         template<typename TSrcVec, typename TDestCh>
01628         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
01629                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
01630         // ToCaseFolded folds the string in place.  However, this means that only the simple
01631         // case foldings can be used (the full ones could increase the length of the string).
01632         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
01633         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
01634
01635 protected:
01636         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
01637         void TestCaseConversions();
01638
01639         //-------------------------------------------------------------------------
01640         // Text file reader for the Unicode character database
01641         //-------------------------------------------------------------------------
01642
01643 protected:
01644
01645         class TUcdFileReader
01646         {
01647         protected:
01648                 TChA buf;
01649         public:
01650                 TChA comment; // contains '#' and everything after it
01651         protected:
01652                 FILE *f;
01653                 int putBackCh;
01654                 int GetCh() {
01655                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
01656                         return fgetc(f); }
01657                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
01658                 // Returns 'false' iff the EOF was encountered before anything was read.
01659                 bool ReadNextLine() {
01660                         buf.Clr(); comment.Clr();
01661                         bool inComment = false, first = true;
01662                         while (true) {
01663                                 int c = GetCh();
01664                                 if (c == EOF) return ! first;
01665                                 else if (c == 13) {
01666                                         c = GetCh(); if (c != 10) PutBack(c);
01667                                         return true; }
01668                                 else if (c == 10) return true;
01669                                 else if (c == '#') inComment = true;
01670                                 if (! inComment) buf += char(c);
01671                                 else comment += char(c); }
01672                                 /*first = false;*/}
01673         private:
01674                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }
01675                 TUcdFileReader(const TUcdFileReader& r) { Fail; }
01676         public:
01677                 TUcdFileReader() : f(0) { }
01678                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
01679                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
01680                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
01681                 ~TUcdFileReader() { Close(); }
01682                 bool GetNextLine(TStrV& dest) {
01683                         dest.Clr();
01684                         while (true) {
01685                                 if (! ReadNextLine()) return false;
01686                                 TStr line = buf; line.ToTrunc();
01687                                 if (line.Len() <= 0) continue;
01688                                 line.SplitOnAllCh(';', dest, false);
01689                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
01690                                 return true; }}
01691                 static int ParseCodePoint(const TStr& s) {
01692                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
01693                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
01694                         if (ClrDestP) dest.Clr();
01695                         TStrV parts; s.SplitOnWs(parts);
01696                         for (int i = 0; i < parts.Len(); i++) {
01697                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
01698                                 dest.Add(c); } }
01699                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
01700                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
01701                         from = ParseCodePoint(s.GetSubStr(0, i - 1));
01702                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
01703         };
01704
01705         //-------------------------------------------------------------------------
01706         // Helper class for processing the text files
01707         //-------------------------------------------------------------------------
01708         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
01709         // and not all codepoints from the range have also been listed in
01710         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created
01711         // when processing DerivedCoreProps.txt and similar files.
01712         // To assign the correct (sub)categories to these new codepoints,
01713         // the following class will extract the subcategory info from the
01714         // comments in DerivedCoreProps.txt and similar files.
01715
01716         class TSubcatHelper
01717         {
01718         public:
01719                 bool hasCat; TUniChSubCategory subCat;
01720                 TStrH invalidCatCodes;
01721                 TUniChDb &owner;
01722
01723                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
01724
01725                 void ProcessComment(TUniChDb::TUcdFileReader &reader)
01726                 {
01727                         hasCat = false; subCat = ucOtherNotAssigned;
01728                         if (reader.comment.Len() > 3)
01729                         {
01730                                 IAssert(reader.comment[0] == '#');
01731                                 IAssert(reader.comment[1] == ' ');
01732                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];
01733                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
01734                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
01735                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
01736                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
01737                         }
01738                 }
01739
01740                 void SetCat(const int cp) {
01741                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01742                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);
01743                         IAssert(hasCat);
01744                         owner.h[i].SetCatAndSubCat(subCat); }
01745                 void TestCat(const int cp) {
01746                         if (! hasCat) return;
01747                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01748                         IAssert(owner.h[i].subCat == subCat); }
01749
01750                 ~TSubcatHelper()
01751                 {
01752                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
01753                         // Output any unexpected ones (there shouldn't be any).
01754                         if (! invalidCatCodes.Empty()) {
01755                                 printf("Invalid cat code(s) in the comments: ");
01756                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
01757                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
01758                                 printf("\n"); }
01759                 }
01760         };
01761 };
01762
01763 //-----------------------------------------------------------------------------
01764 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
01765 //-----------------------------------------------------------------------------
01766
01767 class TUnicode
01768 {
01769 public:
01770         TUniCodec codec;
01771         TUniChDb ucd;
01772
01773         TUnicode() { Init(); }
01774         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
01775         void Init() { InitCodecs(); }
01776
01777         //-----------------------------------------------------------------------
01778         // UTF-8
01779         //-----------------------------------------------------------------------
01780
01781         // Returns the number of characters that have been successfully decoded.
01782         // This does not include any replacement characters that may have been inserted into 'dest'.
01783         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01784         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01785
01786         // Returns the number of characters that have been successfully encoded.
01787         // This does not include any replacement characters that may have been inserted into 'dest'.
01788         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
01789
01790         // The following wrapper around the UTF-8 encoder returns a TStr containing
01791         // the UTF-8-encoded version of the input string.
01792         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
01793
01794         //-----------------------------------------------------------------------
01795         // UTF-16 Decoder
01796         //-----------------------------------------------------------------------
01797
01798         // Returns the number of characters that have been successfully decoded.
01799         // This does not include any replacement characters that may have been inserted into 'dest'.
01800         // Each element of 'src' is assumed to contain one byte of data.
01801         // srcCount must be even (though srcIdx doesn't need to be).
01802         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
01803                 const TUtf16BomHandling bomHandling = bomAllowed,
01804                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01805                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01806
01807         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
01808         // are used to determine if the two bytes of each word should be swapped before further
01809         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
01810         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
01811         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
01812         // beginning of the source data is used to determine the "original" byte order of the data;
01813         // if this doesn't match the byte order of the local machine, the two bytes of each word will
01814         // be swapped during the decoding process.
01815         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
01816                 const TUtf16BomHandling bomHandling = bomAllowed,
01817                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01818                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01819
01820         //-----------------------------------------------------------------------
01821         // UTF-16 Encoder
01822         //-----------------------------------------------------------------------
01823
01824         // Returns the number of characters that have been successfully encoded.
01825         // This does not include any replacement characters that may have been inserted into 'dest'.
01826         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
01827                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01828                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01829
01830         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
01831                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01832                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01833
01834         //-----------------------------------------------------------------------
01835         // 8-bit codecs
01836         //-----------------------------------------------------------------------
01837
01838         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;
01839         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;
01840         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;
01841         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;
01842         T8BitCodec<TEncoding_YuAscii> yuAscii;
01843         T8BitCodec<TEncoding_CP1250> cp1250;
01844         T8BitCodec<TEncoding_CP852> cp852;
01845         T8BitCodec<TEncoding_CP437> cp437;
01846
01847         //-----------------------------------------------------------------------
01848         // Codec registry
01849         //-----------------------------------------------------------------------
01850         // If you know you'll need ISO-8859-2, just use
01851         //   TUnicode unicode;
01852         //   unicode.iso8859_2.Encode(...);
01853         // If you don't know what you'll need, use:
01854         //   TUnicode unicode;
01855         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);
01856         //   myCodec->Encode(...);
01857         // Note that the first approach is slightly more efficient because there
01858         // aren't any virtual method calls involved.
01859
01860 protected:
01861         THash<TStr, PCodecBase> codecs;
01862         static inline TStr NormalizeCodecName(const TStr& name) {
01863                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
01864 public:
01865         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
01866                 TStrV names; nameList.SplitOnWs(names);
01867                 for (int i = 0; i < names.Len(); i++)
01868                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }
01869         void UnregisterCodec(const TStr& nameList) {
01870                 TStrV names; nameList.SplitOnWs(names);
01871                 for (int i = 0; i < names.Len(); i++)
01872                         codecs.DelKey(NormalizeCodecName(names[i])); }
01873         void ClrCodecs() { codecs.Clr(); }
01874         void InitCodecs();
01875         PCodecBase GetCodec(const TStr& name) const {
01876                 TStr s = NormalizeCodecName(name);
01877                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
01878                 return p; }
01879         void GetAllCodecs(TCodecBaseV& dest) const {
01880                 dest.Clr();
01881                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
01882                         PCodecBase codec = codecs[i]; bool found = false;
01883                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
01884                         if (! found) dest.Add(codec); }}
01885
01886         //-------------------------------------------------------------------------
01887         // Word boundaries (UAX #29)
01888         //-------------------------------------------------------------------------
01889
01890         // Finds the next word boundary strictly after 'position'.
01891         // Note that there are valid word boundaries at 0 and at 'src.Len()'.
01892         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01893         bool FindNextWordBoundary(const TIntV& src, int &position) const {
01894                 if (position < 0) { position = 0; return true; }
01895                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01896         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
01897         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01898         // always set to 'true'.
01899         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
01900
01901         //-------------------------------------------------------------------------
01902         // Sentence boundaries (UAX #29)
01903         //-------------------------------------------------------------------------
01904
01905         // Finds the next sentence boundary strictly after 'position'.
01906         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
01907         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01908         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
01909                 if (position < 0) { position = 0; return true; }
01910                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01911         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
01912         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01913         // always set to 'true'.
01914         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
01915
01916         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }
01917         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }
01918
01919         //-------------------------------------------------------------------------
01920         // Normalization, decomposition, etc. (UAX #15)
01921         //-------------------------------------------------------------------------
01922
01923         // This sets 'dest' to the decomposed form of the source string.
01924         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01925         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01926         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
01927         // This performs canonical composition on the source string, and stores
01928         // the result in the destination vector.  The source string should be the
01929         // result of a (canonical or compatibility) decomposition; if this is the
01930         // case, the composition will lead to a normalization form C (NFC) or
01931         // normalization form KC (NFKC), depending on whether canonical or compatibility
01932         // decomposition was used.
01933         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
01934         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01935         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01936         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01937         // source string.
01938         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
01939         // Copies the starter characters from 'src' to 'dest'; the other
01940         // characters are skipped.  'src' should already have been decomposed.
01941         // Returns the number of characters extracted.  This function can be
01942         // used to remove diacritical marks from a string (after it has been decomposed!).
01943         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
01944         // Extracts the starters into a temporary vector and then copies it into 'src'.
01945         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
01946
01947         //-------------------------------------------------------------------------
01948         // Case conversions
01949         //-------------------------------------------------------------------------
01950         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
01951         // use the case-conversion methods in TUniChDb, which allow the caller
01952         // to request language-specific case mappings for these languages.
01953
01954 public:
01955         typedef TUniChDb::TCaseConversion TCaseConversion;
01956         // Sets 'dest' to the case-converted form of 'src'.
01957         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
01958         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
01959         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
01960
01961         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01962         // This is simpler and faster.  Since each character now maps into exactly one
01963         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01964         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
01965         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
01966         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
01967
01968         // These functions perform simple case-conversions in-place.
01969         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
01970         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
01971         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
01972
01973         // Case folding is an alternative to the above functions.  It is intended primarily
01974         // to produce strings that are suitable for comparisons.  For example,
01975         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01976         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01977         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01978         //   into a string of two or more characters.
01979         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01980         //   each string before comparing them (see sec. 3.13 of the standard).
01981         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
01982         // ToCaseFolded folds the string in place.  However, this means that only the simple
01983         // case foldings can be used (the full ones could increase the length of the string).
01984         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
01985
01986         TStr GetUtf8CaseFolded(const TStr& s) const {
01987                 bool isAscii = true;
01988                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
01989                 if (isAscii) return s.GetLc();
01990                 TIntV src; DecodeUtf8(s, src);
01991                 TIntV dest; GetCaseFolded(src, dest);
01992                 return EncodeUtf8Str(dest); }
01993
01994         //-------------------------------------------------------------------------
01995         // Character properties
01996         //-------------------------------------------------------------------------
01997         // These methods simply call the corresponding TUniChDb method
01998         // (which typically calls the corresponding method of TUniChInfo).
01999         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
02000         // They are all of the form        bool IsXxxx(const int cp) const
02001         // Some of the more notable ones include:
02002         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
02003         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
02004         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
02005
02006 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
02007         DECLARE_FORWARDED_PROPERTY_METHODS
02008 #undef DECLARE_FORWARDED_PROPERTY_METHODS
02009 #undef __UniFwd1
02010         ___UniFwd2(IsPrivateUse, IsSurrogate)
02011
02012         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
02013         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
02014
02015         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
02016         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
02017         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
02018
02019 };
02020
02021 //-----------------------------------------------------------------------------
02022 // TUniCodec -- UTF-8 Decoder
02023 //-----------------------------------------------------------------------------
02024
02025 // Returns the number of characters that have been successfully decoded.
02026 // This does not include any replacement characters that may have been inserted into 'dest'.
02027 template<typename TSrcVec, typename TDestCh>
02028 size_t TUniCodec::DecodeUtf8(
02029         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02030         TVec<TDestCh>& dest, const bool clrDest) const
02031 {
02032         size_t nDecoded = 0;
02033         if (clrDest) dest.Clr();
02034         const size_t origSrcIdx = srcIdx;
02035         const size_t srcEnd = srcIdx + srcCount;
02036         while (srcIdx < srcEnd)
02037         {
02038                 const size_t charSrcIdx = srcIdx;
02039                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02040                 if ((c & _1000_0000) == 0) {
02041                         // c is one of the characters 0..0x7f, encoded as a single byte.
02042                         dest.Add(TDestCh(c)); nDecoded++; continue; }
02043                 else if ((c & _1100_0000) == _1000_0000) {
02044                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
02045                         // We must have been thrown into the middle of a multi-byte character.
02046                         switch (errorHandling) {
02047                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
02048                         case uehAbort: return nDecoded;
02049                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02050                         case uehIgnore: continue;
02051                         default: Fail; } }
02052                 else
02053                 {
02054                         // c introduces a sequence of 2..6 bytes, depending on how many
02055                         // of the most significant bits of c are set.
02056                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
02057                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
02058                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
02059                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
02060                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
02061                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
02062                         else {
02063                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
02064                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
02065                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
02066                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
02067                                 if (strict)  {
02068                                         switch (errorHandling) {
02069                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
02070                                         case uehAbort: return nDecoded;
02071                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
02072                                         // and try to decode the character.  Then, since 'strict' is true and
02073                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
02074                                         // and (in the case of uehReplace) insert a replacement character then.
02075                                         // This is probably better than inserting a replacement character right
02076                                         // away and then trying to read the next byte as if a new character
02077                                         // was beginning there -- if the current byte is really followed by five
02078                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
02079                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
02080                                         case uehIgnore: break; // continue;
02081                                         default: Fail; } }
02082                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
02083                         // Decode this multi-byte sequence.
02084                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
02085                         bool cancel = false;
02086                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
02087                                 // See if there are enough bytes left in the source vector.
02088                                 if (! (srcIdx < srcEnd)) {
02089                                         switch (errorHandling) {
02090                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
02091                                         case uehAbort: return nDecoded;
02092                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
02093                                         case uehIgnore: cancel = true; continue;
02094                                         default: Fail; } }
02095                                 // Read the next byte.
02096                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02097                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
02098                                         switch (errorHandling) {
02099                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
02100                                         case uehAbort: return nDecoded;
02101                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
02102                                         case uehIgnore: srcIdx--; cancel = true; continue;
02103                                         default: Fail; } }
02104                                 cOut <<= 6; cOut |= (c & _0011_1111); }
02105                         if (cancel) continue;
02106                         if (strict) {
02107                                 // err1: This codepoint has been represented by more bytes than it should have been.
02108                                 // For example, cOut in the range 0..127 should be represented by a single byte,
02109                                 // not by two or more bytes.
02110                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
02111                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
02112                                 // the appearance of null bytes in the encoded stream.
02113                                 bool err1 = (cOut < minVal);
02114                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
02115                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
02116                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
02117                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
02118                                 if (err1 || err2) switch (errorHandling) {
02119                                         case uehThrow:
02120                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
02121                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
02122                                                 else { Fail; break; }
02123                                         case uehAbort: return nDecoded;
02124                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02125                                         case uehIgnore: continue;
02126                                         default: Fail; } }
02127                         // Add the decoded codepoint to the destination vector.
02128                         // If this is the first decoded character, and it's one of the byte-order marks
02129                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
02130                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
02131                                 dest.Add(cOut); nDecoded++; }
02132                 } // else (multi-byte sequence)
02133         } // while
02134         return nDecoded;
02135 }
02136
02137 //-----------------------------------------------------------------------
02138 // TUniCodec -- UTF-8 Encoder
02139 //-----------------------------------------------------------------------
02140
02141 // Returns the number of characters that have been successfully encoded.
02142 // This does not include any replacement characters that may have been inserted into 'dest'.
02143 template<typename TSrcVec, typename TDestCh>
02144 size_t TUniCodec::EncodeUtf8(
02145         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02146         TVec<TDestCh>& dest, const bool clrDest) const
02147 {
02148         size_t nEncoded = 0;
02149         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
02150         {
02151                 uint c = uint(src[TVecIdx(srcIdx)]);
02152                 bool err = false;
02153                 if (strict && c > 0x10ffff) {
02154                         err = true;
02155                         switch (errorHandling) {
02156                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
02157                         case uehAbort: return nEncoded;
02158                         case uehReplace: c = replacementChar; break;
02159                         case uehIgnore: continue;
02160                         default: Fail; } }
02161                 if (c < 0x80u)
02162                         dest.Add(TDestCh(c & 0xffu));
02163                 else if (c < 0x800u) {
02164                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
02165                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02166                 else if (c < 0x10000u) {
02167                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
02168                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02169                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02170                 else if (c < 0x200000u) {
02171                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
02172                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02173                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02174                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02175                 else if (c < 0x4000000u) {
02176                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
02177                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02178                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02179                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02180                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02181                 else {
02182                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
02183                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
02184                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02185                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02186                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02187                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02188                 if (! err) nEncoded++;
02189         }
02190         return nEncoded;
02191 }
02192
02193 //-----------------------------------------------------------------------
02194 // TUniCodec -- UTF-16 Encoder
02195 //-----------------------------------------------------------------------
02196
02197 // Returns the number of characters that have been successfully decoded.
02198 // This does not include any replacement characters that may have been inserted into 'dest'.
02199 // Each element of 'src' is assumed to contain one byte of data.
02200 // srcCount must be even (though srcIdx doesn't need to be).
02201 template<typename TSrcVec, typename TDestCh>
02202 size_t TUniCodec::DecodeUtf16FromBytes(
02203         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02204         TVec<TDestCh>& dest, const bool clrDest,
02205         const TUtf16BomHandling bomHandling,
02206         const TUniByteOrder defaultByteOrder) const
02207 {
02208         IAssert(srcCount % 2 == 0);
02209         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02210         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02211         if (clrDest) dest.Clr();
02212         size_t nDecoded = 0;
02213         if (srcCount <= 0) return nDecoded;
02214         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02215         bool littleEndian = false;
02216   bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
02217         if (bomHandling == bomIgnored) littleEndian = leDefault;
02218         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02219         {
02220                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
02221                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
02222                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
02223                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
02224                 else { // Report an error.
02225                         switch (errorHandling) {
02226                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
02227                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02228                         default: Fail; } }
02229         }
02230         else Fail;
02231         while (srcIdx < srcEnd)
02232         {
02233                 const size_t charSrcIdx = srcIdx;
02234                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02235                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02236                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02237                 {
02238                         // c is the first character in a surrogate pair.  Read the next character.
02239                         if (! (srcIdx + 2 <= srcEnd)) {
02240                                 switch (errorHandling) {
02241                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02242                                 case uehAbort: return nDecoded;
02243                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02244                                 case uehIgnore: continue;
02245                                 default: Fail; } }
02246                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02247                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02248                         // c2 should be the second character of the surrogate pair.
02249                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02250                                 switch (errorHandling) {
02251                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02252                                 case uehAbort: return nDecoded;
02253                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02254                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
02255                                 case uehIgnore: srcIdx -= 2; continue;
02256                                 default: Fail; } }
02257                         // c and c2 each contain 10 bits of information.
02258                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02259                         cc += 0x10000;
02260                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02261                 }
02262                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02263                         switch (errorHandling) {
02264                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02265                         case uehAbort: return nDecoded;
02266                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02267                         case uehIgnore: continue;
02268                         default: Fail; } }
02269                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02270                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02271                 // Otherwise, store 'c' to the destination vector.
02272                 dest.Add(TDestCh(c)); nDecoded++;
02273         }
02274         return nDecoded;
02275 }
02276
02277 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
02278 // are used to determine if the two bytes of each word should be swapped before further
02279 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
02280 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
02281 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
02282 // beginning of the source data is used to determine the "original" byte order of the data;
02283 // if this doesn't match the byte order of the local machine, the two bytes of each word will
02284 // be swapped during the decoding process.
02285 template<typename TSrcVec, typename TDestCh>
02286 size_t TUniCodec::DecodeUtf16FromWords(
02287         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02288         TVec<TDestCh>& dest, bool clrDest,
02289         const TUtf16BomHandling bomHandling,
02290         const TUniByteOrder defaultByteOrder) const
02291 {
02292         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02293         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02294         if (clrDest) dest.Clr();
02295         size_t nDecoded = 0;
02296         if (srcCount <= 0) return nDecoded;
02297         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02298         bool swap = false;
02299   bool isMachineLe = IsMachineLittleEndian();
02300         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
02301         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
02302         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02303         {
02304                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
02305                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
02306                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
02307                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
02308                 else { // Report an error.
02309                         switch (errorHandling) {
02310                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
02311                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02312                         default: Fail; } }
02313         }
02314         else Fail;
02315         while (srcIdx < srcEnd)
02316         {
02317                 const size_t charSrcIdx = srcIdx;
02318                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02319                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02320                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02321                 {
02322                         // c is the first character in a surrogate pair.  Read the next character.
02323                         if (! (srcIdx < srcEnd)) {
02324                                 switch (errorHandling) {
02325                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02326                                 case uehAbort: return nDecoded;
02327                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02328                                 case uehIgnore: continue;
02329                                 default: Fail; } }
02330                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02331                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
02332                         // c2 should be the second character of the surrogate pair.
02333                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02334                                 switch (errorHandling) {
02335                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02336                                 case uehAbort: return nDecoded;
02337                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02338                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
02339                                 case uehIgnore: srcIdx -= 1; continue;
02340                                 default: Fail; } }
02341                         // c and c2 each contain 10 bits of information.
02342                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02343                         cc += 0x10000;
02344                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02345                 }
02346                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02347                         switch (errorHandling) {
02348                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02349                         case uehAbort: return nDecoded;
02350                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02351                         case uehIgnore: continue;
02352                         default: Fail; } }
02353                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02354                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02355                 // Otherwise, store 'c' to the destination vector.
02356                 dest.Add(TDestCh(c)); nDecoded++;
02357         }
02358         return nDecoded;
02359 }
02360
02361 //-----------------------------------------------------------------------
02362 // TUniCodec -- UTF-16 Encoder
02363 //-----------------------------------------------------------------------
02364
02365 // Returns the number of characters that have been successfully encoded.
02366 // This does not include any replacement characters that may have been inserted into 'dest'.
02367 template<typename TSrcVec, typename TDestCh>
02368 size_t TUniCodec::EncodeUtf16ToWords(
02369         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02370         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02371         const TUniByteOrder destByteOrder) const
02372 {
02373         bool isMachineLe = IsMachineLittleEndian();
02374         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
02375         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02376         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
02377         while (srcIdx < srcEnd)
02378         {
02379                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02380                 if (! (c <= 0x10ffffu)) {
02381                         switch (errorHandling) {
02382                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02383                         case uehAbort: return nEncoded;
02384                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02385                         case uehIgnore: continue;
02386                         default: Fail; } }
02387                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02388                         switch (errorHandling) {
02389                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02390                         case uehAbort: return nEncoded;
02391                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02392                         case uehIgnore: continue;
02393                         default: Fail; } }
02394                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02395                         switch (errorHandling) {
02396                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02397                         case uehAbort: return nEncoded;
02398                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02399                         case uehIgnore: continue;
02400                         default: Fail; } }
02401                 // If c is <= 0xffff, it can be stored directly.
02402                 if (c <= 0xffffu) {
02403                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02404                         dest.Add(TDestCh(c)); nEncoded++; continue; }
02405                 // Otherwise, represent c by a pair of surrogate characters.
02406                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02407                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02408                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02409                 if (swap) {
02410                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
02411                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
02412                 dest.Add(TDestCh(c1));
02413                 dest.Add(TDestCh(c2));
02414                 nEncoded++; continue;
02415         }
02416         return nEncoded;
02417 }
02418
02419 template<typename TSrcVec, typename TDestCh>
02420 size_t TUniCodec::EncodeUtf16ToBytes(
02421         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02422         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02423         const TUniByteOrder destByteOrder) const
02424 {
02425         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
02426         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02427         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
02428         while (srcIdx < srcEnd)
02429         {
02430                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02431                 if (! (c <= 0x10ffffu)) {
02432                         switch (errorHandling) {
02433                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02434                         case uehAbort: return nEncoded;
02435 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
02436                         case uehReplace: ___OutRepl; continue;
02437                         case uehIgnore: continue;
02438                         default: Fail; } }
02439                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02440                         switch (errorHandling) {
02441                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02442                         case uehAbort: return nEncoded;
02443                         case uehReplace: ___OutRepl; continue;
02444                         case uehIgnore: continue;
02445                         default: Fail; } }
02446                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02447                         switch (errorHandling) {
02448                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02449                         case uehAbort: return nEncoded;
02450                         case uehReplace: ___OutRepl; continue;
02451                         case uehIgnore: continue;
02452                         default: Fail; } }
02453 #undef ___OutRepl
02454                 // If c is <= 0xffff, it can be stored directly.
02455                 if (c <= 0xffffu) {
02456                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
02457                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
02458                         nEncoded++; continue; }
02459                 // Otherwise, represent c by a pair of surrogate characters.
02460                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02461                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02462                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02463                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
02464                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
02465                 nEncoded++; continue;
02466         }
02467         return nEncoded;
02468 }
02469
02470 //-----------------------------------------------------------------------------
02471 // TUniChDb -- word boundaries
02472 //-----------------------------------------------------------------------------
02473
02474 template<typename TSrcVec>
02475 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02476 {
02477         // WB1.  Break at the start of text.
02478         if (position < srcIdx) { position = srcIdx; return true; }
02479         // If we are beyond the end of the text, there aren't any word breaks left.
02480         const size_t srcEnd = srcIdx + srcCount;
02481         if (position >= srcEnd) return false;
02482         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02483         size_t origPos = position;
02484         if (IsWbIgnored(src[TVecIdx(position)])) {
02485                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02486                         position = origPos;
02487         }
02488         // Determine the previous nonignored character (before 'position').
02489         size_t posPrev = position;
02490         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02491         // Sec 6.2.  Allow a break between Sep and an ignored character.
02492         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02493         // Determine the next nonignored character (after 'position').
02494         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02495         size_t posNext2;
02496         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02497         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02498         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
02499         int cNext2, wbfNext2;
02500         //
02501         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02502                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02503                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
02504         {
02505                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02506                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02507                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02508                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02509                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02510                 wbfNext2 = GetWbFlags(cNext2);
02511 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02512 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
02513 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02514                 // WB3.  Do not break within CRLF.
02515                 if (cCur == 13 && cNext == 10) continue;
02516                 // WB5.  Do not break between most letters.
02517                 TestCurNext(ucfWbALetter, ucfWbALetter);
02518                 // WB6.  Do not break letters across certain punctuation.
02519                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02520                 // WB7.  Do not break letters across certain punctuation.
02521                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02522                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
02523                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
02524                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
02525                 TestCurNext(ucfWbALetter, ucfWbNumeric);
02526                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
02527                 TestCurNext(ucfWbNumeric, ucfWbALetter);
02528                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
02529                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02530                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
02531                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02532                 // WB13.  Do not break between Katakana.
02533                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
02534                 // WB13a.  Do not break from extenders.
02535                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
02536                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
02537                 // WB13b.  Do not break from extenders.
02538                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
02539                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
02540                 // WB14.  Otherwise, break everywhere.
02541                 position = posNext; return true;
02542 #undef TestCurNext
02543 #undef TestCurNext2
02544 #undef TestPrevCurNext
02545         }
02546         // WB2.  Break at the end of text.
02547         IAssert(position == srcEnd);
02548         return true;
02549 }
02550
02551 // ToDo: provide a more efficient implementation of this.
02552 template<typename TSrcVec>
02553 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02554 {
02555         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02556         dest.PutAll(false);
02557         size_t position = srcIdx;
02558         dest[TVecIdx(position - srcIdx)] = true;
02559         while (position < srcIdx + srcCount)
02560         {
02561                 size_t oldPos = position;
02562                 FindNextWordBoundary(src, srcIdx, srcCount, position);
02563                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02564                 dest[TVecIdx(position - srcIdx)] = true;
02565         }
02566         Assert(dest[TVecIdx(srcCount)]);
02567 }
02568
02569 //-----------------------------------------------------------------------------
02570 // TUniChDb -- sentence boundaries
02571 //-----------------------------------------------------------------------------
02572
02573 template<typename TSrcVec>
02574 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
02575 {
02576         if (sbExTrie.Empty()) return true;
02577         // We'll move back from the position where a sentence-boundary is being considered.
02578         size_t pos = position;
02579         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02580         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
02581         // - Skip the Sep, if there is one.
02582         if ((c & ucfSbSep) == ucfSbSep) {
02583                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02584                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02585         // - Skip any Sp characters.
02586         while ((sfb & ucfSbSp) == ucfSbSp) {
02587                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02588                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02589         // - Skip any Close characters.
02590         while ((sfb & ucfSbSp) == ucfSbSp) {
02591                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02592                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02593         // - Skip any ATerm | STerm characters.
02594         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
02595                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02596                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02597         // Now start moving through the trie.
02598         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
02599         while (true)
02600         {
02601                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
02602                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
02603                 TUniChCategory cat = GetCat(c);
02604                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
02605                         // Check if the suffix we've read so far is one of those that appear in the trie.
02606                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
02607                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
02608                         IAssert(len >= 3); IAssert(node >= 0);
02609                         if (sbExTrie.IsNodeTerminal(node)) return false;
02610                         if (atEnd) return true; }
02611                 if (len == 1) { cButLast = c; len++; }
02612                 else if (len == 2) { cButButLast = c; len++;
02613                         // Now we have read the last three characters; start descending the suitable subtrie.
02614                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
02615                         if (node < 0) return true; }
02616                 else {
02617                         // Descend down the trie.
02618                         node = sbExTrie.GetChild(node, c);
02619                         if (node < 0) return true; }
02620         }
02621         //return true;
02622 }
02623
02624 template<typename TSrcVec>
02625 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02626 {
02627         // SB1.  Break at the start of text.
02628         if (position < srcIdx) { position = srcIdx; return true; }
02629         // If we are beyond the end of the text, there aren't any word breaks left.
02630         const size_t srcEnd = srcIdx + srcCount;
02631         if (position >= srcEnd) return false;
02632         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02633         size_t origPos = position;
02634         if (IsWbIgnored(src[TVecIdx(position)])) {
02635                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02636                         position = origPos;
02637         }
02638         // Determine the previous nonignored character (before 'position').
02639         size_t posPrev = position;
02640         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02641         // Sec 6.2.  Allow a break between Sep and an ignored character.
02642         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02643         // Determine the next nonignored character (after 'position').
02644         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02645         size_t posNext2;
02646         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02647         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02648         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
02649         int cNext2, sbfNext2;
02650         // Initialize the state of the peek-back automaton.
02651         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
02652         TPeekBackState backState;
02653         {
02654                 size_t pos = position;
02655                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
02656                 while (true)
02657                 {
02658                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02659                         // Skip at most one Sep.
02660                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02661                         if ((sbf & ucfSbSep) == ucfSbSep) {
02662                                 wasSep = true;
02663                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02664                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02665                         // Skip zero or more Sp's.
02666                         bool stop = false;
02667                         while ((sbf & ucfSbSp) == ucfSbSp) {
02668                                 wasSp = true;
02669                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02670                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02671                         if (stop) break;
02672                         // Skip zero or more Close's.
02673                         while ((sbf & ucfSbClose) == ucfSbClose) {
02674                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02675                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02676                         if (stop) break;
02677                         // Process an ATerm or STerm.
02678                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
02679                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
02680                         break;
02681                 }
02682                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
02683                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
02684                 else backState = stInit;
02685         }
02686         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
02687         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
02688         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
02689         // Our peek-ahead automaton must tell us whether it is Lower or something else.
02690         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
02691         TPeekAheadState aheadState = stUnknown;
02692         //
02693         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02694                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02695                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
02696         {
02697                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02698                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02699                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02700                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02701                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02702                 sbfNext2 = GetSbFlags(cNext2);
02703                 // Update the peek-back automaton.
02704 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
02705 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
02706                 switch (backState) {
02707                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
02708                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
02709                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
02710                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02711                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02712                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02713                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02714                         default: IAssert(false); }
02715 #undef Trans
02716 #undef TestCur
02717                 // Update the peek-ahead automaton.
02718 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
02719                 if (! IsPeekAheadSkippable(sbfCur)) {
02720                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
02721                         if (aheadState == stLower) IAssert(isLower);
02722                         else if (aheadState == stNotLower) IAssert(! isLower);
02723                         // We haven't peaked ahead farther than this so far -- invalidate the state.
02724                         aheadState = stUnknown; }
02725                 if (aheadState == stUnknown)
02726                 {
02727                         // Peak ahead to the next non-peekahead-skippable character.
02728                         size_t pos = posNext;
02729                         while (pos < srcEnd) {
02730                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02731                                 if (! IsPeekAheadSkippable(sbf)) {
02732                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
02733                                         else aheadState = stNotLower;
02734                                         break; }
02735                                 WbFindNextNonIgnored(src, pos, srcEnd); }
02736                         if (! (pos < srcEnd)) aheadState = stNotLower;
02737                 }
02738 #undef IsPeekAheadSkippable
02739                 //
02740 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02741 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
02742 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02743                 // SB3.  Do not break within CRLF.
02744                 if (cCur == 13 && cNext == 10) continue;
02745                 // SB4.  Break ater paragraph separators.
02746                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
02747                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02748                         position = posNext; return true; }
02749                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
02750                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
02751                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
02752                 // or numeric period, and thus may not mark the end of a sentence.
02753                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
02754                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
02755                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
02756                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
02757                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
02758                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
02759                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
02760                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
02761                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
02762                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
02763                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
02764                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
02765                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
02766                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
02767                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02768                         position = posNext; return true; } // SB11
02769                 // WB12.  Otherwise, do not break.
02770                 continue;
02771 #undef TestCurNext
02772 #undef TestCurNext2
02773 #undef TestPrevCurNext
02774         }
02775         // WB2.  Break at the end of text.
02776         IAssert(position == srcEnd);
02777         return true;
02778 }
02779
02780 // ToDo: provide a more efficient implementation of this.
02781 template<typename TSrcVec>
02782 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02783 {
02784         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02785         dest.PutAll(false);
02786         size_t position = srcIdx;
02787         dest[TVecIdx(position - srcIdx)] = true;
02788         while (position < srcIdx + srcCount)
02789         {
02790                 size_t oldPos = position;
02791                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
02792                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02793                 dest[TVecIdx(position - srcIdx)] = true;
02794         }
02795         Assert(dest[TVecIdx(srcCount)]);
02796 }
02797
02798 //-----------------------------------------------------------------------------
02799 // TUniChDb -- case conversions
02800 //-----------------------------------------------------------------------------
02801
02802 template<typename TSrcVec, typename TDestCh>
02803 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02804                                                                 TVec<TDestCh>& dest, const bool clrDest,
02805                                                                 const TUniChDb::TCaseConversion how,
02806                                                                 const bool turkic, const bool lithuanian) const
02807 {
02808         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
02809         if (clrDest) dest.Clr();
02810         enum {
02811                 GreekCapitalLetterSigma = 0x3a3,
02812                 GreekSmallLetterSigma = 0x3c3,
02813                 GreekSmallLetterFinalSigma = 0x3c2,
02814                 LatinCapitalLetterI = 0x49,
02815                 LatinCapitalLetterJ = 0x4a,
02816                 LatinCapitalLetterIWithOgonek = 0x12e,
02817                 LatinCapitalLetterIWithGrave = 0xcc,
02818                 LatinCapitalLetterIWithAcute = 0xcd,
02819                 LatinCapitalLetterIWithTilde = 0x128,
02820                 LatinCapitalLetterIWithDotAbove = 0x130,
02821                 LatinSmallLetterI = 0x69,
02822                 CombiningDotAbove = 0x307
02823         };
02824         //
02825         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
02826         size_t nextWordBoundary = srcIdx;
02827         TBoolV wordBoundaries; bool wbsKnown = false;
02828         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
02829         {
02830                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
02831                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
02832                 // For conversion to titlecase, the first cased character of each word
02833                 // must be converted to titlecase; everything else must be converted
02834                 // to lowercase.
02835                 TUniChDb::TCaseConversion howHere;
02836                 if (how != ccTitle) howHere = how;
02837                 else {
02838                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
02839                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
02840                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
02841                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
02842                         bool isCased = IsCased(cp);
02843                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
02844                         else { howHere = ccLower;
02845                                 if (isCased && seenCased) seenTwoCased = true; }
02846                 }
02847                 // First, process the conditional mappings from SpecialCasing.txt.
02848                 // These will be processed in code -- they were ignored while
02849                 // we were reading SpecialCasing.txt itself.
02850                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
02851                 {
02852                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
02853                         // the standard doesn't define it.  We'll use FinalCased instead.
02854                         // FinalCased: within the closest word boundaries containing C,
02855                         // there is a cased letter before C, and there is no cased letter after C.
02856                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
02857                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
02858                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
02859                         if (how == ccTitle)
02860                                 printf("!");
02861                         //while (srcIdx2 < nextBoundary)
02862                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02863                         {
02864                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02865                                 if (IsCased(cp2)) { casedAfter = true; break; }
02866                         }
02867                         if (! casedAfter)
02868                         {
02869                                 //size_t prevBoundary = srcIdx - 1;
02870                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
02871                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
02872                                 //while (prevBoundary < srcIdx2)
02873                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02874                                 {
02875                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02876                                         if (IsCased(cp2)) { casedBefore = true; break; }
02877                                 }
02878                                 if (casedBefore) {
02879                                         // Now we have a FinalCased character.
02880                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
02881                         }
02882                         // If we got here, add a non-final sigma.
02883                         dest.Add(GreekSmallLetterSigma); continue;
02884                 }
02885                 else if (lithuanian)
02886                 {
02887                         if (howHere == ccLower)
02888                         {
02889                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
02890                                 {
02891                                         bool moreAbove = false;
02892                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02893                                         {
02894                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02895                                                 const int cc2 = GetCombiningClass(cp2);
02896                                                 if (cc2 == TUniChInfo::ccStarter) break;
02897                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
02898                                         }
02899                                         if (moreAbove)
02900                                         {
02901                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
02902                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
02903                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
02904                                         }
02905                                 }
02906                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
02907                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
02908                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
02909                         }
02910                         if (cp == CombiningDotAbove)
02911                         {
02912                                 // Lithuanian, howHere != ccLower.
02913                                 // AfterSoftDotted := the last preceding character with a combining class
02914                                 // of zero before C was Soft_Dotted, and there is no intervening combining
02915                                 // character class 230 (ABOVE).
02916                                 bool afterSoftDotted = false;
02917                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02918                                 while (origSrcIdx < srcIdx2)
02919                                 {
02920                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02921                                         int cc2 = GetCombiningClass(cp2);
02922                                         if (cc2 == TUniChInfo::ccAbove) break;
02923                                         if (cc2 == TUniChInfo::ccStarter) {
02924                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
02925                                 }
02926                                 if (afterSoftDotted)
02927                                 {
02928                                         Assert(lithuanian);
02929                                         // Remove DOT ABOVE after "i" with upper or titlecase.
02930                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
02931                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
02932                                         if (how == ccLower) { dest.Add(0x307); continue; }
02933                                         if (how == ccUpper) continue;
02934                                         Assert(how == ccTitle);
02935                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
02936                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
02937                                         dest.Add(0x307); continue;
02938                                 }
02939                         }
02940                 }
02941                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
02942                 {
02943                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
02944                         // The following rules handle those cases.
02945                         if (cp == LatinCapitalLetterIWithDotAbove) {
02946                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
02947                         // When lowercasing, remove dot_above in the sequence I + dot_above,
02948                         // which will turn into i.  This matches the behavior of the
02949                         // canonically equivalent I-dot_above.
02950                         else if (cp == CombiningDotAbove)
02951                         {
02952                                 // AfterI: the last preceding base character was an uppercase I,
02953                                 // and there is no intervening combining character class 230 (ABOVE).
02954                                 bool afterI = false;
02955                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02956                                 while (origSrcIdx < srcIdx2)
02957                                 {
02958                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02959                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
02960                                         int cc2 = GetCombiningClass(cp2);
02961                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
02962                                 }
02963                                 if (afterI) {
02964                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
02965                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
02966                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
02967                                                 // This suggests that if a cased character is found, others in that word should be left alone.
02968                                                 // This seems unusual; we map all other characters to lowercase instead.
02969                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
02970                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
02971                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
02972                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
02973                                                 // So we treat this as a special case here.
02974                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
02975                                                 dest.Add(0x307); continue; }
02976                                         if (howHere != ccLower) dest.Add(0x307);
02977                                         continue; }
02978                         }
02979                         // When lowercasing, unless an I is before a dot_above,
02980                         // it turns into a dotless i.
02981                         else if (cp == LatinCapitalLetterI)
02982                         {
02983                                 // BeforeDot: C is followed by U+0307 (combining dot above).
02984                                 // Any sequence of characters with a combining class that is
02985                                 // neither 0 nor 230 may intervene between the current character
02986                                 // and the combining dot above.
02987                                 bool beforeDot = false;
02988                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02989                                 {
02990                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02991                                         if (cp2 == 0x307) { beforeDot = true; break; }
02992                                         const int cc2 = GetCombiningClass(cp2);
02993                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
02994                                 }
02995                                 if (! beforeDot) {
02996                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
02997                         }
02998                         // When uppercasing, i turns into a dotted capital I.
02999                         else if (cp == LatinSmallLetterI)
03000                         {
03001                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
03002                         }
03003                 }
03004                 // Try to use the unconditional mappings.
03005                 const TIntIntVH &specHere = (
03006                         howHere == how ? specials :
03007                         howHere == ccLower ? specialCasingLower :
03008                         howHere == ccTitle ? specialCasingTitle :
03009                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
03010                 int i = specHere.GetKeyId(cp);
03011                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
03012                 // Try to use the simple (one-character) mappings.
03013                 i = h.GetKeyId(cp);
03014                 if (i >= 0) {
03015                         const TUniChInfo &ci = h[i];
03016                         int cpNew = (
03017                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
03018                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
03019                                                                          ci.simpleTitleCaseMapping);
03020                         if (cpNew < 0) cpNew = cp;
03021                         dest.Add(cpNew); continue; }
03022                 // As a final resort, leave 'cp' unchanged.
03023                 dest.Add(cp);
03024         }
03025 }
03026
03027 template<typename TSrcVec, typename TDestCh>
03028 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03029         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
03030 {
03031         if (clrDest) dest.Clr();
03032         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03033         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
03034         {
03035                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03036                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
03037                 const TUniChInfo &ci = h[i];
03038                 // With titlecasing, the first cased character of each word must be put into titlecase,
03039                 // all others into lowercase.  This is what the howHere variable is for.
03040                 TUniChDb::TCaseConversion howHere;
03041                 if (how != ccTitle) howHere = how;
03042                 else {
03043                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
03044                                 seenCased = false;
03045                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03046                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03047                         bool isCased = IsCased(cp);
03048                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03049                         else howHere = ccLower;
03050                 }
03051                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03052                 if (cpNew < 0) cpNew = cp;
03053                 dest.Add(cpNew);
03054         }
03055 }
03056
03057 template<typename TSrcVec>
03058 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
03059 {
03060         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03061         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
03062         {
03063                 const int cp = src[TVecIdx(srcIdx)];
03064                 int i = h.GetKeyId(cp); if (i < 0) continue;
03065                 const TUniChInfo &ci = h[i];
03066                 // With titlecasing, the first cased character of each word must be put into titlecase,
03067                 // all others into lowercase.  This is what the howHere variable is for.
03068                 TUniChDb::TCaseConversion howHere;
03069                 if (how != ccTitle) howHere = how;
03070                 else {
03071                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
03072                                 seenCased = false;
03073                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03074                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03075                         bool isCased = IsCased(cp);
03076                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03077                         else howHere = ccLower;
03078                 }
03079                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03080                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
03081         }
03082 }
03083
03084 //-----------------------------------------------------------------------------
03085 // TUniChDb -- composition, decomposition, normal forms
03086 //-----------------------------------------------------------------------------
03087
03088 template<typename TDestCh>
03089 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
03090 {
03091         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
03092         {
03093                 // UAX #15, sec. 16: Hangul decomposition
03094                 const int SIndex = codePoint - HangulSBase;
03095                 const int L = HangulLBase + SIndex / HangulNCount;
03096                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
03097                 const int T = HangulTBase + (SIndex % HangulTCount);
03098                 dest.Add(L); dest.Add(V);
03099                 if (T != HangulTBase) dest.Add(T);
03100                 return;
03101         }
03102         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
03103         const TUniChInfo &ci = h[i];
03104         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
03105         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
03106         while (true) {
03107                 int cp = decompositions[ofs++]; if (cp < 0) return;
03108                 AddDecomposition(cp, dest, compatibility); }
03109 }
03110
03111 template<typename TSrcVec, typename TDestCh>
03112 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03113                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
03114 {
03115         if (clrDest) dest.Clr();
03116         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
03117         // Decompose the string.
03118         while (srcIdx < srcCount) {
03119                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
03120         // Rearrange the decomposed string into canonical order.
03121         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
03122         {
03123                 size_t j = destIdx;
03124                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
03125                 int cpCls = GetCombiningClass(cp);
03126                 if (cpCls == TUniChInfo::ccStarter) continue;
03127                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
03128                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
03129                 dest[TVecIdx(j)] = cp;
03130         }
03131 }
03132
03133 template<typename TSrcVec, typename TDestCh>
03134 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03135                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
03136 {
03137         if (clrDest) dest.Clr();
03138         TIntV temp;
03139         Decompose(src, srcIdx, srcCount, temp, compatibility);
03140         Compose(temp, 0, temp.Len(), dest, clrDest);
03141 }
03142
03143 template<typename TSrcVec, typename TDestCh>
03144 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03145                 TVec<TDestCh>& dest, bool clrDest) const
03146 {
03147         if (clrDest) dest.Clr();
03148         bool lastStarterKnown = false; // has a starter been encountered yet?
03149         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
03150         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
03151         const size_t srcEnd = srcIdx + srcCount;
03152         int ccMax = -1; // The highest combining class among the characters since the last starter.
03153         while (srcIdx < srcEnd)
03154         {
03155                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03156                 const int cpClass = GetCombiningClass(cp);
03157                 //int cpCombined = -1;
03158                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
03159                 // by some intermediate character, we can try to combine them.
03160                 if (lastStarterKnown && ccMax < cpClass)
03161                 {
03162                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
03163                         int cpCombined = -1;
03164                         do {
03165                                 // Try to look up a composition in the inverseDec table.
03166                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
03167                                 // UAX #15, sec. 16: Hangul composition
03168                                 // - Try to combine L and V.
03169                                 const int LIndex = cpLastStarter - HangulLBase;
03170                                 if (0 <= LIndex && LIndex < HangulLCount) {
03171                                         const int VIndex = cp - HangulVBase;
03172                                         if (0 <= VIndex && VIndex < HangulVCount) {
03173                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
03174                                                 break; } }
03175                                 // - Try to combine LV and T.
03176                                 const int SIndex = cpLastStarter - HangulSBase;
03177                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
03178                                 {
03179                                         const int TIndex = cp - HangulTBase;
03180                                         if (0 <= TIndex && TIndex < HangulTCount) {
03181                                                 cpCombined = cpLastStarter + TIndex;
03182                                                 break; }
03183                                 }
03184                         } while (false);
03185                         // If a combining character has been found, use it to replace the old cpStarter.
03186                         if (cpCombined >= 0) {
03187                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
03188                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
03189                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
03190                                 cpLastStarter = cpCombined; continue; }
03191                 }
03192                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
03193                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
03194                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
03195                         ccMax = cpClass;
03196                 dest.Add(cp);
03197         }
03198 }
03199
03200 template<typename TSrcVec, typename TDestCh>
03201 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03202                 TVec<TDestCh>& dest, bool clrDest) const
03203 {
03204         if (clrDest) dest.Clr();
03205         size_t retVal = 0;
03206         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
03207                 const int cp = src[TVecIdx(srcIdx)];
03208                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
03209                         { dest.Add(cp); retVal++; } }
03210         return retVal;
03211 }
03212
03213 inline bool AlwaysFalse()
03214 {
03215         int sum = 0;
03216         for (int i = 0; i < 5; i++) sum += i;
03217         return sum > 100;
03218 }
03219
03220 inline bool AlwaysTrue()
03221 {
03222         int sum = 0;
03223         for (int i = 0; i < 5; i++) sum += i;
03224         return sum < 100;
03225 }
03226
03227 /*
03228
03229 Notes on decomposition:
03230
03231 - In UnicodeData.txt, there is a field with the decomposition mapping.
03232   This field may also include a tag, <...>.
03233   If there is a tag, this is a compatibility mapping.
03234   Otherwise it is a canonical mapping.
03235 - Canonical decomposition uses only canonical mappings,
03236   compatibility decomposition uses both canonical and compatibility mappings.
03237 - Decomposition:
03238   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
03239   2. Put the string into canonical order, which means:
03240      while there exists a pair of characters, A immediately followed by B,
03241          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:
03242            swap A and B;
03243   This results in NFD (normalized form D, after canonical decomposition)
03244   or NFKD (normalized form KD, after compatibility decomposition).
03245 - Canonical composition:
03246   1. Before composition, the string should have been decomposed
03247      (using either canonical or compatibility decomposition).
03248   2. For each character C (from left to right):
03249      2.1.  Find the last starter S before C (if not found, continue).
03250          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.
03251          2.3.  If there exists a character L for which the canonical decomposition is S+L
03252                and L is not in the composition exclusion table [i.e. L is a "primary composite"],
03253                    then replace S by L, and remove C.
03254   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
03255   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
03256 - Composition exclusion table:
03257   - Anything in CompositionExclusions.txt.
03258   - Singletons: characters whose canonical decomposition is a single character.
03259   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
03260
03261 Example:
03262                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)
03263                                  E-macron (0112; composition class 0;                          0045 0304)
03264                                  grave   (0300; composition class 230)
03265                  macron  (0304; composition class 230)
03266   source string: 00c8 0304
03267   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
03268   after canonical composition: 00c8 0304
03269
03270   cc(horn) = 216
03271   cc(dot below) = 220
03272   cc(dot above) = 230
03273
03274 ToDos:
03275 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
03276   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
03277   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
03278   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
03279 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
03280   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
03281   (+ simple case mappinge v UnicodeData.txt).
03282   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
03283   in jih potem upostevamo posebej kar v source kodi nasih programov [za
03284   podrobno definicijo pogojev pa glej tabelo 3.13].
03285   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
03286     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
03287         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
03288         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za
03289         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
03290   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
03291     in se posebej str. 90.
03292   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
03293   - definicija cased ipd. na str. 89
03294 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
03295   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
03296   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
03297   To je se najbolje dodati med flagse posameznega characterja.
03298 - general category: sec. 4.5
03299 - motivacija za titlecase: 5.18
03300 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
03301   pod Full_Composition_Exclusion
03302 - script names: Scripts.txt in UAX #24.
03303 - block names: Blocks.txt
03304 - space characters: table 6.2 in baje tudi UCD.html
03305 - dash characters: table 6.3
03306 */
03307
03308 //#endif
03309