SNAP Library, User Reference
2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
|
00001 //#ifndef unicode_h 00002 //#define unicode_h 00003 00005 // Includes 00006 //#include "base.h" 00007 #include <new> 00008 00009 typedef int TUniVecIdx; 00010 00011 //----------------------------------------------------------------------------- 00012 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder 00013 //----------------------------------------------------------------------------- 00014 00015 // Error handling modes for the TUniCodec class. 00016 typedef enum TUnicodeErrorHandling_ 00017 { 00018 // What happens when an error occurs: 00019 uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector) 00020 uehThrow = 1, // - an exception is thrown (TUnicodeException) 00021 uehReplace = 2, // - the replacement character is added to the output vector 00022 uehAbort = 3 // - the encoding/decoding process stops immediately 00023 } 00024 TUnicodeErrorHandling; 00025 00026 class TUnicodeException 00027 { 00028 public: 00029 TStr message; // error message 00030 size_t srcIdx; // the position in the source vector where the error occurred 00031 int srcChar; // the source character at the position srcIdx 00032 TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) : 00033 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { } 00034 }; 00035 00036 typedef enum TUniByteOrder_ 00037 { 00038 boMachineEndian = 0, 00039 boLittleEndian = 1, 00040 boBigEndian = 2 00041 } 00042 TUniByteOrder; 00043 00044 typedef enum TUtf16BomHandling_ 00045 { 00046 bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used 00047 bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported 00048 bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character 00049 } 00050 TUtf16BomHandling; 00051 00052 class TUniCodec 00053 { 00054 public: 00055 // 0xfffd is defined as the replacement character by the Unicode standard. 00056 // By default, it is rendered as a question mark inside a diamond: "<?>". 00057 enum { DefaultReplacementChar = 0xfffd }; 00058 00059 // The replacement character is inserted into the destination vector 00060 // if an error occurs in the source vector. By default, this is set 00061 // to DefaultReplacementChar. 00062 int replacementChar; 00063 // The error handling mode. 00064 TUnicodeErrorHandling errorHandling; 00065 // There are a number of situations where there is strictly speaking an error in 00066 // the source data although it can still be decoded in a reasonably meaningful way. 00067 // If strict == true, these situations are treated as errors. Examples: 00068 // - when decoding UTF-8: 00069 // - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127 00070 // encoded as a two-byte sequence) 00071 // - a codepoint > 0x10ffff 00072 // - when decoding UTF-16: 00073 // - a codepoint from the range reserved for the second character of a surrogate pair 00074 // is not preceded by a codepoint from the range reserved for the first character of a surrogate pair 00075 // - when encoding UTF-8: 00076 // - a codepoint > 0x10ffff 00077 // - when encoding UTF-16: 00078 // - a codepoint from the range reserved from the second character of a surrogate pair 00079 // [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a 00080 // surrogate pair, is always an error, even with strict == false] 00081 bool strict; 00082 // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning 00083 // of the source vector, it is skipped (when decoding). 00084 // - Note: a BOM is not really useful in UTF-8 encoded data. However, the .NET UTF8Encoding 00085 // emits 0xfeff by default as a kind of preamble. It gets encoded as 3 bytes, ef bb bf, 00086 // and can be helpful to make the data easier to recognize as UTF-8 encoded data. 00087 bool skipBom; 00088 00089 TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) 00090 { 00091 } 00092 00093 TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) : 00094 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) 00095 { 00096 } 00097 00098 protected: 00099 enum { 00100 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 00101 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), 00102 DefineByte(1, 1, 0, 0, 0, 0, 0, 0), 00103 DefineByte(1, 1, 1, 0, 0, 0, 0, 0), 00104 DefineByte(1, 1, 1, 1, 0, 0, 0, 0), 00105 DefineByte(1, 1, 1, 1, 1, 0, 0, 0), 00106 DefineByte(1, 1, 1, 1, 1, 1, 0, 0), 00107 DefineByte(1, 1, 1, 1, 1, 1, 1, 0), 00108 DefineByte(0, 0, 1, 1, 1, 1, 1, 1), 00109 DefineByte(0, 0, 0, 1, 1, 1, 1, 1), 00110 DefineByte(0, 0, 0, 0, 1, 1, 1, 1), 00111 DefineByte(0, 0, 0, 0, 0, 1, 1, 1), 00112 DefineByte(0, 0, 0, 0, 0, 0, 1, 1) 00113 #undef DefineByte 00114 }; 00115 00116 typedef TUniVecIdx TVecIdx; 00117 //friend class TUniChDb; 00118 friend class TUniCaseFolding; 00119 00120 public: 00121 00122 //----------------------------------------------------------------------- 00123 // UTF-8 00124 //----------------------------------------------------------------------- 00125 00126 // Returns the number of characters that have been successfully decoded. 00127 // This does not include any replacement characters that may have been inserted into 'dest'. 00128 template<typename TSrcVec, typename TDestCh> 00129 size_t DecodeUtf8( 00130 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00131 TVec<TDestCh>& dest, const bool clrDest = true) const; 00132 template<typename TSrcVec, typename TDestCh> 00133 size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); } 00134 00135 // Returns the number of characters that have been successfully encoded. 00136 // This does not include any replacement characters that may have been inserted into 'dest'. 00137 template<typename TSrcVec, typename TDestCh> 00138 size_t EncodeUtf8( 00139 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00140 TVec<TDestCh>& dest, const bool clrDest = true) const; 00141 template<typename TSrcVec, typename TDestCh> 00142 size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); } 00143 00144 // The following wrappers around the UTF-8 encoder return a TStr containing 00145 // the UTF-8-encoded version of the input string. 00146 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; } 00147 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; } 00148 00149 //----------------------------------------------------------------------- 00150 // UTF-16 Decoder 00151 //----------------------------------------------------------------------- 00152 00153 protected: 00154 enum { 00155 Utf16FirstSurrogate = 0xd800, 00156 Utf16SecondSurrogate = 0xdc00 00157 }; 00158 00159 static bool IsMachineLittleEndian(); 00160 00161 public: 00162 00163 // Returns the number of characters that have been successfully decoded. 00164 // This does not include any replacement characters that may have been inserted into 'dest'. 00165 // Each element of 'src' is assumed to contain one byte of data. 00166 // srcCount must be even (though srcIdx doesn't need to be). 00167 template<typename TSrcVec, typename TDestCh> 00168 size_t DecodeUtf16FromBytes( 00169 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00170 TVec<TDestCh>& dest, const bool clrDest, 00171 const TUtf16BomHandling bomHandling = bomAllowed, 00172 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00173 00174 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 00175 // are used to determine if the two bytes of each word should be swapped before further 00176 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 00177 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 00178 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 00179 // beginning of the source data is used to determine the "original" byte order of the data; 00180 // if this doesn't match the byte order of the local machine, the two bytes of each word will 00181 // be swapped during the decoding process. 00182 template<typename TSrcVec, typename TDestCh> 00183 size_t DecodeUtf16FromWords( 00184 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00185 TVec<TDestCh>& dest, bool clrDest, 00186 const TUtf16BomHandling bomHandling = bomAllowed, 00187 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00188 00189 //----------------------------------------------------------------------- 00190 // UTF-16 Encoder 00191 //----------------------------------------------------------------------- 00192 00193 // Returns the number of characters that have been successfully encoded. 00194 // This does not include any replacement characters that may have been inserted into 'dest'. 00195 // 00196 // Notes: 00197 // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always 00198 // treated as an error, regardless of the value of 'strict'. 00199 // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023 00200 // cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding 00201 // as the first character of a surrogate pair. 00202 // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023 00203 // can be encoded in principle; however, if strict == true, they are treated as errors. 00204 template<typename TSrcVec, typename TDestCh> 00205 size_t EncodeUtf16ToWords( 00206 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00207 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00208 const TUniByteOrder destByteOrder = boMachineEndian) const; 00209 00210 template<typename TSrcVec, typename TDestCh> 00211 size_t EncodeUtf16ToBytes( 00212 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00213 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00214 const TUniByteOrder destByteOrder = boMachineEndian) const; 00215 00216 //----------------------------------------------------------------------- 00217 // Helper declarations for the test drivers 00218 //----------------------------------------------------------------------- 00219 00220 protected: 00221 00222 static uint GetRndUint(TRnd& rnd); 00223 static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal); 00224 00225 //----------------------------------------------------------------------- 00226 // UTF-8 Test Driver 00227 //----------------------------------------------------------------------- 00228 00229 protected: 00230 void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f); 00231 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc', 00232 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected. 00233 void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc); 00234 public: 00235 void TestUtf8(); 00236 00237 //----------------------------------------------------------------------- 00238 // UTF-16 Test Driver 00239 //----------------------------------------------------------------------- 00240 00241 protected: 00242 void WordsToBytes(const TIntV& src, TIntV& dest); 00243 void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, 00244 // Note: insertBom is only used with the encoder. When encoding, 'defaultByteOrder' is used as the destination byte order. 00245 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, 00246 FILE *f); 00247 static inline int SwapBytes(int x) { 00248 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); } 00249 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc', 00250 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected. 00251 void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc, 00252 const TUtf16BomHandling bomHandling, 00253 const TUniByteOrder defaultByteOrder, 00254 const bool insertBom); 00255 public: 00256 void TestUtf16(); 00257 00258 }; 00259 00260 //----------------------------------------------------------------------------- 00261 // Case folding 00262 //----------------------------------------------------------------------------- 00263 // Note: there's no need to access this class directly. 00264 // Use TUniChDb::GetCaseFolded() instead. 00265 00266 typedef THash<TInt, TIntV> TIntIntVH; 00267 00268 class TUniCaseFolding 00269 { 00270 protected: 00271 TIntH cfCommon, cfSimple, cfTurkic; 00272 TIntIntVH cfFull; 00273 00274 template<typename TSrcDat, typename TDestDat> 00275 inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) { 00276 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); } 00277 friend class TUniChDb; 00278 typedef TUniVecIdx TVecIdx; 00279 00280 public: 00281 TUniCaseFolding() { } 00282 explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); } 00283 void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); } 00284 void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); } 00285 void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); } 00286 void LoadTxt(const TStr& fileName); 00287 00288 // Use 'turkic' when processing text in a Turkic language (tr, az). This only affects the uppercase I and I-with-dot-above. 00289 template<typename TSrcVec, typename TDestCh> 00290 void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00291 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const 00292 { 00293 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 00294 { 00295 int c = src[TVecIdx(srcIdx)], i; srcIdx++; 00296 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; } 00297 if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; } 00298 if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; } 00299 i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c); 00300 } 00301 } 00302 00303 template<typename TSrcVec> 00304 void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const 00305 { 00306 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 00307 { 00308 int c = src[TVecIdx(srcIdx)], i; 00309 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; } 00310 if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; } 00311 i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i]; 00312 } 00313 } 00314 00315 protected: 00316 void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f); 00317 public: 00318 void Test(); 00319 }; 00320 00321 //----------------------------------------------------------------------------- 00322 // TCodecBase -- an abstract base class for codecs 00323 //----------------------------------------------------------------------------- 00324 00325 class TCodecBase; 00326 typedef TPt<TCodecBase> PCodecBase; 00327 typedef TVec<PCodecBase> TCodecBaseV; 00328 00329 class TCodecBase 00330 { 00331 protected: 00332 TCRef CRef; 00333 friend class TPt<TCodecBase>; 00334 public: 00335 virtual ~TCodecBase() { } 00336 00337 template<class TCodecImpl> 00338 static PCodecBase New(); /* { 00339 return new TCodecWrapper<TCodecImpl>(); } */ 00340 00341 virtual TStr GetName() const = 0; 00342 virtual void Test() const { } 00343 00344 // Returns the number of characters that have been successfully decoded. 00345 // This does not include any replacement characters that may have been inserted into 'dest'. 00346 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00347 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00348 00349 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00350 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00351 00352 // Returns the number of characters that have been successfully encoded. 00353 // This does not include any replacement characters that may have been inserted into 'dest'. 00354 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00355 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0; 00356 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0; 00357 00358 size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00359 size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00360 size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00361 }; 00362 00363 //----------------------------------------------------------------------------- 00364 // TCodecWrapper -- a descendant of TCodecBase; relies on a template 00365 // parameter class for the actual implementation of the codec. 00366 //----------------------------------------------------------------------------- 00367 // Thus, if you know in advance that you'll need ISO-8859-2, just use 00368 // T8BitCodec<TEncoding_ISO8859_2>. If you don't know the encoding 00369 // in advance, use a PCodecBase pointing to a suitable specialization 00370 // of TCodecWrapper<...>. You can TUnicode::GetCodec(TStr& name) 00371 // to obtain a suitable pointer. 00372 00373 template<class TCodecImpl_> 00374 class TCodecWrapper : public TCodecBase 00375 { 00376 public: 00377 typedef TCodecImpl_ TCodecImpl; 00378 TCodecImpl impl; 00379 public: 00380 00381 virtual TStr GetName() const { return impl.GetName(); } 00382 00383 virtual void Test() const { impl.Test(); } 00384 00385 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00386 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00387 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00388 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00389 00390 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00391 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00392 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const { 00393 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00394 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00395 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false); 00396 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00397 return retVal; } 00398 }; 00399 00400 template<class TCodecImpl> 00401 PCodecBase TCodecBase::New() { 00402 return new TCodecWrapper<TCodecImpl>(); 00403 } 00404 00405 //----------------------------------------------------------------------------- 00406 // TVecElt -- a template for determining the type of a vector's elements 00407 //----------------------------------------------------------------------------- 00408 00409 template<class TVector_> 00410 class TVecElt 00411 { 00412 }; 00413 00414 template<class TDat> 00415 class TVecElt<TVec<TDat> > 00416 { 00417 public: 00418 typedef TVec<TDat> TVector; 00419 typedef TDat TElement; 00420 static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); } 00421 }; 00422 00423 template<> 00424 class TVecElt<TChA> 00425 { 00426 public: 00427 typedef TChA TVector; 00428 typedef char TElement; 00429 static inline void Add(TVector& vector, const TElement& element) { vector += element; } 00430 }; 00431 00432 00433 //----------------------------------------------------------------------------- 00434 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode 00435 //----------------------------------------------------------------------------- 00436 00437 class TEncoding_ISO8859_1 00438 { 00439 public: 00440 static inline TStr GetName() { return "ISO-8859-1"; } 00441 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; } 00442 static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; } 00443 }; 00444 00445 class TEncoding_ISO8859_2 // ISO Latin 2 00446 { 00447 public: 00448 static inline TStr GetName() { return "ISO-8859-2"; } 00449 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00450 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00451 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00452 static int FromUnicode(int c) { 00453 if (0 <= c && c < 0xa0) return c; 00454 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00455 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00456 else return -1; } 00457 }; 00458 00459 class TEncoding_ISO8859_3 00460 { 00461 public: 00462 static inline TStr GetName() { return "ISO-8859-3"; } 00463 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2]; 00464 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00465 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00466 static int FromUnicode(int c) { 00467 if (0 <= c && c < 0xa0) return c; 00468 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00469 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8]; 00470 else return -1; } 00471 }; 00472 00473 class TEncoding_ISO8859_4 00474 { 00475 public: 00476 static inline TStr GetName() { return "ISO-8859-4"; } 00477 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00478 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00479 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00480 static int FromUnicode(int c) { 00481 if (0 <= c && c < 0xa0) return c; 00482 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00483 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00484 else return -1; } 00485 }; 00486 00487 class TEncoding_YuAscii 00488 { 00489 public: 00490 static const int uniChars[10], yuAsciiChars[10]; 00491 static inline TStr GetName() { return "YU-ASCII"; } 00492 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00493 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++) 00494 if (c == yuAsciiChars[i]) return uniChars[i]; 00495 return c; } 00496 static int FromUnicode(int c) { 00497 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++) 00498 if (c == uniChars[i]) return yuAsciiChars[i]; 00499 else if(c == yuAsciiChars[i]) return -1; 00500 if (0 <= c && c <= 255) return c; else return -1; } 00501 }; 00502 00503 class TEncoding_CP437 // DOS US 00504 { 00505 public: 00506 static inline TStr GetName() { return "CP437"; } 00507 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16]; 00508 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00509 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00510 static int FromUnicode(int c) { 00511 if (0 <= c && c < 0x80) return c; 00512 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0]; 00513 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390]; 00514 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210]; 00515 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500]; 00516 else if (c == 0x192) return 0x9f; 00517 else if (c == 0x207f) return 0xfc; 00518 else if (c == 0x20a7) return 0x9e; 00519 else if (c == 0x2310) return 0xa9; 00520 else if (c == 0x2320) return 0xf4; 00521 else if (c == 0x2321) return 0xf5; 00522 else return -1; } 00523 }; 00524 00525 class TEncoding_CP852 // DOS Latin 2 00526 { 00527 public: 00528 static inline TStr GetName() { return "CP852"; } 00529 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16]; 00530 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00531 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00532 static int FromUnicode(int c) { 00533 if (0 <= c && c < 0x80) return c; 00534 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00535 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00536 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500]; 00537 else return -1; } 00538 }; 00539 00540 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2 00541 { 00542 public: 00543 static inline TStr GetName() { return "CP1250"; } 00544 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16]; 00545 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00546 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00547 static int FromUnicode(int c) { 00548 if (0 <= c && c < 0x80) return c; 00549 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00550 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00551 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010]; 00552 else if (c == 0x20ac) return 0x80; 00553 else if (c == 0x2122) return 0x99; 00554 else return -1; } 00555 }; 00556 00557 template<class TEncoding_> 00558 class T8BitCodec 00559 { 00560 protected: 00561 typedef TUniVecIdx TVecIdx; 00562 public: 00563 typedef TEncoding_ TEncoding; 00564 TUnicodeErrorHandling errorHandling; 00565 int replacementChar; 00566 00567 T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { } 00568 T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) : 00569 errorHandling(errorHandling_), replacementChar(replacementChar_) { } 00570 static TStr GetName() { return TEncoding::GetName(); } 00571 00572 void Test() const 00573 { 00574 int nDecoded = 0; 00575 for (int c = 0; c <= 255; c++) { 00576 int cu = TEncoding::ToUnicode(c); if (cu == -1) continue; 00577 nDecoded++; 00578 IAssert(0 <= cu && cu < 0x110000); 00579 int c2 = TEncoding::FromUnicode(cu); 00580 IAssert(c2 == c); } 00581 int nEncoded = 0; 00582 for (int cu = 0; cu < 0x110000; cu++) { 00583 int c = TEncoding::FromUnicode(cu); if (c == -1) continue; 00584 nEncoded++; 00585 IAssert(0 <= c && c <= 255); 00586 int cu2 = TEncoding::ToUnicode(c); 00587 IAssert(cu2 == cu); } 00588 IAssert(nDecoded == nEncoded); 00589 } 00590 00591 // Returns the number of characters that have been successfully decoded. 00592 // This does not include any replacement characters that may have been inserted into 'dest'. 00593 template<typename TSrcVec, typename TDestCh> 00594 size_t ToUnicode( 00595 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00596 TVec<TDestCh>& dest, const bool clrDest = true) const 00597 { 00598 if (clrDest) dest.Clr(); 00599 size_t toDo = srcCount; 00600 while (toDo-- > 0) { 00601 int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++; 00602 int chDest = TEncoding::ToUnicode(chSrc); 00603 dest.Add(chDest); } 00604 return srcCount; 00605 } 00606 template<typename TSrcVec, typename TDestCh> 00607 size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00608 00609 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00610 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00611 00612 // Returns the number of characters that have been successfully encoded. 00613 // This does not include any replacement characters that may have been inserted into 'dest'. 00614 template<typename TSrcVec, typename TDestVec> 00615 size_t FromUnicode( 00616 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00617 TDestVec& dest, const bool clrDest = true) const 00618 { 00619 typedef typename TVecElt<TDestVec>::TElement TDestCh; 00620 if (clrDest) dest.Clr(); 00621 size_t toDo = srcCount, nEncoded = 0; 00622 while (toDo-- > 0) { 00623 int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++; 00624 int chDest = TEncoding::FromUnicode(chSrc); 00625 if (chDest < 0) { 00626 switch (errorHandling) { 00627 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + "."); 00628 case uehAbort: return nEncoded; 00629 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue; 00630 case uehIgnore: continue; 00631 default: Fail; } } 00632 TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; } 00633 return nEncoded; 00634 } 00635 00636 template<typename TSrcVec, typename TDestVec> 00637 size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00638 00639 size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00640 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false); 00641 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00642 return retVal; } 00643 size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); } 00644 }; 00645 00646 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1; 00647 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2; 00648 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3; 00649 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4; 00650 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852; 00651 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437; 00652 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250; 00653 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii; 00654 00655 //----------------------------------------------------------------------------- 00656 // Various declarations used by the Unicode Character Database 00657 //----------------------------------------------------------------------------- 00658 00659 typedef enum TUniChCategory_ 00660 { 00661 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) 00662 DefineUniCat(Letter, 'L'), // ucLetter 00663 DefineUniCat(Mark, 'M'), 00664 DefineUniCat(Number, 'N'), 00665 DefineUniCat(Punctuation, 'P'), 00666 DefineUniCat(Symbol, 'S'), 00667 DefineUniCat(Separator, 'Z'), 00668 DefineUniCat(Other, 'C') 00669 #undef DefineUniCat 00670 } 00671 TUniChCategory; 00672 00673 typedef enum TUniChSubCategory_ 00674 { 00675 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) 00676 DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase 00677 DefineUniSubCat(Letter, Lowercase, 'l'), 00678 DefineUniSubCat(Letter, Titlecase, 't'), 00679 DefineUniSubCat(Letter, Modifier, 'm'), 00680 DefineUniSubCat(Letter, Other, 'o'), 00681 DefineUniSubCat(Mark, Nonspacing, 'n'), 00682 DefineUniSubCat(Mark, SpacingCombining, 'c'), 00683 DefineUniSubCat(Mark, Enclosing, 'e'), 00684 DefineUniSubCat(Number, DecimalDigit, 'd'), 00685 DefineUniSubCat(Number, Letter, 'l'), 00686 DefineUniSubCat(Number, Other, 'o'), 00687 DefineUniSubCat(Punctuation, Connector, 'c'), 00688 DefineUniSubCat(Punctuation, Dash, 'd'), 00689 DefineUniSubCat(Punctuation, Open, 's'), 00690 DefineUniSubCat(Punctuation, Close, 'e'), 00691 DefineUniSubCat(Punctuation, InitialQuote, 'i'), 00692 DefineUniSubCat(Punctuation, FinalQuote, 'f'), 00693 DefineUniSubCat(Punctuation, Other, 'o'), 00694 DefineUniSubCat(Symbol, Math, 'm'), 00695 DefineUniSubCat(Symbol, Currency, 'c'), 00696 DefineUniSubCat(Symbol, Modifier, 'k'), 00697 DefineUniSubCat(Symbol, Other, 'o'), 00698 DefineUniSubCat(Separator, Space, 's'), 00699 DefineUniSubCat(Separator, Line, 'l'), 00700 DefineUniSubCat(Separator, Paragraph, 'p'), 00701 DefineUniSubCat(Other, Control, 'c'), 00702 DefineUniSubCat(Other, Format, 'f'), 00703 DefineUniSubCat(Other, Surrogate, 's'), 00704 DefineUniSubCat(Other, PrivateUse, 'o'), 00705 DefineUniSubCat(Other, NotAssigned, 'n') 00706 } 00707 TUniChSubCategory; 00708 00709 typedef enum TUniChFlags_ 00710 { 00711 ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical 00712 ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt 00713 // Flags used when searching for word boundaries. See UAX #29. 00714 ucfWbFormat = 1 << 2, 00715 ucfWbKatakana = 1 << 3, 00716 ucfWbALetter = 1 << 4, 00717 ucfWbMidLetter = 1 << 5, 00718 ucfWbMidNum = 1 << 6, 00719 ucfWbNumeric = 1 << 7, 00720 ucfWbExtendNumLet = 1 << 8, 00721 // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29. 00722 ucfSbSep = 1 << 9, 00723 ucfSbFormat = 1 << 10, 00724 ucfSbSp = 1 << 11, 00725 ucfSbLower = 1 << 12, 00726 ucfSbUpper = 1 << 13, 00727 ucfSbOLetter = 1 << 14, 00728 ucfSbNumeric = 1 << 15, 00729 ucfSbATerm = 1 << 16, 00730 ucfSbSTerm = 1 << 17, 00731 ucfSbClose = 1 << 18, 00732 ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, 00733 ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, 00734 // Flags from DerivedCoreProperties.txt. 00735 // [The comments are from UCD.html.] 00736 // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode]. 00737 // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl 00738 ucfDcpAlphabetic = 1 << 19, 00739 // - For programmatic determination of default-ignorable code points. 00740 // New characters that should be ignored in processing (unless explicitly supported) 00741 // will be assigned in these ranges, permitting programs to correctly handle the default 00742 // behavior of such characters when not otherwise supported. For more information, see 00743 // UAX #29: Text Boundaries [Breaks]. 00744 // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters 00745 // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors] 00746 ucfDcpDefaultIgnorableCodePoint = 1 << 20, 00747 // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode]. 00748 // Generated from: Other_Lowercase + Ll 00749 ucfDcpLowercase = 1 << 21, 00750 // - For programmatic determination of grapheme cluster boundaries. 00751 // For more information, see UAX #29: Text Boundaries [Breaks]. 00752 // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend 00753 ucfDcpGraphemeBase = 1 << 22, 00754 // - For programmatic determination of grapheme cluster boundaries. 00755 // For more information, see UAX #29: Text Boundaries [Breaks]. 00756 // Generated from: Other_Grapheme_Extend + Me + Mn 00757 // Note: depending on an application's interpretation of Co (private use), they may be either 00758 // in Grapheme_Base, or in Grapheme_Extend, or in neither. 00759 ucfDcpGraphemeExtend = 1 << 23, 00760 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00761 ucfDcpIdStart = 1 << 24, 00762 ucfDcpIdContinue = 1 << 25, 00763 // - Characters with the Math property. For more information, see Chapter 4 in [Unicode]. 00764 // Generated from: Sm + Other_Math 00765 ucfDcpMath = 1 << 26, 00766 // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode]. 00767 // Generated from: Lu + Other_Uppercase 00768 ucfDcpUppercase = 1 << 27, 00769 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00770 ucfDcpXidStart = 1 << 28, 00771 ucfDcpXidContinue = 1 << 29, 00772 ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend | 00773 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue, 00774 } 00775 TUniChFlags; 00776 00777 typedef enum TUniChProperties_ 00778 { 00779 // The flags from PropList.txt. 00780 // [The comments are from UCD.html.] 00781 // - ASCII characters commonly used for the representation of hexadecimal numbers. 00782 // [= 0123456789abcdefABCDEF] 00783 ucfPrAsciiHexDigit = 1, 00784 // - Those format control characters which have specific functions in the Bidirectional Algorithm. 00785 ucfPrBidiControl = 2, 00786 // - Those punctuation characters explicitly called out as dashes in the Unicode Standard, 00787 // plus compatibility equivalents to those. Most of these have the Pd General Category, 00788 // but some have the Sm General Category because of their use in mathematics. 00789 // U+0002d HYPHEN-MINUS 00790 // U+0058a ARMENIAN HYPHEN 00791 // U+005be HEBREW PUNCTUATION MAQAF 00792 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00793 // U+02010 HYPHEN 00794 // U+02011 NON-BREAKING HYPHEN 00795 // U+02012 FIGURE DASH 00796 // U+02013 EN DASH 00797 // U+02014 EM DASH 00798 // U+02015 HORIZONTAL BAR 00799 // U+02053 SWUNG DASH 00800 // U+0207b SUPERSCRIPT MINUS 00801 // U+0208b SUBSCRIPT MINUS 00802 // U+02212 MINUS SIGN 00803 // U+02e17 DOUBLE OBLIQUE HYPHEN 00804 // U+0301c WAVE DASH 00805 // U+03030 WAVY DASH 00806 // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN 00807 // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH 00808 // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH 00809 // U+0fe58 SMALL EM DASH 00810 // U+0fe63 SMALL HYPHEN-MINUS 00811 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00812 ucfPrDash = 4, 00813 // - For a machine-readable list of deprecated characters. No characters will ever be removed 00814 // from the standard, but the usage of deprecated characters is strongly discouraged. 00815 ucfPrDeprecated = 8, 00816 // - Characters that linguistically modify the meaning of another character to which they apply. 00817 // Some diacritics are not combining characters, and some combining characters are not diacritics. 00818 ucfPrDiacritic = 0x10, 00819 // - Characters whose principal function is to extend the value or shape of a preceding alphabetic 00820 // character. Typical of these are length and iteration marks. 00821 ucfPrExtender = 0x20, 00822 // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries. 00823 ucfPrGraphemeLink = 0x40, 00824 // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents. 00825 // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}] 00826 ucfPrHexDigit = 0x80, 00827 // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot. 00828 // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash. 00829 // U+0002d HYPHEN-MINUS 00830 // U+000ad SOFT HYPHEN 00831 // U+0058a ARMENIAN HYPHEN 00832 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00833 // U+02010 HYPHEN 00834 // U+02011 NON-BREAKING HYPHEN 00835 // U+02e17 DOUBLE OBLIQUE HYPHEN 00836 // U+030fb KATAKANA MIDDLE DOT 00837 // U+0fe63 SMALL HYPHEN-MINUS 00838 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00839 // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT 00840 ucfPrHyphen = 0x100, 00841 // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs. 00842 ucfPrIdeographic = 0x200, 00843 // - Those format control characters which have specific functions for control of cursive joining and ligation. 00844 ucfPrJoinControl = 0x400, 00845 // - There are a small number of characters that do not use logical order. 00846 // These characters require special handling in most processing. 00847 ucfPrLogicalOrderException = 0x800, 00848 // - Code points that are permanently reserved for internal use. 00849 ucfPrNoncharacterCodePoint = 0x1000, 00850 // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax. 00851 ucfPrPatternSyntax = 0x2000, 00852 ucfPrPatternWhiteSpace = 0x4000, 00853 // - Those punctuation characters that function as quotation marks. 00854 // U+00022 QUOTATION MARK 00855 // U+00027 APOSTROPHE 00856 // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00857 // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00858 // U+02018 LEFT SINGLE QUOTATION MARK 00859 // U+02019 RIGHT SINGLE QUOTATION MARK 00860 // U+0201a SINGLE LOW-9 QUOTATION MARK 00861 // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK 00862 // U+0201c LEFT DOUBLE QUOTATION MARK 00863 // U+0201d RIGHT DOUBLE QUOTATION MARK 00864 // U+0201e DOUBLE LOW-9 QUOTATION MARK 00865 // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK 00866 // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 00867 // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 00868 // U+0300c LEFT CORNER BRACKET 00869 // U+0300d RIGHT CORNER BRACKET 00870 // U+0300e LEFT WHITE CORNER BRACKET 00871 // U+0300f RIGHT WHITE CORNER BRACKET 00872 // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK 00873 // U+0301e DOUBLE PRIME QUOTATION MARK 00874 // U+0301f LOW DOUBLE PRIME QUOTATION MARK 00875 // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET 00876 // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET 00877 // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET 00878 // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET 00879 // U+0ff02 FULLWIDTH QUOTATION MARK 00880 // U+0ff07 FULLWIDTH APOSTROPHE 00881 // U+0ff62 HALFWIDTH LEFT CORNER BRACKET 00882 // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET 00883 ucfPrQuotationMark = 0x8000, 00884 // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. 00885 // An explicit _dot above_ can be added where required, such as in Lithuanian. 00886 ucfPrSoftDotted = 0x10000, 00887 // - Sentence Terminal. Used in UAX #29: Text Boundaries. 00888 // U+00021 EXCLAMATION MARK 00889 // U+0002e FULL STOP 00890 // U+0003f QUESTION MARK 00891 // U+0203c DOUBLE EXCLAMATION MARK 00892 // U+0203d INTERROBANG 00893 // U+02047 DOUBLE QUESTION MARK 00894 // U+02048 QUESTION EXCLAMATION MARK 00895 // U+02049 EXCLAMATION QUESTION MARK 00896 // U+03002 IDEOGRAPHIC FULL STOP 00897 // [plus many characters from other writing systems] 00898 ucfPrSTerm = 0x20000, 00899 // - Those punctuation characters that generally mark the end of textual units. 00900 // [JB note: this set contains more character than STerm. For example, it contains 00901 // the comma, colon and semicolon, whereas STerm doesn't.] 00902 // U+00021 EXCLAMATION MARK 00903 // U+0002c COMMA 00904 // U+0002e FULL STOP 00905 // U+0003a COLON 00906 // U+0003b SEMICOLON 00907 // U+0003f QUESTION MARK 00908 // U+0203c DOUBLE EXCLAMATION MARK 00909 // U+0203d INTERROBANG 00910 // U+02047 DOUBLE QUESTION MARK 00911 // U+02048 QUESTION EXCLAMATION MARK 00912 // U+02049 EXCLAMATION QUESTION MARK 00913 // [plus *lots* of charcters from other writing systems] 00914 ucfPrTerminalPunctuation = 0x40000, 00915 // - Indicates all those characters that qualify as Variation Selectors. 00916 // For details on the behavior of these characters, see StandardizedVariants.html and 00917 // Section 16.4, Variation Selectors in [Unicode]. 00918 ucfPrVariationSelector = 0x80000, 00919 // - Those separator characters and control characters which should be treated by 00920 // programming languages as "white space" for the purpose of parsing elements. 00921 // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included, 00922 // since their functions are restricted to line-break control. 00923 // Their names are unfortunately misleading in this respect. 00924 // Note: There are other senses of "whitespace" that encompass a different set of characters. 00925 // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt. 00926 // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.] 00927 // This includes the following characters: 00928 // U+0009 <control> 00929 // U+000a <control> 00930 // U+000b <control> 00931 // U+000c <control> 00932 // U+000d <control> 00933 // U+0020 SPACE 00934 // U+0085 <control> 00935 // U+00a0 NO-BREAK SPACE 00936 // U+1680 OGHAM SPACE MARK 00937 // U+180e MONGOLIAN VOWEL SEPARATOR 00938 // U+2000 EN QUAD 00939 // U+2001 EM QUAD 00940 // U+2002 EN SPACE 00941 // U+2003 EM SPACE 00942 // U+2004 THREE-PER-EM SPACE 00943 // U+2005 FOUR-PER-EM SPACE 00944 // U+2006 SIX-PER-EM SPACE 00945 // U+2007 FIGURE SPACE 00946 // U+2008 PUNCTUATION SPACE 00947 // U+2009 THIN SPACE 00948 // U+200a HAIR SPACE 00949 // U+2028 LINE SEPARATOR 00950 // U+2029 PARAGRAPH SEPARATOR 00951 // U+202f NARROW NO-BREAK SPACE 00952 // U+205f MEDIUM MATHEMATICAL SPACE 00953 // U+3000 IDEOGRAPHIC SPACE 00954 ucfPrWhiteSpace = 0x100000 00955 } 00956 TUniChProperties; 00957 00958 typedef enum TUniChPropertiesX_ 00959 { 00960 // More properties from PropList.txt. 00961 // - Used to derive the properties in DerivedCoreProperties.txt. 00962 ucfPxOtherAlphabetic = 1, 00963 ucfPxOtherDefaultIgnorableCodePoint = 2, 00964 ucfPxOtherGraphemeExtend = 4, 00965 ucfPxOtherIdContinue = 8, 00966 ucfPxOtherIdStart = 0x10, 00967 ucfPxOtherLowercase = 0x20, 00968 ucfPxOtherMath = 0x40, 00969 ucfPxOtherUppercase = 0x80, 00970 // - Used in ideographic description sequences. 00971 ucfPxIdsBinaryOperator = 0x100, 00972 ucfPxIdsTrinaryOperator = 0x200, 00973 ucfPxRadical = 0x400, 00974 ucfPxUnifiedIdeograph = 0x800 00975 } 00976 TUniChPropertiesX; 00977 00978 //----------------------------------------------------------------------------- 00979 // TUniChInfo -- contains information about a single Unicode codepoint 00980 //----------------------------------------------------------------------------- 00981 00982 class TUniChInfo 00983 { 00984 public: 00985 enum { // combining classes (for 'combClass'); from UnicodeData.txt 00986 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined 00987 ccOverlaysAndInterior = 1, 00988 ccNuktas = 7, 00989 ccHiraganaKatakanaVoicingMarks = 8, 00990 ccViramas = 9, 00991 ccFixedPositionStart = 10, // Start of fixed position classes 00992 ccFixedPositionEnd = 199, // End of fixed position classes 00993 ccBelowLeftAttached = 200, 00994 ccBelowAttached = 202, 00995 ccBelowRightAttached = 204, 00996 ccLeftAttached = 208, // Left attached (reordrant around single base character) 00997 ccRightAttached = 210, 00998 ccAboveLeftAttached = 212, 00999 ccAboveAttached = 214, 01000 ccAboveRightAttached = 216, 01001 ccBelowLeft = 218, 01002 ccBelow = 220, 01003 ccBelowRight = 222, 01004 ccLeft = 224, // Left (reordrant around single base character) 01005 ccRight = 226, 01006 ccAboveLeft = 228, 01007 ccAbove = 230, 01008 ccAboveRight = 232, 01009 ccDoubleBelow = 233, 01010 ccDoubleAbove = 234, 01011 ccBelowIotaSubscript = 240, // Below (iota subscript) 01012 ccInvalid = 255 // not defined by Unicode 01013 }; 01014 char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt) 01015 uchar combClass; // canonical combining class 01016 TUniChCategory cat; // = TUniChCategory(chCat) 01017 TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat) 01018 signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown 01019 int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt 01020 int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition 01021 int nameOffset; // offset into 'TUniChDb.charNames' 01022 int flags; // a combination of TUniChFlags 01023 int properties; // a combination of TUniChProperties 01024 int propertiesX; // a combination of TUniChPropertiesX 01025 ushort lineBreak; // from LineBreak.txt 01026 01027 // Converts a 2-letter linebreak code into a 16-bit integer. 01028 static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); } 01029 static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation; 01030 01031 public: 01032 void InitAfterLoad() { 01033 cat = (TUniChCategory) chCat; 01034 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); } 01035 void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) { 01036 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff); 01037 subCat = catAndSubCat; 01038 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); } 01039 friend class TUniChDb; 01040 01041 // Inexplicably missing from TSIn/TSOut... 01042 static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); } 01043 static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); } 01044 static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); } 01045 static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); } 01046 01047 public: 01048 void Save(TSOut& SOut) const { 01049 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script); 01050 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping); 01051 SOut.Save(decompOffset); SOut.Save(nameOffset); 01052 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); } 01053 void Load(TSIn& SIn) { 01054 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script); 01055 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping); 01056 SIn.Load(decompOffset); SIn.Load(nameOffset); 01057 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); } 01058 explicit TUniChInfo(TSIn& SIn) { Load(SIn); } 01059 TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid), 01060 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1), 01061 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) { 01062 InitAfterLoad(); } 01063 01064 // DerivedCoreProperties flags. 01065 bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; } 01066 void ClrDcpFlags() { flags = flags & ~ucfDcpMask; } 01067 void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; } 01068 bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); } 01069 bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); } 01070 bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); } 01071 bool IsMath() const { return IsDcpFlag(ucfDcpMath); } 01072 bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); } 01073 bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); } 01074 bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); } 01075 bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); } 01076 bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); } 01077 bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); } 01078 bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); } 01079 01080 // PropList.txt flags. 01081 bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; } 01082 void SetProperty(const TUniChProperties flag) { properties |= flag; } 01083 bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); } 01084 bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); } 01085 bool IsDash() const { return IsProperty(ucfPrDash); } 01086 bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); } 01087 bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); } 01088 bool IsExtender() const { return IsProperty(ucfPrExtender); } 01089 bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); } 01090 bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); } 01091 bool IsHyphen() const { return IsProperty(ucfPrHyphen); } 01092 bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); } 01093 bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); } 01094 bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); } 01095 bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); } 01096 bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); } 01097 bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); } 01098 bool IsSTerminal() const { return IsProperty(ucfPrSTerm); } 01099 bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); } 01100 bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); } 01101 bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); } 01102 01103 // Additional PropList.txt flags. 01104 bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; } 01105 void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; } 01106 01107 // Miscellaneous flags. 01108 bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; } 01109 bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; } 01110 01111 // Word-boundary flags. 01112 bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; } 01113 void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); } 01114 void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; } 01115 int GetWbFlags() const { return flags & ucfWbMask; } 01116 bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); } 01117 TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); } 01118 static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") + 01119 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") + 01120 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); } 01121 01122 // Sentence-boundary flags. 01123 bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; } 01124 void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; } 01125 int GetSbFlags() const { return flags & ucfSbMask; } 01126 bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); } 01127 TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); } 01128 static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") + 01129 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") + 01130 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") + 01131 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); } 01132 01133 bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; } 01134 01135 // Grapheme-boundary flags. 01136 bool IsGbExtend() const { return IsGraphemeExtend(); } 01137 01138 // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter. 01139 bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); } 01140 01141 // Character categories. 01142 TUniChCategory GetCat() const { return (TUniChCategory) cat; } 01143 TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; } 01144 // The following characters belong to the 'symbol/currency' subcategory: 01145 // U+00024 DOLLAR SIGN 01146 // U+000a2 CENT SIGN 01147 // U+000a3 POUND SIGN 01148 // U+000a4 CURRENCY SIGN 01149 // U+000a5 YEN SIGN 01150 // U+020a3 FRENCH FRANC SIGN 01151 // U+020a4 LIRA SIGN 01152 // U+020ac EURO SIGN 01153 // [and plenty of others] 01154 bool IsCurrency() const { return subCat == ucSymbolCurrency; } 01155 // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt. 01156 // Thus, it's better to call TUniChDb's versions of these methods, which are aware of 01157 // the full ranges of private-use and surrogate characters. 01158 bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; } 01159 bool IsSurrogate() const { return subCat == ucOtherSurrogate; } 01160 01161 inline static bool IsValidSubCat(const char chCat, const char chSubCat) { 01162 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn"; 01163 for (const char *p = s; *p; p += 2) 01164 if (chCat == p[0] && chSubCat == p[1]) return true; 01165 return false; } 01166 }; 01167 01168 //----------------------------------------------------------------------------- 01169 // TUniTrie -- a trie for suffixes that should not appear at the end 01170 // of a sentence 01171 //----------------------------------------------------------------------------- 01172 01173 template<typename TItem_> 01174 class TUniTrie 01175 { 01176 public: 01177 typedef TItem_ TItem; 01178 protected: 01179 class TNode { 01180 public: 01181 TItem item; 01182 int child, sib; 01183 bool terminal; 01184 TNode() : child(-1), sib(-1), terminal(false) { } 01185 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { } 01186 }; 01187 typedef TVec<TNode> TNodeV; 01188 typedef TPair<TItem, TItem> TItemPr; 01189 typedef TTriple<TItem, TItem, TItem> TItemTr; 01190 typedef TUniVecIdx TVecIdx; 01191 THash<TItem, TVoid> singles; // 01192 THash<TItemPr, TVoid> pairs; 01193 THash<TItemTr, TInt> roots; 01194 TNodeV nodes; 01195 public: 01196 TUniTrie() { } 01197 void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); } 01198 01199 bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); } 01200 01201 bool Has1Gram(const TItem& item) const { return singles.IsKey(item); } 01202 bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); } 01203 int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const { 01204 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast)); 01205 if (keyId < 0) return 0; else return roots[keyId]; } 01206 int GetChild(const int parentIdx, const TItem& item) const { 01207 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) { 01208 const TNode &node = nodes[childIdx]; 01209 if (node.item == item) return childIdx; 01210 childIdx = node.sib; } 01211 return -1; } 01212 bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; } 01213 01214 // Adds a new string to the trie. Note that the last characters appear 01215 // closer to the root of the trie. 01216 template<typename TSrcVec> 01217 void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount) 01218 { 01219 IAssert(srcCount > 0); 01220 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; } 01221 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; } 01222 size_t srcLast = srcIdx + (srcCount - 1); 01223 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)])); 01224 int keyId = roots.GetKeyId(tr), curNodeIdx = -1; 01225 if (keyId >= 0) curNodeIdx = roots[keyId]; 01226 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); } 01227 // 01228 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; ) 01229 { 01230 const TItem curItem = src[TVecIdx(srcPos)]; 01231 int childNodeIdx = nodes[curNodeIdx].child; 01232 while (childNodeIdx >= 0) { 01233 TNode &childNode = nodes[childNodeIdx]; 01234 if (childNode.item == curItem) break; 01235 childNodeIdx = childNode.sib; } 01236 if (childNodeIdx < 0) { 01237 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false)); 01238 nodes[curNodeIdx].child = childNodeIdx; } 01239 curNodeIdx = childNodeIdx; 01240 if (srcPos == srcIdx) break; else srcPos--; 01241 } 01242 nodes[curNodeIdx].terminal = true; 01243 } 01244 01245 template<typename TSrcVec> 01246 void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); } 01247 }; 01248 01249 //----------------------------------------------------------------------------- 01250 // TUniChDb -- provides access to the Unicode Character Database 01251 //----------------------------------------------------------------------------- 01252 01253 class TUniChDb 01254 { 01255 protected: 01256 void InitAfterLoad(); 01257 typedef TUniVecIdx TVecIdx; 01258 01259 public: 01260 THash<TInt, TUniChInfo> h; // key: codepoint 01261 TStrPool charNames; 01262 TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only) 01263 TIntV decompositions; 01264 THash<TIntPr, TInt> inverseDec; 01265 TUniCaseFolding caseFolding; 01266 // These hash tables contain only the unconditional mappings from SpecialCasing.txt. 01267 // The conditional mappings are hardcoded into GetCaseConverted(). 01268 TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle; 01269 int scriptUnknown; // = scripts.GetKey("Unknown") 01270 01271 TUniChDb() : scriptUnknown(-1) { } 01272 explicit TUniChDb(TSIn& SIn) { Load(SIn); } 01273 void Clr() { 01274 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr(); 01275 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr(); 01276 scripts.Clr(); } 01277 void Save(TSOut& SOut) const { 01278 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut); 01279 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut); 01280 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut); 01281 SOut.SaveCs(); } 01282 void Load(TSIn& SIn) { 01283 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn); 01284 decompositions.Load(SIn); 01285 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn); 01286 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn); 01287 SIn.LoadCs(); InitAfterLoad(); } 01288 void LoadBin(const TStr& fnBin) { 01289 PSIn SIn = TFIn::New(fnBin); Load(*SIn); } 01290 void Test(const TStr& basePath); 01291 01292 // File names used by LoadTxt() and its subroutines. 01293 static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; } 01294 static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; } 01295 static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; } 01296 static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; } 01297 static TStr GetScriptsFn() { return "Scripts.txt"; } 01298 static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; } 01299 static TStr GetLineBreakFn() { return "LineBreak.txt"; } 01300 static TStr GetPropListFn() { return "PropList.txt"; } 01301 static TStr GetAuxiliaryDir() { return "auxiliary"; } 01302 static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; } 01303 static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; } 01304 static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; } 01305 static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; } 01306 static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; } 01307 static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test() 01308 01309 //------------------------------------------------------------------------- 01310 // Script names 01311 //------------------------------------------------------------------------- 01312 01313 // These constants are used when initializing from the text files. 01314 static TStr GetScriptNameUnknown() { return "Unknown"; } 01315 static TStr GetScriptNameKatakana() { return "Katakana"; } 01316 static TStr GetScriptNameHiragana() { return "Hiragana"; } 01317 // 01318 const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); } 01319 int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); } 01320 int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; } 01321 int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); } 01322 01323 //------------------------------------------------------------------------- 01324 // Character namesnames 01325 //------------------------------------------------------------------------- 01326 01327 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 01328 const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); } 01329 TStr GetCharNameS(const int cp) const { 01330 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16). 01331 const char *p = GetCharName(cp); if (p) return p; 01332 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); } 01333 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const { 01334 if (! f) f = stdout; 01335 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 01336 fprintf(f, "%s", prefix.CStr()); 01337 int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp); 01338 fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }} 01339 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); } 01340 01341 //------------------------------------------------------------------------- 01342 // Character information 01343 //------------------------------------------------------------------------- 01344 // These methods provide access to a subset of the functionality 01345 // available in TUniChInfo. 01346 01347 bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) { 01348 int i = h.GetKeyId(cp); 01349 if (i < 0) return false; else { ChInfo=h[i]; return true; }} 01350 TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; } 01351 TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; } 01352 01353 bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); } 01354 int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); } 01355 bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); } 01356 int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); } 01357 01358 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } 01359 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2) 01360 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3) 01361 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) 01362 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) 01363 01364 #define DECLARE_FORWARDED_PROPERTY_METHODS \ 01365 ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \ 01366 ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \ 01367 ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \ 01368 ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \ 01369 ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \ 01370 ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \ 01371 ___UniFwd2(IsXidStart, IsXidContinue) \ 01372 ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \ 01373 ___UniFwd1(IsGbExtend) \ 01374 ___UniFwd2(IsCased, IsCurrency) 01375 01376 DECLARE_FORWARDED_PROPERTY_METHODS 01377 01378 #undef ___UniFwd1 01379 01380 bool IsPrivateUse(const int cp) const { 01381 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse(); 01382 return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area 01383 // Planes 15 and 16 are entirely for private use. 01384 (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); } 01385 // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates. 01386 // For db80..dbff it is clear that the surrogate pair containing this high surrogate 01387 // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false 01388 // for db80..dbff. This is consistent with the category codes assigned in UnicodeData.txt. 01389 bool IsSurrogate(const int cp) const { 01390 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate(); 01391 return 0xd800 <= cp && cp <= 0xdcff; } 01392 01393 // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1 01394 // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters 01395 // for composition to work correctly. 01396 int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; } 01397 01398 //------------------------------------------------------------------------- 01399 // Hangul constants 01400 //------------------------------------------------------------------------- 01401 01402 enum { 01403 HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, 01404 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, 01405 HangulNCount = HangulVCount * HangulTCount, // 588 01406 HangulSCount = HangulLCount * HangulNCount // 11172 01407 }; 01408 01409 //------------------------------------------------------------------------- 01410 // Word boundaries (UAX #29) 01411 //------------------------------------------------------------------------- 01412 01413 protected: 01414 // UAX #29, rule WB3: ignore Format and Extend characters. 01415 // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.] 01416 static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); } 01417 bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); } 01418 // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character. 01419 template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01420 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01421 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01422 template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01423 if (position >= srcEnd) return; 01424 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01425 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01426 template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01427 if (position >= srcEnd) return; 01428 if (IsSbSep(src[TVecIdx(position)])) { position++; return; } 01429 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01430 // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character. 01431 template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const { 01432 if (position <= srcStart) return false; 01433 while (position > srcStart) { 01434 position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; } 01435 return false; } 01436 // Test driver for WbFind*NonIgnored. 01437 void TestWbFindNonIgnored(const TIntV& src) const; 01438 void TestWbFindNonIgnored() const; 01439 public: 01440 // Finds the next word boundary strictly after 'position'. 01441 // Note that there is a valid word boundary at 'srcIdx + srcCount'. 01442 // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01443 template<typename TSrcVec> 01444 bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01445 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word 01446 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01447 // always set to 'true'. 01448 template<typename TSrcVec> 01449 void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01450 protected: 01451 void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence); 01452 01453 //------------------------------------------------------------------------- 01454 // Sentence boundaries (UAX #29) 01455 //------------------------------------------------------------------------- 01456 01457 protected: 01458 TUniTrie<TInt> sbExTrie; 01459 01460 // Checks whether a sentence that ended at src[position - 1] 01461 // would end in one of the suffixes from sbExTrie. 01462 template<typename TSrcVec> 01463 bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const; 01464 01465 public: 01466 // Finds the next sentence boundary strictly after 'position'. 01467 // Note that there is a valid sentence boundary at 'srcIdx + srcCount'. 01468 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01469 template<typename TSrcVec> 01470 bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01471 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence 01472 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01473 // always set to 'true'. 01474 template<typename TSrcVec> 01475 void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01476 01477 // These methods allow the user to define a set of sentence boundary exceptions. 01478 // This is a set of strings, stored in 'sbExTrie'. If the Unicode rules require 01479 // a sentence boundary in a position that would cause the sentence to end with 01480 // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie', 01481 // we will *not* place a sentence boundary there. 01482 // 01483 // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods. 01484 // By default, it is empty. Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain 01485 // a standard set of English-language exceptions. 01486 void SbEx_Clr() { sbExTrie.Clr(); } 01487 template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); } 01488 // template<> void SbEx_Add(const TStr& s) { 01489 void SbEx_Add(const TStr& s) { 01490 TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); } 01491 void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); } 01492 int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec); 01493 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]); 01494 return vec.Len(); } 01495 void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; } 01496 int SbEx_SetStdEnglish() { 01497 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv"; 01498 SbEx_Clr(); return SbEx_AddMulti(data, false); } 01499 01500 //------------------------------------------------------------------------- 01501 // Normalization, decomposition, etc. (UAX #15) 01502 //------------------------------------------------------------------------- 01503 01504 protected: 01505 // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary). 01506 // If 'compatibility == false', only canonical decompositions are used. 01507 template<typename TDestCh> 01508 void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const; 01509 public: 01510 // This appends, to 'dest', the decomposed form of the source string. 01511 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01512 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01513 template<typename TSrcVec, typename TDestCh> 01514 void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01515 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01516 template<typename TSrcVec, typename TDestCh> 01517 void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01518 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01519 // This performs canonical composition on the source string, and appends 01520 // the result to the destination string. The source string should be the 01521 // result of a (canonical or compatibility) decomposition; if this is the 01522 // case, the composition will lead to a normalization form C (NFC) or 01523 // normalization form KC (NFKC), depending on whether canonical or compatibility 01524 // decomposition was used. 01525 template<typename TSrcVec, typename TDestCh> 01526 void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01527 TVec<TDestCh>& dest, bool clrDest = true) const; 01528 template<typename TSrcVec, typename TDestCh> 01529 void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01530 Compose(src, 0, src.Len(), dest, clrDest); } 01531 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01532 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01533 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01534 // source string. 01535 template<typename TSrcVec, typename TDestCh> 01536 void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01537 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01538 template<typename TSrcVec, typename TDestCh> 01539 void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01540 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01541 // Copies the starter characters from 'src' to 'dest'; the other 01542 // characters are skipped. 'src' should already have been decomposed. 01543 // Returns the number of characters extracted. 01544 template<typename TSrcVec, typename TDestCh> 01545 size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01546 TVec<TDestCh>& dest, bool clrDest = true) const; 01547 template<typename TSrcVec, typename TDestCh> 01548 size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01549 return ExtractStarters(src, 0, src.Len(), dest, clrDest); } 01550 // Extracts the starters into a temporary vector and then copies it into 'src'. 01551 template<typename TSrcVec> 01552 size_t ExtractStarters(TSrcVec& src) const { 01553 TIntV temp; size_t retVal = ExtractStarters(src, temp); 01554 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]); 01555 return retVal; } 01556 01557 protected: 01558 void TestComposition(const TStr& basePath); 01559 01560 //------------------------------------------------------------------------- 01561 // Initialization from the text files 01562 //------------------------------------------------------------------------- 01563 01564 protected: 01565 void InitWordAndSentenceBoundaryFlags(const TStr& basePath); 01566 void InitScripts(const TStr& basePath); 01567 void InitLineBreaks(const TStr& basePath); 01568 void InitDerivedCoreProperties(const TStr& basePath); 01569 void InitPropList(const TStr& basePath); 01570 void InitSpecialCasing(const TStr& basePath); 01571 void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s); 01572 public: 01573 void LoadTxt(const TStr& basePath); 01574 void SaveBin(const TStr& fnBinUcd); 01575 01576 //------------------------------------------------------------------------- 01577 // Case conversions 01578 //------------------------------------------------------------------------- 01579 01580 public: 01581 typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion; 01582 // Appends the case-converted form of 'src' to 'dest'. 01583 // 'how' defines what kind of case conversion is required. 01584 // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar'). 01585 // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt'). 01586 template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const; 01587 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); } 01588 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); } 01589 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); } 01590 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01591 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01592 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01593 01594 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01595 // This is simpler and faster. Since each character now maps into exactly one 01596 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01597 template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const; 01598 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); } 01599 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); } 01600 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); } 01601 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); } 01602 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); } 01603 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); } 01604 01605 template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const; 01606 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); } 01607 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); } 01608 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); } 01609 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); } 01610 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); } 01611 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); } 01612 01613 public: 01614 friend class TUniCaseFolding; 01615 01616 // Case folding is an alternative to the above functions. It is intended primarily 01617 // to produce strings that are suitable for comparisons. For example, 01618 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01619 // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01620 // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless). 01621 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01622 // into a string of two or more characters. 01623 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01624 // each string before comparing them (see sec. 3.13 of the standard). 01625 template<typename TSrcVec, typename TDestCh> 01626 void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01627 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); } 01628 template<typename TSrcVec, typename TDestCh> 01629 void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const { 01630 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); } 01631 // ToCaseFolded folds the string in place. However, this means that only the simple 01632 // case foldings can be used (the full ones could increase the length of the string). 01633 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); } 01634 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); } 01635 01636 protected: 01637 void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian); 01638 void TestCaseConversions(); 01639 01640 //------------------------------------------------------------------------- 01641 // Text file reader for the Unicode character database 01642 //------------------------------------------------------------------------- 01643 01644 protected: 01645 01646 class TUcdFileReader 01647 { 01648 protected: 01649 TChA buf; 01650 public: 01651 TChA comment; // contains '#' and everything after it 01652 protected: 01653 FILE *f; 01654 int putBackCh; 01655 int GetCh() { 01656 if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; } 01657 return fgetc(f); } 01658 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; } 01659 // Returns 'false' iff the EOF was encountered before anything was read. 01660 bool ReadNextLine() { 01661 buf.Clr(); comment.Clr(); 01662 bool inComment = false, first = true; 01663 while (true) { 01664 int c = GetCh(); 01665 if (c == EOF) return ! first; 01666 else if (c == 13) { 01667 c = GetCh(); if (c != 10) PutBack(c); 01668 return true; } 01669 else if (c == 10) return true; 01670 else if (c == '#') inComment = true; 01671 if (! inComment) buf += char(c); 01672 else comment += char(c); } 01673 /*first = false;*/} 01674 private: 01675 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); } 01676 TUcdFileReader(const TUcdFileReader& r) { Fail; } 01677 public: 01678 TUcdFileReader() : f(0) { } 01679 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); } 01680 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; } 01681 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }} 01682 ~TUcdFileReader() { Close(); } 01683 bool GetNextLine(TStrV& dest) { 01684 dest.Clr(); 01685 while (true) { 01686 if (! ReadNextLine()) return false; 01687 TStr line = buf; line.ToTrunc(); 01688 if (line.Len() <= 0) continue; 01689 line.SplitOnAllCh(';', dest, false); 01690 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc(); 01691 return true; }} 01692 static int ParseCodePoint(const TStr& s) { 01693 int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; } 01694 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list 01695 if (ClrDestP) dest.Clr(); 01696 TStrV parts; s.SplitOnWs(parts); 01697 for (int i = 0; i < parts.Len(); i++) { 01698 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); 01699 dest.Add(c); } } 01700 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy 01701 int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; } 01702 from = ParseCodePoint(s.GetSubStr(0, i - 1)); 01703 to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); } 01704 }; 01705 01706 //------------------------------------------------------------------------- 01707 // Helper class for processing the text files 01708 //------------------------------------------------------------------------- 01709 // Files such as DerivedCoreProps.txt often refer to ranges of codepoints, 01710 // and not all codepoints from the range have also been listed in 01711 // UnicodeData.txt. Thus, new TUniChInfo instances will be created 01712 // when processing DerivedCoreProps.txt and similar files. 01713 // To assign the correct (sub)categories to these new codepoints, 01714 // the following class will extract the subcategory info from the 01715 // comments in DerivedCoreProps.txt and similar files. 01716 01717 class TSubcatHelper 01718 { 01719 public: 01720 bool hasCat; TUniChSubCategory subCat; 01721 TStrH invalidCatCodes; 01722 TUniChDb &owner; 01723 01724 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { } 01725 01726 void ProcessComment(TUniChDb::TUcdFileReader &reader) 01727 { 01728 hasCat = false; subCat = ucOtherNotAssigned; 01729 if (reader.comment.Len() > 3) 01730 { 01731 IAssert(reader.comment[0] == '#'); 01732 IAssert(reader.comment[1] == ' '); 01733 char chCat = reader.comment[2], chSubCat = reader.comment[3]; 01734 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4]))); 01735 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) { 01736 hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); } 01737 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat)); 01738 } 01739 } 01740 01741 void SetCat(const int cp) { 01742 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01743 IAssert(owner.h[i].subCat == ucOtherNotAssigned); 01744 IAssert(hasCat); 01745 owner.h[i].SetCatAndSubCat(subCat); } 01746 void TestCat(const int cp) { 01747 if (! hasCat) return; 01748 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01749 IAssert(owner.h[i].subCat == subCat); } 01750 01751 ~TSubcatHelper() 01752 { 01753 if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&"); 01754 // Output any unexpected ones (there shouldn't be any). 01755 if (! invalidCatCodes.Empty()) { 01756 printf("Invalid cat code(s) in the comments: "); 01757 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); ) 01758 printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr()); 01759 printf("\n"); } 01760 } 01761 }; 01762 }; 01763 01764 //----------------------------------------------------------------------------- 01765 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb 01766 //----------------------------------------------------------------------------- 01767 01768 class TUnicode 01769 { 01770 public: 01771 TUniCodec codec; 01772 TUniChDb ucd; 01773 01774 TUnicode() { Init(); } 01775 explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); } 01776 void Init() { InitCodecs(); } 01777 01778 //----------------------------------------------------------------------- 01779 // UTF-8 01780 //----------------------------------------------------------------------- 01781 01782 // Returns the number of characters that have been successfully decoded. 01783 // This does not include any replacement characters that may have been inserted into 'dest'. 01784 int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01785 int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01786 01787 // Returns the number of characters that have been successfully encoded. 01788 // This does not include any replacement characters that may have been inserted into 'dest'. 01789 int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); } 01790 01791 // The following wrapper around the UTF-8 encoder returns a TStr containing 01792 // the UTF-8-encoded version of the input string. 01793 TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); } 01794 01795 //----------------------------------------------------------------------- 01796 // UTF-16 Decoder 01797 //----------------------------------------------------------------------- 01798 01799 // Returns the number of characters that have been successfully decoded. 01800 // This does not include any replacement characters that may have been inserted into 'dest'. 01801 // Each element of 'src' is assumed to contain one byte of data. 01802 // srcCount must be even (though srcIdx doesn't need to be). 01803 int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest, 01804 const TUtf16BomHandling bomHandling = bomAllowed, 01805 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01806 return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01807 01808 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 01809 // are used to determine if the two bytes of each word should be swapped before further 01810 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 01811 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 01812 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 01813 // beginning of the source data is used to determine the "original" byte order of the data; 01814 // if this doesn't match the byte order of the local machine, the two bytes of each word will 01815 // be swapped during the decoding process. 01816 int DecodeUtf16FromWords(const TIntV& src, TIntV& dest, 01817 const TUtf16BomHandling bomHandling = bomAllowed, 01818 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01819 return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01820 01821 //----------------------------------------------------------------------- 01822 // UTF-16 Encoder 01823 //----------------------------------------------------------------------- 01824 01825 // Returns the number of characters that have been successfully encoded. 01826 // This does not include any replacement characters that may have been inserted into 'dest'. 01827 int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom, 01828 const TUniByteOrder destByteOrder = boMachineEndian) const { 01829 return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01830 01831 int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom, 01832 const TUniByteOrder destByteOrder = boMachineEndian) const { 01833 return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01834 01835 //----------------------------------------------------------------------- 01836 // 8-bit codecs 01837 //----------------------------------------------------------------------- 01838 01839 T8BitCodec<TEncoding_ISO8859_1> iso8859_1; 01840 T8BitCodec<TEncoding_ISO8859_2> iso8859_2; 01841 T8BitCodec<TEncoding_ISO8859_3> iso8859_3; 01842 T8BitCodec<TEncoding_ISO8859_4> iso8859_4; 01843 T8BitCodec<TEncoding_YuAscii> yuAscii; 01844 T8BitCodec<TEncoding_CP1250> cp1250; 01845 T8BitCodec<TEncoding_CP852> cp852; 01846 T8BitCodec<TEncoding_CP437> cp437; 01847 01848 //----------------------------------------------------------------------- 01849 // Codec registry 01850 //----------------------------------------------------------------------- 01851 // If you know you'll need ISO-8859-2, just use 01852 // TUnicode unicode; 01853 // unicode.iso8859_2.Encode(...); 01854 // If you don't know what you'll need, use: 01855 // TUnicode unicode; 01856 // PCodecBase myCodec = unicode.GetCodec(myCodecName); 01857 // myCodec->Encode(...); 01858 // Note that the first approach is slightly more efficient because there 01859 // aren't any virtual method calls involved. 01860 01861 protected: 01862 THash<TStr, PCodecBase> codecs; 01863 static inline TStr NormalizeCodecName(const TStr& name) { 01864 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; } 01865 public: 01866 void RegisterCodec(const TStr& nameList, const PCodecBase& codec) { 01867 TStrV names; nameList.SplitOnWs(names); 01868 for (int i = 0; i < names.Len(); i++) 01869 codecs.AddDat(NormalizeCodecName(names[i]), codec); } 01870 void UnregisterCodec(const TStr& nameList) { 01871 TStrV names; nameList.SplitOnWs(names); 01872 for (int i = 0; i < names.Len(); i++) 01873 codecs.DelKey(NormalizeCodecName(names[i])); } 01874 void ClrCodecs() { codecs.Clr(); } 01875 void InitCodecs(); 01876 PCodecBase GetCodec(const TStr& name) const { 01877 TStr s = NormalizeCodecName(name); 01878 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr(); 01879 return p; } 01880 void GetAllCodecs(TCodecBaseV& dest) const { 01881 dest.Clr(); 01882 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) { 01883 PCodecBase codec = codecs[i]; bool found = false; 01884 for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; } 01885 if (! found) dest.Add(codec); }} 01886 01887 //------------------------------------------------------------------------- 01888 // Word boundaries (UAX #29) 01889 //------------------------------------------------------------------------- 01890 01891 // Finds the next word boundary strictly after 'position'. 01892 // Note that there are valid word boundaries at 0 and at 'src.Len()'. 01893 // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01894 bool FindNextWordBoundary(const TIntV& src, int &position) const { 01895 if (position < 0) { position = 0; return true; } 01896 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01897 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word 01898 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01899 // always set to 'true'. 01900 void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); } 01901 01902 //------------------------------------------------------------------------- 01903 // Sentence boundaries (UAX #29) 01904 //------------------------------------------------------------------------- 01905 01906 // Finds the next sentence boundary strictly after 'position'. 01907 // Note that there are valid sentence boundaries at 0 and at 'src.Len()'. 01908 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01909 bool FindNextSentenceBoundary(const TIntV& src, int &position) const { 01910 if (position < 0) { position = 0; return true; } 01911 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01912 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence 01913 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01914 // always set to 'true'. 01915 void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); } 01916 01917 void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); } 01918 void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); } 01919 01920 //------------------------------------------------------------------------- 01921 // Normalization, decomposition, etc. (UAX #15) 01922 //------------------------------------------------------------------------- 01923 01924 // This sets 'dest' to the decomposed form of the source string. 01925 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01926 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01927 void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); } 01928 // This performs canonical composition on the source string, and stores 01929 // the result in the destination vector. The source string should be the 01930 // result of a (canonical or compatibility) decomposition; if this is the 01931 // case, the composition will lead to a normalization form C (NFC) or 01932 // normalization form KC (NFKC), depending on whether canonical or compatibility 01933 // decomposition was used. 01934 void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); } 01935 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01936 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01937 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01938 // source string. 01939 void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); } 01940 // Copies the starter characters from 'src' to 'dest'; the other 01941 // characters are skipped. 'src' should already have been decomposed. 01942 // Returns the number of characters extracted. This function can be 01943 // used to remove diacritical marks from a string (after it has been decomposed!). 01944 int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); } 01945 // Extracts the starters into a temporary vector and then copies it into 'src'. 01946 int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); } 01947 01948 //------------------------------------------------------------------------- 01949 // Case conversions 01950 //------------------------------------------------------------------------- 01951 // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text, 01952 // use the case-conversion methods in TUniChDb, which allow the caller 01953 // to request language-specific case mappings for these languages. 01954 01955 public: 01956 typedef TUniChDb::TCaseConversion TCaseConversion; 01957 // Sets 'dest' to the case-converted form of 'src'. 01958 void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); } 01959 void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); } 01960 void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); } 01961 01962 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01963 // This is simpler and faster. Since each character now maps into exactly one 01964 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01965 void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); } 01966 void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); } 01967 void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); } 01968 01969 // These functions perform simple case-conversions in-place. 01970 void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); } 01971 void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); } 01972 void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); } 01973 01974 // Case folding is an alternative to the above functions. It is intended primarily 01975 // to produce strings that are suitable for comparisons. For example, 01976 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01977 // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01978 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01979 // into a string of two or more characters. 01980 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01981 // each string before comparing them (see sec. 3.13 of the standard). 01982 void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); } 01983 // ToCaseFolded folds the string in place. However, this means that only the simple 01984 // case foldings can be used (the full ones could increase the length of the string). 01985 void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); } 01986 01987 TStr GetUtf8CaseFolded(const TStr& s) const { 01988 bool isAscii = true; 01989 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; } 01990 if (isAscii) return s.GetLc(); 01991 TIntV src; DecodeUtf8(s, src); 01992 TIntV dest; GetCaseFolded(src, dest); 01993 return EncodeUtf8Str(dest); } 01994 01995 //------------------------------------------------------------------------- 01996 // Character properties 01997 //------------------------------------------------------------------------- 01998 // These methods simply call the corresponding TUniChDb method 01999 // (which typically calls the corresponding method of TUniChInfo). 02000 // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list. 02001 // They are all of the form bool IsXxxx(const int cp) const 02002 // Some of the more notable ones include: 02003 // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit 02004 // IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic 02005 // IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace 02006 02007 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); } 02008 DECLARE_FORWARDED_PROPERTY_METHODS 02009 #undef DECLARE_FORWARDED_PROPERTY_METHODS 02010 #undef __UniFwd1 02011 ___UniFwd2(IsPrivateUse, IsSurrogate) 02012 02013 TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); } 02014 TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); } 02015 02016 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 02017 const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); } 02018 TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); } 02019 02020 }; 02021 02022 //----------------------------------------------------------------------------- 02023 // TUniCodec -- UTF-8 Decoder 02024 //----------------------------------------------------------------------------- 02025 02026 // Returns the number of characters that have been successfully decoded. 02027 // This does not include any replacement characters that may have been inserted into 'dest'. 02028 template<typename TSrcVec, typename TDestCh> 02029 size_t TUniCodec::DecodeUtf8( 02030 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02031 TVec<TDestCh>& dest, const bool clrDest) const 02032 { 02033 size_t nDecoded = 0; 02034 if (clrDest) dest.Clr(); 02035 const size_t origSrcIdx = srcIdx; 02036 const size_t srcEnd = srcIdx + srcCount; 02037 while (srcIdx < srcEnd) 02038 { 02039 const size_t charSrcIdx = srcIdx; 02040 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02041 if ((c & _1000_0000) == 0) { 02042 // c is one of the characters 0..0x7f, encoded as a single byte. 02043 dest.Add(TDestCh(c)); nDecoded++; continue; } 02044 else if ((c & _1100_0000) == _1000_0000) { 02045 // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. 02046 // We must have been thrown into the middle of a multi-byte character. 02047 switch (errorHandling) { 02048 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); 02049 case uehAbort: return nDecoded; 02050 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02051 case uehIgnore: continue; 02052 default: Fail; } } 02053 else 02054 { 02055 // c introduces a sequence of 2..6 bytes, depending on how many 02056 // of the most significant bits of c are set. 02057 uint nMoreBytes = 0, nBits = 0, minVal = 0; 02058 if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; 02059 else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; 02060 else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; 02061 else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; 02062 else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; 02063 else { 02064 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 02065 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this 02066 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh 02067 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. 02068 if (strict) { 02069 switch (errorHandling) { 02070 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); 02071 case uehAbort: return nDecoded; 02072 // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes 02073 // and try to decode the character. Then, since 'strict' is true and 02074 // the codepoint is clearly >= 2^31, we'll notice this as an error later 02075 // and (in the case of uehReplace) insert a replacement character then. 02076 // This is probably better than inserting a replacement character right 02077 // away and then trying to read the next byte as if a new character 02078 // was beginning there -- if the current byte is really followed by five 02079 // 10xxxxxx bytes, we'll just get six replacement characters in a row. 02080 case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; 02081 case uehIgnore: break; // continue; 02082 default: Fail; } } 02083 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } 02084 // Decode this multi-byte sequence. 02085 uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. 02086 bool cancel = false; 02087 for (uint i = 0; i < nMoreBytes && ! cancel; i++) { 02088 // See if there are enough bytes left in the source vector. 02089 if (! (srcIdx < srcEnd)) { 02090 switch (errorHandling) { 02091 case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); 02092 case uehAbort: return nDecoded; 02093 case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; 02094 case uehIgnore: cancel = true; continue; 02095 default: Fail; } } 02096 // Read the next byte. 02097 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02098 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. 02099 switch (errorHandling) { 02100 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); 02101 case uehAbort: return nDecoded; 02102 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; 02103 case uehIgnore: srcIdx--; cancel = true; continue; 02104 default: Fail; } } 02105 cOut <<= 6; cOut |= (c & _0011_1111); } 02106 if (cancel) continue; 02107 if (strict) { 02108 // err1: This codepoint has been represented by more bytes than it should have been. 02109 // For example, cOut in the range 0..127 should be represented by a single byte, 02110 // not by two or more bytes. 02111 // - For example, this may happen in the "modified UTF-8" sometimes used for Java 02112 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid 02113 // the appearance of null bytes in the encoded stream. 02114 bool err1 = (cOut < minVal); 02115 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. 02116 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these 02117 // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. 02118 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); 02119 if (err1 || err2) switch (errorHandling) { 02120 case uehThrow: 02121 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); 02122 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); 02123 else { Fail; break; } 02124 case uehAbort: return nDecoded; 02125 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02126 case uehIgnore: continue; 02127 default: Fail; } } 02128 // Add the decoded codepoint to the destination vector. 02129 // If this is the first decoded character, and it's one of the byte-order marks 02130 // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). 02131 if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { 02132 dest.Add(cOut); nDecoded++; } 02133 } // else (multi-byte sequence) 02134 } // while 02135 return nDecoded; 02136 } 02137 02138 //----------------------------------------------------------------------- 02139 // TUniCodec -- UTF-8 Encoder 02140 //----------------------------------------------------------------------- 02141 02142 // Returns the number of characters that have been successfully encoded. 02143 // This does not include any replacement characters that may have been inserted into 'dest'. 02144 template<typename TSrcVec, typename TDestCh> 02145 size_t TUniCodec::EncodeUtf8( 02146 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02147 TVec<TDestCh>& dest, const bool clrDest) const 02148 { 02149 size_t nEncoded = 0; 02150 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 02151 { 02152 uint c = uint(src[TVecIdx(srcIdx)]); 02153 bool err = false; 02154 if (strict && c > 0x10ffff) { 02155 err = true; 02156 switch (errorHandling) { 02157 case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); 02158 case uehAbort: return nEncoded; 02159 case uehReplace: c = replacementChar; break; 02160 case uehIgnore: continue; 02161 default: Fail; } } 02162 if (c < 0x80u) 02163 dest.Add(TDestCh(c & 0xffu)); 02164 else if (c < 0x800u) { 02165 dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); 02166 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02167 else if (c < 0x10000u) { 02168 dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); 02169 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02170 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02171 else if (c < 0x200000u) { 02172 dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); 02173 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02174 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02175 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02176 else if (c < 0x4000000u) { 02177 dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); 02178 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02179 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02180 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02181 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02182 else { 02183 dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); 02184 dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); 02185 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02186 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02187 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02188 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02189 if (! err) nEncoded++; 02190 } 02191 return nEncoded; 02192 } 02193 02194 //----------------------------------------------------------------------- 02195 // TUniCodec -- UTF-16 Encoder 02196 //----------------------------------------------------------------------- 02197 02198 // Returns the number of characters that have been successfully decoded. 02199 // This does not include any replacement characters that may have been inserted into 'dest'. 02200 // Each element of 'src' is assumed to contain one byte of data. 02201 // srcCount must be even (though srcIdx doesn't need to be). 02202 template<typename TSrcVec, typename TDestCh> 02203 size_t TUniCodec::DecodeUtf16FromBytes( 02204 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02205 TVec<TDestCh>& dest, const bool clrDest, 02206 const TUtf16BomHandling bomHandling, 02207 const TUniByteOrder defaultByteOrder) const 02208 { 02209 IAssert(srcCount % 2 == 0); 02210 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02211 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02212 if (clrDest) dest.Clr(); 02213 size_t nDecoded = 0; 02214 if (srcCount <= 0) return nDecoded; 02215 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02216 bool littleEndian = false; 02217 bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); 02218 if (bomHandling == bomIgnored) littleEndian = leDefault; 02219 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02220 { 02221 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; 02222 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } 02223 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } 02224 else if (bomHandling == bomAllowed) littleEndian = leDefault; 02225 else { // Report an error. 02226 switch (errorHandling) { 02227 case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); 02228 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02229 default: Fail; } } 02230 } 02231 else Fail; 02232 while (srcIdx < srcEnd) 02233 { 02234 const size_t charSrcIdx = srcIdx; 02235 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02236 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02237 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02238 { 02239 // c is the first character in a surrogate pair. Read the next character. 02240 if (! (srcIdx + 2 <= srcEnd)) { 02241 switch (errorHandling) { 02242 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02243 case uehAbort: return nDecoded; 02244 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02245 case uehIgnore: continue; 02246 default: Fail; } } 02247 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02248 uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02249 // c2 should be the second character of the surrogate pair. 02250 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02251 switch (errorHandling) { 02252 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02253 case uehAbort: return nDecoded; 02254 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02255 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; 02256 case uehIgnore: srcIdx -= 2; continue; 02257 default: Fail; } } 02258 // c and c2 each contain 10 bits of information. 02259 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02260 cc += 0x10000; 02261 dest.Add(TDestCh(cc)); nDecoded++; continue; 02262 } 02263 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02264 switch (errorHandling) { 02265 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02266 case uehAbort: return nDecoded; 02267 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02268 case uehIgnore: continue; 02269 default: Fail; } } 02270 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02271 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02272 // Otherwise, store 'c' to the destination vector. 02273 dest.Add(TDestCh(c)); nDecoded++; 02274 } 02275 return nDecoded; 02276 } 02277 02278 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 02279 // are used to determine if the two bytes of each word should be swapped before further 02280 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 02281 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 02282 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 02283 // beginning of the source data is used to determine the "original" byte order of the data; 02284 // if this doesn't match the byte order of the local machine, the two bytes of each word will 02285 // be swapped during the decoding process. 02286 template<typename TSrcVec, typename TDestCh> 02287 size_t TUniCodec::DecodeUtf16FromWords( 02288 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02289 TVec<TDestCh>& dest, bool clrDest, 02290 const TUtf16BomHandling bomHandling, 02291 const TUniByteOrder defaultByteOrder) const 02292 { 02293 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02294 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02295 if (clrDest) dest.Clr(); 02296 size_t nDecoded = 0; 02297 if (srcCount <= 0) return nDecoded; 02298 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02299 bool swap = false; 02300 bool isMachineLe = IsMachineLittleEndian(); 02301 bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); 02302 if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); 02303 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02304 { 02305 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; 02306 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } 02307 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } 02308 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); 02309 else { // Report an error. 02310 switch (errorHandling) { 02311 case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); 02312 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02313 default: Fail; } } 02314 } 02315 else Fail; 02316 while (srcIdx < srcEnd) 02317 { 02318 const size_t charSrcIdx = srcIdx; 02319 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02320 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02321 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02322 { 02323 // c is the first character in a surrogate pair. Read the next character. 02324 if (! (srcIdx < srcEnd)) { 02325 switch (errorHandling) { 02326 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02327 case uehAbort: return nDecoded; 02328 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02329 case uehIgnore: continue; 02330 default: Fail; } } 02331 uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02332 if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); 02333 // c2 should be the second character of the surrogate pair. 02334 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02335 switch (errorHandling) { 02336 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02337 case uehAbort: return nDecoded; 02338 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02339 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; 02340 case uehIgnore: srcIdx -= 1; continue; 02341 default: Fail; } } 02342 // c and c2 each contain 10 bits of information. 02343 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02344 cc += 0x10000; 02345 dest.Add(TDestCh(cc)); nDecoded++; continue; 02346 } 02347 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02348 switch (errorHandling) { 02349 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02350 case uehAbort: return nDecoded; 02351 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02352 case uehIgnore: continue; 02353 default: Fail; } } 02354 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02355 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02356 // Otherwise, store 'c' to the destination vector. 02357 dest.Add(TDestCh(c)); nDecoded++; 02358 } 02359 return nDecoded; 02360 } 02361 02362 //----------------------------------------------------------------------- 02363 // TUniCodec -- UTF-16 Encoder 02364 //----------------------------------------------------------------------- 02365 02366 // Returns the number of characters that have been successfully encoded. 02367 // This does not include any replacement characters that may have been inserted into 'dest'. 02368 template<typename TSrcVec, typename TDestCh> 02369 size_t TUniCodec::EncodeUtf16ToWords( 02370 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02371 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02372 const TUniByteOrder destByteOrder) const 02373 { 02374 bool isMachineLe = IsMachineLittleEndian(); 02375 bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); 02376 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02377 if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } 02378 while (srcIdx < srcEnd) 02379 { 02380 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02381 if (! (c <= 0x10ffffu)) { 02382 switch (errorHandling) { 02383 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02384 case uehAbort: return nEncoded; 02385 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02386 case uehIgnore: continue; 02387 default: Fail; } } 02388 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02389 switch (errorHandling) { 02390 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02391 case uehAbort: return nEncoded; 02392 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02393 case uehIgnore: continue; 02394 default: Fail; } } 02395 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02396 switch (errorHandling) { 02397 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02398 case uehAbort: return nEncoded; 02399 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02400 case uehIgnore: continue; 02401 default: Fail; } } 02402 // If c is <= 0xffff, it can be stored directly. 02403 if (c <= 0xffffu) { 02404 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02405 dest.Add(TDestCh(c)); nEncoded++; continue; } 02406 // Otherwise, represent c by a pair of surrogate characters. 02407 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02408 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02409 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02410 if (swap) { 02411 c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); 02412 c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } 02413 dest.Add(TDestCh(c1)); 02414 dest.Add(TDestCh(c2)); 02415 nEncoded++; continue; 02416 } 02417 return nEncoded; 02418 } 02419 02420 template<typename TSrcVec, typename TDestCh> 02421 size_t TUniCodec::EncodeUtf16ToBytes( 02422 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02423 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02424 const TUniByteOrder destByteOrder) const 02425 { 02426 bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); 02427 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02428 if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } 02429 while (srcIdx < srcEnd) 02430 { 02431 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02432 if (! (c <= 0x10ffffu)) { 02433 switch (errorHandling) { 02434 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02435 case uehAbort: return nEncoded; 02436 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } 02437 case uehReplace: ___OutRepl; continue; 02438 case uehIgnore: continue; 02439 default: Fail; } } 02440 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02441 switch (errorHandling) { 02442 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02443 case uehAbort: return nEncoded; 02444 case uehReplace: ___OutRepl; continue; 02445 case uehIgnore: continue; 02446 default: Fail; } } 02447 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02448 switch (errorHandling) { 02449 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02450 case uehAbort: return nEncoded; 02451 case uehReplace: ___OutRepl; continue; 02452 case uehIgnore: continue; 02453 default: Fail; } } 02454 #undef ___OutRepl 02455 // If c is <= 0xffff, it can be stored directly. 02456 if (c <= 0xffffu) { 02457 if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } 02458 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } 02459 nEncoded++; continue; } 02460 // Otherwise, represent c by a pair of surrogate characters. 02461 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02462 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02463 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02464 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } 02465 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } 02466 nEncoded++; continue; 02467 } 02468 return nEncoded; 02469 } 02470 02471 //----------------------------------------------------------------------------- 02472 // TUniChDb -- word boundaries 02473 //----------------------------------------------------------------------------- 02474 02475 template<typename TSrcVec> 02476 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02477 { 02478 // WB1. Break at the start of text. 02479 if (position < srcIdx) { position = srcIdx; return true; } 02480 // If we are beyond the end of the text, there aren't any word breaks left. 02481 const size_t srcEnd = srcIdx + srcCount; 02482 if (position >= srcEnd) return false; 02483 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02484 size_t origPos = position; 02485 if (IsWbIgnored(src[TVecIdx(position)])) { 02486 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02487 position = origPos; 02488 } 02489 // Determine the previous nonignored character (before 'position'). 02490 size_t posPrev = position; 02491 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02492 // Sec 6.2. Allow a break between Sep and an ignored character. 02493 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02494 // Determine the next nonignored character (after 'position'). 02495 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02496 size_t posNext2; 02497 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02498 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02499 int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext); 02500 int cNext2, wbfNext2; 02501 // 02502 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02503 cPrev = cCur, cCur = cNext, cNext = cNext2, 02504 wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2) 02505 { 02506 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02507 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02508 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02509 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02510 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02511 wbfNext2 = GetWbFlags(cNext2); 02512 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02513 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue 02514 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02515 // WB3. Do not break within CRLF. 02516 if (cCur == 13 && cNext == 10) continue; 02517 // WB5. Do not break between most letters. 02518 TestCurNext(ucfWbALetter, ucfWbALetter); 02519 // WB6. Do not break letters across certain punctuation. 02520 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02521 // WB7. Do not break letters across certain punctuation. 02522 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02523 // WB8. Do not break within sequences of digits, or digits adjacent to letters. 02524 TestCurNext(ucfWbNumeric, ucfWbNumeric); 02525 // WB9. Do not break within sequences of digits, or digits adjacent to letters. 02526 TestCurNext(ucfWbALetter, ucfWbNumeric); 02527 // WB10. Do not break within sequences of digits, or digits adjacent to letters. 02528 TestCurNext(ucfWbNumeric, ucfWbALetter); 02529 // WB11. Do not break within sequences, such as "3.2" or "3.456,789". 02530 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02531 // WB12. Do not break within sequences, such as "3.2" or "3.456,789". 02532 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02533 // WB13. Do not break between Katakana. 02534 TestCurNext(ucfWbKatakana, ucfWbKatakana); 02535 // WB13a. Do not break from extenders. 02536 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 && 02537 (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue; 02538 // WB13b. Do not break from extenders. 02539 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet && 02540 (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue; 02541 // WB14. Otherwise, break everywhere. 02542 position = posNext; return true; 02543 #undef TestCurNext 02544 #undef TestCurNext2 02545 #undef TestPrevCurNext 02546 } 02547 // WB2. Break at the end of text. 02548 IAssert(position == srcEnd); 02549 return true; 02550 } 02551 02552 // ToDo: provide a more efficient implementation of this. 02553 template<typename TSrcVec> 02554 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02555 { 02556 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02557 dest.PutAll(false); 02558 size_t position = srcIdx; 02559 dest[TVecIdx(position - srcIdx)] = true; 02560 while (position < srcIdx + srcCount) 02561 { 02562 size_t oldPos = position; 02563 FindNextWordBoundary(src, srcIdx, srcCount, position); 02564 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02565 dest[TVecIdx(position - srcIdx)] = true; 02566 } 02567 Assert(dest[TVecIdx(srcCount)]); 02568 } 02569 02570 //----------------------------------------------------------------------------- 02571 // TUniChDb -- sentence boundaries 02572 //----------------------------------------------------------------------------- 02573 02574 template<typename TSrcVec> 02575 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const 02576 { 02577 if (sbExTrie.Empty()) return true; 02578 // We'll move back from the position where a sentence-boundary is being considered. 02579 size_t pos = position; 02580 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02581 int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c); 02582 // - Skip the Sep, if there is one. 02583 if ((c & ucfSbSep) == ucfSbSep) { 02584 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02585 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02586 // - Skip any Sp characters. 02587 while ((sfb & ucfSbSp) == ucfSbSp) { 02588 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02589 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02590 // - Skip any Close characters. 02591 while ((sfb & ucfSbSp) == ucfSbSp) { 02592 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02593 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02594 // - Skip any ATerm | STerm characters. 02595 while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) { 02596 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02597 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02598 // Now start moving through the trie. 02599 int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1; 02600 while (true) 02601 { 02602 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos)); 02603 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]); 02604 TUniChCategory cat = GetCat(c); 02605 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) { 02606 // Check if the suffix we've read so far is one of those that appear in the trie. 02607 if (len == 1) return ! sbExTrie.Has1Gram(cLast); 02608 if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast); 02609 IAssert(len >= 3); IAssert(node >= 0); 02610 if (sbExTrie.IsNodeTerminal(node)) return false; 02611 if (atEnd) return true; } 02612 if (len == 1) { cButLast = c; len++; } 02613 else if (len == 2) { cButButLast = c; len++; 02614 // Now we have read the last three characters; start descending the suitable subtrie. 02615 node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast); 02616 if (node < 0) return true; } 02617 else { 02618 // Descend down the trie. 02619 node = sbExTrie.GetChild(node, c); 02620 if (node < 0) return true; } 02621 } 02622 //return true; 02623 } 02624 02625 template<typename TSrcVec> 02626 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02627 { 02628 // SB1. Break at the start of text. 02629 if (position < srcIdx) { position = srcIdx; return true; } 02630 // If we are beyond the end of the text, there aren't any word breaks left. 02631 const size_t srcEnd = srcIdx + srcCount; 02632 if (position >= srcEnd) return false; 02633 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02634 size_t origPos = position; 02635 if (IsWbIgnored(src[TVecIdx(position)])) { 02636 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02637 position = origPos; 02638 } 02639 // Determine the previous nonignored character (before 'position'). 02640 size_t posPrev = position; 02641 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02642 // Sec 6.2. Allow a break between Sep and an ignored character. 02643 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02644 // Determine the next nonignored character (after 'position'). 02645 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02646 size_t posNext2; 02647 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02648 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02649 int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext); 02650 int cNext2, sbfNext2; 02651 // Initialize the state of the peek-back automaton. 02652 typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState; 02653 TPeekBackState backState; 02654 { 02655 size_t pos = position; 02656 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false; 02657 while (true) 02658 { 02659 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02660 // Skip at most one Sep. 02661 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02662 if ((sbf & ucfSbSep) == ucfSbSep) { 02663 wasSep = true; 02664 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02665 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02666 // Skip zero or more Sp's. 02667 bool stop = false; 02668 while ((sbf & ucfSbSp) == ucfSbSp) { 02669 wasSp = true; 02670 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02671 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02672 if (stop) break; 02673 // Skip zero or more Close's. 02674 while ((sbf & ucfSbClose) == ucfSbClose) { 02675 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02676 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02677 if (stop) break; 02678 // Process an ATerm or STerm. 02679 wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm); 02680 wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm); 02681 break; 02682 } 02683 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm); 02684 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm); 02685 else backState = stInit; 02686 } 02687 // Initialize the state of the peek-ahead automaton. This state tells us what follows 02688 // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}. 02689 // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string. 02690 // Our peek-ahead automaton must tell us whether it is Lower or something else. 02691 typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState; 02692 TPeekAheadState aheadState = stUnknown; 02693 // 02694 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02695 cPrev = cCur, cCur = cNext, cNext = cNext2, 02696 sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2) 02697 { 02698 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02699 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02700 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02701 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02702 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02703 sbfNext2 = GetSbFlags(cNext2); 02704 // Update the peek-back automaton. 02705 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) 02706 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; } 02707 switch (backState) { 02708 case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break; 02709 case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break; 02710 case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break; 02711 case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02712 case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02713 case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02714 case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02715 default: IAssert(false); } 02716 #undef Trans 02717 #undef TestCur 02718 // Update the peek-ahead automaton. 02719 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) 02720 if (! IsPeekAheadSkippable(sbfCur)) { 02721 bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower); 02722 if (aheadState == stLower) IAssert(isLower); 02723 else if (aheadState == stNotLower) IAssert(! isLower); 02724 // We haven't peaked ahead farther than this so far -- invalidate the state. 02725 aheadState = stUnknown; } 02726 if (aheadState == stUnknown) 02727 { 02728 // Peak ahead to the next non-peekahead-skippable character. 02729 size_t pos = posNext; 02730 while (pos < srcEnd) { 02731 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02732 if (! IsPeekAheadSkippable(sbf)) { 02733 if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower; 02734 else aheadState = stNotLower; 02735 break; } 02736 WbFindNextNonIgnored(src, pos, srcEnd); } 02737 if (! (pos < srcEnd)) aheadState = stNotLower; 02738 } 02739 #undef IsPeekAheadSkippable 02740 // 02741 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02742 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue 02743 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02744 // SB3. Do not break within CRLF. 02745 if (cCur == 13 && cNext == 10) continue; 02746 // SB4. Break ater paragraph separators. 02747 if ((sbfCur & ucfSbSep) == ucfSbSep) { 02748 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02749 position = posNext; return true; } 02750 // Do not break after ambiguous terminators like period, if they are immediately followed by a number 02751 // or lowercase letter, if they are between uppercase letters, or if the first following letter 02752 // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation 02753 // or numeric period, and thus may not mark the end of a sentence. 02754 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6 02755 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7 02756 // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm) 02757 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) && 02758 (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue; 02759 // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower 02760 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue; 02761 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present). 02762 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep ) 02763 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue; 02764 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep ) 02765 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break] 02766 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) { 02767 if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10 02768 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02769 position = posNext; return true; } // SB11 02770 // WB12. Otherwise, do not break. 02771 continue; 02772 #undef TestCurNext 02773 #undef TestCurNext2 02774 #undef TestPrevCurNext 02775 } 02776 // WB2. Break at the end of text. 02777 IAssert(position == srcEnd); 02778 return true; 02779 } 02780 02781 // ToDo: provide a more efficient implementation of this. 02782 template<typename TSrcVec> 02783 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02784 { 02785 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02786 dest.PutAll(false); 02787 size_t position = srcIdx; 02788 dest[TVecIdx(position - srcIdx)] = true; 02789 while (position < srcIdx + srcCount) 02790 { 02791 size_t oldPos = position; 02792 FindNextSentenceBoundary(src, srcIdx, srcCount, position); 02793 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02794 dest[TVecIdx(position - srcIdx)] = true; 02795 } 02796 Assert(dest[TVecIdx(srcCount)]); 02797 } 02798 02799 //----------------------------------------------------------------------------- 02800 // TUniChDb -- case conversions 02801 //----------------------------------------------------------------------------- 02802 02803 template<typename TSrcVec, typename TDestCh> 02804 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02805 TVec<TDestCh>& dest, const bool clrDest, 02806 const TUniChDb::TCaseConversion how, 02807 const bool turkic, const bool lithuanian) const 02808 { 02809 const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0)); 02810 if (clrDest) dest.Clr(); 02811 enum { 02812 GreekCapitalLetterSigma = 0x3a3, 02813 GreekSmallLetterSigma = 0x3c3, 02814 GreekSmallLetterFinalSigma = 0x3c2, 02815 LatinCapitalLetterI = 0x49, 02816 LatinCapitalLetterJ = 0x4a, 02817 LatinCapitalLetterIWithOgonek = 0x12e, 02818 LatinCapitalLetterIWithGrave = 0xcc, 02819 LatinCapitalLetterIWithAcute = 0xcd, 02820 LatinCapitalLetterIWithTilde = 0x128, 02821 LatinCapitalLetterIWithDotAbove = 0x130, 02822 LatinSmallLetterI = 0x69, 02823 CombiningDotAbove = 0x307 02824 }; 02825 // 02826 bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1; 02827 size_t nextWordBoundary = srcIdx; 02828 TBoolV wordBoundaries; bool wbsKnown = false; 02829 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 02830 { 02831 int cp = src[TVecIdx(srcIdx)]; srcIdx++; 02832 //if (turkic && cp == 0x130 && how == ccLower) printf("!"); 02833 // For conversion to titlecase, the first cased character of each word 02834 // must be converted to titlecase; everything else must be converted 02835 // to lowercase. 02836 TUniChDb::TCaseConversion howHere; 02837 if (how != ccTitle) howHere = how; 02838 else { 02839 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 02840 seenCased = false; seenTwoCased = false; cpFirstCased = -1; 02841 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 02842 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 02843 bool isCased = IsCased(cp); 02844 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; } 02845 else { howHere = ccLower; 02846 if (isCased && seenCased) seenTwoCased = true; } 02847 } 02848 // First, process the conditional mappings from SpecialCasing.txt. 02849 // These will be processed in code -- they were ignored while 02850 // we were reading SpecialCasing.txt itself. 02851 if (cp == GreekCapitalLetterSigma && howHere == ccLower) 02852 { 02853 // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of 02854 // the standard doesn't define it. We'll use FinalCased instead. 02855 // FinalCased: within the closest word boundaries containing C, 02856 // there is a cased letter before C, and there is no cased letter after C. 02857 //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary); 02858 if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; } 02859 size_t srcIdx2 = srcIdx; bool casedAfter = false; 02860 if (how == ccTitle) 02861 printf("!"); 02862 //while (srcIdx2 < nextBoundary) 02863 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02864 { 02865 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02866 if (IsCased(cp2)) { casedAfter = true; break; } 02867 } 02868 if (! casedAfter) 02869 { 02870 //size_t prevBoundary = srcIdx - 1; 02871 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary); 02872 srcIdx2 = srcIdx - 1; bool casedBefore = false; 02873 //while (prevBoundary < srcIdx2) 02874 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02875 { 02876 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02877 if (IsCased(cp2)) { casedBefore = true; break; } 02878 } 02879 if (casedBefore) { 02880 // Now we have a FinalCased character. 02881 dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; } 02882 } 02883 // If we got here, add a non-final sigma. 02884 dest.Add(GreekSmallLetterSigma); continue; 02885 } 02886 else if (lithuanian) 02887 { 02888 if (howHere == ccLower) 02889 { 02890 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek) 02891 { 02892 bool moreAbove = false; 02893 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02894 { 02895 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02896 const int cc2 = GetCombiningClass(cp2); 02897 if (cc2 == TUniChInfo::ccStarter) break; 02898 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; } 02899 } 02900 if (moreAbove) 02901 { 02902 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; } 02903 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; } 02904 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; } 02905 } 02906 } 02907 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; } 02908 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; } 02909 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; } 02910 } 02911 if (cp == CombiningDotAbove) 02912 { 02913 // Lithuanian, howHere != ccLower. 02914 // AfterSoftDotted := the last preceding character with a combining class 02915 // of zero before C was Soft_Dotted, and there is no intervening combining 02916 // character class 230 (ABOVE). 02917 bool afterSoftDotted = false; 02918 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02919 while (origSrcIdx < srcIdx2) 02920 { 02921 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02922 int cc2 = GetCombiningClass(cp2); 02923 if (cc2 == TUniChInfo::ccAbove) break; 02924 if (cc2 == TUniChInfo::ccStarter) { 02925 afterSoftDotted = IsSoftDotted(cp2); break; } 02926 } 02927 if (afterSoftDotted) 02928 { 02929 Assert(lithuanian); 02930 // Remove DOT ABOVE after "i" with upper or titlecase. 02931 // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle, 02932 // the "i" may have been kept lowercase and thus we shouldn't remove the dot). 02933 if (how == ccLower) { dest.Add(0x307); continue; } 02934 if (how == ccUpper) continue; 02935 Assert(how == ccTitle); 02936 Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character 02937 if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot. 02938 dest.Add(0x307); continue; 02939 } 02940 } 02941 } 02942 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri) 02943 { 02944 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 02945 // The following rules handle those cases. 02946 if (cp == LatinCapitalLetterIWithDotAbove) { 02947 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; } 02948 // When lowercasing, remove dot_above in the sequence I + dot_above, 02949 // which will turn into i. This matches the behavior of the 02950 // canonically equivalent I-dot_above. 02951 else if (cp == CombiningDotAbove) 02952 { 02953 // AfterI: the last preceding base character was an uppercase I, 02954 // and there is no intervening combining character class 230 (ABOVE). 02955 bool afterI = false; 02956 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02957 while (origSrcIdx < srcIdx2) 02958 { 02959 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02960 if (cp2 == LatinCapitalLetterI) { afterI = true; break; } 02961 int cc2 = GetCombiningClass(cp2); 02962 if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break; 02963 } 02964 if (afterI) { 02965 if (how == ccTitle && seenCased && ! seenTwoCased) { 02966 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word; 02967 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase. 02968 // This suggests that if a cased character is found, others in that word should be left alone. 02969 // This seems unusual; we map all other characters to lowercase instead. 02970 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above 02971 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase; 02972 // but since afterI is also true here, this would mean deleting it. Thus our titlecased 02973 // form of "I followed by dot-above" would be just "I", which is clearly wrong. 02974 // So we treat this as a special case here. 02975 IAssert(cpFirstCased == LatinCapitalLetterI); 02976 dest.Add(0x307); continue; } 02977 if (howHere != ccLower) dest.Add(0x307); 02978 continue; } 02979 } 02980 // When lowercasing, unless an I is before a dot_above, 02981 // it turns into a dotless i. 02982 else if (cp == LatinCapitalLetterI) 02983 { 02984 // BeforeDot: C is followed by U+0307 (combining dot above). 02985 // Any sequence of characters with a combining class that is 02986 // neither 0 nor 230 may intervene between the current character 02987 // and the combining dot above. 02988 bool beforeDot = false; 02989 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02990 { 02991 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02992 if (cp2 == 0x307) { beforeDot = true; break; } 02993 const int cc2 = GetCombiningClass(cp2); 02994 if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break; 02995 } 02996 if (! beforeDot) { 02997 dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; } 02998 } 02999 // When uppercasing, i turns into a dotted capital I. 03000 else if (cp == LatinSmallLetterI) 03001 { 03002 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; 03003 } 03004 } 03005 // Try to use the unconditional mappings. 03006 const TIntIntVH &specHere = ( 03007 howHere == how ? specials : 03008 howHere == ccLower ? specialCasingLower : 03009 howHere == ccTitle ? specialCasingTitle : 03010 howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0)); 03011 int i = specHere.GetKeyId(cp); 03012 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; } 03013 // Try to use the simple (one-character) mappings. 03014 i = h.GetKeyId(cp); 03015 if (i >= 0) { 03016 const TUniChInfo &ci = h[i]; 03017 int cpNew = ( 03018 howHere == ccLower ? ci.simpleLowerCaseMapping : 03019 howHere == ccUpper ? ci.simpleUpperCaseMapping : 03020 ci.simpleTitleCaseMapping); 03021 if (cpNew < 0) cpNew = cp; 03022 dest.Add(cpNew); continue; } 03023 // As a final resort, leave 'cp' unchanged. 03024 dest.Add(cp); 03025 } 03026 } 03027 03028 template<typename TSrcVec, typename TDestCh> 03029 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03030 TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const 03031 { 03032 if (clrDest) dest.Clr(); 03033 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03034 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 03035 { 03036 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03037 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; } 03038 const TUniChInfo &ci = h[i]; 03039 // With titlecasing, the first cased character of each word must be put into titlecase, 03040 // all others into lowercase. This is what the howHere variable is for. 03041 TUniChDb::TCaseConversion howHere; 03042 if (how != ccTitle) howHere = how; 03043 else { 03044 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 03045 seenCased = false; 03046 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03047 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03048 bool isCased = IsCased(cp); 03049 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03050 else howHere = ccLower; 03051 } 03052 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03053 if (cpNew < 0) cpNew = cp; 03054 dest.Add(cpNew); 03055 } 03056 } 03057 03058 template<typename TSrcVec> 03059 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const 03060 { 03061 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03062 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 03063 { 03064 const int cp = src[TVecIdx(srcIdx)]; 03065 int i = h.GetKeyId(cp); if (i < 0) continue; 03066 const TUniChInfo &ci = h[i]; 03067 // With titlecasing, the first cased character of each word must be put into titlecase, 03068 // all others into lowercase. This is what the howHere variable is for. 03069 TUniChDb::TCaseConversion howHere; 03070 if (how != ccTitle) howHere = how; 03071 else { 03072 if (srcIdx == nextWordBoundary) { // A word starts/ends here. 03073 seenCased = false; 03074 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03075 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03076 bool isCased = IsCased(cp); 03077 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03078 else howHere = ccLower; 03079 } 03080 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03081 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew; 03082 } 03083 } 03084 03085 //----------------------------------------------------------------------------- 03086 // TUniChDb -- composition, decomposition, normal forms 03087 //----------------------------------------------------------------------------- 03088 03089 template<typename TDestCh> 03090 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const 03091 { 03092 if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount) 03093 { 03094 // UAX #15, sec. 16: Hangul decomposition 03095 const int SIndex = codePoint - HangulSBase; 03096 const int L = HangulLBase + SIndex / HangulNCount; 03097 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount; 03098 const int T = HangulTBase + (SIndex % HangulTCount); 03099 dest.Add(L); dest.Add(V); 03100 if (T != HangulTBase) dest.Add(T); 03101 return; 03102 } 03103 int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; } 03104 const TUniChInfo &ci = h[i]; 03105 int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; } 03106 if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; } 03107 while (true) { 03108 int cp = decompositions[ofs++]; if (cp < 0) return; 03109 AddDecomposition(cp, dest, compatibility); } 03110 } 03111 03112 template<typename TSrcVec, typename TDestCh> 03113 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03114 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const 03115 { 03116 if (clrDest) dest.Clr(); 03117 const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/; 03118 // Decompose the string. 03119 while (srcIdx < srcCount) { 03120 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; } 03121 // Rearrange the decomposed string into canonical order. 03122 for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; ) 03123 { 03124 size_t j = destIdx; 03125 int cp = dest[TVecIdx(destIdx)]; destIdx++; 03126 int cpCls = GetCombiningClass(cp); 03127 if (cpCls == TUniChInfo::ccStarter) continue; 03128 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) { 03129 dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; } 03130 dest[TVecIdx(j)] = cp; 03131 } 03132 } 03133 03134 template<typename TSrcVec, typename TDestCh> 03135 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03136 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const 03137 { 03138 if (clrDest) dest.Clr(); 03139 TIntV temp; 03140 Decompose(src, srcIdx, srcCount, temp, compatibility); 03141 Compose(temp, 0, temp.Len(), dest, clrDest); 03142 } 03143 03144 template<typename TSrcVec, typename TDestCh> 03145 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03146 TVec<TDestCh>& dest, bool clrDest) const 03147 { 03148 if (clrDest) dest.Clr(); 03149 bool lastStarterKnown = false; // has a starter been encountered yet? 03150 size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter 03151 int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos]) 03152 const size_t srcEnd = srcIdx + srcCount; 03153 int ccMax = -1; // The highest combining class among the characters since the last starter. 03154 while (srcIdx < srcEnd) 03155 { 03156 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03157 const int cpClass = GetCombiningClass(cp); 03158 //int cpCombined = -1; 03159 // If there is a starter with which 'cp' can be combined, and from which it is not blocked 03160 // by some intermediate character, we can try to combine them. 03161 if (lastStarterKnown && ccMax < cpClass) 03162 { 03163 int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp)); 03164 int cpCombined = -1; 03165 do { 03166 // Try to look up a composition in the inverseDec table. 03167 if (j >= 0) { cpCombined = inverseDec[j]; break; } 03168 // UAX #15, sec. 16: Hangul composition 03169 // - Try to combine L and V. 03170 const int LIndex = cpLastStarter - HangulLBase; 03171 if (0 <= LIndex && LIndex < HangulLCount) { 03172 const int VIndex = cp - HangulVBase; 03173 if (0 <= VIndex && VIndex < HangulVCount) { 03174 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount; 03175 break; } } 03176 // - Try to combine LV and T. 03177 const int SIndex = cpLastStarter - HangulSBase; 03178 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) 03179 { 03180 const int TIndex = cp - HangulTBase; 03181 if (0 <= TIndex && TIndex < HangulTCount) { 03182 cpCombined = cpLastStarter + TIndex; 03183 break; } 03184 } 03185 } while (false); 03186 // If a combining character has been found, use it to replace the old cpStarter. 03187 if (cpCombined >= 0) { 03188 dest[TVecIdx(lastStarterPos)] = cpCombined; 03189 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter); 03190 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else 03191 cpLastStarter = cpCombined; continue; } 03192 } 03193 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter. 03194 lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; } 03195 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking). 03196 ccMax = cpClass; 03197 dest.Add(cp); 03198 } 03199 } 03200 03201 template<typename TSrcVec, typename TDestCh> 03202 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03203 TVec<TDestCh>& dest, bool clrDest) const 03204 { 03205 if (clrDest) dest.Clr(); 03206 size_t retVal = 0; 03207 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 03208 const int cp = src[TVecIdx(srcIdx)]; 03209 if (GetCombiningClass(cp) == TUniChInfo::ccStarter) 03210 { dest.Add(cp); retVal++; } } 03211 return retVal; 03212 } 03213 03214 inline bool AlwaysFalse() 03215 { 03216 int sum = 0; 03217 for (int i = 0; i < 5; i++) sum += i; 03218 return sum > 100; 03219 } 03220 03221 inline bool AlwaysTrue() 03222 { 03223 int sum = 0; 03224 for (int i = 0; i < 5; i++) sum += i; 03225 return sum < 100; 03226 } 03227 03228 /* 03229 03230 Notes on decomposition: 03231 03232 - In UnicodeData.txt, there is a field with the decomposition mapping. 03233 This field may also include a tag, <...>. 03234 If there is a tag, this is a compatibility mapping. 03235 Otherwise it is a canonical mapping. 03236 - Canonical decomposition uses only canonical mappings, 03237 compatibility decomposition uses both canonical and compatibility mappings. 03238 - Decomposition: 03239 1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively. 03240 2. Put the string into canonical order, which means: 03241 while there exists a pair of characters, A immediately followed by B, 03242 such that combiningclass(A) > combiningclass(B) > 0 [an "exchangeable pair"]: 03243 swap A and B; 03244 This results in NFD (normalized form D, after canonical decomposition) 03245 or NFKD (normalized form KD, after compatibility decomposition). 03246 - Canonical composition: 03247 1. Before composition, the string should have been decomposed 03248 (using either canonical or compatibility decomposition). 03249 2. For each character C (from left to right): 03250 2.1. Find the last starter S before C (if not found, continue). 03251 2.2. If there is, between S and C, some character with a combining class >= than that of C, then continue. 03252 2.3. If there exists a character L for which the canonical decomposition is S+L 03253 and L is not in the composition exclusion table [i.e. L is a "primary composite"], 03254 then replace S by L, and remove C. 03255 This results in NFC (normalized form C, with canonical decomposition followed by canonical composition) 03256 or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition). 03257 - Composition exclusion table: 03258 - Anything in CompositionExclusions.txt. 03259 - Singletons: characters whose canonical decomposition is a single character. 03260 - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter. 03261 03262 Example: 03263 E-grave (00c8; composition class 0; canonical decomposition: 0045 0300) 03264 E-macron (0112; composition class 0; 0045 0304) 03265 grave (0300; composition class 230) 03266 macron (0304; composition class 230) 03267 source string: 00c8 0304 03268 after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304 03269 after canonical composition: 00c8 0304 03270 03271 cc(horn) = 216 03272 cc(dot below) = 220 03273 cc(dot above) = 230 03274 03275 ToDos: 03276 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov. 03277 Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna. 03278 Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu 03279 upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt). 03280 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase. 03281 Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt 03282 (+ simple case mappinge v UnicodeData.txt). 03283 Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo 03284 in jih potem upostevamo posebej kar v source kodi nasih programov [za 03285 podrobno definicijo pogojev pa glej tabelo 3.13]. 03286 - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase. 03287 Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3. 03288 To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise, 03289 da je CaseFolding.txt izpeljan iz njiju. Glavni namen CaseFolding.txt naj bi bil za 03290 potrebe "locale-independent case folding" (table 4.1 in sec. 5.18). 03291 - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13 03292 in se posebej str. 90. 03293 - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD 03294 - definicija cased ipd. na str. 89 03295 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15 03296 Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih 03297 stvari, med drugim isLowerCase in isUpperCase. Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9). 03298 To je se najbolje dodati med flagse posameznega characterja. 03299 - general category: sec. 4.5 03300 - motivacija za titlecase: 5.18 03301 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt 03302 pod Full_Composition_Exclusion 03303 - script names: Scripts.txt in UAX #24. 03304 - block names: Blocks.txt 03305 - space characters: table 6.2 in baje tudi UCD.html 03306 - dash characters: table 6.3 03307 */ 03308 03309 //#endif 03310