SNAP Library, Developer Reference
2012-10-02 12:56:23
SNAP, a general purpose network analysis and graph mining library
|
00001 //#ifndef unicode_h 00002 //#define unicode_h 00003 00005 // Includes 00006 //#include "base.h" 00007 00008 typedef int TUniVecIdx; 00009 00010 //----------------------------------------------------------------------------- 00011 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder 00012 //----------------------------------------------------------------------------- 00013 00014 // Error handling modes for the TUniCodec class. 00015 typedef enum TUnicodeErrorHandling_ 00016 { 00017 // What happens when an error occurs: 00018 uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector) 00019 uehThrow = 1, // - an exception is thrown (TUnicodeException) 00020 uehReplace = 2, // - the replacement character is added to the output vector 00021 uehAbort = 3 // - the encoding/decoding process stops immediately 00022 } 00023 TUnicodeErrorHandling; 00024 00025 class TUnicodeException 00026 { 00027 public: 00028 TStr message; // error message 00029 size_t srcIdx; // the position in the source vector where the error occurred 00030 int srcChar; // the source character at the position srcIdx 00031 TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) : 00032 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { } 00033 }; 00034 00035 typedef enum TUniByteOrder_ 00036 { 00037 boMachineEndian = 0, 00038 boLittleEndian = 1, 00039 boBigEndian = 2 00040 } 00041 TUniByteOrder; 00042 00043 typedef enum TUtf16BomHandling_ 00044 { 00045 bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used 00046 bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported 00047 bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character 00048 } 00049 TUtf16BomHandling; 00050 00051 class TUniCodec 00052 { 00053 public: 00054 // 0xfffd is defined as the replacement character by the Unicode standard. 00055 // By default, it is rendered as a question mark inside a diamond: "<?>". 00056 enum { DefaultReplacementChar = 0xfffd }; 00057 00058 // The replacement character is inserted into the destination vector 00059 // if an error occurs in the source vector. By default, this is set 00060 // to DefaultReplacementChar. 00061 int replacementChar; 00062 // The error handling mode. 00063 TUnicodeErrorHandling errorHandling; 00064 // There are a number of situations where there is strictly speaking an error in 00065 // the source data although it can still be decoded in a reasonably meaningful way. 00066 // If strict == true, these situations are treated as errors. Examples: 00067 // - when decoding UTF-8: 00068 // - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127 00069 // encoded as a two-byte sequence) 00070 // - a codepoint > 0x10ffff 00071 // - when decoding UTF-16: 00072 // - a codepoint from the range reserved for the second character of a surrogate pair 00073 // is not preceded by a codepoint from the range reserved for the first character of a surrogate pair 00074 // - when encoding UTF-8: 00075 // - a codepoint > 0x10ffff 00076 // - when encoding UTF-16: 00077 // - a codepoint from the range reserved from the second character of a surrogate pair 00078 // [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a 00079 // surrogate pair, is always an error, even with strict == false] 00080 bool strict; 00081 // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning 00082 // of the source vector, it is skipped (when decoding). 00083 // - Note: a BOM is not really useful in UTF-8 encoded data. However, the .NET UTF8Encoding 00084 // emits 0xfeff by default as a kind of preamble. It gets encoded as 3 bytes, ef bb bf, 00085 // and can be helpful to make the data easier to recognize as UTF-8 encoded data. 00086 bool skipBom; 00087 00088 TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true) 00089 { 00090 } 00091 00092 TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) : 00093 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_) 00094 { 00095 } 00096 00097 protected: 00098 enum { 00099 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0 00100 DefineByte(1, 0, 0, 0, 0, 0, 0, 0), 00101 DefineByte(1, 1, 0, 0, 0, 0, 0, 0), 00102 DefineByte(1, 1, 1, 0, 0, 0, 0, 0), 00103 DefineByte(1, 1, 1, 1, 0, 0, 0, 0), 00104 DefineByte(1, 1, 1, 1, 1, 0, 0, 0), 00105 DefineByte(1, 1, 1, 1, 1, 1, 0, 0), 00106 DefineByte(1, 1, 1, 1, 1, 1, 1, 0), 00107 DefineByte(0, 0, 1, 1, 1, 1, 1, 1), 00108 DefineByte(0, 0, 0, 1, 1, 1, 1, 1), 00109 DefineByte(0, 0, 0, 0, 1, 1, 1, 1), 00110 DefineByte(0, 0, 0, 0, 0, 1, 1, 1), 00111 DefineByte(0, 0, 0, 0, 0, 0, 1, 1) 00112 #undef DefineByte 00113 }; 00114 00115 typedef TUniVecIdx TVecIdx; 00116 //friend class TUniChDb; 00117 friend class TUniCaseFolding; 00118 00119 public: 00120 00121 //----------------------------------------------------------------------- 00122 // UTF-8 00123 //----------------------------------------------------------------------- 00124 00125 // Returns the number of characters that have been successfully decoded. 00126 // This does not include any replacement characters that may have been inserted into 'dest'. 00127 template<typename TSrcVec, typename TDestCh> 00128 size_t DecodeUtf8( 00129 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00130 TVec<TDestCh>& dest, const bool clrDest = true) const; 00131 template<typename TSrcVec, typename TDestCh> 00132 size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); } 00133 00134 // Returns the number of characters that have been successfully encoded. 00135 // This does not include any replacement characters that may have been inserted into 'dest'. 00136 template<typename TSrcVec, typename TDestCh> 00137 size_t EncodeUtf8( 00138 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00139 TVec<TDestCh>& dest, const bool clrDest = true) const; 00140 template<typename TSrcVec, typename TDestCh> 00141 size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); } 00142 00143 // The following wrappers around the UTF-8 encoder return a TStr containing 00144 // the UTF-8-encoded version of the input string. 00145 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; } 00146 template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; } 00147 00148 //----------------------------------------------------------------------- 00149 // UTF-16 Decoder 00150 //----------------------------------------------------------------------- 00151 00152 protected: 00153 enum { 00154 Utf16FirstSurrogate = 0xd800, 00155 Utf16SecondSurrogate = 0xdc00 00156 }; 00157 00158 static bool IsMachineLittleEndian(); 00159 00160 public: 00161 00162 // Returns the number of characters that have been successfully decoded. 00163 // This does not include any replacement characters that may have been inserted into 'dest'. 00164 // Each element of 'src' is assumed to contain one byte of data. 00165 // srcCount must be even (though srcIdx doesn't need to be). 00166 template<typename TSrcVec, typename TDestCh> 00167 size_t DecodeUtf16FromBytes( 00168 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00169 TVec<TDestCh>& dest, const bool clrDest, 00170 const TUtf16BomHandling bomHandling = bomAllowed, 00171 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00172 00173 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 00174 // are used to determine if the two bytes of each word should be swapped before further 00175 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 00176 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 00177 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 00178 // beginning of the source data is used to determine the "original" byte order of the data; 00179 // if this doesn't match the byte order of the local machine, the two bytes of each word will 00180 // be swapped during the decoding process. 00181 template<typename TSrcVec, typename TDestCh> 00182 size_t DecodeUtf16FromWords( 00183 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00184 TVec<TDestCh>& dest, bool clrDest, 00185 const TUtf16BomHandling bomHandling = bomAllowed, 00186 const TUniByteOrder defaultByteOrder = boMachineEndian) const; 00187 00188 //----------------------------------------------------------------------- 00189 // UTF-16 Encoder 00190 //----------------------------------------------------------------------- 00191 00192 // Returns the number of characters that have been successfully encoded. 00193 // This does not include any replacement characters that may have been inserted into 'dest'. 00194 // 00195 // Notes: 00196 // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always 00197 // treated as an error, regardless of the value of 'strict'. 00198 // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023 00199 // cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding 00200 // as the first character of a surrogate pair. 00201 // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023 00202 // can be encoded in principle; however, if strict == true, they are treated as errors. 00203 template<typename TSrcVec, typename TDestCh> 00204 size_t EncodeUtf16ToWords( 00205 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00206 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00207 const TUniByteOrder destByteOrder = boMachineEndian) const; 00208 00209 template<typename TSrcVec, typename TDestCh> 00210 size_t EncodeUtf16ToBytes( 00211 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00212 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 00213 const TUniByteOrder destByteOrder = boMachineEndian) const; 00214 00215 //----------------------------------------------------------------------- 00216 // Helper declarations for the test drivers 00217 //----------------------------------------------------------------------- 00218 00219 protected: 00220 00221 static uint GetRndUint(TRnd& rnd); 00222 static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal); 00223 00224 //----------------------------------------------------------------------- 00225 // UTF-8 Test Driver 00226 //----------------------------------------------------------------------- 00227 00228 protected: 00229 void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f); 00230 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc', 00231 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected. 00232 void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc); 00233 public: 00234 void TestUtf8(); 00235 00236 //----------------------------------------------------------------------- 00237 // UTF-16 Test Driver 00238 //----------------------------------------------------------------------- 00239 00240 protected: 00241 void WordsToBytes(const TIntV& src, TIntV& dest); 00242 void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, 00243 // Note: insertBom is only used with the encoder. When encoding, 'defaultByteOrder' is used as the destination byte order. 00244 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, 00245 FILE *f); 00246 static inline int SwapBytes(int x) { 00247 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); } 00248 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc', 00249 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected. 00250 void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc, 00251 const TUtf16BomHandling bomHandling, 00252 const TUniByteOrder defaultByteOrder, 00253 const bool insertBom); 00254 public: 00255 void TestUtf16(); 00256 00257 }; 00258 00259 //----------------------------------------------------------------------------- 00260 // Case folding 00261 //----------------------------------------------------------------------------- 00262 // Note: there's no need to access this class directly. 00263 // Use TUniChDb::GetCaseFolded() instead. 00264 00265 typedef THash<TInt, TIntV> TIntIntVH; 00266 00267 class TUniCaseFolding 00268 { 00269 protected: 00270 TIntH cfCommon, cfSimple, cfTurkic; 00271 TIntIntVH cfFull; 00272 00273 template<typename TSrcDat, typename TDestDat> 00274 inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) { 00275 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); } 00276 friend class TUniChDb; 00277 typedef TUniVecIdx TVecIdx; 00278 00279 public: 00280 TUniCaseFolding() { } 00281 explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); } 00282 void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); } 00283 void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); } 00284 void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); } 00285 void LoadTxt(const TStr& fileName); 00286 00287 // Use 'turkic' when processing text in a Turkic language (tr, az). This only affects the uppercase I and I-with-dot-above. 00288 template<typename TSrcVec, typename TDestCh> 00289 void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00290 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const 00291 { 00292 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 00293 { 00294 int c = src[TVecIdx(srcIdx)], i; srcIdx++; 00295 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; } 00296 if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; } 00297 if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; } 00298 i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c); 00299 } 00300 } 00301 00302 template<typename TSrcVec> 00303 void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const 00304 { 00305 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 00306 { 00307 int c = src[TVecIdx(srcIdx)], i; 00308 if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; } 00309 if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; } 00310 i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i]; 00311 } 00312 } 00313 00314 protected: 00315 void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f); 00316 public: 00317 void Test(); 00318 }; 00319 00320 //----------------------------------------------------------------------------- 00321 // TCodecBase -- an abstract base class for codecs 00322 //----------------------------------------------------------------------------- 00323 00324 class TCodecBase; 00325 typedef TPt<TCodecBase> PCodecBase; 00326 typedef TVec<PCodecBase> TCodecBaseV; 00327 00328 class TCodecBase 00329 { 00330 protected: 00331 TCRef CRef; 00332 friend class TPt<TCodecBase>; 00333 public: 00334 virtual ~TCodecBase() { } 00335 00336 template<class TCodecImpl> 00337 static PCodecBase New(); /* { 00338 return new TCodecWrapper<TCodecImpl>(); } */ 00339 00340 virtual TStr GetName() const = 0; 00341 virtual void Test() const { } 00342 00343 // Returns the number of characters that have been successfully decoded. 00344 // This does not include any replacement characters that may have been inserted into 'dest'. 00345 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00346 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00347 00348 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00349 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00350 00351 // Returns the number of characters that have been successfully encoded. 00352 // This does not include any replacement characters that may have been inserted into 'dest'. 00353 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0; 00354 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0; 00355 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0; 00356 00357 size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00358 size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00359 size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00360 }; 00361 00362 //----------------------------------------------------------------------------- 00363 // TCodecWrapper -- a descendant of TCodecBase; relies on a template 00364 // parameter class for the actual implementation of the codec. 00365 //----------------------------------------------------------------------------- 00366 // Thus, if you know in advance that you'll need ISO-8859-2, just use 00367 // T8BitCodec<TEncoding_ISO8859_2>. If you don't know the encoding 00368 // in advance, use a PCodecBase pointing to a suitable specialization 00369 // of TCodecWrapper<...>. You can TUnicode::GetCodec(TStr& name) 00370 // to obtain a suitable pointer. 00371 00372 template<class TCodecImpl_> 00373 class TCodecWrapper : public TCodecBase 00374 { 00375 public: 00376 typedef TCodecImpl_ TCodecImpl; 00377 TCodecImpl impl; 00378 public: 00379 00380 virtual TStr GetName() const { return impl.GetName(); } 00381 00382 virtual void Test() const { impl.Test(); } 00383 00384 virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00385 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00386 virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00387 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); } 00388 00389 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const { 00390 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00391 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const { 00392 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); } 00393 virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00394 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false); 00395 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00396 return retVal; } 00397 }; 00398 00399 template<class TCodecImpl> 00400 PCodecBase TCodecBase::New() { 00401 return new TCodecWrapper<TCodecImpl>(); 00402 } 00403 00404 //----------------------------------------------------------------------------- 00405 // TVecElt -- a template for determining the type of a vector's elements 00406 //----------------------------------------------------------------------------- 00407 00408 template<class TVector_> 00409 class TVecElt 00410 { 00411 }; 00412 00413 template<class TDat> 00414 class TVecElt<TVec<TDat> > 00415 { 00416 public: 00417 typedef TVec<TDat> TVector; 00418 typedef TDat TElement; 00419 static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); } 00420 }; 00421 00422 template<> 00423 class TVecElt<TChA> 00424 { 00425 public: 00426 typedef TChA TVector; 00427 typedef char TElement; 00428 static inline void Add(TVector& vector, const TElement& element) { vector += element; } 00429 }; 00430 00431 00432 //----------------------------------------------------------------------------- 00433 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode 00434 //----------------------------------------------------------------------------- 00435 00436 class TEncoding_ISO8859_1 00437 { 00438 public: 00439 static inline TStr GetName() { return "ISO-8859-1"; } 00440 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; } 00441 static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; } 00442 }; 00443 00444 class TEncoding_ISO8859_2 // ISO Latin 2 00445 { 00446 public: 00447 static inline TStr GetName() { return "ISO-8859-2"; } 00448 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00449 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00450 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00451 static int FromUnicode(int c) { 00452 if (0 <= c && c < 0xa0) return c; 00453 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00454 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00455 else return -1; } 00456 }; 00457 00458 class TEncoding_ISO8859_3 00459 { 00460 public: 00461 static inline TStr GetName() { return "ISO-8859-3"; } 00462 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2]; 00463 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00464 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00465 static int FromUnicode(int c) { 00466 if (0 <= c && c < 0xa0) return c; 00467 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00468 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8]; 00469 else return -1; } 00470 }; 00471 00472 class TEncoding_ISO8859_4 00473 { 00474 public: 00475 static inline TStr GetName() { return "ISO-8859-4"; } 00476 static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16]; 00477 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00478 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; } 00479 static int FromUnicode(int c) { 00480 if (0 <= c && c < 0xa0) return c; 00481 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00482 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00483 else return -1; } 00484 }; 00485 00486 class TEncoding_YuAscii 00487 { 00488 public: 00489 static const int uniChars[10], yuAsciiChars[10]; 00490 static inline TStr GetName() { return "YU-ASCII"; } 00491 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00492 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++) 00493 if (c == yuAsciiChars[i]) return uniChars[i]; 00494 return c; } 00495 static int FromUnicode(int c) { 00496 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++) 00497 if (c == uniChars[i]) return yuAsciiChars[i]; 00498 else if(c == yuAsciiChars[i]) return -1; 00499 if (0 <= c && c <= 255) return c; else return -1; } 00500 }; 00501 00502 class TEncoding_CP437 // DOS US 00503 { 00504 public: 00505 static inline TStr GetName() { return "CP437"; } 00506 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16]; 00507 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00508 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00509 static int FromUnicode(int c) { 00510 if (0 <= c && c < 0x80) return c; 00511 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0]; 00512 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390]; 00513 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210]; 00514 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500]; 00515 else if (c == 0x192) return 0x9f; 00516 else if (c == 0x207f) return 0xfc; 00517 else if (c == 0x20a7) return 0x9e; 00518 else if (c == 0x2310) return 0xa9; 00519 else if (c == 0x2320) return 0xf4; 00520 else if (c == 0x2321) return 0xf5; 00521 else return -1; } 00522 }; 00523 00524 class TEncoding_CP852 // DOS Latin 2 00525 { 00526 public: 00527 static inline TStr GetName() { return "CP852"; } 00528 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16]; 00529 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00530 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00531 static int FromUnicode(int c) { 00532 if (0 <= c && c < 0x80) return c; 00533 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00534 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00535 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500]; 00536 else return -1; } 00537 }; 00538 00539 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2 00540 { 00541 public: 00542 static inline TStr GetName() { return "CP1250"; } 00543 static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16]; 00544 static int ToUnicode(int c) { Assert(0 <= c && c <= 255); 00545 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; } 00546 static int FromUnicode(int c) { 00547 if (0 <= c && c < 0x80) return c; 00548 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0]; 00549 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0]; 00550 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010]; 00551 else if (c == 0x20ac) return 0x80; 00552 else if (c == 0x2122) return 0x99; 00553 else return -1; } 00554 }; 00555 00556 template<class TEncoding_> 00557 class T8BitCodec 00558 { 00559 protected: 00560 typedef TUniVecIdx TVecIdx; 00561 public: 00562 typedef TEncoding_ TEncoding; 00563 TUnicodeErrorHandling errorHandling; 00564 int replacementChar; 00565 00566 T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { } 00567 T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) : 00568 errorHandling(errorHandling_), replacementChar(replacementChar_) { } 00569 static TStr GetName() { return TEncoding::GetName(); } 00570 00571 void Test() const 00572 { 00573 int nDecoded = 0; 00574 for (int c = 0; c <= 255; c++) { 00575 int cu = TEncoding::ToUnicode(c); if (cu == -1) continue; 00576 nDecoded++; 00577 IAssert(0 <= cu && cu < 0x110000); 00578 int c2 = TEncoding::FromUnicode(cu); 00579 IAssert(c2 == c); } 00580 int nEncoded = 0; 00581 for (int cu = 0; cu < 0x110000; cu++) { 00582 int c = TEncoding::FromUnicode(cu); if (c == -1) continue; 00583 nEncoded++; 00584 IAssert(0 <= c && c <= 255); 00585 int cu2 = TEncoding::ToUnicode(c); 00586 IAssert(cu2 == cu); } 00587 IAssert(nDecoded == nEncoded); 00588 } 00589 00590 // Returns the number of characters that have been successfully decoded. 00591 // This does not include any replacement characters that may have been inserted into 'dest'. 00592 template<typename TSrcVec, typename TDestCh> 00593 size_t ToUnicode( 00594 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00595 TVec<TDestCh>& dest, const bool clrDest = true) const 00596 { 00597 if (clrDest) dest.Clr(); 00598 size_t toDo = srcCount; 00599 while (toDo-- > 0) { 00600 int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++; 00601 int chDest = TEncoding::ToUnicode(chSrc); 00602 dest.Add(chDest); } 00603 return srcCount; 00604 } 00605 template<typename TSrcVec, typename TDestCh> 00606 size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00607 00608 size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00609 size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); } 00610 00611 // Returns the number of characters that have been successfully encoded. 00612 // This does not include any replacement characters that may have been inserted into 'dest'. 00613 template<typename TSrcVec, typename TDestVec> 00614 size_t FromUnicode( 00615 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 00616 TDestVec& dest, const bool clrDest = true) const 00617 { 00618 typedef typename TVecElt<TDestVec>::TElement TDestCh; 00619 if (clrDest) dest.Clr(); 00620 size_t toDo = srcCount, nEncoded = 0; 00621 while (toDo-- > 0) { 00622 int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++; 00623 int chDest = TEncoding::FromUnicode(chSrc); 00624 if (chDest < 0) { 00625 switch (errorHandling) { 00626 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + "."); 00627 case uehAbort: return nEncoded; 00628 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue; 00629 case uehIgnore: continue; 00630 default: Fail; } } 00631 TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; } 00632 return nEncoded; 00633 } 00634 00635 template<typename TSrcVec, typename TDestVec> 00636 size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); } 00637 00638 size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const { 00639 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false); 00640 if (clrDest) dest += buf.CStr(); else dest = buf.CStr(); 00641 return retVal; } 00642 size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); } 00643 }; 00644 00645 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1; 00646 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2; 00647 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3; 00648 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4; 00649 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852; 00650 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437; 00651 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250; 00652 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii; 00653 00654 //----------------------------------------------------------------------------- 00655 // Various declarations used by the Unicode Character Database 00656 //----------------------------------------------------------------------------- 00657 00658 typedef enum TUniChCategory_ 00659 { 00660 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff) 00661 DefineUniCat(Letter, 'L'), // ucLetter 00662 DefineUniCat(Mark, 'M'), 00663 DefineUniCat(Number, 'N'), 00664 DefineUniCat(Punctuation, 'P'), 00665 DefineUniCat(Symbol, 'S'), 00666 DefineUniCat(Separator, 'Z'), 00667 DefineUniCat(Other, 'C') 00668 #undef DefineUniCat 00669 } 00670 TUniChCategory; 00671 00672 typedef enum TUniChSubCategory_ 00673 { 00674 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff) 00675 DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase 00676 DefineUniSubCat(Letter, Lowercase, 'l'), 00677 DefineUniSubCat(Letter, Titlecase, 't'), 00678 DefineUniSubCat(Letter, Modifier, 'm'), 00679 DefineUniSubCat(Letter, Other, 'o'), 00680 DefineUniSubCat(Mark, Nonspacing, 'n'), 00681 DefineUniSubCat(Mark, SpacingCombining, 'c'), 00682 DefineUniSubCat(Mark, Enclosing, 'e'), 00683 DefineUniSubCat(Number, DecimalDigit, 'd'), 00684 DefineUniSubCat(Number, Letter, 'l'), 00685 DefineUniSubCat(Number, Other, 'o'), 00686 DefineUniSubCat(Punctuation, Connector, 'c'), 00687 DefineUniSubCat(Punctuation, Dash, 'd'), 00688 DefineUniSubCat(Punctuation, Open, 's'), 00689 DefineUniSubCat(Punctuation, Close, 'e'), 00690 DefineUniSubCat(Punctuation, InitialQuote, 'i'), 00691 DefineUniSubCat(Punctuation, FinalQuote, 'f'), 00692 DefineUniSubCat(Punctuation, Other, 'o'), 00693 DefineUniSubCat(Symbol, Math, 'm'), 00694 DefineUniSubCat(Symbol, Currency, 'c'), 00695 DefineUniSubCat(Symbol, Modifier, 'k'), 00696 DefineUniSubCat(Symbol, Other, 'o'), 00697 DefineUniSubCat(Separator, Space, 's'), 00698 DefineUniSubCat(Separator, Line, 'l'), 00699 DefineUniSubCat(Separator, Paragraph, 'p'), 00700 DefineUniSubCat(Other, Control, 'c'), 00701 DefineUniSubCat(Other, Format, 'f'), 00702 DefineUniSubCat(Other, Surrogate, 's'), 00703 DefineUniSubCat(Other, PrivateUse, 'o'), 00704 DefineUniSubCat(Other, NotAssigned, 'n') 00705 } 00706 TUniChSubCategory; 00707 00708 typedef enum TUniChFlags_ 00709 { 00710 ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical 00711 ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt 00712 // Flags used when searching for word boundaries. See UAX #29. 00713 ucfWbFormat = 1 << 2, 00714 ucfWbKatakana = 1 << 3, 00715 ucfWbALetter = 1 << 4, 00716 ucfWbMidLetter = 1 << 5, 00717 ucfWbMidNum = 1 << 6, 00718 ucfWbNumeric = 1 << 7, 00719 ucfWbExtendNumLet = 1 << 8, 00720 // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29. 00721 ucfSbSep = 1 << 9, 00722 ucfSbFormat = 1 << 10, 00723 ucfSbSp = 1 << 11, 00724 ucfSbLower = 1 << 12, 00725 ucfSbUpper = 1 << 13, 00726 ucfSbOLetter = 1 << 14, 00727 ucfSbNumeric = 1 << 15, 00728 ucfSbATerm = 1 << 16, 00729 ucfSbSTerm = 1 << 17, 00730 ucfSbClose = 1 << 18, 00731 ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose, 00732 ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, 00733 // Flags from DerivedCoreProperties.txt. 00734 // [The comments are from UCD.html.] 00735 // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode]. 00736 // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl 00737 ucfDcpAlphabetic = 1 << 19, 00738 // - For programmatic determination of default-ignorable code points. 00739 // New characters that should be ignored in processing (unless explicitly supported) 00740 // will be assigned in these ranges, permitting programs to correctly handle the default 00741 // behavior of such characters when not otherwise supported. For more information, see 00742 // UAX #29: Text Boundaries [Breaks]. 00743 // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters 00744 // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors] 00745 ucfDcpDefaultIgnorableCodePoint = 1 << 20, 00746 // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode]. 00747 // Generated from: Other_Lowercase + Ll 00748 ucfDcpLowercase = 1 << 21, 00749 // - For programmatic determination of grapheme cluster boundaries. 00750 // For more information, see UAX #29: Text Boundaries [Breaks]. 00751 // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend 00752 ucfDcpGraphemeBase = 1 << 22, 00753 // - For programmatic determination of grapheme cluster boundaries. 00754 // For more information, see UAX #29: Text Boundaries [Breaks]. 00755 // Generated from: Other_Grapheme_Extend + Me + Mn 00756 // Note: depending on an application's interpretation of Co (private use), they may be either 00757 // in Grapheme_Base, or in Grapheme_Extend, or in neither. 00758 ucfDcpGraphemeExtend = 1 << 23, 00759 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00760 ucfDcpIdStart = 1 << 24, 00761 ucfDcpIdContinue = 1 << 25, 00762 // - Characters with the Math property. For more information, see Chapter 4 in [Unicode]. 00763 // Generated from: Sm + Other_Math 00764 ucfDcpMath = 1 << 26, 00765 // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode]. 00766 // Generated from: Lu + Other_Uppercase 00767 ucfDcpUppercase = 1 << 27, 00768 // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax. 00769 ucfDcpXidStart = 1 << 28, 00770 ucfDcpXidContinue = 1 << 29, 00771 ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend | 00772 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue, 00773 } 00774 TUniChFlags; 00775 00776 typedef enum TUniChProperties_ 00777 { 00778 // The flags from PropList.txt. 00779 // [The comments are from UCD.html.] 00780 // - ASCII characters commonly used for the representation of hexadecimal numbers. 00781 // [= 0123456789abcdefABCDEF] 00782 ucfPrAsciiHexDigit = 1, 00783 // - Those format control characters which have specific functions in the Bidirectional Algorithm. 00784 ucfPrBidiControl = 2, 00785 // - Those punctuation characters explicitly called out as dashes in the Unicode Standard, 00786 // plus compatibility equivalents to those. Most of these have the Pd General Category, 00787 // but some have the Sm General Category because of their use in mathematics. 00788 // U+0002d HYPHEN-MINUS 00789 // U+0058a ARMENIAN HYPHEN 00790 // U+005be HEBREW PUNCTUATION MAQAF 00791 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00792 // U+02010 HYPHEN 00793 // U+02011 NON-BREAKING HYPHEN 00794 // U+02012 FIGURE DASH 00795 // U+02013 EN DASH 00796 // U+02014 EM DASH 00797 // U+02015 HORIZONTAL BAR 00798 // U+02053 SWUNG DASH 00799 // U+0207b SUPERSCRIPT MINUS 00800 // U+0208b SUBSCRIPT MINUS 00801 // U+02212 MINUS SIGN 00802 // U+02e17 DOUBLE OBLIQUE HYPHEN 00803 // U+0301c WAVE DASH 00804 // U+03030 WAVY DASH 00805 // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN 00806 // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH 00807 // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH 00808 // U+0fe58 SMALL EM DASH 00809 // U+0fe63 SMALL HYPHEN-MINUS 00810 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00811 ucfPrDash = 4, 00812 // - For a machine-readable list of deprecated characters. No characters will ever be removed 00813 // from the standard, but the usage of deprecated characters is strongly discouraged. 00814 ucfPrDeprecated = 8, 00815 // - Characters that linguistically modify the meaning of another character to which they apply. 00816 // Some diacritics are not combining characters, and some combining characters are not diacritics. 00817 ucfPrDiacritic = 0x10, 00818 // - Characters whose principal function is to extend the value or shape of a preceding alphabetic 00819 // character. Typical of these are length and iteration marks. 00820 ucfPrExtender = 0x20, 00821 // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries. 00822 ucfPrGraphemeLink = 0x40, 00823 // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents. 00824 // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}] 00825 ucfPrHexDigit = 0x80, 00826 // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot. 00827 // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash. 00828 // U+0002d HYPHEN-MINUS 00829 // U+000ad SOFT HYPHEN 00830 // U+0058a ARMENIAN HYPHEN 00831 // U+01806 MONGOLIAN TODO SOFT HYPHEN 00832 // U+02010 HYPHEN 00833 // U+02011 NON-BREAKING HYPHEN 00834 // U+02e17 DOUBLE OBLIQUE HYPHEN 00835 // U+030fb KATAKANA MIDDLE DOT 00836 // U+0fe63 SMALL HYPHEN-MINUS 00837 // U+0ff0d FULLWIDTH HYPHEN-MINUS 00838 // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT 00839 ucfPrHyphen = 0x100, 00840 // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs. 00841 ucfPrIdeographic = 0x200, 00842 // - Those format control characters which have specific functions for control of cursive joining and ligation. 00843 ucfPrJoinControl = 0x400, 00844 // - There are a small number of characters that do not use logical order. 00845 // These characters require special handling in most processing. 00846 ucfPrLogicalOrderException = 0x800, 00847 // - Code points that are permanently reserved for internal use. 00848 ucfPrNoncharacterCodePoint = 0x1000, 00849 // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax. 00850 ucfPrPatternSyntax = 0x2000, 00851 ucfPrPatternWhiteSpace = 0x4000, 00852 // - Those punctuation characters that function as quotation marks. 00853 // U+00022 QUOTATION MARK 00854 // U+00027 APOSTROPHE 00855 // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 00856 // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 00857 // U+02018 LEFT SINGLE QUOTATION MARK 00858 // U+02019 RIGHT SINGLE QUOTATION MARK 00859 // U+0201a SINGLE LOW-9 QUOTATION MARK 00860 // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK 00861 // U+0201c LEFT DOUBLE QUOTATION MARK 00862 // U+0201d RIGHT DOUBLE QUOTATION MARK 00863 // U+0201e DOUBLE LOW-9 QUOTATION MARK 00864 // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK 00865 // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 00866 // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 00867 // U+0300c LEFT CORNER BRACKET 00868 // U+0300d RIGHT CORNER BRACKET 00869 // U+0300e LEFT WHITE CORNER BRACKET 00870 // U+0300f RIGHT WHITE CORNER BRACKET 00871 // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK 00872 // U+0301e DOUBLE PRIME QUOTATION MARK 00873 // U+0301f LOW DOUBLE PRIME QUOTATION MARK 00874 // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET 00875 // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET 00876 // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET 00877 // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET 00878 // U+0ff02 FULLWIDTH QUOTATION MARK 00879 // U+0ff07 FULLWIDTH APOSTROPHE 00880 // U+0ff62 HALFWIDTH LEFT CORNER BRACKET 00881 // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET 00882 ucfPrQuotationMark = 0x8000, 00883 // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. 00884 // An explicit _dot above_ can be added where required, such as in Lithuanian. 00885 ucfPrSoftDotted = 0x10000, 00886 // - Sentence Terminal. Used in UAX #29: Text Boundaries. 00887 // U+00021 EXCLAMATION MARK 00888 // U+0002e FULL STOP 00889 // U+0003f QUESTION MARK 00890 // U+0203c DOUBLE EXCLAMATION MARK 00891 // U+0203d INTERROBANG 00892 // U+02047 DOUBLE QUESTION MARK 00893 // U+02048 QUESTION EXCLAMATION MARK 00894 // U+02049 EXCLAMATION QUESTION MARK 00895 // U+03002 IDEOGRAPHIC FULL STOP 00896 // [plus many characters from other writing systems] 00897 ucfPrSTerm = 0x20000, 00898 // - Those punctuation characters that generally mark the end of textual units. 00899 // [JB note: this set contains more character than STerm. For example, it contains 00900 // the comma, colon and semicolon, whereas STerm doesn't.] 00901 // U+00021 EXCLAMATION MARK 00902 // U+0002c COMMA 00903 // U+0002e FULL STOP 00904 // U+0003a COLON 00905 // U+0003b SEMICOLON 00906 // U+0003f QUESTION MARK 00907 // U+0203c DOUBLE EXCLAMATION MARK 00908 // U+0203d INTERROBANG 00909 // U+02047 DOUBLE QUESTION MARK 00910 // U+02048 QUESTION EXCLAMATION MARK 00911 // U+02049 EXCLAMATION QUESTION MARK 00912 // [plus *lots* of charcters from other writing systems] 00913 ucfPrTerminalPunctuation = 0x40000, 00914 // - Indicates all those characters that qualify as Variation Selectors. 00915 // For details on the behavior of these characters, see StandardizedVariants.html and 00916 // Section 16.4, Variation Selectors in [Unicode]. 00917 ucfPrVariationSelector = 0x80000, 00918 // - Those separator characters and control characters which should be treated by 00919 // programming languages as "white space" for the purpose of parsing elements. 00920 // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included, 00921 // since their functions are restricted to line-break control. 00922 // Their names are unfortunately misleading in this respect. 00923 // Note: There are other senses of "whitespace" that encompass a different set of characters. 00924 // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt. 00925 // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.] 00926 // This includes the following characters: 00927 // U+0009 <control> 00928 // U+000a <control> 00929 // U+000b <control> 00930 // U+000c <control> 00931 // U+000d <control> 00932 // U+0020 SPACE 00933 // U+0085 <control> 00934 // U+00a0 NO-BREAK SPACE 00935 // U+1680 OGHAM SPACE MARK 00936 // U+180e MONGOLIAN VOWEL SEPARATOR 00937 // U+2000 EN QUAD 00938 // U+2001 EM QUAD 00939 // U+2002 EN SPACE 00940 // U+2003 EM SPACE 00941 // U+2004 THREE-PER-EM SPACE 00942 // U+2005 FOUR-PER-EM SPACE 00943 // U+2006 SIX-PER-EM SPACE 00944 // U+2007 FIGURE SPACE 00945 // U+2008 PUNCTUATION SPACE 00946 // U+2009 THIN SPACE 00947 // U+200a HAIR SPACE 00948 // U+2028 LINE SEPARATOR 00949 // U+2029 PARAGRAPH SEPARATOR 00950 // U+202f NARROW NO-BREAK SPACE 00951 // U+205f MEDIUM MATHEMATICAL SPACE 00952 // U+3000 IDEOGRAPHIC SPACE 00953 ucfPrWhiteSpace = 0x100000 00954 } 00955 TUniChProperties; 00956 00957 typedef enum TUniChPropertiesX_ 00958 { 00959 // More properties from PropList.txt. 00960 // - Used to derive the properties in DerivedCoreProperties.txt. 00961 ucfPxOtherAlphabetic = 1, 00962 ucfPxOtherDefaultIgnorableCodePoint = 2, 00963 ucfPxOtherGraphemeExtend = 4, 00964 ucfPxOtherIdContinue = 8, 00965 ucfPxOtherIdStart = 0x10, 00966 ucfPxOtherLowercase = 0x20, 00967 ucfPxOtherMath = 0x40, 00968 ucfPxOtherUppercase = 0x80, 00969 // - Used in ideographic description sequences. 00970 ucfPxIdsBinaryOperator = 0x100, 00971 ucfPxIdsTrinaryOperator = 0x200, 00972 ucfPxRadical = 0x400, 00973 ucfPxUnifiedIdeograph = 0x800 00974 } 00975 TUniChPropertiesX; 00976 00977 //----------------------------------------------------------------------------- 00978 // TUniChInfo -- contains information about a single Unicode codepoint 00979 //----------------------------------------------------------------------------- 00980 00981 class TUniChInfo 00982 { 00983 public: 00984 enum { // combining classes (for 'combClass'); from UnicodeData.txt 00985 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined 00986 ccOverlaysAndInterior = 1, 00987 ccNuktas = 7, 00988 ccHiraganaKatakanaVoicingMarks = 8, 00989 ccViramas = 9, 00990 ccFixedPositionStart = 10, // Start of fixed position classes 00991 ccFixedPositionEnd = 199, // End of fixed position classes 00992 ccBelowLeftAttached = 200, 00993 ccBelowAttached = 202, 00994 ccBelowRightAttached = 204, 00995 ccLeftAttached = 208, // Left attached (reordrant around single base character) 00996 ccRightAttached = 210, 00997 ccAboveLeftAttached = 212, 00998 ccAboveAttached = 214, 00999 ccAboveRightAttached = 216, 01000 ccBelowLeft = 218, 01001 ccBelow = 220, 01002 ccBelowRight = 222, 01003 ccLeft = 224, // Left (reordrant around single base character) 01004 ccRight = 226, 01005 ccAboveLeft = 228, 01006 ccAbove = 230, 01007 ccAboveRight = 232, 01008 ccDoubleBelow = 233, 01009 ccDoubleAbove = 234, 01010 ccBelowIotaSubscript = 240, // Below (iota subscript) 01011 ccInvalid = 255 // not defined by Unicode 01012 }; 01013 char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt) 01014 uchar combClass; // canonical combining class 01015 TUniChCategory cat; // = TUniChCategory(chCat) 01016 TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat) 01017 signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown 01018 int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt 01019 int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition 01020 int nameOffset; // offset into 'TUniChDb.charNames' 01021 int flags; // a combination of TUniChFlags 01022 int properties; // a combination of TUniChProperties 01023 int propertiesX; // a combination of TUniChPropertiesX 01024 ushort lineBreak; // from LineBreak.txt 01025 01026 // Converts a 2-letter linebreak code into a 16-bit integer. 01027 static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); } 01028 static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation; 01029 01030 public: 01031 void InitAfterLoad() { 01032 cat = (TUniChCategory) chCat; 01033 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); } 01034 void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) { 01035 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff); 01036 subCat = catAndSubCat; 01037 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); } 01038 friend class TUniChDb; 01039 01040 // Inexplicably missing from TSIn/TSOut... 01041 static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); } 01042 static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); } 01043 static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); } 01044 static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); } 01045 01046 public: 01047 void Save(TSOut& SOut) const { 01048 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script); 01049 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping); 01050 SOut.Save(decompOffset); SOut.Save(nameOffset); 01051 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); } 01052 void Load(TSIn& SIn) { 01053 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script); 01054 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping); 01055 SIn.Load(decompOffset); SIn.Load(nameOffset); 01056 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); } 01057 explicit TUniChInfo(TSIn& SIn) { Load(SIn); } 01058 TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid), 01059 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1), 01060 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) { 01061 InitAfterLoad(); } 01062 01063 // DerivedCoreProperties flags. 01064 bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; } 01065 void ClrDcpFlags() { flags = flags & ~ucfDcpMask; } 01066 void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; } 01067 bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); } 01068 bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); } 01069 bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); } 01070 bool IsMath() const { return IsDcpFlag(ucfDcpMath); } 01071 bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); } 01072 bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); } 01073 bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); } 01074 bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); } 01075 bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); } 01076 bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); } 01077 bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); } 01078 01079 // PropList.txt flags. 01080 bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; } 01081 void SetProperty(const TUniChProperties flag) { properties |= flag; } 01082 bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); } 01083 bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); } 01084 bool IsDash() const { return IsProperty(ucfPrDash); } 01085 bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); } 01086 bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); } 01087 bool IsExtender() const { return IsProperty(ucfPrExtender); } 01088 bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); } 01089 bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); } 01090 bool IsHyphen() const { return IsProperty(ucfPrHyphen); } 01091 bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); } 01092 bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); } 01093 bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); } 01094 bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); } 01095 bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); } 01096 bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); } 01097 bool IsSTerminal() const { return IsProperty(ucfPrSTerm); } 01098 bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); } 01099 bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); } 01100 bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); } 01101 01102 // Additional PropList.txt flags. 01103 bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; } 01104 void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; } 01105 01106 // Miscellaneous flags. 01107 bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; } 01108 bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; } 01109 01110 // Word-boundary flags. 01111 bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; } 01112 void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); } 01113 void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; } 01114 int GetWbFlags() const { return flags & ucfWbMask; } 01115 bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); } 01116 TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); } 01117 static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") + 01118 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") + 01119 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); } 01120 01121 // Sentence-boundary flags. 01122 bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; } 01123 void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; } 01124 int GetSbFlags() const { return flags & ucfSbMask; } 01125 bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); } 01126 TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); } 01127 static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") + 01128 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") + 01129 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") + 01130 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); } 01131 01132 bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; } 01133 01134 // Grapheme-boundary flags. 01135 bool IsGbExtend() const { return IsGraphemeExtend(); } 01136 01137 // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter. 01138 bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); } 01139 01140 // Character categories. 01141 TUniChCategory GetCat() const { return (TUniChCategory) cat; } 01142 TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; } 01143 // The following characters belong to the 'symbol/currency' subcategory: 01144 // U+00024 DOLLAR SIGN 01145 // U+000a2 CENT SIGN 01146 // U+000a3 POUND SIGN 01147 // U+000a4 CURRENCY SIGN 01148 // U+000a5 YEN SIGN 01149 // U+020a3 FRENCH FRANC SIGN 01150 // U+020a4 LIRA SIGN 01151 // U+020ac EURO SIGN 01152 // [and plenty of others] 01153 bool IsCurrency() const { return subCat == ucSymbolCurrency; } 01154 // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt. 01155 // Thus, it's better to call TUniChDb's versions of these methods, which are aware of 01156 // the full ranges of private-use and surrogate characters. 01157 bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; } 01158 bool IsSurrogate() const { return subCat == ucOtherSurrogate; } 01159 01160 inline static bool IsValidSubCat(const char chCat, const char chSubCat) { 01161 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn"; 01162 for (const char *p = s; *p; p += 2) 01163 if (chCat == p[0] && chSubCat == p[1]) return true; 01164 return false; } 01165 }; 01166 01167 //----------------------------------------------------------------------------- 01168 // TUniTrie -- a trie for suffixes that should not appear at the end 01169 // of a sentence 01170 //----------------------------------------------------------------------------- 01171 01172 template<typename TItem_> 01173 class TUniTrie 01174 { 01175 public: 01176 typedef TItem_ TItem; 01177 protected: 01178 class TNode { 01179 public: 01180 TItem item; 01181 int child, sib; 01182 bool terminal; 01183 TNode() : child(-1), sib(-1), terminal(false) { } 01184 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { } 01185 }; 01186 typedef TVec<TNode> TNodeV; 01187 typedef TPair<TItem, TItem> TItemPr; 01188 typedef TTriple<TItem, TItem, TItem> TItemTr; 01189 typedef TUniVecIdx TVecIdx; 01190 THash<TItem, TVoid> singles; // 01191 THash<TItemPr, TVoid> pairs; 01192 THash<TItemTr, TInt> roots; 01193 TNodeV nodes; 01194 public: 01195 TUniTrie() { } 01196 void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); } 01197 01198 bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); } 01199 01200 bool Has1Gram(const TItem& item) const { return singles.IsKey(item); } 01201 bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); } 01202 int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const { 01203 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast)); 01204 if (keyId < 0) return 0; else return roots[keyId]; } 01205 int GetChild(const int parentIdx, const TItem& item) const { 01206 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) { 01207 const TNode &node = nodes[childIdx]; 01208 if (node.item == item) return childIdx; 01209 childIdx = node.sib; } 01210 return -1; } 01211 bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; } 01212 01213 // Adds a new string to the trie. Note that the last characters appear 01214 // closer to the root of the trie. 01215 template<typename TSrcVec> 01216 void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount) 01217 { 01218 IAssert(srcCount > 0); 01219 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; } 01220 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; } 01221 size_t srcLast = srcIdx + (srcCount - 1); 01222 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)])); 01223 int keyId = roots.GetKeyId(tr), curNodeIdx = -1; 01224 if (keyId >= 0) curNodeIdx = roots[keyId]; 01225 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); } 01226 // 01227 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; ) 01228 { 01229 const TItem curItem = src[TVecIdx(srcPos)]; 01230 int childNodeIdx = nodes[curNodeIdx].child; 01231 while (childNodeIdx >= 0) { 01232 TNode &childNode = nodes[childNodeIdx]; 01233 if (childNode.item == curItem) break; 01234 childNodeIdx = childNode.sib; } 01235 if (childNodeIdx < 0) { 01236 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false)); 01237 nodes[curNodeIdx].child = childNodeIdx; } 01238 curNodeIdx = childNodeIdx; 01239 if (srcPos == srcIdx) break; else srcPos--; 01240 } 01241 nodes[curNodeIdx].terminal = true; 01242 } 01243 01244 template<typename TSrcVec> 01245 void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); } 01246 }; 01247 01248 //----------------------------------------------------------------------------- 01249 // TUniChDb -- provides access to the Unicode Character Database 01250 //----------------------------------------------------------------------------- 01251 01252 class TUniChDb 01253 { 01254 protected: 01255 void InitAfterLoad(); 01256 typedef TUniVecIdx TVecIdx; 01257 01258 public: 01259 THash<TInt, TUniChInfo> h; // key: codepoint 01260 TStrPool charNames; 01261 TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only) 01262 TIntV decompositions; 01263 THash<TIntPr, TInt> inverseDec; 01264 TUniCaseFolding caseFolding; 01265 // These hash tables contain only the unconditional mappings from SpecialCasing.txt. 01266 // The conditional mappings are hardcoded into GetCaseConverted(). 01267 TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle; 01268 int scriptUnknown; // = scripts.GetKey("Unknown") 01269 01270 TUniChDb() : scriptUnknown(-1) { } 01271 explicit TUniChDb(TSIn& SIn) { Load(SIn); } 01272 void Clr() { 01273 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr(); 01274 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr(); 01275 scripts.Clr(); } 01276 void Save(TSOut& SOut) const { 01277 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut); 01278 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut); 01279 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut); 01280 SOut.SaveCs(); } 01281 void Load(TSIn& SIn) { 01282 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn); 01283 decompositions.Load(SIn); 01284 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn); 01285 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn); 01286 SIn.LoadCs(); InitAfterLoad(); } 01287 void LoadBin(const TStr& fnBin) { 01288 PSIn SIn = TFIn::New(fnBin); Load(*SIn); } 01289 void Test(const TStr& basePath); 01290 01291 // File names used by LoadTxt() and its subroutines. 01292 static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; } 01293 static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; } 01294 static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; } 01295 static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; } 01296 static TStr GetScriptsFn() { return "Scripts.txt"; } 01297 static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; } 01298 static TStr GetLineBreakFn() { return "LineBreak.txt"; } 01299 static TStr GetPropListFn() { return "PropList.txt"; } 01300 static TStr GetAuxiliaryDir() { return "auxiliary"; } 01301 static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; } 01302 static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; } 01303 static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; } 01304 static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; } 01305 static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; } 01306 static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test() 01307 01308 //------------------------------------------------------------------------- 01309 // Script names 01310 //------------------------------------------------------------------------- 01311 01312 // These constants are used when initializing from the text files. 01313 static TStr GetScriptNameUnknown() { return "Unknown"; } 01314 static TStr GetScriptNameKatakana() { return "Katakana"; } 01315 static TStr GetScriptNameHiragana() { return "Hiragana"; } 01316 // 01317 const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); } 01318 int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); } 01319 int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; } 01320 int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); } 01321 01322 //------------------------------------------------------------------------- 01323 // Character namesnames 01324 //------------------------------------------------------------------------- 01325 01326 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 01327 const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); } 01328 TStr GetCharNameS(const int cp) const { 01329 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16). 01330 const char *p = GetCharName(cp); if (p) return p; 01331 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); } 01332 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const { 01333 if (! f) f = stdout; 01334 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 01335 fprintf(f, "%s", prefix.CStr()); 01336 int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp); 01337 fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }} 01338 template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); } 01339 01340 //------------------------------------------------------------------------- 01341 // Character information 01342 //------------------------------------------------------------------------- 01343 // These methods provide access to a subset of the functionality 01344 // available in TUniChInfo. 01345 01346 bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) { 01347 int i = h.GetKeyId(cp); 01348 if (i < 0) return false; else { ChInfo=h[i]; return true; }} 01349 TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; } 01350 TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; } 01351 01352 bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); } 01353 int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); } 01354 bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); } 01355 int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); } 01356 01357 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); } 01358 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2) 01359 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3) 01360 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4) 01361 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5) 01362 01363 #define DECLARE_FORWARDED_PROPERTY_METHODS \ 01364 ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \ 01365 ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \ 01366 ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \ 01367 ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \ 01368 ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \ 01369 ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \ 01370 ___UniFwd2(IsXidStart, IsXidContinue) \ 01371 ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \ 01372 ___UniFwd1(IsGbExtend) \ 01373 ___UniFwd2(IsCased, IsCurrency) 01374 01375 DECLARE_FORWARDED_PROPERTY_METHODS 01376 01377 #undef ___UniFwd1 01378 01379 bool IsPrivateUse(const int cp) const { 01380 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse(); 01381 return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area 01382 // Planes 15 and 16 are entirely for private use. 01383 (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); } 01384 // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates. 01385 // For db80..dbff it is clear that the surrogate pair containing this high surrogate 01386 // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false 01387 // for db80..dbff. This is consistent with the category codes assigned in UnicodeData.txt. 01388 bool IsSurrogate(const int cp) const { 01389 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate(); 01390 return 0xd800 <= cp && cp <= 0xdcff; } 01391 01392 // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1 01393 // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters 01394 // for composition to work correctly. 01395 int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; } 01396 01397 //------------------------------------------------------------------------- 01398 // Hangul constants 01399 //------------------------------------------------------------------------- 01400 01401 enum { 01402 HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7, 01403 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, 01404 HangulNCount = HangulVCount * HangulTCount, // 588 01405 HangulSCount = HangulLCount * HangulNCount // 11172 01406 }; 01407 01408 //------------------------------------------------------------------------- 01409 // Word boundaries (UAX #29) 01410 //------------------------------------------------------------------------- 01411 01412 protected: 01413 // UAX #29, rule WB3: ignore Format and Extend characters. 01414 // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.] 01415 static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); } 01416 bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); } 01417 // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character. 01418 template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01419 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01420 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01421 template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01422 if (position >= srcEnd) return; 01423 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01424 // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character. 01425 template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const { 01426 if (position >= srcEnd) return; 01427 if (IsSbSep(src[TVecIdx(position)])) { position++; return; } 01428 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; } 01429 // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character. 01430 template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const { 01431 if (position <= srcStart) return false; 01432 while (position > srcStart) { 01433 position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; } 01434 return false; } 01435 // Test driver for WbFind*NonIgnored. 01436 void TestWbFindNonIgnored(const TIntV& src) const; 01437 void TestWbFindNonIgnored() const; 01438 public: 01439 // Finds the next word boundary strictly after 'position'. 01440 // Note that there is a valid word boundary at 'srcIdx + srcCount'. 01441 // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01442 template<typename TSrcVec> 01443 bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01444 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word 01445 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01446 // always set to 'true'. 01447 template<typename TSrcVec> 01448 void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01449 protected: 01450 void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence); 01451 01452 //------------------------------------------------------------------------- 01453 // Sentence boundaries (UAX #29) 01454 //------------------------------------------------------------------------- 01455 01456 protected: 01457 TUniTrie<TInt> sbExTrie; 01458 01459 // Checks whether a sentence that ended at src[position - 1] 01460 // would end in one of the suffixes from sbExTrie. 01461 template<typename TSrcVec> 01462 bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const; 01463 01464 public: 01465 // Finds the next sentence boundary strictly after 'position'. 01466 // Note that there is a valid sentence boundary at 'srcIdx + srcCount'. 01467 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'. 01468 template<typename TSrcVec> 01469 bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const; 01470 // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence 01471 // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are 01472 // always set to 'true'. 01473 template<typename TSrcVec> 01474 void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const; 01475 01476 // These methods allow the user to define a set of sentence boundary exceptions. 01477 // This is a set of strings, stored in 'sbExTrie'. If the Unicode rules require 01478 // a sentence boundary in a position that would cause the sentence to end with 01479 // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie', 01480 // we will *not* place a sentence boundary there. 01481 // 01482 // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods. 01483 // By default, it is empty. Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain 01484 // a standard set of English-language exceptions. 01485 void SbEx_Clr() { sbExTrie.Clr(); } 01486 template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); } 01487 // template<> void SbEx_Add(const TStr& s) { 01488 void SbEx_Add(const TStr& s) { 01489 TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); } 01490 void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); } 01491 int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec); 01492 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]); 01493 return vec.Len(); } 01494 void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; } 01495 int SbEx_SetStdEnglish() { 01496 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv"; 01497 SbEx_Clr(); return SbEx_AddMulti(data, false); } 01498 01499 //------------------------------------------------------------------------- 01500 // Normalization, decomposition, etc. (UAX #15) 01501 //------------------------------------------------------------------------- 01502 01503 protected: 01504 // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary). 01505 // If 'compatibility == false', only canonical decompositions are used. 01506 template<typename TDestCh> 01507 void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const; 01508 public: 01509 // This appends, to 'dest', the decomposed form of the source string. 01510 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01511 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01512 template<typename TSrcVec, typename TDestCh> 01513 void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01514 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01515 template<typename TSrcVec, typename TDestCh> 01516 void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01517 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01518 // This performs canonical composition on the source string, and appends 01519 // the result to the destination string. The source string should be the 01520 // result of a (canonical or compatibility) decomposition; if this is the 01521 // case, the composition will lead to a normalization form C (NFC) or 01522 // normalization form KC (NFKC), depending on whether canonical or compatibility 01523 // decomposition was used. 01524 template<typename TSrcVec, typename TDestCh> 01525 void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01526 TVec<TDestCh>& dest, bool clrDest = true) const; 01527 template<typename TSrcVec, typename TDestCh> 01528 void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01529 Compose(src, 0, src.Len(), dest, clrDest); } 01530 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01531 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01532 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01533 // source string. 01534 template<typename TSrcVec, typename TDestCh> 01535 void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01536 TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const; 01537 template<typename TSrcVec, typename TDestCh> 01538 void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const { 01539 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); } 01540 // Copies the starter characters from 'src' to 'dest'; the other 01541 // characters are skipped. 'src' should already have been decomposed. 01542 // Returns the number of characters extracted. 01543 template<typename TSrcVec, typename TDestCh> 01544 size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01545 TVec<TDestCh>& dest, bool clrDest = true) const; 01546 template<typename TSrcVec, typename TDestCh> 01547 size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const { 01548 return ExtractStarters(src, 0, src.Len(), dest, clrDest); } 01549 // Extracts the starters into a temporary vector and then copies it into 'src'. 01550 template<typename TSrcVec> 01551 size_t ExtractStarters(TSrcVec& src) const { 01552 TIntV temp; size_t retVal = ExtractStarters(src, temp); 01553 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]); 01554 return retVal; } 01555 01556 protected: 01557 void TestComposition(const TStr& basePath); 01558 01559 //------------------------------------------------------------------------- 01560 // Initialization from the text files 01561 //------------------------------------------------------------------------- 01562 01563 protected: 01564 void InitWordAndSentenceBoundaryFlags(const TStr& basePath); 01565 void InitScripts(const TStr& basePath); 01566 void InitLineBreaks(const TStr& basePath); 01567 void InitDerivedCoreProperties(const TStr& basePath); 01568 void InitPropList(const TStr& basePath); 01569 void InitSpecialCasing(const TStr& basePath); 01570 void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s); 01571 public: 01572 void LoadTxt(const TStr& basePath); 01573 void SaveBin(const TStr& fnBinUcd); 01574 01575 //------------------------------------------------------------------------- 01576 // Case conversions 01577 //------------------------------------------------------------------------- 01578 01579 public: 01580 typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion; 01581 // Appends the case-converted form of 'src' to 'dest'. 01582 // 'how' defines what kind of case conversion is required. 01583 // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar'). 01584 // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt'). 01585 template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const; 01586 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); } 01587 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); } 01588 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); } 01589 template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01590 template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01591 template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); } 01592 01593 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01594 // This is simpler and faster. Since each character now maps into exactly one 01595 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01596 template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const; 01597 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); } 01598 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); } 01599 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); } 01600 template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); } 01601 template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); } 01602 template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); } 01603 01604 template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const; 01605 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); } 01606 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); } 01607 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); } 01608 template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); } 01609 template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); } 01610 template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); } 01611 01612 public: 01613 friend class TUniCaseFolding; 01614 01615 // Case folding is an alternative to the above functions. It is intended primarily 01616 // to produce strings that are suitable for comparisons. For example, 01617 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01618 // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01619 // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless). 01620 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01621 // into a string of two or more characters. 01622 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01623 // each string before comparing them (see sec. 3.13 of the standard). 01624 template<typename TSrcVec, typename TDestCh> 01625 void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 01626 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); } 01627 template<typename TSrcVec, typename TDestCh> 01628 void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const { 01629 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); } 01630 // ToCaseFolded folds the string in place. However, this means that only the simple 01631 // case foldings can be used (the full ones could increase the length of the string). 01632 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); } 01633 template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); } 01634 01635 protected: 01636 void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian); 01637 void TestCaseConversions(); 01638 01639 //------------------------------------------------------------------------- 01640 // Text file reader for the Unicode character database 01641 //------------------------------------------------------------------------- 01642 01643 protected: 01644 01645 class TUcdFileReader 01646 { 01647 protected: 01648 TChA buf; 01649 public: 01650 TChA comment; // contains '#' and everything after it 01651 protected: 01652 FILE *f; 01653 int putBackCh; 01654 int GetCh() { 01655 if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; } 01656 return fgetc(f); } 01657 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; } 01658 // Returns 'false' iff the EOF was encountered before anything was read. 01659 bool ReadNextLine() { 01660 buf.Clr(); comment.Clr(); 01661 bool inComment = false, first = true; 01662 while (true) { 01663 int c = GetCh(); 01664 if (c == EOF) return ! first; 01665 else if (c == 13) { 01666 c = GetCh(); if (c != 10) PutBack(c); 01667 return true; } 01668 else if (c == 10) return true; 01669 else if (c == '#') inComment = true; 01670 if (! inComment) buf += char(c); 01671 else comment += char(c); } 01672 /*first = false;*/} 01673 private: 01674 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); } 01675 TUcdFileReader(const TUcdFileReader& r) { Fail; } 01676 public: 01677 TUcdFileReader() : f(0) { } 01678 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); } 01679 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; } 01680 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }} 01681 ~TUcdFileReader() { Close(); } 01682 bool GetNextLine(TStrV& dest) { 01683 dest.Clr(); 01684 while (true) { 01685 if (! ReadNextLine()) return false; 01686 TStr line = buf; line.ToTrunc(); 01687 if (line.Len() <= 0) continue; 01688 line.SplitOnAllCh(';', dest, false); 01689 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc(); 01690 return true; }} 01691 static int ParseCodePoint(const TStr& s) { 01692 int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; } 01693 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list 01694 if (ClrDestP) dest.Clr(); 01695 TStrV parts; s.SplitOnWs(parts); 01696 for (int i = 0; i < parts.Len(); i++) { 01697 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); 01698 dest.Add(c); } } 01699 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy 01700 int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; } 01701 from = ParseCodePoint(s.GetSubStr(0, i - 1)); 01702 to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); } 01703 }; 01704 01705 //------------------------------------------------------------------------- 01706 // Helper class for processing the text files 01707 //------------------------------------------------------------------------- 01708 // Files such as DerivedCoreProps.txt often refer to ranges of codepoints, 01709 // and not all codepoints from the range have also been listed in 01710 // UnicodeData.txt. Thus, new TUniChInfo instances will be created 01711 // when processing DerivedCoreProps.txt and similar files. 01712 // To assign the correct (sub)categories to these new codepoints, 01713 // the following class will extract the subcategory info from the 01714 // comments in DerivedCoreProps.txt and similar files. 01715 01716 class TSubcatHelper 01717 { 01718 public: 01719 bool hasCat; TUniChSubCategory subCat; 01720 TStrH invalidCatCodes; 01721 TUniChDb &owner; 01722 01723 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { } 01724 01725 void ProcessComment(TUniChDb::TUcdFileReader &reader) 01726 { 01727 hasCat = false; subCat = ucOtherNotAssigned; 01728 if (reader.comment.Len() > 3) 01729 { 01730 IAssert(reader.comment[0] == '#'); 01731 IAssert(reader.comment[1] == ' '); 01732 char chCat = reader.comment[2], chSubCat = reader.comment[3]; 01733 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4]))); 01734 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) { 01735 hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); } 01736 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat)); 01737 } 01738 } 01739 01740 void SetCat(const int cp) { 01741 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01742 IAssert(owner.h[i].subCat == ucOtherNotAssigned); 01743 IAssert(hasCat); 01744 owner.h[i].SetCatAndSubCat(subCat); } 01745 void TestCat(const int cp) { 01746 if (! hasCat) return; 01747 int i = owner.h.GetKeyId(cp); IAssert(i >= 0); 01748 IAssert(owner.h[i].subCat == subCat); } 01749 01750 ~TSubcatHelper() 01751 { 01752 if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&"); 01753 // Output any unexpected ones (there shouldn't be any). 01754 if (! invalidCatCodes.Empty()) { 01755 printf("Invalid cat code(s) in the comments: "); 01756 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); ) 01757 printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr()); 01758 printf("\n"); } 01759 } 01760 }; 01761 }; 01762 01763 //----------------------------------------------------------------------------- 01764 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb 01765 //----------------------------------------------------------------------------- 01766 01767 class TUnicode 01768 { 01769 public: 01770 TUniCodec codec; 01771 TUniChDb ucd; 01772 01773 TUnicode() { Init(); } 01774 explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); } 01775 void Init() { InitCodecs(); } 01776 01777 //----------------------------------------------------------------------- 01778 // UTF-8 01779 //----------------------------------------------------------------------- 01780 01781 // Returns the number of characters that have been successfully decoded. 01782 // This does not include any replacement characters that may have been inserted into 'dest'. 01783 int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01784 int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); } 01785 01786 // Returns the number of characters that have been successfully encoded. 01787 // This does not include any replacement characters that may have been inserted into 'dest'. 01788 int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); } 01789 01790 // The following wrapper around the UTF-8 encoder returns a TStr containing 01791 // the UTF-8-encoded version of the input string. 01792 TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); } 01793 01794 //----------------------------------------------------------------------- 01795 // UTF-16 Decoder 01796 //----------------------------------------------------------------------- 01797 01798 // Returns the number of characters that have been successfully decoded. 01799 // This does not include any replacement characters that may have been inserted into 'dest'. 01800 // Each element of 'src' is assumed to contain one byte of data. 01801 // srcCount must be even (though srcIdx doesn't need to be). 01802 int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest, 01803 const TUtf16BomHandling bomHandling = bomAllowed, 01804 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01805 return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01806 01807 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 01808 // are used to determine if the two bytes of each word should be swapped before further 01809 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 01810 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 01811 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 01812 // beginning of the source data is used to determine the "original" byte order of the data; 01813 // if this doesn't match the byte order of the local machine, the two bytes of each word will 01814 // be swapped during the decoding process. 01815 int DecodeUtf16FromWords(const TIntV& src, TIntV& dest, 01816 const TUtf16BomHandling bomHandling = bomAllowed, 01817 const TUniByteOrder defaultByteOrder = boMachineEndian) const { 01818 return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); } 01819 01820 //----------------------------------------------------------------------- 01821 // UTF-16 Encoder 01822 //----------------------------------------------------------------------- 01823 01824 // Returns the number of characters that have been successfully encoded. 01825 // This does not include any replacement characters that may have been inserted into 'dest'. 01826 int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom, 01827 const TUniByteOrder destByteOrder = boMachineEndian) const { 01828 return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01829 01830 int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom, 01831 const TUniByteOrder destByteOrder = boMachineEndian) const { 01832 return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); } 01833 01834 //----------------------------------------------------------------------- 01835 // 8-bit codecs 01836 //----------------------------------------------------------------------- 01837 01838 T8BitCodec<TEncoding_ISO8859_1> iso8859_1; 01839 T8BitCodec<TEncoding_ISO8859_2> iso8859_2; 01840 T8BitCodec<TEncoding_ISO8859_3> iso8859_3; 01841 T8BitCodec<TEncoding_ISO8859_4> iso8859_4; 01842 T8BitCodec<TEncoding_YuAscii> yuAscii; 01843 T8BitCodec<TEncoding_CP1250> cp1250; 01844 T8BitCodec<TEncoding_CP852> cp852; 01845 T8BitCodec<TEncoding_CP437> cp437; 01846 01847 //----------------------------------------------------------------------- 01848 // Codec registry 01849 //----------------------------------------------------------------------- 01850 // If you know you'll need ISO-8859-2, just use 01851 // TUnicode unicode; 01852 // unicode.iso8859_2.Encode(...); 01853 // If you don't know what you'll need, use: 01854 // TUnicode unicode; 01855 // PCodecBase myCodec = unicode.GetCodec(myCodecName); 01856 // myCodec->Encode(...); 01857 // Note that the first approach is slightly more efficient because there 01858 // aren't any virtual method calls involved. 01859 01860 protected: 01861 THash<TStr, PCodecBase> codecs; 01862 static inline TStr NormalizeCodecName(const TStr& name) { 01863 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; } 01864 public: 01865 void RegisterCodec(const TStr& nameList, const PCodecBase& codec) { 01866 TStrV names; nameList.SplitOnWs(names); 01867 for (int i = 0; i < names.Len(); i++) 01868 codecs.AddDat(NormalizeCodecName(names[i]), codec); } 01869 void UnregisterCodec(const TStr& nameList) { 01870 TStrV names; nameList.SplitOnWs(names); 01871 for (int i = 0; i < names.Len(); i++) 01872 codecs.DelKey(NormalizeCodecName(names[i])); } 01873 void ClrCodecs() { codecs.Clr(); } 01874 void InitCodecs(); 01875 PCodecBase GetCodec(const TStr& name) const { 01876 TStr s = NormalizeCodecName(name); 01877 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr(); 01878 return p; } 01879 void GetAllCodecs(TCodecBaseV& dest) const { 01880 dest.Clr(); 01881 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) { 01882 PCodecBase codec = codecs[i]; bool found = false; 01883 for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; } 01884 if (! found) dest.Add(codec); }} 01885 01886 //------------------------------------------------------------------------- 01887 // Word boundaries (UAX #29) 01888 //------------------------------------------------------------------------- 01889 01890 // Finds the next word boundary strictly after 'position'. 01891 // Note that there are valid word boundaries at 0 and at 'src.Len()'. 01892 // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01893 bool FindNextWordBoundary(const TIntV& src, int &position) const { 01894 if (position < 0) { position = 0; return true; } 01895 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01896 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word 01897 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01898 // always set to 'true'. 01899 void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); } 01900 01901 //------------------------------------------------------------------------- 01902 // Sentence boundaries (UAX #29) 01903 //------------------------------------------------------------------------- 01904 01905 // Finds the next sentence boundary strictly after 'position'. 01906 // Note that there are valid sentence boundaries at 0 and at 'src.Len()'. 01907 // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'. 01908 bool FindNextSentenceBoundary(const TIntV& src, int &position) const { 01909 if (position < 0) { position = 0; return true; } 01910 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; } 01911 // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence 01912 // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are 01913 // always set to 'true'. 01914 void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); } 01915 01916 void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); } 01917 void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); } 01918 01919 //------------------------------------------------------------------------- 01920 // Normalization, decomposition, etc. (UAX #15) 01921 //------------------------------------------------------------------------- 01922 01923 // This sets 'dest' to the decomposed form of the source string. 01924 // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false; 01925 // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true. 01926 void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); } 01927 // This performs canonical composition on the source string, and stores 01928 // the result in the destination vector. The source string should be the 01929 // result of a (canonical or compatibility) decomposition; if this is the 01930 // case, the composition will lead to a normalization form C (NFC) or 01931 // normalization form KC (NFKC), depending on whether canonical or compatibility 01932 // decomposition was used. 01933 void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); } 01934 // Calls Decompose, followed by Compose; thus the result is the NFC (if 01935 // compatibility == false) or NFKC (if compatibility == true) of the source string. 01936 // A temporary TIntV is used to contain the intermediate NF(K)D form of the 01937 // source string. 01938 void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); } 01939 // Copies the starter characters from 'src' to 'dest'; the other 01940 // characters are skipped. 'src' should already have been decomposed. 01941 // Returns the number of characters extracted. This function can be 01942 // used to remove diacritical marks from a string (after it has been decomposed!). 01943 int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); } 01944 // Extracts the starters into a temporary vector and then copies it into 'src'. 01945 int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); } 01946 01947 //------------------------------------------------------------------------- 01948 // Case conversions 01949 //------------------------------------------------------------------------- 01950 // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text, 01951 // use the case-conversion methods in TUniChDb, which allow the caller 01952 // to request language-specific case mappings for these languages. 01953 01954 public: 01955 typedef TUniChDb::TCaseConversion TCaseConversion; 01956 // Sets 'dest' to the case-converted form of 'src'. 01957 void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); } 01958 void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); } 01959 void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); } 01960 01961 // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt). 01962 // This is simpler and faster. Since each character now maps into exactly one 01963 // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.). 01964 void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); } 01965 void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); } 01966 void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); } 01967 01968 // These functions perform simple case-conversions in-place. 01969 void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); } 01970 void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); } 01971 void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); } 01972 01973 // Case folding is an alternative to the above functions. It is intended primarily 01974 // to produce strings that are suitable for comparisons. For example, 01975 // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma; 01976 // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma. 01977 // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped 01978 // into a string of two or more characters. 01979 // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on 01980 // each string before comparing them (see sec. 3.13 of the standard). 01981 void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); } 01982 // ToCaseFolded folds the string in place. However, this means that only the simple 01983 // case foldings can be used (the full ones could increase the length of the string). 01984 void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); } 01985 01986 TStr GetUtf8CaseFolded(const TStr& s) const { 01987 bool isAscii = true; 01988 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; } 01989 if (isAscii) return s.GetLc(); 01990 TIntV src; DecodeUtf8(s, src); 01991 TIntV dest; GetCaseFolded(src, dest); 01992 return EncodeUtf8Str(dest); } 01993 01994 //------------------------------------------------------------------------- 01995 // Character properties 01996 //------------------------------------------------------------------------- 01997 // These methods simply call the corresponding TUniChDb method 01998 // (which typically calls the corresponding method of TUniChInfo). 01999 // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list. 02000 // They are all of the form bool IsXxxx(const int cp) const 02001 // Some of the more notable ones include: 02002 // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit 02003 // IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic 02004 // IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace 02005 02006 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); } 02007 DECLARE_FORWARDED_PROPERTY_METHODS 02008 #undef DECLARE_FORWARDED_PROPERTY_METHODS 02009 #undef __UniFwd1 02010 ___UniFwd2(IsPrivateUse, IsSurrogate) 02011 02012 TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); } 02013 TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); } 02014 02015 // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234". 02016 const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); } 02017 TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); } 02018 02019 }; 02020 02021 //----------------------------------------------------------------------------- 02022 // TUniCodec -- UTF-8 Decoder 02023 //----------------------------------------------------------------------------- 02024 02025 // Returns the number of characters that have been successfully decoded. 02026 // This does not include any replacement characters that may have been inserted into 'dest'. 02027 template<typename TSrcVec, typename TDestCh> 02028 size_t TUniCodec::DecodeUtf8( 02029 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02030 TVec<TDestCh>& dest, const bool clrDest) const 02031 { 02032 size_t nDecoded = 0; 02033 if (clrDest) dest.Clr(); 02034 const size_t origSrcIdx = srcIdx; 02035 const size_t srcEnd = srcIdx + srcCount; 02036 while (srcIdx < srcEnd) 02037 { 02038 const size_t charSrcIdx = srcIdx; 02039 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02040 if ((c & _1000_0000) == 0) { 02041 // c is one of the characters 0..0x7f, encoded as a single byte. 02042 dest.Add(TDestCh(c)); nDecoded++; continue; } 02043 else if ((c & _1100_0000) == _1000_0000) { 02044 // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx. 02045 // We must have been thrown into the middle of a multi-byte character. 02046 switch (errorHandling) { 02047 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx."); 02048 case uehAbort: return nDecoded; 02049 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02050 case uehIgnore: continue; 02051 default: Fail; } } 02052 else 02053 { 02054 // c introduces a sequence of 2..6 bytes, depending on how many 02055 // of the most significant bits of c are set. 02056 uint nMoreBytes = 0, nBits = 0, minVal = 0; 02057 if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80; 02058 else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800; 02059 else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000; 02060 else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000; 02061 else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000; 02062 else { 02063 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8 02064 // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this 02065 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh 02066 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh. 02067 if (strict) { 02068 switch (errorHandling) { 02069 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x."); 02070 case uehAbort: return nDecoded; 02071 // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes 02072 // and try to decode the character. Then, since 'strict' is true and 02073 // the codepoint is clearly >= 2^31, we'll notice this as an error later 02074 // and (in the case of uehReplace) insert a replacement character then. 02075 // This is probably better than inserting a replacement character right 02076 // away and then trying to read the next byte as if a new character 02077 // was beginning there -- if the current byte is really followed by five 02078 // 10xxxxxx bytes, we'll just get six replacement characters in a row. 02079 case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue; 02080 case uehIgnore: break; // continue; 02081 default: Fail; } } 02082 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; } 02083 // Decode this multi-byte sequence. 02084 uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c. 02085 bool cancel = false; 02086 for (uint i = 0; i < nMoreBytes && ! cancel; i++) { 02087 // See if there are enough bytes left in the source vector. 02088 if (! (srcIdx < srcEnd)) { 02089 switch (errorHandling) { 02090 case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available."); 02091 case uehAbort: return nDecoded; 02092 case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue; 02093 case uehIgnore: cancel = true; continue; 02094 default: Fail; } } 02095 // Read the next byte. 02096 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++; 02097 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx. 02098 switch (errorHandling) { 02099 case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx."); 02100 case uehAbort: return nDecoded; 02101 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue; 02102 case uehIgnore: srcIdx--; cancel = true; continue; 02103 default: Fail; } } 02104 cOut <<= 6; cOut |= (c & _0011_1111); } 02105 if (cancel) continue; 02106 if (strict) { 02107 // err1: This codepoint has been represented by more bytes than it should have been. 02108 // For example, cOut in the range 0..127 should be represented by a single byte, 02109 // not by two or more bytes. 02110 // - For example, this may happen in the "modified UTF-8" sometimes used for Java 02111 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid 02112 // the appearance of null bytes in the encoded stream. 02113 bool err1 = (cOut < minVal); 02114 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes. 02115 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these 02116 // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary. 02117 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff)); 02118 if (err1 || err2) switch (errorHandling) { 02119 case uehThrow: 02120 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ")."); 02121 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid."); 02122 else { Fail; break; } 02123 case uehAbort: return nDecoded; 02124 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02125 case uehIgnore: continue; 02126 default: Fail; } } 02127 // Add the decoded codepoint to the destination vector. 02128 // If this is the first decoded character, and it's one of the byte-order marks 02129 // (0xfffe and 0xfeff), we will skip it (unless skipBom is false). 02130 if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) { 02131 dest.Add(cOut); nDecoded++; } 02132 } // else (multi-byte sequence) 02133 } // while 02134 return nDecoded; 02135 } 02136 02137 //----------------------------------------------------------------------- 02138 // TUniCodec -- UTF-8 Encoder 02139 //----------------------------------------------------------------------- 02140 02141 // Returns the number of characters that have been successfully encoded. 02142 // This does not include any replacement characters that may have been inserted into 'dest'. 02143 template<typename TSrcVec, typename TDestCh> 02144 size_t TUniCodec::EncodeUtf8( 02145 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02146 TVec<TDestCh>& dest, const bool clrDest) const 02147 { 02148 size_t nEncoded = 0; 02149 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 02150 { 02151 uint c = uint(src[TVecIdx(srcIdx)]); 02152 bool err = false; 02153 if (strict && c > 0x10ffff) { 02154 err = true; 02155 switch (errorHandling) { 02156 case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed)."); 02157 case uehAbort: return nEncoded; 02158 case uehReplace: c = replacementChar; break; 02159 case uehIgnore: continue; 02160 default: Fail; } } 02161 if (c < 0x80u) 02162 dest.Add(TDestCh(c & 0xffu)); 02163 else if (c < 0x800u) { 02164 dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111))); 02165 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02166 else if (c < 0x10000u) { 02167 dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111))); 02168 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02169 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02170 else if (c < 0x200000u) { 02171 dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111))); 02172 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02173 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02174 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02175 else if (c < 0x4000000u) { 02176 dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011))); 02177 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02178 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02179 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02180 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02181 else { 02182 dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011))); 02183 dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111))); 02184 dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111))); 02185 dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111))); 02186 dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111))); 02187 dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); } 02188 if (! err) nEncoded++; 02189 } 02190 return nEncoded; 02191 } 02192 02193 //----------------------------------------------------------------------- 02194 // TUniCodec -- UTF-16 Encoder 02195 //----------------------------------------------------------------------- 02196 02197 // Returns the number of characters that have been successfully decoded. 02198 // This does not include any replacement characters that may have been inserted into 'dest'. 02199 // Each element of 'src' is assumed to contain one byte of data. 02200 // srcCount must be even (though srcIdx doesn't need to be). 02201 template<typename TSrcVec, typename TDestCh> 02202 size_t TUniCodec::DecodeUtf16FromBytes( 02203 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02204 TVec<TDestCh>& dest, const bool clrDest, 02205 const TUtf16BomHandling bomHandling, 02206 const TUniByteOrder defaultByteOrder) const 02207 { 02208 IAssert(srcCount % 2 == 0); 02209 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02210 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02211 if (clrDest) dest.Clr(); 02212 size_t nDecoded = 0; 02213 if (srcCount <= 0) return nDecoded; 02214 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02215 bool littleEndian = false; 02216 bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian())); 02217 if (bomHandling == bomIgnored) littleEndian = leDefault; 02218 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02219 { 02220 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; 02221 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; } 02222 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; } 02223 else if (bomHandling == bomAllowed) littleEndian = leDefault; 02224 else { // Report an error. 02225 switch (errorHandling) { 02226 case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead)."); 02227 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02228 default: Fail; } } 02229 } 02230 else Fail; 02231 while (srcIdx < srcEnd) 02232 { 02233 const size_t charSrcIdx = srcIdx; 02234 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02235 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02236 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02237 { 02238 // c is the first character in a surrogate pair. Read the next character. 02239 if (! (srcIdx + 2 <= srcEnd)) { 02240 switch (errorHandling) { 02241 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02242 case uehAbort: return nDecoded; 02243 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02244 case uehIgnore: continue; 02245 default: Fail; } } 02246 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2; 02247 uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8)); 02248 // c2 should be the second character of the surrogate pair. 02249 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02250 switch (errorHandling) { 02251 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02252 case uehAbort: return nDecoded; 02253 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02254 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue; 02255 case uehIgnore: srcIdx -= 2; continue; 02256 default: Fail; } } 02257 // c and c2 each contain 10 bits of information. 02258 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02259 cc += 0x10000; 02260 dest.Add(TDestCh(cc)); nDecoded++; continue; 02261 } 02262 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02263 switch (errorHandling) { 02264 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02265 case uehAbort: return nDecoded; 02266 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02267 case uehIgnore: continue; 02268 default: Fail; } } 02269 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02270 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02271 // Otherwise, store 'c' to the destination vector. 02272 dest.Add(TDestCh(c)); nDecoded++; 02273 } 02274 return nDecoded; 02275 } 02276 02277 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings 02278 // are used to determine if the two bytes of each word should be swapped before further 02279 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it 02280 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped. 02281 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the 02282 // beginning of the source data is used to determine the "original" byte order of the data; 02283 // if this doesn't match the byte order of the local machine, the two bytes of each word will 02284 // be swapped during the decoding process. 02285 template<typename TSrcVec, typename TDestCh> 02286 size_t TUniCodec::DecodeUtf16FromWords( 02287 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02288 TVec<TDestCh>& dest, bool clrDest, 02289 const TUtf16BomHandling bomHandling, 02290 const TUniByteOrder defaultByteOrder) const 02291 { 02292 IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored); 02293 IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian); 02294 if (clrDest) dest.Clr(); 02295 size_t nDecoded = 0; 02296 if (srcCount <= 0) return nDecoded; 02297 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; 02298 bool swap = false; 02299 bool isMachineLe = IsMachineLittleEndian(); 02300 bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe)); 02301 if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe); 02302 else if (bomHandling == bomAllowed || bomHandling == bomRequired) 02303 { 02304 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff; 02305 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; } 02306 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; } 02307 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe); 02308 else { // Report an error. 02309 switch (errorHandling) { 02310 case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead)."); 02311 case uehAbort: case uehReplace: case uehIgnore: return size_t(-1); 02312 default: Fail; } } 02313 } 02314 else Fail; 02315 while (srcIdx < srcEnd) 02316 { 02317 const size_t charSrcIdx = srcIdx; 02318 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02319 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02320 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023) 02321 { 02322 // c is the first character in a surrogate pair. Read the next character. 02323 if (! (srcIdx < srcEnd)) { 02324 switch (errorHandling) { 02325 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing."); 02326 case uehAbort: return nDecoded; 02327 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02328 case uehIgnore: continue; 02329 default: Fail; } } 02330 uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++; 02331 if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); 02332 // c2 should be the second character of the surrogate pair. 02333 if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) { 02334 switch (errorHandling) { 02335 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + "."); 02336 case uehAbort: return nDecoded; 02337 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character 02338 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue; 02339 case uehIgnore: srcIdx -= 1; continue; 02340 default: Fail; } } 02341 // c and c2 each contain 10 bits of information. 02342 uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate); 02343 cc += 0x10000; 02344 dest.Add(TDestCh(cc)); nDecoded++; continue; 02345 } 02346 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) { 02347 switch (errorHandling) { 02348 case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair."); 02349 case uehAbort: return nDecoded; 02350 case uehReplace: dest.Add(TDestCh(replacementChar)); continue; 02351 case uehIgnore: continue; 02352 default: Fail; } } 02353 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it. 02354 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue; 02355 // Otherwise, store 'c' to the destination vector. 02356 dest.Add(TDestCh(c)); nDecoded++; 02357 } 02358 return nDecoded; 02359 } 02360 02361 //----------------------------------------------------------------------- 02362 // TUniCodec -- UTF-16 Encoder 02363 //----------------------------------------------------------------------- 02364 02365 // Returns the number of characters that have been successfully encoded. 02366 // This does not include any replacement characters that may have been inserted into 'dest'. 02367 template<typename TSrcVec, typename TDestCh> 02368 size_t TUniCodec::EncodeUtf16ToWords( 02369 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02370 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02371 const TUniByteOrder destByteOrder) const 02372 { 02373 bool isMachineLe = IsMachineLittleEndian(); 02374 bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe); 02375 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02376 if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; } 02377 while (srcIdx < srcEnd) 02378 { 02379 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02380 if (! (c <= 0x10ffffu)) { 02381 switch (errorHandling) { 02382 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02383 case uehAbort: return nEncoded; 02384 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02385 case uehIgnore: continue; 02386 default: Fail; } } 02387 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02388 switch (errorHandling) { 02389 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02390 case uehAbort: return nEncoded; 02391 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02392 case uehIgnore: continue; 02393 default: Fail; } } 02394 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02395 switch (errorHandling) { 02396 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02397 case uehAbort: return nEncoded; 02398 case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue; 02399 case uehIgnore: continue; 02400 default: Fail; } } 02401 // If c is <= 0xffff, it can be stored directly. 02402 if (c <= 0xffffu) { 02403 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8); 02404 dest.Add(TDestCh(c)); nEncoded++; continue; } 02405 // Otherwise, represent c by a pair of surrogate characters. 02406 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02407 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02408 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02409 if (swap) { 02410 c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8); 02411 c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); } 02412 dest.Add(TDestCh(c1)); 02413 dest.Add(TDestCh(c2)); 02414 nEncoded++; continue; 02415 } 02416 return nEncoded; 02417 } 02418 02419 template<typename TSrcVec, typename TDestCh> 02420 size_t TUniCodec::EncodeUtf16ToBytes( 02421 const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02422 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom, 02423 const TUniByteOrder destByteOrder) const 02424 { 02425 bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian())); 02426 size_t nEncoded = 0, srcEnd = srcIdx + srcCount; 02427 if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; } 02428 while (srcIdx < srcEnd) 02429 { 02430 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++; 02431 if (! (c <= 0x10ffffu)) { 02432 switch (errorHandling) { 02433 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ")."); 02434 case uehAbort: return nEncoded; 02435 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); } 02436 case uehReplace: ___OutRepl; continue; 02437 case uehIgnore: continue; 02438 default: Fail; } } 02439 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) { 02440 switch (errorHandling) { 02441 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ")."); 02442 case uehAbort: return nEncoded; 02443 case uehReplace: ___OutRepl; continue; 02444 case uehIgnore: continue; 02445 default: Fail; } } 02446 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) { 02447 switch (errorHandling) { 02448 case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true."); 02449 case uehAbort: return nEncoded; 02450 case uehReplace: ___OutRepl; continue; 02451 case uehIgnore: continue; 02452 default: Fail; } } 02453 #undef ___OutRepl 02454 // If c is <= 0xffff, it can be stored directly. 02455 if (c <= 0xffffu) { 02456 if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); } 02457 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } 02458 nEncoded++; continue; } 02459 // Otherwise, represent c by a pair of surrogate characters. 02460 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu); 02461 uint c1 = (c >> 10) & 1023, c2 = c & 1023; 02462 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate; 02463 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); } 02464 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); } 02465 nEncoded++; continue; 02466 } 02467 return nEncoded; 02468 } 02469 02470 //----------------------------------------------------------------------------- 02471 // TUniChDb -- word boundaries 02472 //----------------------------------------------------------------------------- 02473 02474 template<typename TSrcVec> 02475 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02476 { 02477 // WB1. Break at the start of text. 02478 if (position < srcIdx) { position = srcIdx; return true; } 02479 // If we are beyond the end of the text, there aren't any word breaks left. 02480 const size_t srcEnd = srcIdx + srcCount; 02481 if (position >= srcEnd) return false; 02482 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02483 size_t origPos = position; 02484 if (IsWbIgnored(src[TVecIdx(position)])) { 02485 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02486 position = origPos; 02487 } 02488 // Determine the previous nonignored character (before 'position'). 02489 size_t posPrev = position; 02490 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02491 // Sec 6.2. Allow a break between Sep and an ignored character. 02492 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02493 // Determine the next nonignored character (after 'position'). 02494 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02495 size_t posNext2; 02496 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02497 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02498 int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext); 02499 int cNext2, wbfNext2; 02500 // 02501 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02502 cPrev = cCur, cCur = cNext, cNext = cNext2, 02503 wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2) 02504 { 02505 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02506 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02507 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02508 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02509 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02510 wbfNext2 = GetWbFlags(cNext2); 02511 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02512 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue 02513 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue 02514 // WB3. Do not break within CRLF. 02515 if (cCur == 13 && cNext == 10) continue; 02516 // WB5. Do not break between most letters. 02517 TestCurNext(ucfWbALetter, ucfWbALetter); 02518 // WB6. Do not break letters across certain punctuation. 02519 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02520 // WB7. Do not break letters across certain punctuation. 02521 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter); 02522 // WB8. Do not break within sequences of digits, or digits adjacent to letters. 02523 TestCurNext(ucfWbNumeric, ucfWbNumeric); 02524 // WB9. Do not break within sequences of digits, or digits adjacent to letters. 02525 TestCurNext(ucfWbALetter, ucfWbNumeric); 02526 // WB10. Do not break within sequences of digits, or digits adjacent to letters. 02527 TestCurNext(ucfWbNumeric, ucfWbALetter); 02528 // WB11. Do not break within sequences, such as "3.2" or "3.456,789". 02529 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02530 // WB12. Do not break within sequences, such as "3.2" or "3.456,789". 02531 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric); 02532 // WB13. Do not break between Katakana. 02533 TestCurNext(ucfWbKatakana, ucfWbKatakana); 02534 // WB13a. Do not break from extenders. 02535 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 && 02536 (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue; 02537 // WB13b. Do not break from extenders. 02538 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet && 02539 (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue; 02540 // WB14. Otherwise, break everywhere. 02541 position = posNext; return true; 02542 #undef TestCurNext 02543 #undef TestCurNext2 02544 #undef TestPrevCurNext 02545 } 02546 // WB2. Break at the end of text. 02547 IAssert(position == srcEnd); 02548 return true; 02549 } 02550 02551 // ToDo: provide a more efficient implementation of this. 02552 template<typename TSrcVec> 02553 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02554 { 02555 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02556 dest.PutAll(false); 02557 size_t position = srcIdx; 02558 dest[TVecIdx(position - srcIdx)] = true; 02559 while (position < srcIdx + srcCount) 02560 { 02561 size_t oldPos = position; 02562 FindNextWordBoundary(src, srcIdx, srcCount, position); 02563 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02564 dest[TVecIdx(position - srcIdx)] = true; 02565 } 02566 Assert(dest[TVecIdx(srcCount)]); 02567 } 02568 02569 //----------------------------------------------------------------------------- 02570 // TUniChDb -- sentence boundaries 02571 //----------------------------------------------------------------------------- 02572 02573 template<typename TSrcVec> 02574 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const 02575 { 02576 if (sbExTrie.Empty()) return true; 02577 // We'll move back from the position where a sentence-boundary is being considered. 02578 size_t pos = position; 02579 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02580 int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c); 02581 // - Skip the Sep, if there is one. 02582 if ((c & ucfSbSep) == ucfSbSep) { 02583 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02584 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02585 // - Skip any Sp characters. 02586 while ((sfb & ucfSbSp) == ucfSbSp) { 02587 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02588 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02589 // - Skip any Close characters. 02590 while ((sfb & ucfSbSp) == ucfSbSp) { 02591 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02592 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02593 // - Skip any ATerm | STerm characters. 02594 while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) { 02595 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true; 02596 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); } 02597 // Now start moving through the trie. 02598 int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1; 02599 while (true) 02600 { 02601 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos)); 02602 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]); 02603 TUniChCategory cat = GetCat(c); 02604 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) { 02605 // Check if the suffix we've read so far is one of those that appear in the trie. 02606 if (len == 1) return ! sbExTrie.Has1Gram(cLast); 02607 if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast); 02608 IAssert(len >= 3); IAssert(node >= 0); 02609 if (sbExTrie.IsNodeTerminal(node)) return false; 02610 if (atEnd) return true; } 02611 if (len == 1) { cButLast = c; len++; } 02612 else if (len == 2) { cButButLast = c; len++; 02613 // Now we have read the last three characters; start descending the suitable subtrie. 02614 node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast); 02615 if (node < 0) return true; } 02616 else { 02617 // Descend down the trie. 02618 node = sbExTrie.GetChild(node, c); 02619 if (node < 0) return true; } 02620 } 02621 //return true; 02622 } 02623 02624 template<typename TSrcVec> 02625 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const 02626 { 02627 // SB1. Break at the start of text. 02628 if (position < srcIdx) { position = srcIdx; return true; } 02629 // If we are beyond the end of the text, there aren't any word breaks left. 02630 const size_t srcEnd = srcIdx + srcCount; 02631 if (position >= srcEnd) return false; 02632 // If 'position' is currently at an ignored character, move it back to the last nonignored character. 02633 size_t origPos = position; 02634 if (IsWbIgnored(src[TVecIdx(position)])) { 02635 if (! WbFindPrevNonIgnored(src, srcIdx, position)) 02636 position = origPos; 02637 } 02638 // Determine the previous nonignored character (before 'position'). 02639 size_t posPrev = position; 02640 if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position; 02641 // Sec 6.2. Allow a break between Sep and an ignored character. 02642 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; } 02643 // Determine the next nonignored character (after 'position'). 02644 size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd); 02645 size_t posNext2; 02646 int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1); 02647 int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1); 02648 int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext); 02649 int cNext2, sbfNext2; 02650 // Initialize the state of the peek-back automaton. 02651 typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState; 02652 TPeekBackState backState; 02653 { 02654 size_t pos = position; 02655 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false; 02656 while (true) 02657 { 02658 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02659 // Skip at most one Sep. 02660 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02661 if ((sbf & ucfSbSep) == ucfSbSep) { 02662 wasSep = true; 02663 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break; 02664 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02665 // Skip zero or more Sp's. 02666 bool stop = false; 02667 while ((sbf & ucfSbSp) == ucfSbSp) { 02668 wasSp = true; 02669 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02670 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02671 if (stop) break; 02672 // Skip zero or more Close's. 02673 while ((sbf & ucfSbClose) == ucfSbClose) { 02674 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; } 02675 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); } 02676 if (stop) break; 02677 // Process an ATerm or STerm. 02678 wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm); 02679 wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm); 02680 break; 02681 } 02682 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm); 02683 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm); 02684 else backState = stInit; 02685 } 02686 // Initialize the state of the peek-ahead automaton. This state tells us what follows 02687 // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}. 02688 // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string. 02689 // Our peek-ahead automaton must tell us whether it is Lower or something else. 02690 typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState; 02691 TPeekAheadState aheadState = stUnknown; 02692 // 02693 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2, 02694 cPrev = cCur, cCur = cNext, cNext = cNext2, 02695 sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2) 02696 { 02697 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately, 02698 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position' 02699 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)? 02700 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd); 02701 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1); 02702 sbfNext2 = GetSbFlags(cNext2); 02703 // Update the peek-back automaton. 02704 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag) 02705 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; } 02706 switch (backState) { 02707 case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break; 02708 case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break; 02709 case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break; 02710 case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02711 case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02712 case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02713 case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break; 02714 default: IAssert(false); } 02715 #undef Trans 02716 #undef TestCur 02717 // Update the peek-ahead automaton. 02718 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0) 02719 if (! IsPeekAheadSkippable(sbfCur)) { 02720 bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower); 02721 if (aheadState == stLower) IAssert(isLower); 02722 else if (aheadState == stNotLower) IAssert(! isLower); 02723 // We haven't peaked ahead farther than this so far -- invalidate the state. 02724 aheadState = stUnknown; } 02725 if (aheadState == stUnknown) 02726 { 02727 // Peak ahead to the next non-peekahead-skippable character. 02728 size_t pos = posNext; 02729 while (pos < srcEnd) { 02730 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp); 02731 if (! IsPeekAheadSkippable(sbf)) { 02732 if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower; 02733 else aheadState = stNotLower; 02734 break; } 02735 WbFindNextNonIgnored(src, pos, srcEnd); } 02736 if (! (pos < srcEnd)) aheadState = stNotLower; 02737 } 02738 #undef IsPeekAheadSkippable 02739 // 02740 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02741 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue 02742 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue 02743 // SB3. Do not break within CRLF. 02744 if (cCur == 13 && cNext == 10) continue; 02745 // SB4. Break ater paragraph separators. 02746 if ((sbfCur & ucfSbSep) == ucfSbSep) { 02747 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02748 position = posNext; return true; } 02749 // Do not break after ambiguous terminators like period, if they are immediately followed by a number 02750 // or lowercase letter, if they are between uppercase letters, or if the first following letter 02751 // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation 02752 // or numeric period, and thus may not mark the end of a sentence. 02753 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6 02754 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7 02755 // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm) 02756 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) && 02757 (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue; 02758 // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower 02759 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue; 02760 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present). 02761 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep ) 02762 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue; 02763 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep ) 02764 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break] 02765 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) { 02766 if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10 02767 if (! CanSentenceEndHere(src, srcIdx, position)) continue; 02768 position = posNext; return true; } // SB11 02769 // WB12. Otherwise, do not break. 02770 continue; 02771 #undef TestCurNext 02772 #undef TestCurNext2 02773 #undef TestPrevCurNext 02774 } 02775 // WB2. Break at the end of text. 02776 IAssert(position == srcEnd); 02777 return true; 02778 } 02779 02780 // ToDo: provide a more efficient implementation of this. 02781 template<typename TSrcVec> 02782 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const 02783 { 02784 if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1)); 02785 dest.PutAll(false); 02786 size_t position = srcIdx; 02787 dest[TVecIdx(position - srcIdx)] = true; 02788 while (position < srcIdx + srcCount) 02789 { 02790 size_t oldPos = position; 02791 FindNextSentenceBoundary(src, srcIdx, srcCount, position); 02792 Assert(oldPos < position); Assert(position <= srcIdx + srcCount); 02793 dest[TVecIdx(position - srcIdx)] = true; 02794 } 02795 Assert(dest[TVecIdx(srcCount)]); 02796 } 02797 02798 //----------------------------------------------------------------------------- 02799 // TUniChDb -- case conversions 02800 //----------------------------------------------------------------------------- 02801 02802 template<typename TSrcVec, typename TDestCh> 02803 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 02804 TVec<TDestCh>& dest, const bool clrDest, 02805 const TUniChDb::TCaseConversion how, 02806 const bool turkic, const bool lithuanian) const 02807 { 02808 const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0)); 02809 if (clrDest) dest.Clr(); 02810 enum { 02811 GreekCapitalLetterSigma = 0x3a3, 02812 GreekSmallLetterSigma = 0x3c3, 02813 GreekSmallLetterFinalSigma = 0x3c2, 02814 LatinCapitalLetterI = 0x49, 02815 LatinCapitalLetterJ = 0x4a, 02816 LatinCapitalLetterIWithOgonek = 0x12e, 02817 LatinCapitalLetterIWithGrave = 0xcc, 02818 LatinCapitalLetterIWithAcute = 0xcd, 02819 LatinCapitalLetterIWithTilde = 0x128, 02820 LatinCapitalLetterIWithDotAbove = 0x130, 02821 LatinSmallLetterI = 0x69, 02822 CombiningDotAbove = 0x307 02823 }; 02824 // 02825 bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1; 02826 size_t nextWordBoundary = srcIdx; 02827 TBoolV wordBoundaries; bool wbsKnown = false; 02828 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 02829 { 02830 int cp = src[TVecIdx(srcIdx)]; srcIdx++; 02831 //if (turkic && cp == 0x130 && how == ccLower) printf("!"); 02832 // For conversion to titlecase, the first cased character of each word 02833 // must be converted to titlecase; everything else must be converted 02834 // to lowercase. 02835 TUniChDb::TCaseConversion howHere; 02836 if (how != ccTitle) howHere = how; 02837 else { 02838 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 02839 seenCased = false; seenTwoCased = false; cpFirstCased = -1; 02840 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 02841 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 02842 bool isCased = IsCased(cp); 02843 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; } 02844 else { howHere = ccLower; 02845 if (isCased && seenCased) seenTwoCased = true; } 02846 } 02847 // First, process the conditional mappings from SpecialCasing.txt. 02848 // These will be processed in code -- they were ignored while 02849 // we were reading SpecialCasing.txt itself. 02850 if (cp == GreekCapitalLetterSigma && howHere == ccLower) 02851 { 02852 // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of 02853 // the standard doesn't define it. We'll use FinalCased instead. 02854 // FinalCased: within the closest word boundaries containing C, 02855 // there is a cased letter before C, and there is no cased letter after C. 02856 //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary); 02857 if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; } 02858 size_t srcIdx2 = srcIdx; bool casedAfter = false; 02859 if (how == ccTitle) 02860 printf("!"); 02861 //while (srcIdx2 < nextBoundary) 02862 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02863 { 02864 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02865 if (IsCased(cp2)) { casedAfter = true; break; } 02866 } 02867 if (! casedAfter) 02868 { 02869 //size_t prevBoundary = srcIdx - 1; 02870 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary); 02871 srcIdx2 = srcIdx - 1; bool casedBefore = false; 02872 //while (prevBoundary < srcIdx2) 02873 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)]) 02874 { 02875 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02876 if (IsCased(cp2)) { casedBefore = true; break; } 02877 } 02878 if (casedBefore) { 02879 // Now we have a FinalCased character. 02880 dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; } 02881 } 02882 // If we got here, add a non-final sigma. 02883 dest.Add(GreekSmallLetterSigma); continue; 02884 } 02885 else if (lithuanian) 02886 { 02887 if (howHere == ccLower) 02888 { 02889 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek) 02890 { 02891 bool moreAbove = false; 02892 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02893 { 02894 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02895 const int cc2 = GetCombiningClass(cp2); 02896 if (cc2 == TUniChInfo::ccStarter) break; 02897 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; } 02898 } 02899 if (moreAbove) 02900 { 02901 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; } 02902 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; } 02903 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; } 02904 } 02905 } 02906 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; } 02907 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; } 02908 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; } 02909 } 02910 if (cp == CombiningDotAbove) 02911 { 02912 // Lithuanian, howHere != ccLower. 02913 // AfterSoftDotted := the last preceding character with a combining class 02914 // of zero before C was Soft_Dotted, and there is no intervening combining 02915 // character class 230 (ABOVE). 02916 bool afterSoftDotted = false; 02917 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02918 while (origSrcIdx < srcIdx2) 02919 { 02920 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02921 int cc2 = GetCombiningClass(cp2); 02922 if (cc2 == TUniChInfo::ccAbove) break; 02923 if (cc2 == TUniChInfo::ccStarter) { 02924 afterSoftDotted = IsSoftDotted(cp2); break; } 02925 } 02926 if (afterSoftDotted) 02927 { 02928 Assert(lithuanian); 02929 // Remove DOT ABOVE after "i" with upper or titlecase. 02930 // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle, 02931 // the "i" may have been kept lowercase and thus we shouldn't remove the dot). 02932 if (how == ccLower) { dest.Add(0x307); continue; } 02933 if (how == ccUpper) continue; 02934 Assert(how == ccTitle); 02935 Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character 02936 if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot. 02937 dest.Add(0x307); continue; 02938 } 02939 } 02940 } 02941 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri) 02942 { 02943 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 02944 // The following rules handle those cases. 02945 if (cp == LatinCapitalLetterIWithDotAbove) { 02946 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; } 02947 // When lowercasing, remove dot_above in the sequence I + dot_above, 02948 // which will turn into i. This matches the behavior of the 02949 // canonically equivalent I-dot_above. 02950 else if (cp == CombiningDotAbove) 02951 { 02952 // AfterI: the last preceding base character was an uppercase I, 02953 // and there is no intervening combining character class 230 (ABOVE). 02954 bool afterI = false; 02955 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp' 02956 while (origSrcIdx < srcIdx2) 02957 { 02958 --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)]; 02959 if (cp2 == LatinCapitalLetterI) { afterI = true; break; } 02960 int cc2 = GetCombiningClass(cp2); 02961 if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break; 02962 } 02963 if (afterI) { 02964 if (how == ccTitle && seenCased && ! seenTwoCased) { 02965 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word; 02966 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase. 02967 // This suggests that if a cased character is found, others in that word should be left alone. 02968 // This seems unusual; we map all other characters to lowercase instead. 02969 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above 02970 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase; 02971 // but since afterI is also true here, this would mean deleting it. Thus our titlecased 02972 // form of "I followed by dot-above" would be just "I", which is clearly wrong. 02973 // So we treat this as a special case here. 02974 IAssert(cpFirstCased == LatinCapitalLetterI); 02975 dest.Add(0x307); continue; } 02976 if (howHere != ccLower) dest.Add(0x307); 02977 continue; } 02978 } 02979 // When lowercasing, unless an I is before a dot_above, 02980 // it turns into a dotless i. 02981 else if (cp == LatinCapitalLetterI) 02982 { 02983 // BeforeDot: C is followed by U+0307 (combining dot above). 02984 // Any sequence of characters with a combining class that is 02985 // neither 0 nor 230 may intervene between the current character 02986 // and the combining dot above. 02987 bool beforeDot = false; 02988 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; ) 02989 { 02990 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++; 02991 if (cp2 == 0x307) { beforeDot = true; break; } 02992 const int cc2 = GetCombiningClass(cp2); 02993 if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break; 02994 } 02995 if (! beforeDot) { 02996 dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; } 02997 } 02998 // When uppercasing, i turns into a dotted capital I. 02999 else if (cp == LatinSmallLetterI) 03000 { 03001 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; 03002 } 03003 } 03004 // Try to use the unconditional mappings. 03005 const TIntIntVH &specHere = ( 03006 howHere == how ? specials : 03007 howHere == ccLower ? specialCasingLower : 03008 howHere == ccTitle ? specialCasingTitle : 03009 howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0)); 03010 int i = specHere.GetKeyId(cp); 03011 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; } 03012 // Try to use the simple (one-character) mappings. 03013 i = h.GetKeyId(cp); 03014 if (i >= 0) { 03015 const TUniChInfo &ci = h[i]; 03016 int cpNew = ( 03017 howHere == ccLower ? ci.simpleLowerCaseMapping : 03018 howHere == ccUpper ? ci.simpleUpperCaseMapping : 03019 ci.simpleTitleCaseMapping); 03020 if (cpNew < 0) cpNew = cp; 03021 dest.Add(cpNew); continue; } 03022 // As a final resort, leave 'cp' unchanged. 03023 dest.Add(cp); 03024 } 03025 } 03026 03027 template<typename TSrcVec, typename TDestCh> 03028 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03029 TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const 03030 { 03031 if (clrDest) dest.Clr(); 03032 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03033 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; ) 03034 { 03035 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03036 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; } 03037 const TUniChInfo &ci = h[i]; 03038 // With titlecasing, the first cased character of each word must be put into titlecase, 03039 // all others into lowercase. This is what the howHere variable is for. 03040 TUniChDb::TCaseConversion howHere; 03041 if (how != ccTitle) howHere = how; 03042 else { 03043 if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here. 03044 seenCased = false; 03045 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03046 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03047 bool isCased = IsCased(cp); 03048 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03049 else howHere = ccLower; 03050 } 03051 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03052 if (cpNew < 0) cpNew = cp; 03053 dest.Add(cpNew); 03054 } 03055 } 03056 03057 template<typename TSrcVec> 03058 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const 03059 { 03060 bool seenCased = false; size_t nextWordBoundary = srcIdx; 03061 for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) 03062 { 03063 const int cp = src[TVecIdx(srcIdx)]; 03064 int i = h.GetKeyId(cp); if (i < 0) continue; 03065 const TUniChInfo &ci = h[i]; 03066 // With titlecasing, the first cased character of each word must be put into titlecase, 03067 // all others into lowercase. This is what the howHere variable is for. 03068 TUniChDb::TCaseConversion howHere; 03069 if (how != ccTitle) howHere = how; 03070 else { 03071 if (srcIdx == nextWordBoundary) { // A word starts/ends here. 03072 seenCased = false; 03073 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next); 03074 IAssert(next > nextWordBoundary); nextWordBoundary = next; } 03075 bool isCased = IsCased(cp); 03076 if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; } 03077 else howHere = ccLower; 03078 } 03079 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping); 03080 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew; 03081 } 03082 } 03083 03084 //----------------------------------------------------------------------------- 03085 // TUniChDb -- composition, decomposition, normal forms 03086 //----------------------------------------------------------------------------- 03087 03088 template<typename TDestCh> 03089 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const 03090 { 03091 if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount) 03092 { 03093 // UAX #15, sec. 16: Hangul decomposition 03094 const int SIndex = codePoint - HangulSBase; 03095 const int L = HangulLBase + SIndex / HangulNCount; 03096 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount; 03097 const int T = HangulTBase + (SIndex % HangulTCount); 03098 dest.Add(L); dest.Add(V); 03099 if (T != HangulTBase) dest.Add(T); 03100 return; 03101 } 03102 int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; } 03103 const TUniChInfo &ci = h[i]; 03104 int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; } 03105 if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; } 03106 while (true) { 03107 int cp = decompositions[ofs++]; if (cp < 0) return; 03108 AddDecomposition(cp, dest, compatibility); } 03109 } 03110 03111 template<typename TSrcVec, typename TDestCh> 03112 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03113 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const 03114 { 03115 if (clrDest) dest.Clr(); 03116 const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/; 03117 // Decompose the string. 03118 while (srcIdx < srcCount) { 03119 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; } 03120 // Rearrange the decomposed string into canonical order. 03121 for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; ) 03122 { 03123 size_t j = destIdx; 03124 int cp = dest[TVecIdx(destIdx)]; destIdx++; 03125 int cpCls = GetCombiningClass(cp); 03126 if (cpCls == TUniChInfo::ccStarter) continue; 03127 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) { 03128 dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; } 03129 dest[TVecIdx(j)] = cp; 03130 } 03131 } 03132 03133 template<typename TSrcVec, typename TDestCh> 03134 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03135 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const 03136 { 03137 if (clrDest) dest.Clr(); 03138 TIntV temp; 03139 Decompose(src, srcIdx, srcCount, temp, compatibility); 03140 Compose(temp, 0, temp.Len(), dest, clrDest); 03141 } 03142 03143 template<typename TSrcVec, typename TDestCh> 03144 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03145 TVec<TDestCh>& dest, bool clrDest) const 03146 { 03147 if (clrDest) dest.Clr(); 03148 bool lastStarterKnown = false; // has a starter been encountered yet? 03149 size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter 03150 int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos]) 03151 const size_t srcEnd = srcIdx + srcCount; 03152 int ccMax = -1; // The highest combining class among the characters since the last starter. 03153 while (srcIdx < srcEnd) 03154 { 03155 const int cp = src[TVecIdx(srcIdx)]; srcIdx++; 03156 const int cpClass = GetCombiningClass(cp); 03157 //int cpCombined = -1; 03158 // If there is a starter with which 'cp' can be combined, and from which it is not blocked 03159 // by some intermediate character, we can try to combine them. 03160 if (lastStarterKnown && ccMax < cpClass) 03161 { 03162 int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp)); 03163 int cpCombined = -1; 03164 do { 03165 // Try to look up a composition in the inverseDec table. 03166 if (j >= 0) { cpCombined = inverseDec[j]; break; } 03167 // UAX #15, sec. 16: Hangul composition 03168 // - Try to combine L and V. 03169 const int LIndex = cpLastStarter - HangulLBase; 03170 if (0 <= LIndex && LIndex < HangulLCount) { 03171 const int VIndex = cp - HangulVBase; 03172 if (0 <= VIndex && VIndex < HangulVCount) { 03173 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount; 03174 break; } } 03175 // - Try to combine LV and T. 03176 const int SIndex = cpLastStarter - HangulSBase; 03177 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) 03178 { 03179 const int TIndex = cp - HangulTBase; 03180 if (0 <= TIndex && TIndex < HangulTCount) { 03181 cpCombined = cpLastStarter + TIndex; 03182 break; } 03183 } 03184 } while (false); 03185 // If a combining character has been found, use it to replace the old cpStarter. 03186 if (cpCombined >= 0) { 03187 dest[TVecIdx(lastStarterPos)] = cpCombined; 03188 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter); 03189 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else 03190 cpLastStarter = cpCombined; continue; } 03191 } 03192 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter. 03193 lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; } 03194 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking). 03195 ccMax = cpClass; 03196 dest.Add(cp); 03197 } 03198 } 03199 03200 template<typename TSrcVec, typename TDestCh> 03201 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount, 03202 TVec<TDestCh>& dest, bool clrDest) const 03203 { 03204 if (clrDest) dest.Clr(); 03205 size_t retVal = 0; 03206 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) { 03207 const int cp = src[TVecIdx(srcIdx)]; 03208 if (GetCombiningClass(cp) == TUniChInfo::ccStarter) 03209 { dest.Add(cp); retVal++; } } 03210 return retVal; 03211 } 03212 03213 inline bool AlwaysFalse() 03214 { 03215 int sum = 0; 03216 for (int i = 0; i < 5; i++) sum += i; 03217 return sum > 100; 03218 } 03219 03220 inline bool AlwaysTrue() 03221 { 03222 int sum = 0; 03223 for (int i = 0; i < 5; i++) sum += i; 03224 return sum < 100; 03225 } 03226 03227 /* 03228 03229 Notes on decomposition: 03230 03231 - In UnicodeData.txt, there is a field with the decomposition mapping. 03232 This field may also include a tag, <...>. 03233 If there is a tag, this is a compatibility mapping. 03234 Otherwise it is a canonical mapping. 03235 - Canonical decomposition uses only canonical mappings, 03236 compatibility decomposition uses both canonical and compatibility mappings. 03237 - Decomposition: 03238 1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively. 03239 2. Put the string into canonical order, which means: 03240 while there exists a pair of characters, A immediately followed by B, 03241 such that combiningclass(A) > combiningclass(B) > 0 [an "exchangeable pair"]: 03242 swap A and B; 03243 This results in NFD (normalized form D, after canonical decomposition) 03244 or NFKD (normalized form KD, after compatibility decomposition). 03245 - Canonical composition: 03246 1. Before composition, the string should have been decomposed 03247 (using either canonical or compatibility decomposition). 03248 2. For each character C (from left to right): 03249 2.1. Find the last starter S before C (if not found, continue). 03250 2.2. If there is, between S and C, some character with a combining class >= than that of C, then continue. 03251 2.3. If there exists a character L for which the canonical decomposition is S+L 03252 and L is not in the composition exclusion table [i.e. L is a "primary composite"], 03253 then replace S by L, and remove C. 03254 This results in NFC (normalized form C, with canonical decomposition followed by canonical composition) 03255 or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition). 03256 - Composition exclusion table: 03257 - Anything in CompositionExclusions.txt. 03258 - Singletons: characters whose canonical decomposition is a single character. 03259 - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter. 03260 03261 Example: 03262 E-grave (00c8; composition class 0; canonical decomposition: 0045 0300) 03263 E-macron (0112; composition class 0; 0045 0304) 03264 grave (0300; composition class 230) 03265 macron (0304; composition class 230) 03266 source string: 00c8 0304 03267 after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304 03268 after canonical composition: 00c8 0304 03269 03270 cc(horn) = 216 03271 cc(dot below) = 220 03272 cc(dot above) = 230 03273 03274 ToDos: 03275 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov. 03276 Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna. 03277 Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu 03278 upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt). 03279 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase. 03280 Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt 03281 (+ simple case mappinge v UnicodeData.txt). 03282 Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo 03283 in jih potem upostevamo posebej kar v source kodi nasih programov [za 03284 podrobno definicijo pogojev pa glej tabelo 3.13]. 03285 - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase. 03286 Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3. 03287 To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise, 03288 da je CaseFolding.txt izpeljan iz njiju. Glavni namen CaseFolding.txt naj bi bil za 03289 potrebe "locale-independent case folding" (table 4.1 in sec. 5.18). 03290 - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13 03291 in se posebej str. 90. 03292 - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD 03293 - definicija cased ipd. na str. 89 03294 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15 03295 Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih 03296 stvari, med drugim isLowerCase in isUpperCase. Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9). 03297 To je se najbolje dodati med flagse posameznega characterja. 03298 - general category: sec. 4.5 03299 - motivacija za titlecase: 5.18 03300 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt 03301 pod Full_Composition_Exclusion 03302 - script names: Scripts.txt in UAX #24. 03303 - block names: Blocks.txt 03304 - space characters: table 6.2 in baje tudi UCD.html 03305 - dash characters: table 6.3 03306 */ 03307 03308 //#endif 03309