35 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
96 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
131 template<
typename TSrcVec,
typename TDestCh>
133 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
135 template<
typename TSrcVec,
typename TDestCh>
140 template<
typename TSrcVec,
typename TDestCh>
142 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
144 template<
typename TSrcVec,
typename TDestCh>
170 template<
typename TSrcVec,
typename TDestCh>
172 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
185 template<
typename TSrcVec,
typename TDestCh>
187 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
207 template<
typename TSrcVec,
typename TDestCh>
209 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
210 TVec<TDestCh>& dest,
const bool clrDest,
const bool insertBom,
213 template<
typename TSrcVec,
typename TDestCh>
215 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
216 TVec<TDestCh>& dest,
const bool clrDest,
const bool insertBom,
233 void TestUtf8(
bool decode,
size_t expectedRetVal,
bool expectedThrow,
const TIntV& src,
const TIntV& expectedDest, FILE *f);
246 void TestUtf16(
bool decode,
size_t expectedRetVal,
bool expectedThrow,
const TIntV& src,
const TIntV& expectedDest,
251 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
257 const bool insertBom);
277 template<
typename TSrcDat,
typename TDestDat>
279 for (
int i = 0; i < src.
Len(); i++) dest.
Add(src[i]); }
292 template<
typename TSrcVec,
typename TDestCh>
293 void Fold(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
294 TVec<TDestCh>& dest,
const bool clrDest,
const bool full,
const bool turkic)
const
296 for (
const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
298 int c = src[
TVecIdx(srcIdx)], i; srcIdx++;
299 if (turkic && ((i = cfTurkic.
GetKeyId(c)) >= 0)) { dest.
Add(cfTurkic[i]);
continue; }
301 if ((! full) && ((i = cfSimple.
GetKeyId(c)) >= 0)) { dest.
Add(cfSimple[i]);
continue; }
302 i = cfCommon.
GetKeyId(c);
if (i >= 0) dest.
Add(cfCommon[i]);
else dest.
Add(c);
306 template<
typename TSrcVec>
307 void FoldInPlace(TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
const bool turkic)
const
309 for (
const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
311 int c = src[
TVecIdx(srcIdx)], i;
312 if (turkic && ((i = cfTurkic.
GetKeyId(c)) >= 0)) { src[
TVecIdx(srcIdx)] = cfTurkic[i];
continue; }
313 if ((i = cfSimple.
GetKeyId(c)) >= 0) { src[
TVecIdx(srcIdx)] = cfSimple[i];
continue; }
314 i = cfCommon.
GetKeyId(c);
if (i >= 0) src[
TVecIdx(srcIdx)] = cfCommon[i];
319 void Test(
const TIntV& src,
const TIntV& expectedDest,
const bool full,
const bool turkic, FILE *f);
340 template<
class TCodecImpl>
341 static PCodecBase
New();
349 virtual size_t ToUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const = 0;
350 virtual size_t ToUnicode(
const TStr& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const = 0;
357 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const = 0;
358 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TChA& dest,
const bool clrDest =
true)
const = 0;
359 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TStr& dest,
const bool clrDest =
true)
const = 0;
376 template<
class TCodecImpl_>
386 virtual void Test()
const { impl.Test(); }
388 virtual size_t ToUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const {
389 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
390 virtual size_t ToUnicode(
const TStr& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const {
391 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
393 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TIntV& dest,
const bool clrDest =
true)
const {
394 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
395 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TChA& dest,
const bool clrDest =
true)
const {
396 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
397 virtual size_t FromUnicode(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TStr& dest,
const bool clrDest =
true)
const {
398 TChA buf;
size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf,
false);
399 if (clrDest) dest += buf.
CStr();
else dest = buf.
CStr();
403 template<
class TCodecImpl>
412 template<
class TVector_>
423 static inline void Add(TVector& vector,
const TElement& element) { vector.
Add(element); }
432 static inline void Add(TVector& vector,
const TElement& element) { vector += element; }
445 static int FromUnicode(
int c) {
if (0 <= c && c <= 255)
return c;
else return -1; }
454 if (c < 0xa0)
return c;
else return toUnicodeTable[c - 0xa0]; }
456 if (0 <= c && c < 0xa0)
return c;
457 else if (0xa0 <= c && c < 0x180)
return fromUnicodeTable1[c - 0xa0];
458 else if (0x2c0 <= c && c < 0x2e0)
return fromUnicodeTable2[c - 0x2c0];
468 if (c < 0xa0)
return c;
else return toUnicodeTable[c - 0xa0]; }
470 if (0 <= c && c < 0xa0)
return c;
471 else if (0xa0 <= c && c < 0x180)
return fromUnicodeTable1[c - 0xa0];
472 else if (0x2d8 <= c && c < 0x2da)
return fromUnicodeTable2[c - 0x2d8];
482 if (c < 0xa0)
return c;
else return toUnicodeTable[c - 0xa0]; }
484 if (0 <= c && c < 0xa0)
return c;
485 else if (0xa0 <= c && c < 0x180)
return fromUnicodeTable1[c - 0xa0];
486 else if (0x2c0 <= c && c < 0x2e0)
return fromUnicodeTable2[c - 0x2c0];
496 for (
int i = 0; i < int(
sizeof(yuAsciiChars) /
sizeof(yuAsciiChars[0])); i++)
497 if (c == yuAsciiChars[i])
return uniChars[i];
500 for (
int i = 0; i < int(
sizeof(uniChars) /
sizeof(uniChars[0])); i++)
501 if (c == uniChars[i])
return yuAsciiChars[i];
502 else if(c == yuAsciiChars[i])
return -1;
503 if (0 <= c && c <= 255)
return c;
else return -1; }
512 if (c < 0x80)
return c;
else return toUnicodeTable[c - 0x80]; }
514 if (0 <= c && c < 0x80)
return c;
515 else if (0xa0 <= c && c < 0x100)
return fromUnicodeTable1[c - 0xa0];
516 else if (0x390 <= c && c < 0x3d0)
return fromUnicodeTable2[c - 0x390];
517 else if (0x2210 <= c && c < 0x2270)
return fromUnicodeTable3[c - 0x2210];
518 else if (0x2500 <= c && c < 0x25b0)
return fromUnicodeTable4[c - 0x2500];
519 else if (c == 0x192)
return 0x9f;
520 else if (c == 0x207f)
return 0xfc;
521 else if (c == 0x20a7)
return 0x9e;
522 else if (c == 0x2310)
return 0xa9;
523 else if (c == 0x2320)
return 0xf4;
524 else if (c == 0x2321)
return 0xf5;
534 if (c < 0x80)
return c;
else return toUnicodeTable[c - 0x80]; }
536 if (0 <= c && c < 0x80)
return c;
537 else if (0xa0 <= c && c < 0x180)
return fromUnicodeTable1[c - 0xa0];
538 else if (0x2c0 <= c && c < 0x2e0)
return fromUnicodeTable2[c - 0x2c0];
539 else if (0x2500 <= c && c < 0x25b0)
return fromUnicodeTable3[c - 0x2500];
549 if (c < 0x80)
return c;
else return toUnicodeTable[c - 0x80]; }
551 if (0 <= c && c < 0x80)
return c;
552 else if (0xa0 <= c && c < 0x180)
return fromUnicodeTable1[c - 0xa0];
553 else if (0x2c0 <= c && c < 0x2e0)
return fromUnicodeTable2[c - 0x2c0];
554 else if (0x2010 <= c && c < 0x2040)
return fromUnicodeTable3[c - 0x2010];
555 else if (c == 0x20ac)
return 0x80;
556 else if (c == 0x2122)
return 0x99;
560 template<
class TEncoding_>
572 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
578 for (
int c = 0; c <= 255; c++) {
579 int cu = TEncoding::ToUnicode(c);
if (cu == -1)
continue;
581 IAssert(0 <= cu && cu < 0x110000);
582 int c2 = TEncoding::FromUnicode(cu);
585 for (
int cu = 0; cu < 0x110000; cu++) {
586 int c = TEncoding::FromUnicode(cu);
if (c == -1)
continue;
589 int cu2 = TEncoding::ToUnicode(c);
596 template<
typename TSrcVec,
typename TDestCh>
598 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
601 if (clrDest) dest.
Clr();
602 size_t toDo = srcCount;
604 int chSrc = ((int) src[
TVecIdx(srcIdx)]) & 0xff; srcIdx++;
605 int chDest = TEncoding::ToUnicode(chSrc);
609 template<
typename TSrcVec,
typename TDestCh>
617 template<
typename TSrcVec,
typename TDestVec>
619 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
620 TDestVec& dest,
const bool clrDest =
true)
const
623 if (clrDest) dest.Clr();
624 size_t toDo = srcCount, nEncoded = 0;
626 int chSrc = (int) src[
TVecIdx(srcIdx)]; srcIdx++;
627 int chDest = TEncoding::FromUnicode(chSrc);
629 switch (errorHandling) {
639 template<
typename TSrcVec,
typename TDestVec>
640 size_t FromUnicode(
const TSrcVec& src, TDestVec& dest,
const bool clrDest =
true)
const {
return FromUnicode(src, 0, src.Len(), dest, clrDest); }
642 size_t UniToStr(
const TIntV& src,
size_t srcIdx,
const size_t srcCount,
TStr& dest,
const bool clrDest =
true)
const {
643 TChA buf;
size_t retVal =
FromUnicode(src, srcIdx, srcCount, buf,
false);
644 if (clrDest) dest += buf.
CStr();
else dest = buf.
CStr();
664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
1031 static inline ushort GetLineBreakCode(
char c1,
char c2) {
return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((
static_cast<ushort>(
static_cast<uchar>(c2)) & 0xff)); }
1037 subCat = (
TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); }
1040 subCat = catAndSubCat;
1041 chCat = (char) cat; chSubCat = (char) (
int(subCat) & 0xff); }
1053 SOut.
Save(simpleUpperCaseMapping); SOut.
Save(simpleLowerCaseMapping); SOut.
Save(simpleTitleCaseMapping);
1054 SOut.
Save(decompOffset); SOut.
Save(nameOffset);
1058 SIn.
Load(simpleUpperCaseMapping); SIn.
Load(simpleLowerCaseMapping); SIn.
Load(simpleTitleCaseMapping);
1059 SIn.
Load(decompOffset); SIn.
Load(nameOffset);
1063 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
1064 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
1165 static const char s[] =
"LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
1166 for (
const char *p = s; *p; p += 2)
1167 if (chCat == p[0] && chSubCat == p[1])
return true;
1176 template<
typename TItem_>
1187 TNode() : child(-1), sib(-1), terminal(false) { }
1188 TNode(
const TItem& item_,
const int child_,
const int sib_,
const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
1206 int Get3GramRoot(
const TItem& last,
const TItem& butLast,
const TItem& butButLast)
const {
1208 if (keyId < 0)
return 0;
else return roots[keyId]; }
1209 int GetChild(
const int parentIdx,
const TItem& item)
const {
1210 for (
int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
1211 const TNode &node = nodes[childIdx];
1212 if (node.item == item)
return childIdx;
1213 childIdx = node.sib; }
1219 template<
typename TSrcVec>
1220 void Add(
const TSrcVec& src,
const size_t srcIdx,
const size_t srcCount)
1225 size_t srcLast = srcIdx + (srcCount - 1);
1227 int keyId = roots.
GetKeyId(tr), curNodeIdx = -1;
1228 if (keyId >= 0) curNodeIdx = roots[keyId];
1229 else { curNodeIdx = nodes.
Add(TNode(
TItem(0), -1, -1,
false)); roots.
AddDat(tr, curNodeIdx); }
1231 if (srcCount > 3)
for (
size_t srcPos = srcLast - 3; ; )
1233 const TItem curItem = src[
TVecIdx(srcPos)];
1234 int childNodeIdx = nodes[curNodeIdx].child;
1235 while (childNodeIdx >= 0) {
1236 TNode &childNode = nodes[childNodeIdx];
1237 if (childNode.item == curItem)
break;
1238 childNodeIdx = childNode.sib; }
1239 if (childNodeIdx < 0) {
1240 childNodeIdx = nodes.
Add(TNode(curItem, -1, nodes[curNodeIdx].child,
false));
1241 nodes[curNodeIdx].child = childNodeIdx; }
1242 curNodeIdx = childNodeIdx;
1243 if (srcPos == srcIdx)
break;
else srcPos--;
1245 nodes[curNodeIdx].terminal =
true;
1248 template<
typename TSrcVec>
1249 void Add(
const TSrcVec& src) {
Add(src, 0, (
size_t) src.Len()); }
1277 h.
Clr(); charNames.
Clr(); decompositions.
Clr(); inverseDec.
Clr(); caseFolding.
Clr();
1278 specialCasingLower.
Clr(); specialCasingUpper.
Clr(); specialCasingTitle.
Clr();
1281 h.
Save(SOut); charNames.
Save(SOut); decompositions.
Save(SOut);
1282 inverseDec.
Save(SOut); caseFolding.
Save(SOut); scripts.
Save(SOut);
1283 specialCasingLower.
Save(SOut); specialCasingUpper.
Save(SOut); specialCasingTitle.
Save(SOut);
1287 decompositions.
Load(SIn);
1288 inverseDec.
Load(SIn); caseFolding.
Load(SIn); scripts.
Load(SIn);
1289 specialCasingLower.
Load(SIn); specialCasingUpper.
Load(SIn); specialCasingTitle.
Load(SIn);
1331 const char *
GetCharName(
const int cp)
const {
int i = h.
GetKeyId(cp);
if (i < 0)
return 0;
int ofs = h[i].nameOffset;
return ofs < 0 ? 0 : charNames.
GetCStr(ofs); }
1335 char buf[20]; sprintf(buf,
"U+%04x", cp);
return TStr(buf); }
1336 template<
class TSrcVec>
void PrintCharNames(FILE *f,
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
const TStr& prefix)
const {
1337 if (! f) f = stdout;
1338 for (
const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
1339 fprintf(f,
"%s", prefix.
CStr());
1340 int cp = src[
TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ?
"U+%05x" :
"U+%04x "), cp);
1352 if (i < 0)
return false;
else { ChInfo=h[i];
return true; }}
1357 int GetWbFlags(
const int cp)
const {
int i = h.
GetKeyId(cp);
if (i < 0)
return 0;
else return h[i].GetWbFlags(); }
1359 int GetSbFlags(
const int cp)
const {
int i = h.
GetKeyId(cp);
if (i < 0)
return 0;
else return h[i].GetSbFlags(); }
1361 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
1362 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
1363 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
1364 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
1365 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
1367 #define DECLARE_FORWARDED_PROPERTY_METHODS \
1368 ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
1369 ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \
1370 ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \
1371 ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \
1372 ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \
1373 ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \
1374 ___UniFwd2(IsXidStart, IsXidContinue) \
1375 ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \
1376 ___UniFwd1(IsGbExtend) \
1377 ___UniFwd2(IsCased, IsCurrency)
1384 int i = h.
GetKeyId(cp);
if (i >= 0)
return h[i].IsPrivateUse();
1385 return (0xe000 <= cp && cp <= 0xf8ff) ||
1387 (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
1393 int i = h.
GetKeyId(cp);
if (i >= 0)
return h[i].IsSurrogate();
1394 return 0xd800 <= cp && cp <= 0xdcff; }
1425 template<
typename TSrcVec>
void WbFindNextNonIgnored(
const TSrcVec& src,
size_t& position,
const size_t srcEnd)
const {
1426 if (position >= srcEnd)
return;
1427 position++;
while (position < srcEnd &&
IsWbIgnored(src[
TVecIdx(position)])) position++; }
1429 template<
typename TSrcVec>
void WbFindNextNonIgnoredS(
const TSrcVec& src,
size_t& position,
const size_t srcEnd)
const {
1430 if (position >= srcEnd)
return;
1431 if (IsSbSep(src[
TVecIdx(position)])) { position++;
return; }
1432 position++;
while (position < srcEnd &&
IsWbIgnored(src[
TVecIdx(position)])) position++; }
1434 template<
typename TSrcVec>
bool WbFindPrevNonIgnored(
const TSrcVec& src,
const size_t srcStart,
size_t& position)
const {
1435 if (position <= srcStart)
return false;
1436 while (position > srcStart) {
1446 template<
typename TSrcVec>
1447 bool FindNextWordBoundary(
const TSrcVec& src,
const size_t srcIdx,
const size_t srcCount,
size_t &position)
const;
1451 template<
typename TSrcVec>
1465 template<
typename TSrcVec>
1466 bool CanSentenceEndHere(
const TSrcVec& src,
const size_t srcIdx,
const size_t position)
const;
1472 template<
typename TSrcVec>
1473 bool FindNextSentenceBoundary(
const TSrcVec& src,
const size_t srcIdx,
const size_t srcCount,
size_t &position)
const;
1477 template<
typename TSrcVec>
1490 template<
class TSrcVec>
void SbEx_Add(
const TSrcVec& v) { sbExTrie.
Add(v); }
1500 static const TStr data =
"Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
1510 template<
typename TDestCh>
1516 template<
typename TSrcVec,
typename TDestCh>
1517 void Decompose(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
1518 TVec<TDestCh>& dest,
bool compatibility,
bool clrDest =
true)
const;
1519 template<
typename TSrcVec,
typename TDestCh>
1521 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
1528 template<
typename TSrcVec,
typename TDestCh>
1529 void Compose(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
1531 template<
typename TSrcVec,
typename TDestCh>
1533 Compose(src, 0, src.Len(), dest, clrDest); }
1538 template<
typename TSrcVec,
typename TDestCh>
1540 TVec<TDestCh>& dest,
bool compatibility,
bool clrDest =
true)
const;
1541 template<
typename TSrcVec,
typename TDestCh>
1547 template<
typename TSrcVec,
typename TDestCh>
1548 size_t ExtractStarters(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
1550 template<
typename TSrcVec,
typename TDestCh>
1554 template<
typename TSrcVec>
1557 src.Clr();
for (
int i = 0; i < temp.
Len(); i++) src.Add(temp[i]);
1589 template<
typename TSrcVec,
typename TDestCh>
void GetCaseConverted(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest,
const TCaseConversion how,
const bool turkic,
const bool lithuanian)
const;
1590 template<
typename TSrcVec,
typename TDestCh>
void GetLowerCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccLower, turkic, lithuanian); }
1591 template<
typename TSrcVec,
typename TDestCh>
void GetUpperCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccUpper, turkic, lithuanian); }
1592 template<
typename TSrcVec,
typename TDestCh>
void GetTitleCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccTitle, turkic, lithuanian); }
1593 template<
typename TSrcVec,
typename TDestCh>
void GetLowerCase(
const TSrcVec& src,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1594 template<
typename TSrcVec,
typename TDestCh>
void GetUpperCase(
const TSrcVec& src,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1595 template<
typename TSrcVec,
typename TDestCh>
void GetTitleCase(
const TSrcVec& src,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool turkic =
false,
const bool lithuanian =
false)
const {
GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1601 template<
typename TSrcVec,
typename TDestCh>
void GetSimpleLowerCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true)
const {
GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccLower); }
1602 template<
typename TSrcVec,
typename TDestCh>
void GetSimpleUpperCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true)
const {
GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccUpper); }
1603 template<
typename TSrcVec,
typename TDestCh>
void GetSimpleTitleCase(
const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
TVec<TDestCh>& dest,
const bool clrDest =
true)
const {
GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest,
ccTitle); }
1628 template<
typename TSrcVec,
typename TDestCh>
1630 TVec<TDestCh>& dest,
const bool clrDest,
const bool full,
const bool turkic =
false)
const { caseFolding.
Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
1631 template<
typename TSrcVec,
typename TDestCh>
1632 void GetCaseFolded(
const TSrcVec& src,
TVec<TDestCh>& dest,
const bool clrDest =
true,
const bool full =
true,
const bool turkic =
false)
const {
1633 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
1636 template<
typename TSrcVec>
void ToCaseFolded(TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
const bool turkic =
false)
const { caseFolding.
FoldInPlace(src, srcIdx, srcCount, turkic); }
1637 template<
typename TSrcVec>
void ToCaseFolded(TSrcVec& src,
const bool turkic =
false)
const {
ToCaseFolded(src, 0, src.Len(), turkic); }
1659 if (putBackCh >= 0) {
int c =
putBackCh; putBackCh = EOF;
return c; }
1664 buf.
Clr(); comment.
Clr();
1665 bool inComment =
false, first =
true;
1668 if (c == EOF)
return ! first;
1672 else if (c == 10)
return true;
1673 else if (c ==
'#') inComment =
true;
1674 if (! inComment) buf += char(c);
1675 else comment += char(c); }
1684 void Close() { putBackCh = EOF;
if (f) { fclose(f); f = 0; }}
1691 if (line.
Len() <= 0)
continue;
1693 for (
int i = 0; i < dest.
Len(); i++) dest[i].ToTrunc();
1696 int c;
bool ok = s.
IsHexInt(
true, 0, 0x10ffff, c);
IAssertR(ok, s);
return c; }
1698 if (ClrDestP) dest.
Clr();
1700 for (
int i = 0; i < parts.
Len(); i++) {
1701 int c;
bool ok = parts[i].IsHexInt(
true, 0, 0x10ffff, c);
IAssertR(ok, s);
1731 hasCat =
false; subCat = ucOtherNotAssigned;
1746 IAssert(owner.
h[i].subCat == ucOtherNotAssigned);
1748 owner.
h[i].SetCatAndSubCat(subCat); }
1750 if (! hasCat)
return;
1752 IAssert(owner.
h[i].subCat == subCat); }
1756 if (invalidCatCodes.
IsKey(
"L&")) invalidCatCodes.
DelKey(
"L&");
1758 if (! invalidCatCodes.
Empty()) {
1759 printf(
"Invalid cat code(s) in the comments: ");
1761 printf(
" \"%s\"", invalidCatCodes.
GetKey(i).
CStr());
1875 for (
int i = 0; i < names.
Len(); i++)
1879 for (
int i = 0; i < names.
Len(); i++)
1890 PCodecBase codec = codecs[i];
bool found =
false;
1891 for (
int j = 0; j < dest.
Len(); j++)
if (dest[j]() ==
codec()) { found =
true;
break; }
1892 if (! found) dest.
Add(codec); }}
1902 if (position < 0) { position = 0;
return true; }
1903 size_t position_;
bool retVal = ucd.
FindNextWordBoundary(src, 0, src.
Len(), position_); position = int(position_);
return retVal; }
1917 if (position < 0) { position = 0;
return true; }
1918 size_t position_;
bool retVal = ucd.
FindNextSentenceBoundary(src, 0, src.
Len(), position_); position = int(position_);
return retVal; }
1995 bool isAscii =
true;
1996 for (
int i = 0, n = s.
Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii =
false;
break; }
1997 if (isAscii)
return s.
GetLc();
2014 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
2016 #undef DECLARE_FORWARDED_PROPERTY_METHODS
2035 template<
typename TSrcVec,
typename TDestCh>
2037 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2040 size_t nDecoded = 0;
2041 if (clrDest) dest.
Clr();
2042 const size_t origSrcIdx = srcIdx;
2043 const size_t srcEnd = srcIdx + srcCount;
2044 while (srcIdx < srcEnd)
2046 const size_t charSrcIdx = srcIdx;
2048 if ((c & _1000_0000) == 0) {
2050 dest.
Add(TDestCh(c)); nDecoded++;
continue; }
2051 else if ((c & _1100_0000) == _1000_0000) {
2054 switch (errorHandling) {
2057 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2064 uint nMoreBytes = 0, nBits = 0, minVal = 0;
2065 if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
2066 else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
2067 else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
2068 else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
2069 else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
2076 switch (errorHandling) {
2090 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
2092 uint cOut = c & ((1 << nBits) - 1);
2093 bool cancel =
false;
2094 for (
uint i = 0; i < nMoreBytes && ! cancel; i++) {
2096 if (! (srcIdx < srcEnd)) {
2097 switch (errorHandling) {
2100 case uehReplace: dest.
Add(TDestCh(replacementChar)); cancel =
true;
continue;
2101 case uehIgnore: cancel =
true;
continue;
2104 c = src[
TVecIdx(srcIdx)] & 0xff; srcIdx++;
2105 if ((c & _1100_0000) != _1000_0000) {
2106 switch (errorHandling) {
2109 case uehReplace: dest.
Add(TDestCh(replacementChar)); srcIdx--; cancel =
true;
continue;
2110 case uehIgnore: srcIdx--; cancel =
true;
continue;
2112 cOut <<= 6; cOut |= (c & _0011_1111); }
2113 if (cancel)
continue;
2121 bool err1 = (cOut < minVal);
2125 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
2126 if (err1 || err2)
switch (errorHandling) {
2129 else if (err2)
throw TUnicodeException(charSrcIdx, c,
"Invalid multibyte sequence: it decodes into 0x" +
TInt::GetStr(cOut,
"%08x") +
", but only codepoints 0..0x10ffff are valid.");
2130 else {
Fail;
break; }
2132 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2138 if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
2139 dest.
Add(cOut); nDecoded++; }
2151 template<
typename TSrcVec,
typename TDestCh>
2153 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2156 size_t nEncoded = 0;
2157 for (
const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
2161 if (strict && c > 0x10ffff) {
2163 switch (errorHandling) {
2170 dest.
Add(TDestCh(c & 0xffu));
2171 else if (c < 0x800u) {
2172 dest.
Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
2173 dest.
Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2174 else if (c < 0x10000u) {
2175 dest.
Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
2176 dest.
Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2177 dest.
Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2178 else if (c < 0x200000u) {
2179 dest.
Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
2180 dest.
Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2181 dest.
Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2182 dest.
Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2183 else if (c < 0x4000000u) {
2184 dest.
Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
2185 dest.
Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2186 dest.
Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2187 dest.
Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2188 dest.
Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2190 dest.
Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
2191 dest.
Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
2192 dest.
Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2193 dest.
Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2194 dest.
Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2195 dest.
Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2196 if (! err) nEncoded++;
2209 template<
typename TSrcVec,
typename TDestCh>
2211 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2219 if (clrDest) dest.
Clr();
2220 size_t nDecoded = 0;
2221 if (srcCount <= 0)
return nDecoded;
2222 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2223 bool littleEndian =
false;
2225 if (bomHandling ==
bomIgnored) littleEndian = leDefault;
2229 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian =
false;
if (skipBom) srcIdx += 2; }
2230 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian =
true;
if (skipBom) srcIdx += 2; }
2231 else if (bomHandling ==
bomAllowed) littleEndian = leDefault;
2233 switch (errorHandling) {
2239 while (srcIdx < srcEnd)
2241 const size_t charSrcIdx = srcIdx;
2243 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2247 if (! (srcIdx + 2 <= srcEnd)) {
2248 switch (errorHandling) {
2251 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2255 uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2258 switch (errorHandling) {
2262 case uehReplace: dest.
Add(TDestCh(replacementChar)); srcIdx -= 2;
continue;
2268 dest.
Add(TDestCh(cc)); nDecoded++;
continue;
2271 switch (errorHandling) {
2272 case uehThrow:
throw TUnicodeException(charSrcIdx, c,
"This 16-bit value should be used only as the second character of a surrogate pair.");
2274 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2278 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom)
continue;
2280 dest.
Add(TDestCh(c)); nDecoded++;
2293 template<
typename TSrcVec,
typename TDestCh>
2295 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2302 if (clrDest) dest.
Clr();
2303 size_t nDecoded = 0;
2304 if (srcCount <= 0)
return nDecoded;
2305 const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2309 if (bomHandling ==
bomIgnored) swap = (isDefaultLe != isMachineLe);
2313 if (c == 0xfeff) { swap =
false;
if (skipBom) srcIdx += 1; }
2314 else if (c == 0xfffe) { swap =
true;
if (skipBom) srcIdx += 1; }
2315 else if (bomHandling ==
bomAllowed) swap = (isMachineLe != isDefaultLe);
2317 switch (errorHandling) {
2323 while (srcIdx < srcEnd)
2325 const size_t charSrcIdx = srcIdx;
2327 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2331 if (! (srcIdx < srcEnd)) {
2332 switch (errorHandling) {
2335 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2339 if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
2342 switch (errorHandling) {
2346 case uehReplace: dest.
Add(TDestCh(replacementChar)); srcIdx -= 1;
continue;
2352 dest.
Add(TDestCh(cc)); nDecoded++;
continue;
2355 switch (errorHandling) {
2356 case uehThrow:
throw TUnicodeException(charSrcIdx, c,
"This 16-bit value should be used only as the second character of a surrogate pair.");
2358 case uehReplace: dest.
Add(TDestCh(replacementChar));
continue;
2362 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom)
continue;
2364 dest.
Add(TDestCh(c)); nDecoded++;
2375 template<
typename TSrcVec,
typename TDestCh>
2377 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2378 TVec<TDestCh>& dest,
const bool clrDest,
const bool insertBom,
2383 size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2384 if (insertBom) { dest.
Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
2385 while (srcIdx < srcEnd)
2388 if (! (c <= 0x10ffffu)) {
2389 switch (errorHandling) {
2396 switch (errorHandling) {
2403 switch (errorHandling) {
2411 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2412 dest.
Add(TDestCh(c)); nEncoded++;
continue; }
2414 c -= 0x10000u;
IAssert( c <= 0xfffffu);
2415 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2418 c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
2419 c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
2420 dest.
Add(TDestCh(c1));
2421 dest.
Add(TDestCh(c2));
2422 nEncoded++;
continue;
2427 template<
typename TSrcVec,
typename TDestCh>
2429 const TSrcVec& src,
size_t srcIdx,
const size_t srcCount,
2430 TVec<TDestCh>& dest,
const bool clrDest,
const bool insertBom,
2434 size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2435 if (insertBom) { dest.
Add(isDestLe ? 0xff : 0xfe); dest.
Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
2436 while (srcIdx < srcEnd)
2439 if (! (c <= 0x10ffffu)) {
2440 switch (errorHandling) {
2443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
2448 switch (errorHandling) {
2455 switch (errorHandling) {
2464 if (isDestLe) { dest.
Add(c & 0xff); dest.
Add((c >> 8) & 0xff); }
2465 else { dest.
Add((c >> 8) & 0xff); dest.
Add(c & 0xff); }
2466 nEncoded++;
continue; }
2468 c -= 0x10000u;
IAssert( c <= 0xfffffu);
2469 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2471 if (isDestLe) { dest.
Add(c1 & 0xff); dest.
Add((c1 >> 8) & 0xff); dest.
Add(c2 & 0xff); dest.
Add((c2 >> 8) & 0xff); }
2472 else { dest.
Add((c1 >> 8) & 0xff); dest.
Add(c1 & 0xff); dest.
Add((c2 >> 8) & 0xff); dest.
Add(c2 & 0xff); }
2473 nEncoded++;
continue;
2482 template<
typename TSrcVec>
2486 if (position < srcIdx) { position = srcIdx;
return true; }
2488 const size_t srcEnd = srcIdx + srcCount;
2489 if (position >= srcEnd)
return false;
2491 size_t origPos = position;
2497 size_t posPrev = position;
2500 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[
TVecIdx(position)]) &&
IsWbIgnored(src[
TVecIdx(position + 1)])) { position += 1;
return true; }
2504 int cPrev = (posPrev < position ? (int) src[
TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (
int) src[
TVecIdx(position)] : -1);
2505 int cNext = (position < posNext && posNext < srcEnd ? (int) src[
TVecIdx(posNext)] : -1);
2507 int cNext2, wbfNext2;
2509 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2510 cPrev = cCur, cCur = cNext, cNext = cNext2,
2511 wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
2517 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[
TVecIdx(posNext2)] : -1);
2519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
2521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2523 if (cCur == 13 && cNext == 10)
continue;
2546 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
2549 position = posNext;
return true;
2552 #undef TestPrevCurNext
2560 template<
typename TSrcVec>
2563 if (
size_t(dest.
Len()) != srcCount + 1) dest.
Gen(
TVecIdx(srcCount + 1));
2565 size_t position = srcIdx;
2566 dest[
TVecIdx(position - srcIdx)] =
true;
2567 while (position < srcIdx + srcCount)
2569 size_t oldPos = position;
2571 if (oldPos >= position) {
2572 Assert(oldPos < position);
2574 Assert(position <= srcIdx + srcCount);
2575 dest[
TVecIdx(position - srcIdx)] =
true;
2584 template<
typename TSrcVec>
2589 size_t pos = position;
2597 while ((sfb &
ucfSbSp) == ucfSbSp) {
2601 while ((sfb & ucfSbSp) == ucfSbSp) {
2609 int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
2613 c = (atEnd ? -1 : (int) src[
TVecIdx(pos)]);
2615 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
2621 if (atEnd)
return true; }
2622 if (len == 1) { cButLast = c; len++; }
2623 else if (len == 2) { cButButLast = c; len++;
2626 if (node < 0)
return true; }
2630 if (node < 0)
return true; }
2635 template<
typename TSrcVec>
2639 if (position < srcIdx) { position = srcIdx;
return true; }
2641 const size_t srcEnd = srcIdx + srcCount;
2642 if (position >= srcEnd)
return false;
2644 size_t origPos = position;
2650 size_t posPrev = position;
2653 if (position == origPos && position + 1 < srcEnd && IsSbSep(src[
TVecIdx(position)]) &&
IsWbIgnored(src[
TVecIdx(position + 1)])) { position += 1;
return true; }
2657 int cPrev = (posPrev < position ? (int) src[
TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (
int) src[
TVecIdx(position)] : -1);
2658 int cNext = (position < posNext && posNext < srcEnd ? (int) src[
TVecIdx(posNext)] : -1);
2660 int cNext2, sbfNext2;
2662 typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
2663 TPeekBackState backState;
2665 size_t pos = position;
2666 bool wasSep =
false, wasSp =
false, wasATerm =
false, wasSTerm =
false;
2672 if ((sbf &
ucfSbSep) == ucfSbSep) {
2678 while ((sbf &
ucfSbSp) == ucfSbSp) {
2693 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
2694 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
2695 else backState = stInit;
2701 typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
2702 TPeekAheadState aheadState = stUnknown;
2704 for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2705 cPrev = cCur, cCur = cNext, cNext = cNext2,
2706 sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
2712 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[
TVecIdx(posNext2)] : -1);
2715 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
2716 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
2717 switch (backState) {
2718 case stInit:
Trans(ATerm, ATerm);
Trans(STerm, STerm);
break;
2719 case stATerm:
Trans(Sp, ATermSp);
Trans(Sep, ATermSep);
Trans(ATerm, ATerm);
Trans(STerm, STerm);
Trans(Close, ATerm); backState = stInit;
break;
2720 case stSTerm:
Trans(Sp, STermSp);
Trans(Sep, STermSep);
Trans(ATerm, ATerm);
Trans(STerm, STerm);
Trans(Close, STerm); backState = stInit;
break;
2721 case stATermSp:
Trans(Sp, ATermSp);
Trans(Sep, ATermSep);
Trans(ATerm, ATerm);
Trans(STerm, STerm); backState = stInit;
break;
2722 case stSTermSp:
Trans(Sp, STermSp);
Trans(Sep, STermSep);
Trans(ATerm, ATerm);
Trans(STerm, STerm); backState = stInit;
break;
2723 case stATermSep:
Trans(ATerm, ATerm);
Trans(STerm, STerm); backState = stInit;
break;
2724 case stSTermSep:
Trans(ATerm, ATerm);
Trans(STerm, STerm); backState = stInit;
break;
2729 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
2732 if (aheadState == stLower)
IAssert(isLower);
2733 else if (aheadState == stNotLower)
IAssert(! isLower);
2735 aheadState = stUnknown; }
2736 if (aheadState == stUnknown)
2739 size_t pos = posNext;
2740 while (pos < srcEnd) {
2743 if ((sbf &
ucfSbLower) == ucfSbLower) aheadState = stLower;
2744 else aheadState = stNotLower;
2747 if (! (pos < srcEnd)) aheadState = stNotLower;
2749 #undef IsPeekAheadSkippable
2751 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2752 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
2753 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2755 if (cCur == 13 && cNext == 10)
continue;
2757 if ((sbfCur &
ucfSbSep) == ucfSbSep) {
2759 position = posNext;
return true; }
2767 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
2770 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower)
continue;
2773 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (
ucfSbClose |
ucfSbSp | ucfSbSep)) != 0)
continue;
2776 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
2777 if ((sbfNext & (
ucfSbSp | ucfSbSep)) != 0)
continue;
2779 position = posNext;
return true; }
2784 #undef TestPrevCurNext
2792 template<
typename TSrcVec>
2795 if (
size_t(dest.
Len()) != srcCount + 1) dest.
Gen(
TVecIdx(srcCount + 1));
2797 size_t position = srcIdx;
2798 dest[
TVecIdx(position - srcIdx)] =
true;
2799 while (position < srcIdx + srcCount)
2801 size_t oldPos = position;
2803 if (oldPos >= position) {
2804 Assert(oldPos < position);
2806 Assert(position <= srcIdx + srcCount);
2807 dest[
TVecIdx(position - srcIdx)] =
true;
2816 template<
typename TSrcVec,
typename TDestCh>
2820 const bool turkic,
const bool lithuanian)
const
2823 if (clrDest) dest.
Clr();
2825 GreekCapitalLetterSigma = 0x3a3,
2826 GreekSmallLetterSigma = 0x3c3,
2827 GreekSmallLetterFinalSigma = 0x3c2,
2828 LatinCapitalLetterI = 0x49,
2829 LatinCapitalLetterJ = 0x4a,
2830 LatinCapitalLetterIWithOgonek = 0x12e,
2831 LatinCapitalLetterIWithGrave = 0xcc,
2832 LatinCapitalLetterIWithAcute = 0xcd,
2833 LatinCapitalLetterIWithTilde = 0x128,
2834 LatinCapitalLetterIWithDotAbove = 0x130,
2835 LatinSmallLetterI = 0x69,
2836 CombiningDotAbove = 0x307
2839 bool seenCased =
false, seenTwoCased =
false;
int cpFirstCased = -1;
2840 size_t nextWordBoundary = srcIdx;
2841 TBoolV wordBoundaries;
bool wbsKnown =
false;
2842 for (
const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
2844 int cp = src[
TVecIdx(srcIdx)]; srcIdx++;
2850 if (how !=
ccTitle) howHere = how;
2852 if (srcIdx - 1 == nextWordBoundary) {
2853 seenCased =
false; seenTwoCased =
false; cpFirstCased = -1;
2855 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
2856 bool isCased = IsCased(cp);
2857 if (isCased && ! seenCased) { howHere =
ccTitle; seenCased =
true; cpFirstCased = cp; }
2859 if (isCased && seenCased) seenTwoCased =
true; }
2864 if (cp == GreekCapitalLetterSigma && howHere ==
ccLower)
2871 if (! wbsKnown) {
FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown =
true; }
2872 size_t srcIdx2 = srcIdx;
bool casedAfter =
false;
2876 while (! wordBoundaries[
TVecIdx(srcIdx2 - origSrcIdx)])
2878 int cp2 = src[
TVecIdx(srcIdx2)]; srcIdx2++;
2879 if (IsCased(cp2)) { casedAfter =
true;
break; }
2885 srcIdx2 = srcIdx - 1;
bool casedBefore =
false;
2887 while (! wordBoundaries[
TVecIdx(srcIdx2 - origSrcIdx)])
2889 --srcIdx2;
int cp2 = src[
TVecIdx(srcIdx2)];
2890 if (IsCased(cp2)) { casedBefore =
true;
break; }
2894 dest.
Add(GreekSmallLetterFinalSigma);
Assert(howHere ==
ccLower);
continue; }
2897 dest.
Add(GreekSmallLetterSigma);
continue;
2899 else if (lithuanian)
2903 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
2905 bool moreAbove =
false;
2906 for (
size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
2908 const int cp2 = src[
TVecIdx(srcIdx2)]; srcIdx2++;
2915 if (cp == LatinCapitalLetterI) { dest.
Add(0x69); dest.
Add(0x307);
continue; }
2916 if (cp == LatinCapitalLetterJ) { dest.
Add(0x6a); dest.
Add(0x307);
continue; }
2917 if (cp == LatinCapitalLetterIWithOgonek) { dest.
Add(0x12f); dest.
Add(0x307);
continue; }
2920 else if (cp == LatinCapitalLetterIWithGrave) { dest.
Add(0x69); dest.
Add(0x307); dest.
Add(0x300);
continue; }
2921 else if (cp == LatinCapitalLetterIWithAcute) { dest.
Add(0x69); dest.
Add(0x307); dest.
Add(0x301);
continue; }
2922 else if (cp == LatinCapitalLetterIWithTilde) { dest.
Add(0x69); dest.
Add(0x307); dest.
Add(0x303);
continue; }
2924 if (cp == CombiningDotAbove)
2930 bool afterSoftDotted =
false;
2931 size_t srcIdx2 = srcIdx - 1;
2932 while (origSrcIdx < srcIdx2)
2934 --srcIdx2;
int cp2 = src[
TVecIdx(srcIdx2)];
2938 afterSoftDotted = IsSoftDotted(cp2);
break; }
2940 if (afterSoftDotted)
2946 if (how ==
ccLower) { dest.
Add(0x307);
continue; }
2950 if (seenCased && ! seenTwoCased)
continue;
2951 dest.
Add(0x307);
continue;
2959 if (cp == LatinCapitalLetterIWithDotAbove) {
2960 dest.
Add(howHere ==
ccLower ? 0x69 : 0x130);
continue; }
2964 else if (cp == CombiningDotAbove)
2968 bool afterI =
false;
2969 size_t srcIdx2 = srcIdx - 1;
2970 while (origSrcIdx < srcIdx2)
2972 --srcIdx2;
int cp2 = src[
TVecIdx(srcIdx2)];
2973 if (cp2 == LatinCapitalLetterI) { afterI =
true;
break; }
2978 if (how ==
ccTitle && seenCased && ! seenTwoCased) {
2988 IAssert(cpFirstCased == LatinCapitalLetterI);
2989 dest.
Add(0x307);
continue; }
2995 else if (cp == LatinCapitalLetterI)
3001 bool beforeDot =
false;
3002 for (
size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
3004 const int cp2 = src[
TVecIdx(srcIdx2)]; srcIdx2++;
3005 if (cp2 == 0x307) { beforeDot =
true;
break; }
3010 dest.
Add(howHere ==
ccLower ? 0x131 : 0x49);
continue; }
3013 else if (cp == LatinSmallLetterI)
3015 dest.
Add(howHere ==
ccLower ? 0x69 : 0x130);
continue;
3019 const TIntIntVH &specHere = (
3020 howHere == how ? specials :
3034 if (cpNew < 0) cpNew = cp;
3035 dest.
Add(cpNew);
continue; }
3041 template<
typename TSrcVec,
typename TDestCh>
3045 if (clrDest) dest.
Clr();
3046 bool seenCased =
false;
size_t nextWordBoundary = srcIdx;
3047 for (
const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
3049 const int cp = src[
TVecIdx(srcIdx)]; srcIdx++;
3050 int i =
h.
GetKeyId(cp);
if (i < 0) { dest.
Add(cp);
continue; }
3055 if (how !=
ccTitle) howHere = how;
3057 if (srcIdx - 1 == nextWordBoundary) {
3060 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3061 bool isCased = IsCased(cp);
3062 if (isCased && ! seenCased) { howHere =
ccTitle; seenCased =
true; }
3066 if (cpNew < 0) cpNew = cp;
3071 template<
typename TSrcVec>
3074 bool seenCased =
false;
size_t nextWordBoundary = srcIdx;
3075 for (
const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
3077 const int cp = src[
TVecIdx(srcIdx)];
3078 int i =
h.
GetKeyId(cp);
if (i < 0)
continue;
3083 if (how !=
ccTitle) howHere = how;
3085 if (srcIdx == nextWordBoundary) {
3088 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3089 bool isCased = IsCased(cp);
3090 if (isCased && ! seenCased) { howHere =
ccTitle; seenCased =
true; }
3094 if (cpNew >= 0) src[
TVecIdx(srcIdx)] = cpNew;
3102 template<
typename TDestCh>
3112 dest.
Add(L); dest.
Add(V);
3116 int i =
h.
GetKeyId(codePoint);
if (i < 0) { dest.
Add(codePoint);
return; }
3118 int ofs = ci.
decompOffset;
if (ofs < 0) { dest.
Add(codePoint);
return; }
3125 template<
typename TSrcVec,
typename TDestCh>
3127 TVec<TDestCh>& dest,
const bool compatibility,
bool clrDest)
const
3129 if (clrDest) dest.
Clr();
3130 const size_t destStart = dest.
Len();
3132 while (srcIdx < srcCount) {
3135 for (
size_t destIdx = destStart, destEnd = dest.
Len(); destIdx < destEnd; )
3138 int cp = dest[
TVecIdx(destIdx)]; destIdx++;
3147 template<
typename TSrcVec,
typename TDestCh>
3149 TVec<TDestCh>& dest,
bool compatibility,
bool clrDest)
const
3151 if (clrDest) dest.
Clr();
3153 Decompose(src, srcIdx, srcCount, temp, compatibility);
3157 template<
typename TSrcVec,
typename TDestCh>
3161 if (clrDest) dest.
Clr();
3162 bool lastStarterKnown =
false;
3163 size_t lastStarterPos = size_t(-1);
3164 int cpLastStarter = -1;
3165 const size_t srcEnd = srcIdx + srcCount;
3167 while (srcIdx < srcEnd)
3169 const int cp = src[
TVecIdx(srcIdx)]; srcIdx++;
3174 if (lastStarterKnown && ccMax < cpClass)
3177 int cpCombined = -1;
3180 if (j >= 0) { cpCombined =
inverseDec[j];
break; }
3194 if (0 <= TIndex && TIndex < HangulTCount) {
3195 cpCombined = cpLastStarter + TIndex;
3200 if (cpCombined >= 0) {
3201 dest[
TVecIdx(lastStarterPos)] = cpCombined;
3204 cpLastStarter = cpCombined;
continue; }
3207 lastStarterKnown =
true; lastStarterPos = dest.
Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
3208 else if (cpClass > ccMax)
3214 template<
typename TSrcVec,
typename TDestCh>
3218 if (clrDest) dest.
Clr();
3220 for (
const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
3221 const int cp = src[
TVecIdx(srcIdx)];
3223 { dest.
Add(cp); retVal++; } }
3230 for (
int i = 0; i < 5; i++) sum += i;
3237 for (
int i = 0; i < 5; i++) sum += i;
bool IsVariationSelector() const
void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
static int SwapBytes(int x)
TPair< TInt, TInt > TIntPr
bool IsSbFlag(const int cp, const TUniChFlags flag) const
static int FromUnicode(int c)
void ToSimpleUpperCase(TIntV &src) const
void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
static void Add(TVector &vector, const TElement &element)
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
static const int fromUnicodeTable1[6 *16]
void DecomposeAndCompose(const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
const char * GetCharName(const int cp) const
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
enum TUniChProperties_ TUniChProperties
T8BitCodec< TEncoding_ISO8859_4 > iso8859_4
#define IAssertR(Cond, Reason)
bool Has1Gram(const TItem &item) const
void ToSimpleTitleCase(TSrcVec &src) const
void SetPropertyX(const TUniChPropertiesX flag)
static void AppendVector(const TVec< TSrcDat > &src, TVec< TDestDat > &dest)
TUniChCategory GetCat(const int cp) const
static int ToUnicode(int c)
int GetScriptByName(const TStr &scriptName) const
TUcdFileReader & operator=(const TUcdFileReader &r)
void ToSimpleUpperCase(TSrcVec &src) const
void SbEx_Add(const TStr &s)
TStr EncodeUtf8Str(const TIntV &src) const
enum TUniChFlags_ TUniChFlags
bool IsCompositionExclusion() const
static void Add(TVector &vector, const TElement &element)
THash< TItemPr, TVoid > pairs
TUniCaseFolding(TSIn &SIn)
void SaveBin(const TStr &fnBinUcd)
TUniChDb::TCaseConversion TCaseConversion
bool IsDcpFlag(const TUniChFlags flag) const
static const ushort LineBreak_Quotation
void SetProperty(const TUniChProperties flag)
void Add(const TSrcVec &src)
bool IsGraphemeExtend() const
void SetSbFlag(const TUniChFlags flag)
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
static const int fromUnicodeTable1[14 *16]
static TStr GetSpecialCasingFn()
enum TUniChSubCategory_ TUniChSubCategory
TPair< TItem, TItem > TItemPr
T8BitCodec< TEncoding_ISO8859_1 > iso8859_1
void Save(TSOut &SOut) const
int GetWbFlags(const int cp) const
TPt< TCodecBase > PCodecBase
void WbFindNextNonIgnoredS(const TSrcVec &src, size_t &position, const size_t srcEnd) const
void SetDcpFlag(const TUniChFlags flag)
void SetWbFlag(const TUniChFlags flag)
virtual void Test() const
bool IsGraphemeBase() const
TStr EncodeUtf8Str(const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
void ToSimpleUpperCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
enum TUniChCategory_ TUniChCategory
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
T8BitCodec< TEncoding_ISO8859_3 > TCodec_ISO8859_3
static int ToUnicode(int c)
TUniChSubCategory GetSubCat(const int cp) const
TSubcatHelper(TUniChDb &owner_)
size_t FromUnicode(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TDestVec &dest, const bool clrDest=true) const
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
static TStr GetScriptNameKatakana()
TUniTrie< TInt > sbExTrie
const char * GetCStr(const uint &Offset) const
static const ushort LineBreak_InfixNumeric
static uint GetRndUint(TRnd &rnd)
bool IsLogicalOrderException() const
T8BitCodec< TEncoding_ISO8859_4 > TCodec_ISO8859_4
TSizeTy Len() const
Returns the number of elements in the vector.
static int ToUnicode(int c)
virtual TStr GetName() const =0
void InitPropList(const TStr &basePath)
static const int toUnicodeTable[8 *16]
TUniChSubCategory GetSubCat() const
void ToCaseFolded(TSrcVec &src, const bool turkic=false) const
void Save(TSOut &SOut) const
void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
bool IsDefaultIgnorable() const
void ToSimpleLowerCase(TSrcVec &src) const
T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_=TUniCodec::DefaultReplacementChar)
enum TUniChDb::TCaseConversion_ TCaseConversion
void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
bool IsAlphabetic() const
void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
int GetSbFlags(const int cp) const
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TStr &dest, const bool clrDest=true) const
static const ushort LineBreak_ComplexContext
T8BitCodec< TEncoding_CP852 > cp852
const TStr & GetScriptName(const int scriptId) const
TUniChCategory GetCat() const
TIntIntVH specialCasingUpper
int GetScript(const TUniChInfo &ci) const
static const int yuAsciiChars[10]
TStr GetSubStr(const int &BChN, const int &EChN) const
void GetUpperCase(const TIntV &src, TIntV &dest) const
T8BitCodec< TEncoding_ISO8859_3 > iso8859_3
THash< TItem, TVoid > singles
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
void InitDerivedCoreProperties(const TStr &basePath)
virtual void Test() const
static int FromUnicode(int c)
bool IsWhiteSpace() const
static int FromUnicode(int c)
void InitLineBreaks(const TStr &basePath)
static const int uniChars[10]
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
void GetLowerCase(const TIntV &src, TIntV &dest) const
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
void ToSimpleLowerCase(TIntV &src) const
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
#define Trans(curFlag, newState)
void Add(const TSrcVec &src, const size_t srcIdx, const size_t srcCount)
static TStr GetNormalizationTestFn()
T8BitCodec< TEncoding_ISO8859_2 > TCodec_ISO8859_2
void GetCaseFolded(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const
enum TUniChPropertiesX_ TUniChPropertiesX
void GetSimpleLowerCase(const TIntV &src, TIntV &dest) const
bool IsWbFlag(const int cp, const TUniChFlags flag) const
static void SaveUShort(TSOut &SOut, ushort u)
TUnicodeErrorHandling errorHandling
THash< TInt, TIntV > TIntIntVH
void ToCaseFolded(TIntV &src) const
void DecomposeAndCompose(const TIntV &src, TIntV &dest, bool compatibility) const
void Test(const TStr &basePath)
static const int fromUnicodeTable2[2 *16]
static const int fromUnicodeTable2[4 *16]
T8BitCodec< TEncoding_CP437 > cp437
static void ParseCodePointRange(const TStr &s, int &from, int &to)
TIntIntVH specialCasingLower
virtual TStr GetName() const
int simpleUpperCaseMapping
size_t ExtractStarters(const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
bool FindNextWordBoundary(const TIntV &src, int &position) const
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
static void LoadUShort(TSIn &SIn, ushort &u)
size_t UniToStr(const TIntV &src, TStr &dest, const bool clrDest=true) const
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
static int FromUnicode(int c)
void Save(TSOut &SOut) const
static TStr GetUnicodeDataFn()
T8BitCodec< TEncoding_YuAscii > TCodec_YuAscii
THash< TIntPr, TInt > inverseDec
void FindWordBoundaries(const TIntV &src, TBoolV &dest) const
bool IsPropertyX(const TUniChPropertiesX flag) const
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
size_t EncodeUtf8(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
void ClrSentenceBoundaryExceptions()
bool IsPrivateUse() const
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
TStr GetWbFlagsStr() const
static TStr GetScriptsFn()
static const int fromUnicodeTable3[6 *16]
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
void FindSentenceBoundaries(const TIntV &src, TBoolV &dest) const
static int ToUnicode(int c)
void Clr(bool DoDel=false)
size_t FromUnicode(const TIntV &src, TChA &dest, const bool clrDest=true) const
bool IsDeprecated() const
void TestCaseConversions()
int simpleTitleCaseMapping
static PSIn New(const TStr &FNm)
void DelKey(const TKey &Key)
void SetCatAndSubCat(const TUniChSubCategory catAndSubCat)
static TStr GetWbFlagsStr(const int flags)
size_t UniToStr(const TIntV &src, size_t srcIdx, const size_t srcCount, TStr &dest, const bool clrDest=true) const
size_t ToUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
TStr GetCharNameS(const int cp) const
PCodecBase GetCodec(const TStr &name) const
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
int ChangeStrAll(const TStr &SrcStr, const TStr &DstStr, const bool &FromStartP=false)
static const int fromUnicodeTable2[2 *16]
int DecodeUtf16FromWords(const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
TUcdFileReader(const TUcdFileReader &r)
void TestDecodeUtf8(TRnd &rnd, const TStr &testCaseDesc)
int ExtractStarters(TIntV &src) const
bool IsCompatibilityDecomposition() const
void ToSimpleTitleCase(TIntV &src) const
void PrintCharNames(FILE *f, const TSrcVec &src, const TStr &prefix) const
static TStr GetScriptNameUnknown()
bool IsHexInt(const bool &Check, const int &MnVal, const int &MxVal, int &Val) const
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TChA &dest, const bool clrDest=true) const
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
void GetSimpleUpperCase(const TIntV &src, TIntV &dest) const
TVec< PCodecBase > TCodecBaseV
bool IsNodeTerminal(const int nodeIdx) const
static TStr GetSentenceBreakTestFn()
#define TestCurNext(curFlag, nextFlag)
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
int SearchStr(const TStr &Str, const int &BChN=0) const
void Save(TSOut &SOut) const
TStr GetUtf8CaseFolded(const TStr &s) const
bool IsAsciiHexDigit() const
#define DefineUniSubCat(cat, subCat, c)
T8BitCodec< TEncoding_ISO8859_1 > TCodec_ISO8859_1
bool IsWbIgnored(const int cp) const
enum TUniByteOrder_ TUniByteOrder
bool FNextKeyId(int &KeyId) const
void GetCaseFolded(const TIntV &src, TIntV &dest, const bool full=true) const
static const int fromUnicodeTable3[3 *16]
int ExtractStarters(const TIntV &src, TIntV &dest) const
bool IsKeyGetDat(const TKey &Key, TDat &Dat) const
TStr GetSbFlagsStr() const
size_t ExtractStarters(TSrcVec &src) const
void LoadTxt_ProcessDecomposition(TUniChInfo &ci, TStr s)
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
size_t FromUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
static const int toUnicodeTable[6 *16]
void InitSpecialCasing(const TStr &basePath)
void AddDecomposition(const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
size_t DecodeUtf8(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
static TStr GetDerivedCorePropsFn()
bool IsSurrogate(const int cp) const
THash< TItemTr, TInt > roots
TUnicodeException(size_t srcIdx_, int srcChar_, const TStr &message_)
bool IsNoncharacter() const
static void LoadSChar(TSIn &SIn, signed char &u)
static TStr GetWordBreakPropertyFn()
static const int fromUnicodeTable3[11 *16]
THash< TInt, TUniChInfo > h
bool GetNextLine(TStrV &dest)
void SbEx_AddUtf8(const TStr &s)
void GetSimpleLowerCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
bool CanSentenceEndHere(const TSrcVec &src, const size_t srcIdx, const size_t position) const
void Save(TSOut &SOut) const
void GetSimpleLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
void GetTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
int DecodeUtf16FromBytes(const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
void Open(const TStr &fileName)
T8BitCodec< TEncoding_CP852 > TCodec_CP852
void UseEnglishSentenceBoundaryExceptions()
static bool IsValidSubCat(const char chCat, const char chSubCat)
static const int fromUnicodeTable1[14 *16]
bool IsSbFlag(const TUniChFlags flag) const
static int ToUnicode(int c)
static const ushort LineBreak_Numeric
void SaveBf(const void *Bf, const TSize &BfL)
void LoadBin(const TStr &fnBin)
void InitScripts(const TStr &basePath)
virtual size_t ToUnicode(const TStr &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
void TestComposition(const TStr &basePath)
enum TUtf16BomHandling_ TUtf16BomHandling
void GetCaseFolded(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
static TStr GetLineBreakFn()
void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const
static const int fromUnicodeTable1[14 *16]
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
bool IsGetChInfo(const int cp, TUniChInfo &ChInfo)
bool FindNextSentenceBoundary(const TIntV &src, int &position) const
T8BitCodec< TEncoding_YuAscii > yuAscii
static TStr GetWordBreakTestFn()
TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
TUcdFileReader(const TStr &fileName)
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
static bool IsWbIgnored(const TUniChInfo &ci)
void Save(const bool &Bool)
void Decompose(const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
size_t FromUnicode(const TSrcVec &src, TDestVec &dest, const bool clrDest=true) const
static int ToUnicode(int c)
int Get3GramRoot(const TItem &last, const TItem &butLast, const TItem &butButLast) const
static TStr NormalizeCodecName(const TStr &name)
static const int toUnicodeTable[8 *16]
static TStr GetSbFlagsStr(const int flags)
TStr EncodeUtf8Str(const TSrcVec &src) const
TStr GetCharNameS(const int cp) const
void UnregisterCodec(const TStr &nameList)
T8BitCodec< TEncoding_CP437 > TCodec_CP437
int GetScript(const int cp) const
int DecodeUtf8(const TStr &src, TIntV &dest) const
TUniChSubCategory GetSubCat(const int cp) const
int GetKeyId(const TKey &Key) const
int DecodeUtf8(const TIntV &src, TIntV &dest) const
#define IsPeekAheadSkippable(sbf)
static int FromUnicode(int c)
DECLARE_FORWARDED_PROPERTY_METHODS bool IsPrivateUse(const int cp) const
void SetCat(const int cp)
static int FromUnicode(int c)
void ToCaseFolded(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
int GetCombiningClass(const int cp) const
static const int toUnicodeTable[8 *16]
static TStr GetSentenceBreakPropertyFn()
void SbEx_Add(const TSrcVec &v)
void GetSimpleTitleCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
void LoadTxt(const TStr &fileName)
void PrintCharNames(FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const
static bool IsMachineLittleEndian()
int AddKey(const TKey &Key)
void InitWordAndSentenceBoundaryFlags(const TStr &basePath)
void Compose(const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
#define DECLARE_FORWARDED_PROPERTY_METHODS
void TestFindNextWordOrSentenceBoundary(const TStr &basePath, bool sentence)
int SbEx_AddMulti(const TStr &words, const bool wordsAreUtf8=true)
static const int toUnicodeTable[6 *16]
int simpleLowerCaseMapping
bool IsXidContinue() const
static const ushort LineBreak_Unknown
static TStr GetCompositionExclusionsFn()
virtual size_t ToUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
static int ToUnicode(int c)
bool IsGraphemeLink() const
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const =0
bool IsQuotationMark() const
void LoadTxt(const TStr &basePath)
T8BitCodec< TEncoding_CP1250 > TCodec_CP1250
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2(IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
void ProcessComment(TUniChDb::TUcdFileReader &reader)
T8BitCodec< TEncoding_CP1250 > cp1250
size_t ToUnicode(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
bool Has2Gram(const TItem &last, const TItem &butLast) const
static TStr GetPropListFn()
void ToSimpleLowerCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
void TestCat(const int cp)
size_t FromUnicode(const TIntV &src, TStr &dest, const bool clrDest=true) const
void SplitOnAllCh(const char &SplitCh, TStrV &StrV, const bool &SkipEmpty=true) const
const char * GetCharName(const int cp) const
void LoadBf(const void *Bf, const TSize &BfL)
void Compose(const TIntV &src, TIntV &dest) const
void GetSimpleTitleCase(const TIntV &src, TIntV &dest) const
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
bool IsIdContinue() const
void GetUpperCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
void GetLowerCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
void Clr(const bool &DoDel=true, const int &NoDelLim=-1, const bool &ResetDat=true)
#define TestCurNext2(curFlag, nextFlag, next2Flag)
TTriple< TItem, TItem, TItem > TItemTr
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
void GetSimpleUpperCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
void Save(TSOut &SOut) const
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
void Decompose(const TIntV &src, TIntV &dest, bool compatibility) const
int EncodeUtf8(const TIntV &src, TIntV &dest) const
static const int toUnicodeTable[6 *16]
bool IsProperty(const TUniChProperties flag) const
void GetTitleCase(const TIntV &src, TIntV &dest) const
static int FromUnicode(int c)
void SplitOnWs(TStrV &StrV) const
bool IsWbFlag(const TUniChFlags flag) const
void GetAllCodecs(TCodecBaseV &dest) const
TUnicode(const TStr &fnBinUcd)
TUnicodeErrorHandling errorHandling
static void SaveSChar(TSOut &SOut, signed char u)
static int FromUnicode(int c)
bool IsKey(const TKey &Key) const
size_t ToUnicode(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
bool IsTerminalPunctuation() const
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
static const int fromUnicodeTable2[2 *16]
int EncodeUtf16ToBytes(const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
TUniCaseFolding caseFolding
static const int fromUnicodeTable2[2 *16]
TNode(const TItem &item_, const int child_, const int sib_, const bool terminal_)
bool IsIdeographic() const
TIntIntVH specialCasingTitle
bool IsJoinControl() const
void FoldInPlace(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic) const
int EncodeUtf16ToWords(const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
TDat & AddDat(const TKey &Key)
void DecomposeAndCompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
static const int fromUnicodeTable1[14 *16]
static const int fromUnicodeTable1[14 *16]
THash< TStr, PCodecBase > codecs
static const int fromUnicodeTable4[11 *16]
void SbEx_Set(const TUniTrie< TInt > &newTrie)
const TKey & GetKey(const int &KeyId) const
static const int fromUnicodeTable2[2]
void TestWbFindNonIgnored() const
static int ParseCodePoint(const TStr &s)
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
static TStr GetCaseFoldingFn()
virtual size_t ToUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const =0
static int ToUnicode(int c)
bool IsSoftDotted() const
bool IsBidiControl() const
int GetChild(const int parentIdx, const TItem &item) const
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
void GetTitleCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Vector is a sequence TVal objects representing an array that can change in size.
static TStr GetAuxiliaryDir()
T8BitCodec< TEncoding_ISO8859_2 > iso8859_2
static TStr GetScriptNameHiragana()
static ushort GetLineBreakCode(char c1, char c2)
#define DefineUniCat(cat, c)
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
size_t ToUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
void WordsToBytes(const TIntV &src, TIntV &dest)