SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
TUniChDb Class Reference

#include <unicode.h>

Classes

class  TSubcatHelper
 
class  TUcdFileReader
 

Public Types

enum  {
  HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
  HangulLCount = 19, HangulVCount = 21, HangulTCount = 28, HangulNCount = HangulVCount * HangulTCount,
  HangulSCount = HangulLCount * HangulNCount
}
 
enum  TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 }
 
typedef enum
TUniChDb::TCaseConversion_ 
TCaseConversion
 

Public Member Functions

 TUniChDb ()
 
 TUniChDb (TSIn &SIn)
 
void Clr ()
 
void Save (TSOut &SOut) const
 
void Load (TSIn &SIn)
 
void LoadBin (const TStr &fnBin)
 
void Test (const TStr &basePath)
 
const TStrGetScriptName (const int scriptId) const
 
int GetScriptByName (const TStr &scriptName) const
 
int GetScript (const TUniChInfo &ci) const
 
int GetScript (const int cp) const
 
const char * GetCharName (const int cp) const
 
TStr GetCharNameS (const int cp) const
 
template<class TSrcVec >
void PrintCharNames (FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const
 
template<class TSrcVec >
void PrintCharNames (FILE *f, const TSrcVec &src, const TStr &prefix) const
 
bool IsGetChInfo (const int cp, TUniChInfo &ChInfo)
 
TUniChCategory GetCat (const int cp) const
 
TUniChSubCategory GetSubCat (const int cp) const
 
bool IsWbFlag (const int cp, const TUniChFlags flag) const
 
int GetWbFlags (const int cp) const
 
bool IsSbFlag (const int cp, const TUniChFlags flag) const
 
int GetSbFlags (const int cp) const
 
DECLARE_FORWARDED_PROPERTY_METHODS
bool 
IsPrivateUse (const int cp) const
 
bool IsSurrogate (const int cp) const
 
int GetCombiningClass (const int cp) const
 
template<typename TSrcVec >
bool FindNextWordBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
 
template<typename TSrcVec >
void FindWordBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
 
template<typename TSrcVec >
bool FindNextSentenceBoundary (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
 
template<typename TSrcVec >
void FindSentenceBoundaries (const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
 
void SbEx_Clr ()
 
template<class TSrcVec >
void SbEx_Add (const TSrcVec &v)
 
void SbEx_Add (const TStr &s)
 
void SbEx_AddUtf8 (const TStr &s)
 
int SbEx_AddMulti (const TStr &words, const bool wordsAreUtf8=true)
 
void SbEx_Set (const TUniTrie< TInt > &newTrie)
 
int SbEx_SetStdEnglish ()
 
template<typename TSrcVec , typename TDestCh >
void Decompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void Decompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void Compose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void Compose (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void DecomposeAndCompose (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void DecomposeAndCompose (const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
size_t ExtractStarters (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
size_t ExtractStarters (const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
 
template<typename TSrcVec >
size_t ExtractStarters (TSrcVec &src) const
 
void LoadTxt (const TStr &basePath)
 
void SaveBin (const TStr &fnBinUcd)
 
template<typename TSrcVec , typename TDestCh >
void GetCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
 
template<typename TSrcVec , typename TDestCh >
void GetLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleCaseConverted (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleLowerCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleUpperCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleTitleCase (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleLowerCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleUpperCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
void GetSimpleTitleCase (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec >
void ToSimpleCaseConverted (TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
 
template<typename TSrcVec >
void ToSimpleUpperCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
 
template<typename TSrcVec >
void ToSimpleLowerCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
 
template<typename TSrcVec >
void ToSimpleTitleCase (TSrcVec &src, size_t srcIdx, const size_t srcCount) const
 
template<typename TSrcVec >
void ToSimpleUpperCase (TSrcVec &src) const
 
template<typename TSrcVec >
void ToSimpleLowerCase (TSrcVec &src) const
 
template<typename TSrcVec >
void ToSimpleTitleCase (TSrcVec &src) const
 
template<typename TSrcVec , typename TDestCh >
void GetCaseFolded (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
 
template<typename TSrcVec , typename TDestCh >
void GetCaseFolded (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const
 
template<typename TSrcVec >
void ToCaseFolded (TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
 
template<typename TSrcVec >
void ToCaseFolded (TSrcVec &src, const bool turkic=false) const
 

Static Public Member Functions

static TStr GetCaseFoldingFn ()
 
static TStr GetSpecialCasingFn ()
 
static TStr GetUnicodeDataFn ()
 
static TStr GetCompositionExclusionsFn ()
 
static TStr GetScriptsFn ()
 
static TStr GetDerivedCorePropsFn ()
 
static TStr GetLineBreakFn ()
 
static TStr GetPropListFn ()
 
static TStr GetAuxiliaryDir ()
 
static TStr GetWordBreakTestFn ()
 
static TStr GetWordBreakPropertyFn ()
 
static TStr GetSentenceBreakTestFn ()
 
static TStr GetSentenceBreakPropertyFn ()
 
static TStr GetNormalizationTestFn ()
 
static TStr GetBinFn ()
 
static TStr GetScriptNameUnknown ()
 
static TStr GetScriptNameKatakana ()
 
static TStr GetScriptNameHiragana ()
 

Public Attributes

THash< TInt, TUniChInfoh
 
TStrPool charNames
 
TStrIntH scripts
 
TIntV decompositions
 
THash< TIntPr, TIntinverseDec
 
TUniCaseFolding caseFolding
 
TIntIntVH specialCasingLower
 
TIntIntVH specialCasingUpper
 
TIntIntVH specialCasingTitle
 
int scriptUnknown
 

Protected Types

typedef TUniVecIdx TVecIdx
 

Protected Member Functions

void InitAfterLoad ()
 
bool IsWbIgnored (const int cp) const
 
template<typename TSrcVec >
void WbFindCurOrNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const
 
template<typename TSrcVec >
void WbFindNextNonIgnored (const TSrcVec &src, size_t &position, const size_t srcEnd) const
 
template<typename TSrcVec >
void WbFindNextNonIgnoredS (const TSrcVec &src, size_t &position, const size_t srcEnd) const
 
template<typename TSrcVec >
bool WbFindPrevNonIgnored (const TSrcVec &src, const size_t srcStart, size_t &position) const
 
void TestWbFindNonIgnored (const TIntV &src) const
 
void TestWbFindNonIgnored () const
 
void TestFindNextWordOrSentenceBoundary (const TStr &basePath, bool sentence)
 
template<typename TSrcVec >
bool CanSentenceEndHere (const TSrcVec &src, const size_t srcIdx, const size_t position) const
 
template<typename TDestCh >
void AddDecomposition (const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
 
void TestComposition (const TStr &basePath)
 
void InitWordAndSentenceBoundaryFlags (const TStr &basePath)
 
void InitScripts (const TStr &basePath)
 
void InitLineBreaks (const TStr &basePath)
 
void InitDerivedCoreProperties (const TStr &basePath)
 
void InitPropList (const TStr &basePath)
 
void InitSpecialCasing (const TStr &basePath)
 
void LoadTxt_ProcessDecomposition (TUniChInfo &ci, TStr s)
 
void TestCaseConversion (const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
 
void TestCaseConversions ()
 

Static Protected Member Functions

static bool IsWbIgnored (const TUniChInfo &ci)
 

Protected Attributes

TUniTrie< TIntsbExTrie
 

Friends

class TUniCaseFolding
 

Detailed Description

Definition at line 1256 of file unicode.h.

Member Typedef Documentation

typedef TUniVecIdx TUniChDb::TVecIdx
protected

Definition at line 1260 of file unicode.h.

Member Enumeration Documentation

anonymous enum
Enumerator
HangulSBase 
HangulLBase 
HangulVBase 
HangulTBase 
HangulLCount 
HangulVCount 
HangulTCount 
HangulNCount 
HangulSCount 

Definition at line 1405 of file unicode.h.

Enumerator
ccLower 
ccUpper 
ccTitle 
ccMax 

Definition at line 1584 of file unicode.h.

1584 { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
enum TUniChDb::TCaseConversion_ TCaseConversion

Constructor & Destructor Documentation

TUniChDb::TUniChDb ( )
inline

Definition at line 1274 of file unicode.h.

1274 : scriptUnknown(-1) { }
int scriptUnknown
Definition: unicode.h:1272
TUniChDb::TUniChDb ( TSIn SIn)
inlineexplicit

Definition at line 1275 of file unicode.h.

1275 { Load(SIn); }
void Load(TSIn &SIn)
Definition: unicode.h:1285

Member Function Documentation

template<typename TDestCh >
void TUniChDb::AddDecomposition ( const int  codePoint,
TVec< TDestCh > &  dest,
const bool  compatibility 
) const
protected

Definition at line 3103 of file unicode.h.

3104 {
3105  if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
3106  {
3107  // UAX #15, sec. 16: Hangul decomposition
3108  const int SIndex = codePoint - HangulSBase;
3109  const int L = HangulLBase + SIndex / HangulNCount;
3110  const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
3111  const int T = HangulTBase + (SIndex % HangulTCount);
3112  dest.Add(L); dest.Add(V);
3113  if (T != HangulTBase) dest.Add(T);
3114  return;
3115  }
3116  int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
3117  const TUniChInfo &ci = h[i];
3118  int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
3119  if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
3120  while (true) {
3121  int cp = decompositions[ofs++]; if (cp < 0) return;
3122  AddDecomposition(cp, dest, compatibility); }
3123 }
bool IsCompatibilityDecomposition() const
Definition: unicode.h:1112
void AddDecomposition(const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
Definition: unicode.h:3103
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
TIntV decompositions
Definition: unicode.h:1266
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int decompOffset
Definition: unicode.h:1023
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec >
bool TUniChDb::CanSentenceEndHere ( const TSrcVec &  src,
const size_t  srcIdx,
const size_t  position 
) const
protected

Definition at line 2585 of file unicode.h.

2586 {
2587  if (sbExTrie.Empty()) return true;
2588  // We'll move back from the position where a sentence-boundary is being considered.
2589  size_t pos = position;
2590  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2591  int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
2592  // - Skip the Sep, if there is one.
2593  if ((c & ucfSbSep) == ucfSbSep) {
2594  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2595  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2596  // - Skip any Sp characters.
2597  while ((sfb & ucfSbSp) == ucfSbSp) {
2598  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2599  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2600  // - Skip any Close characters.
2601  while ((sfb & ucfSbSp) == ucfSbSp) {
2602  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2603  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2604  // - Skip any ATerm | STerm characters.
2605  while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
2606  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2607  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2608  // Now start moving through the trie.
2609  int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
2610  while (true)
2611  {
2612  bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
2613  c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
2614  TUniChCategory cat = GetCat(c);
2615  if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
2616  // Check if the suffix we've read so far is one of those that appear in the trie.
2617  if (len == 1) return ! sbExTrie.Has1Gram(cLast);
2618  if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
2619  IAssert(len >= 3); IAssert(node >= 0);
2620  if (sbExTrie.IsNodeTerminal(node)) return false;
2621  if (atEnd) return true; }
2622  if (len == 1) { cButLast = c; len++; }
2623  else if (len == 2) { cButButLast = c; len++;
2624  // Now we have read the last three characters; start descending the suitable subtrie.
2625  node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
2626  if (node < 0) return true; }
2627  else {
2628  // Descend down the trie.
2629  node = sbExTrie.GetChild(node, c);
2630  if (node < 0) return true; }
2631  }
2632  //return true;
2633 }
#define IAssert(Cond)
Definition: bd.h:262
bool Has1Gram(const TItem &item) const
Definition: unicode.h:1204
TUniChCategory GetCat(const int cp) const
Definition: unicode.h:1353
bool Empty() const
Definition: unicode.h:1202
enum TUniChCategory_ TUniChCategory
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
bool IsNodeTerminal(const int nodeIdx) const
Definition: unicode.h:1215
int Get3GramRoot(const TItem &last, const TItem &butLast, const TItem &butButLast) const
Definition: unicode.h:1206
bool Has2Gram(const TItem &last, const TItem &butLast) const
Definition: unicode.h:1205
int GetChild(const int parentIdx, const TItem &item) const
Definition: unicode.h:1209
TUniVecIdx TVecIdx
Definition: unicode.h:1260
void TUniChDb::Clr ( )
inline

Definition at line 1276 of file unicode.h.

1276  {
1279  scripts.Clr(); }
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
TIntIntVH specialCasingLower
Definition: unicode.h:1271
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
void Clr(bool DoDel=false)
Definition: dt.h:816
void Clr()
Definition: unicode.h:288
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
TIntV decompositions
Definition: unicode.h:1266
TStrIntH scripts
Definition: unicode.h:1265
void Clr(const bool &DoDel=true, const int &NoDelLim=-1, const bool &ResetDat=true)
Definition: hash.h:319
TStrPool charNames
Definition: unicode.h:1264
TUniCaseFolding caseFolding
Definition: unicode.h:1268
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
template<typename TSrcVec , typename TDestCh >
void TUniChDb::Compose ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  clrDest = true 
) const

Definition at line 3158 of file unicode.h.

3160 {
3161  if (clrDest) dest.Clr();
3162  bool lastStarterKnown = false; // has a starter been encountered yet?
3163  size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter
3164  int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
3165  const size_t srcEnd = srcIdx + srcCount;
3166  int ccMax = -1; // The highest combining class among the characters since the last starter.
3167  while (srcIdx < srcEnd)
3168  {
3169  const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
3170  const int cpClass = GetCombiningClass(cp);
3171  //int cpCombined = -1;
3172  // If there is a starter with which 'cp' can be combined, and from which it is not blocked
3173  // by some intermediate character, we can try to combine them.
3174  if (lastStarterKnown && ccMax < cpClass)
3175  {
3176  int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
3177  int cpCombined = -1;
3178  do {
3179  // Try to look up a composition in the inverseDec table.
3180  if (j >= 0) { cpCombined = inverseDec[j]; break; }
3181  // UAX #15, sec. 16: Hangul composition
3182  // - Try to combine L and V.
3183  const int LIndex = cpLastStarter - HangulLBase;
3184  if (0 <= LIndex && LIndex < HangulLCount) {
3185  const int VIndex = cp - HangulVBase;
3186  if (0 <= VIndex && VIndex < HangulVCount) {
3187  cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
3188  break; } }
3189  // - Try to combine LV and T.
3190  const int SIndex = cpLastStarter - HangulSBase;
3191  if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
3192  {
3193  const int TIndex = cp - HangulTBase;
3194  if (0 <= TIndex && TIndex < HangulTCount) {
3195  cpCombined = cpLastStarter + TIndex;
3196  break; }
3197  }
3198  } while (false);
3199  // If a combining character has been found, use it to replace the old cpStarter.
3200  if (cpCombined >= 0) {
3201  dest[TVecIdx(lastStarterPos)] = cpCombined;
3203  // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
3204  cpLastStarter = cpCombined; continue; }
3205  }
3206  if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter.
3207  lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
3208  else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
3209  ccMax = cpClass;
3210  dest.Add(cp);
3211  }
3212 }
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
#define Assert(Cond)
Definition: bd.h:251
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int GetCombiningClass(const int cp) const
Definition: unicode.h:1399
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec , typename TDestCh >
void TUniChDb::Compose ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
bool  clrDest = true 
) const
inline

Definition at line 1532 of file unicode.h.

1532  {
1533  Compose(src, 0, src.Len(), dest, clrDest); }
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3158
template<typename TSrcVec , typename TDestCh >
void TUniChDb::Decompose ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  compatibility,
bool  clrDest = true 
) const

Definition at line 3126 of file unicode.h.

3128 {
3129  if (clrDest) dest.Clr();
3130  const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
3131  // Decompose the string.
3132  while (srcIdx < srcCount) {
3133  AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
3134  // Rearrange the decomposed string into canonical order.
3135  for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
3136  {
3137  size_t j = destIdx;
3138  int cp = dest[TVecIdx(destIdx)]; destIdx++;
3139  int cpCls = GetCombiningClass(cp);
3140  if (cpCls == TUniChInfo::ccStarter) continue;
3141  while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
3142  dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
3143  dest[TVecIdx(j)] = cp;
3144  }
3145 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
void AddDecomposition(const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
Definition: unicode.h:3103
int GetCombiningClass(const int cp) const
Definition: unicode.h:1399
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec , typename TDestCh >
void TUniChDb::Decompose ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
bool  compatibility,
bool  clrDest = true 
) const
inline

Definition at line 1520 of file unicode.h.

1520  {
1521  Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3126
template<typename TSrcVec , typename TDestCh >
void TUniChDb::DecomposeAndCompose ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  compatibility,
bool  clrDest = true 
) const

Definition at line 3148 of file unicode.h.

3150 {
3151  if (clrDest) dest.Clr();
3152  TIntV temp;
3153  Decompose(src, srcIdx, srcCount, temp, compatibility);
3154  Compose(temp, 0, temp.Len(), dest, clrDest);
3155 }
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3158
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3126
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
template<typename TSrcVec , typename TDestCh >
void TUniChDb::DecomposeAndCompose ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
bool  compatibility,
bool  clrDest = true 
) const
inline

Definition at line 1542 of file unicode.h.

1542  {
1543  DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
void DecomposeAndCompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3148
template<typename TSrcVec , typename TDestCh >
size_t TUniChDb::ExtractStarters ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  clrDest = true 
) const

Definition at line 3215 of file unicode.h.

3217 {
3218  if (clrDest) dest.Clr();
3219  size_t retVal = 0;
3220  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
3221  const int cp = src[TVecIdx(srcIdx)];
3223  { dest.Add(cp); retVal++; } }
3224  return retVal;
3225 }
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
int GetCombiningClass(const int cp) const
Definition: unicode.h:1399
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec , typename TDestCh >
size_t TUniChDb::ExtractStarters ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
bool  clrDest = true 
) const
inline

Definition at line 1551 of file unicode.h.

1551  {
1552  return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3215
template<typename TSrcVec >
size_t TUniChDb::ExtractStarters ( TSrcVec &  src) const
inline

Definition at line 1555 of file unicode.h.

1555  {
1556  TIntV temp; size_t retVal = ExtractStarters(src, temp);
1557  src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
1558  return retVal; }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3215
template<typename TSrcVec >
bool TUniChDb::FindNextSentenceBoundary ( const TSrcVec &  src,
const size_t  srcIdx,
const size_t  srcCount,
size_t &  position 
) const

Definition at line 2636 of file unicode.h.

2637 {
2638  // SB1. Break at the start of text.
2639  if (position < srcIdx) { position = srcIdx; return true; }
2640  // If we are beyond the end of the text, there aren't any word breaks left.
2641  const size_t srcEnd = srcIdx + srcCount;
2642  if (position >= srcEnd) return false;
2643  // If 'position' is currently at an ignored character, move it back to the last nonignored character.
2644  size_t origPos = position;
2645  if (IsWbIgnored(src[TVecIdx(position)])) {
2646  if (! WbFindPrevNonIgnored(src, srcIdx, position))
2647  position = origPos;
2648  }
2649  // Determine the previous nonignored character (before 'position').
2650  size_t posPrev = position;
2651  if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
2652  // Sec 6.2. Allow a break between Sep and an ignored character.
2653  if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
2654  // Determine the next nonignored character (after 'position').
2655  size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
2656  size_t posNext2;
2657  int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
2658  int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
2659  int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
2660  int cNext2, sbfNext2;
2661  // Initialize the state of the peek-back automaton.
2662  typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
2663  TPeekBackState backState;
2664  {
2665  size_t pos = position;
2666  bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
2667  while (true)
2668  {
2669  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
2670  // Skip at most one Sep.
2671  int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
2672  if ((sbf & ucfSbSep) == ucfSbSep) {
2673  wasSep = true;
2674  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
2675  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2676  // Skip zero or more Sp's.
2677  bool stop = false;
2678  while ((sbf & ucfSbSp) == ucfSbSp) {
2679  wasSp = true;
2680  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
2681  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2682  if (stop) break;
2683  // Skip zero or more Close's.
2684  while ((sbf & ucfSbClose) == ucfSbClose) {
2685  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
2686  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2687  if (stop) break;
2688  // Process an ATerm or STerm.
2689  wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
2690  wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
2691  break;
2692  }
2693  if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
2694  else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
2695  else backState = stInit;
2696  }
2697  // Initialize the state of the peek-ahead automaton. This state tells us what follows
2698  // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
2699  // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
2700  // Our peek-ahead automaton must tell us whether it is Lower or something else.
2701  typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
2702  TPeekAheadState aheadState = stUnknown;
2703  //
2704  for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2705  cPrev = cCur, cCur = cNext, cNext = cNext2,
2706  sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
2707  {
2708  // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
2709  // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
2710  // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
2711  posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
2712  cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
2713  sbfNext2 = GetSbFlags(cNext2);
2714  // Update the peek-back automaton.
2715 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
2716 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
2717  switch (backState) {
2718  case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
2719  case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
2720  case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
2721  case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2722  case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2723  case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2724  case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2725  default: IAssert(false); }
2726 #undef Trans
2727 #undef TestCur
2728  // Update the peek-ahead automaton.
2729 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
2730  if (! IsPeekAheadSkippable(sbfCur)) {
2731  bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
2732  if (aheadState == stLower) IAssert(isLower);
2733  else if (aheadState == stNotLower) IAssert(! isLower);
2734  // We haven't peaked ahead farther than this so far -- invalidate the state.
2735  aheadState = stUnknown; }
2736  if (aheadState == stUnknown)
2737  {
2738  // Peak ahead to the next non-peekahead-skippable character.
2739  size_t pos = posNext;
2740  while (pos < srcEnd) {
2741  int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
2742  if (! IsPeekAheadSkippable(sbf)) {
2743  if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
2744  else aheadState = stNotLower;
2745  break; }
2746  WbFindNextNonIgnored(src, pos, srcEnd); }
2747  if (! (pos < srcEnd)) aheadState = stNotLower;
2748  }
2749 #undef IsPeekAheadSkippable
2750  //
2751 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2752 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
2753 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2754  // SB3. Do not break within CRLF.
2755  if (cCur == 13 && cNext == 10) continue;
2756  // SB4. Break ater paragraph separators.
2757  if ((sbfCur & ucfSbSep) == ucfSbSep) {
2758  if (! CanSentenceEndHere(src, srcIdx, position)) continue;
2759  position = posNext; return true; }
2760  // Do not break after ambiguous terminators like period, if they are immediately followed by a number
2761  // or lowercase letter, if they are between uppercase letters, or if the first following letter
2762  // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation
2763  // or numeric period, and thus may not mark the end of a sentence.
2766  // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
2767  if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
2768  (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
2769  // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
2770  if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
2771  // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
2772  // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
2773  if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
2774  // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
2775  // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
2776  if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
2777  if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
2778  if (! CanSentenceEndHere(src, srcIdx, position)) continue;
2779  position = posNext; return true; } // SB11
2780  // WB12. Otherwise, do not break.
2781  continue;
2782 #undef TestCurNext
2783 #undef TestCurNext2
2784 #undef TestPrevCurNext
2785  }
2786  // WB2. Break at the end of text.
2787  IAssert(position == srcEnd);
2788  return true;
2789 }
#define IAssert(Cond)
Definition: bd.h:262
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
#define Trans(curFlag, newState)
#define TestCurNext(curFlag, nextFlag)
bool CanSentenceEndHere(const TSrcVec &src, const size_t srcIdx, const size_t position) const
Definition: unicode.h:2585
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
#define IsPeekAheadSkippable(sbf)
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
bool TUniChDb::FindNextWordBoundary ( const TSrcVec &  src,
const size_t  srcIdx,
const size_t  srcCount,
size_t &  position 
) const

Definition at line 2483 of file unicode.h.

2484 {
2485  // WB1. Break at the start of text.
2486  if (position < srcIdx) { position = srcIdx; return true; }
2487  // If we are beyond the end of the text, there aren't any word breaks left.
2488  const size_t srcEnd = srcIdx + srcCount;
2489  if (position >= srcEnd) return false;
2490  // If 'position' is currently at an ignored character, move it back to the last nonignored character.
2491  size_t origPos = position;
2492  if (IsWbIgnored(src[TVecIdx(position)])) {
2493  if (! WbFindPrevNonIgnored(src, srcIdx, position))
2494  position = origPos;
2495  }
2496  // Determine the previous nonignored character (before 'position').
2497  size_t posPrev = position;
2498  if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
2499  // Sec 6.2. Allow a break between Sep and an ignored character.
2500  if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
2501  // Determine the next nonignored character (after 'position').
2502  size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
2503  size_t posNext2;
2504  int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
2505  int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
2506  int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
2507  int cNext2, wbfNext2;
2508  //
2509  for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2510  cPrev = cCur, cCur = cNext, cNext = cNext2,
2511  wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
2512  {
2513  // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
2514  // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
2515  // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
2516  posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
2517  cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
2518  wbfNext2 = GetWbFlags(cNext2);
2519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
2521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2522  // WB3. Do not break within CRLF.
2523  if (cCur == 13 && cNext == 10) continue;
2524  // WB5. Do not break between most letters.
2526  // WB6. Do not break letters across certain punctuation.
2528  // WB7. Do not break letters across certain punctuation.
2530  // WB8. Do not break within sequences of digits, or digits adjacent to letters.
2532  // WB9. Do not break within sequences of digits, or digits adjacent to letters.
2534  // WB10. Do not break within sequences of digits, or digits adjacent to letters.
2536  // WB11. Do not break within sequences, such as "3.2" or "3.456,789".
2538  // WB12. Do not break within sequences, such as "3.2" or "3.456,789".
2540  // WB13. Do not break between Katakana.
2542  // WB13a. Do not break from extenders.
2543  if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
2544  (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
2545  // WB13b. Do not break from extenders.
2546  if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
2547  (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
2548  // WB14. Otherwise, break everywhere.
2549  position = posNext; return true;
2550 #undef TestCurNext
2551 #undef TestCurNext2
2552 #undef TestPrevCurNext
2553  }
2554  // WB2. Break at the end of text.
2555  IAssert(position == srcEnd);
2556  return true;
2557 }
#define IAssert(Cond)
Definition: bd.h:262
int GetWbFlags(const int cp) const
Definition: unicode.h:1357
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
#define TestCurNext(curFlag, nextFlag)
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
#define TestCurNext2(curFlag, nextFlag, next2Flag)
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
void TUniChDb::FindSentenceBoundaries ( const TSrcVec &  src,
const size_t  srcIdx,
const size_t  srcCount,
TBoolV dest 
) const

Definition at line 2793 of file unicode.h.

2794 {
2795  if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
2796  dest.PutAll(false);
2797  size_t position = srcIdx;
2798  dest[TVecIdx(position - srcIdx)] = true;
2799  while (position < srcIdx + srcCount)
2800  {
2801  size_t oldPos = position;
2802  FindNextSentenceBoundary(src, srcIdx, srcCount, position);
2803  if (oldPos >= position) {
2804  Assert(oldPos < position);
2805  }
2806  Assert(position <= srcIdx + srcCount);
2807  dest[TVecIdx(position - srcIdx)] = true;
2808  }
2809  Assert(dest[TVecIdx(srcCount)]);
2810 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
#define Assert(Cond)
Definition: bd.h:251
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
void TUniChDb::FindWordBoundaries ( const TSrcVec &  src,
const size_t  srcIdx,
const size_t  srcCount,
TBoolV dest 
) const

Definition at line 2561 of file unicode.h.

2562 {
2563  if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
2564  dest.PutAll(false);
2565  size_t position = srcIdx;
2566  dest[TVecIdx(position - srcIdx)] = true;
2567  while (position < srcIdx + srcCount)
2568  {
2569  size_t oldPos = position;
2570  FindNextWordBoundary(src, srcIdx, srcCount, position);
2571  if (oldPos >= position) {
2572  Assert(oldPos < position);
2573  }
2574  Assert(position <= srcIdx + srcCount);
2575  dest[TVecIdx(position - srcIdx)] = true;
2576  }
2577  Assert(dest[TVecIdx(srcCount)]);
2578 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
#define Assert(Cond)
Definition: bd.h:251
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
TUniVecIdx TVecIdx
Definition: unicode.h:1260
static TStr TUniChDb::GetAuxiliaryDir ( )
inlinestatic

Definition at line 1304 of file unicode.h.

1304 { return "auxiliary"; }
static TStr TUniChDb::GetBinFn ( )
inlinestatic

Definition at line 1310 of file unicode.h.

1310 { return "UniChDb.bin"; } // used only by Test()
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetCaseConverted ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const TCaseConversion  how,
const bool  turkic,
const bool  lithuanian 
) const

Definition at line 2817 of file unicode.h.

2821 {
2822  const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
2823  if (clrDest) dest.Clr();
2824  enum {
2825  GreekCapitalLetterSigma = 0x3a3,
2826  GreekSmallLetterSigma = 0x3c3,
2827  GreekSmallLetterFinalSigma = 0x3c2,
2828  LatinCapitalLetterI = 0x49,
2829  LatinCapitalLetterJ = 0x4a,
2830  LatinCapitalLetterIWithOgonek = 0x12e,
2831  LatinCapitalLetterIWithGrave = 0xcc,
2832  LatinCapitalLetterIWithAcute = 0xcd,
2833  LatinCapitalLetterIWithTilde = 0x128,
2834  LatinCapitalLetterIWithDotAbove = 0x130,
2835  LatinSmallLetterI = 0x69,
2836  CombiningDotAbove = 0x307
2837  };
2838  //
2839  bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
2840  size_t nextWordBoundary = srcIdx;
2841  TBoolV wordBoundaries; bool wbsKnown = false;
2842  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
2843  {
2844  int cp = src[TVecIdx(srcIdx)]; srcIdx++;
2845  //if (turkic && cp == 0x130 && how == ccLower) printf("!");
2846  // For conversion to titlecase, the first cased character of each word
2847  // must be converted to titlecase; everything else must be converted
2848  // to lowercase.
2849  TUniChDb::TCaseConversion howHere;
2850  if (how != ccTitle) howHere = how;
2851  else {
2852  if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
2853  seenCased = false; seenTwoCased = false; cpFirstCased = -1;
2854  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
2855  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
2856  bool isCased = IsCased(cp);
2857  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
2858  else { howHere = ccLower;
2859  if (isCased && seenCased) seenTwoCased = true; }
2860  }
2861  // First, process the conditional mappings from SpecialCasing.txt.
2862  // These will be processed in code -- they were ignored while
2863  // we were reading SpecialCasing.txt itself.
2864  if (cp == GreekCapitalLetterSigma && howHere == ccLower)
2865  {
2866  // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
2867  // the standard doesn't define it. We'll use FinalCased instead.
2868  // FinalCased: within the closest word boundaries containing C,
2869  // there is a cased letter before C, and there is no cased letter after C.
2870  //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
2871  if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
2872  size_t srcIdx2 = srcIdx; bool casedAfter = false;
2873  if (how == ccTitle)
2874  printf("!");
2875  //while (srcIdx2 < nextBoundary)
2876  while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
2877  {
2878  int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
2879  if (IsCased(cp2)) { casedAfter = true; break; }
2880  }
2881  if (! casedAfter)
2882  {
2883  //size_t prevBoundary = srcIdx - 1;
2884  //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
2885  srcIdx2 = srcIdx - 1; bool casedBefore = false;
2886  //while (prevBoundary < srcIdx2)
2887  while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
2888  {
2889  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2890  if (IsCased(cp2)) { casedBefore = true; break; }
2891  }
2892  if (casedBefore) {
2893  // Now we have a FinalCased character.
2894  dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
2895  }
2896  // If we got here, add a non-final sigma.
2897  dest.Add(GreekSmallLetterSigma); continue;
2898  }
2899  else if (lithuanian)
2900  {
2901  if (howHere == ccLower)
2902  {
2903  if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
2904  {
2905  bool moreAbove = false;
2906  for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
2907  {
2908  const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
2909  const int cc2 = GetCombiningClass(cp2);
2910  if (cc2 == TUniChInfo::ccStarter) break;
2911  if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
2912  }
2913  if (moreAbove)
2914  {
2915  if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
2916  if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
2917  if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
2918  }
2919  }
2920  else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
2921  else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
2922  else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
2923  }
2924  if (cp == CombiningDotAbove)
2925  {
2926  // Lithuanian, howHere != ccLower.
2927  // AfterSoftDotted := the last preceding character with a combining class
2928  // of zero before C was Soft_Dotted, and there is no intervening combining
2929  // character class 230 (ABOVE).
2930  bool afterSoftDotted = false;
2931  size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
2932  while (origSrcIdx < srcIdx2)
2933  {
2934  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2935  int cc2 = GetCombiningClass(cp2);
2936  if (cc2 == TUniChInfo::ccAbove) break;
2937  if (cc2 == TUniChInfo::ccStarter) {
2938  afterSoftDotted = IsSoftDotted(cp2); break; }
2939  }
2940  if (afterSoftDotted)
2941  {
2942  Assert(lithuanian);
2943  // Remove DOT ABOVE after "i" with upper or titlecase.
2944  // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
2945  // the "i" may have been kept lowercase and thus we shouldn't remove the dot).
2946  if (how == ccLower) { dest.Add(0x307); continue; }
2947  if (how == ccUpper) continue;
2948  Assert(how == ccTitle);
2949  Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
2950  if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
2951  dest.Add(0x307); continue;
2952  }
2953  }
2954  }
2955  else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
2956  {
2957  // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
2958  // The following rules handle those cases.
2959  if (cp == LatinCapitalLetterIWithDotAbove) {
2960  dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
2961  // When lowercasing, remove dot_above in the sequence I + dot_above,
2962  // which will turn into i. This matches the behavior of the
2963  // canonically equivalent I-dot_above.
2964  else if (cp == CombiningDotAbove)
2965  {
2966  // AfterI: the last preceding base character was an uppercase I,
2967  // and there is no intervening combining character class 230 (ABOVE).
2968  bool afterI = false;
2969  size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
2970  while (origSrcIdx < srcIdx2)
2971  {
2972  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2973  if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
2974  int cc2 = GetCombiningClass(cp2);
2975  if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
2976  }
2977  if (afterI) {
2978  if (how == ccTitle && seenCased && ! seenTwoCased) {
2979  // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
2980  // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
2981  // This suggests that if a cased character is found, others in that word should be left alone.
2982  // This seems unusual; we map all other characters to lowercase instead.
2983  // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
2984  // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
2985  // but since afterI is also true here, this would mean deleting it. Thus our titlecased
2986  // form of "I followed by dot-above" would be just "I", which is clearly wrong.
2987  // So we treat this as a special case here.
2988  IAssert(cpFirstCased == LatinCapitalLetterI);
2989  dest.Add(0x307); continue; }
2990  if (howHere != ccLower) dest.Add(0x307);
2991  continue; }
2992  }
2993  // When lowercasing, unless an I is before a dot_above,
2994  // it turns into a dotless i.
2995  else if (cp == LatinCapitalLetterI)
2996  {
2997  // BeforeDot: C is followed by U+0307 (combining dot above).
2998  // Any sequence of characters with a combining class that is
2999  // neither 0 nor 230 may intervene between the current character
3000  // and the combining dot above.
3001  bool beforeDot = false;
3002  for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
3003  {
3004  const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
3005  if (cp2 == 0x307) { beforeDot = true; break; }
3006  const int cc2 = GetCombiningClass(cp2);
3007  if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
3008  }
3009  if (! beforeDot) {
3010  dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
3011  }
3012  // When uppercasing, i turns into a dotted capital I.
3013  else if (cp == LatinSmallLetterI)
3014  {
3015  dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
3016  }
3017  }
3018  // Try to use the unconditional mappings.
3019  const TIntIntVH &specHere = (
3020  howHere == how ? specials :
3021  howHere == ccLower ? specialCasingLower :
3022  howHere == ccTitle ? specialCasingTitle :
3023  howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
3024  int i = specHere.GetKeyId(cp);
3025  if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
3026  // Try to use the simple (one-character) mappings.
3027  i = h.GetKeyId(cp);
3028  if (i >= 0) {
3029  const TUniChInfo &ci = h[i];
3030  int cpNew = (
3031  howHere == ccLower ? ci.simpleLowerCaseMapping :
3032  howHere == ccUpper ? ci.simpleUpperCaseMapping :
3034  if (cpNew < 0) cpNew = cp;
3035  dest.Add(cpNew); continue; }
3036  // As a final resort, leave 'cp' unchanged.
3037  dest.Add(cp);
3038  }
3039 }
#define IAssert(Cond)
Definition: bd.h:262
static void AppendVector(const TVec< TSrcDat > &src, TVec< TDestDat > &dest)
Definition: unicode.h:278
enum TUniChDb::TCaseConversion_ TCaseConversion
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
TIntIntVH specialCasingLower
Definition: unicode.h:1271
int simpleUpperCaseMapping
Definition: unicode.h:1022
int simpleTitleCaseMapping
Definition: unicode.h:1022
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
#define Assert(Cond)
Definition: bd.h:251
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int GetCombiningClass(const int cp) const
Definition: unicode.h:1399
int simpleLowerCaseMapping
Definition: unicode.h:1022
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetCaseFolded ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const bool  full,
const bool  turkic = false 
) const
inline

Definition at line 1629 of file unicode.h.

1630  { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const
Definition: unicode.h:293
TUniCaseFolding caseFolding
Definition: unicode.h:1268
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetCaseFolded ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  full = true,
const bool  turkic = false 
) const
inline

Definition at line 1632 of file unicode.h.

1632  {
1633  GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
void GetCaseFolded(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
Definition: unicode.h:1629
static TStr TUniChDb::GetCaseFoldingFn ( )
inlinestatic

Definition at line 1296 of file unicode.h.

1296 { return "CaseFolding.txt"; }
TUniChCategory TUniChDb::GetCat ( const int  cp) const
inline

Definition at line 1353 of file unicode.h.

1353 { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
const char* TUniChDb::GetCharName ( const int  cp) const
inline

Definition at line 1331 of file unicode.h.

1331 { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
const char * GetCStr(const uint &Offset) const
Definition: dt.h:811
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
TStrPool charNames
Definition: unicode.h:1264
TStr TUniChDb::GetCharNameS ( const int  cp) const
inline

Definition at line 1332 of file unicode.h.

1332  {
1333  // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
1334  const char *p = GetCharName(cp); if (p) return p;
1335  char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
const char * GetCharName(const int cp) const
Definition: unicode.h:1331
Definition: dt.h:412
int TUniChDb::GetCombiningClass ( const int  cp) const
inline

Definition at line 1399 of file unicode.h.

1399 { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
static TStr TUniChDb::GetCompositionExclusionsFn ( )
inlinestatic

Definition at line 1299 of file unicode.h.

1299 { return "CompositionExclusions.txt"; }
static TStr TUniChDb::GetDerivedCorePropsFn ( )
inlinestatic

Definition at line 1301 of file unicode.h.

1301 { return "DerivedCoreProperties.txt"; }
static TStr TUniChDb::GetLineBreakFn ( )
inlinestatic

Definition at line 1302 of file unicode.h.

1302 { return "LineBreak.txt"; }
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetLowerCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1590 of file unicode.h.

1590 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetLowerCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1593 of file unicode.h.

1593 { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1590
static TStr TUniChDb::GetNormalizationTestFn ( )
inlinestatic

Definition at line 1309 of file unicode.h.

1309 { return "NormalizationTest.txt"; }
static TStr TUniChDb::GetPropListFn ( )
inlinestatic

Definition at line 1303 of file unicode.h.

1303 { return "PropList.txt"; }
int TUniChDb::GetSbFlags ( const int  cp) const
inline

Definition at line 1359 of file unicode.h.

1359 { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int TUniChDb::GetScript ( const TUniChInfo ci) const
inline

Definition at line 1323 of file unicode.h.

1323 { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
signed char script
Definition: unicode.h:1021
int scriptUnknown
Definition: unicode.h:1272
int TUniChDb::GetScript ( const int  cp) const
inline

Definition at line 1324 of file unicode.h.

1324 { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
int GetScript(const TUniChInfo &ci) const
Definition: unicode.h:1323
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int scriptUnknown
Definition: unicode.h:1272
int TUniChDb::GetScriptByName ( const TStr scriptName) const
inline

Definition at line 1322 of file unicode.h.

1322 { return scripts.GetKeyId(scriptName); }
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
TStrIntH scripts
Definition: unicode.h:1265
const TStr& TUniChDb::GetScriptName ( const int  scriptId) const
inline

Definition at line 1321 of file unicode.h.

1321 { return scripts.GetKey(scriptId); }
TStrIntH scripts
Definition: unicode.h:1265
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
static TStr TUniChDb::GetScriptNameHiragana ( )
inlinestatic

Definition at line 1319 of file unicode.h.

1319 { return "Hiragana"; }
static TStr TUniChDb::GetScriptNameKatakana ( )
inlinestatic

Definition at line 1318 of file unicode.h.

1318 { return "Katakana"; }
static TStr TUniChDb::GetScriptNameUnknown ( )
inlinestatic

Definition at line 1317 of file unicode.h.

1317 { return "Unknown"; }
static TStr TUniChDb::GetScriptsFn ( )
inlinestatic

Definition at line 1300 of file unicode.h.

1300 { return "Scripts.txt"; }
static TStr TUniChDb::GetSentenceBreakPropertyFn ( )
inlinestatic

Definition at line 1308 of file unicode.h.

1308 { return "SentenceBreakProperty.txt"; }
static TStr TUniChDb::GetSentenceBreakTestFn ( )
inlinestatic

Definition at line 1307 of file unicode.h.

1307 { return "SentenceBreakTest.txt"; }
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleCaseConverted ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const TCaseConversion  how 
) const

Definition at line 3042 of file unicode.h.

3044 {
3045  if (clrDest) dest.Clr();
3046  bool seenCased = false; size_t nextWordBoundary = srcIdx;
3047  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
3048  {
3049  const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
3050  int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
3051  const TUniChInfo &ci = h[i];
3052  // With titlecasing, the first cased character of each word must be put into titlecase,
3053  // all others into lowercase. This is what the howHere variable is for.
3054  TUniChDb::TCaseConversion howHere;
3055  if (how != ccTitle) howHere = how;
3056  else {
3057  if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
3058  seenCased = false;
3059  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
3060  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3061  bool isCased = IsCased(cp);
3062  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
3063  else howHere = ccLower;
3064  }
3065  int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
3066  if (cpNew < 0) cpNew = cp;
3067  dest.Add(cpNew);
3068  }
3069 }
#define IAssert(Cond)
Definition: bd.h:262
enum TUniChDb::TCaseConversion_ TCaseConversion
int simpleUpperCaseMapping
Definition: unicode.h:1022
int simpleTitleCaseMapping
Definition: unicode.h:1022
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int simpleLowerCaseMapping
Definition: unicode.h:1022
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleLowerCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1601 of file unicode.h.

1601 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
Definition: unicode.h:3042
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleLowerCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1604 of file unicode.h.

1604 { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
void GetSimpleLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1601
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleTitleCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1603 of file unicode.h.

1603 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
Definition: unicode.h:3042
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleTitleCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1606 of file unicode.h.

1606 { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1603
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleUpperCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1602 of file unicode.h.

1602 { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
Definition: unicode.h:3042
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetSimpleUpperCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 1605 of file unicode.h.

1605 { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1602
static TStr TUniChDb::GetSpecialCasingFn ( )
inlinestatic

Definition at line 1297 of file unicode.h.

1297 { return "SpecialCasing.txt"; }
TUniChSubCategory TUniChDb::GetSubCat ( const int  cp) const
inline

Definition at line 1354 of file unicode.h.

1354 { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetTitleCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1592 of file unicode.h.

1592 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetTitleCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1595 of file unicode.h.

1595 { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
void GetTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1592
static TStr TUniChDb::GetUnicodeDataFn ( )
inlinestatic

Definition at line 1298 of file unicode.h.

1298 { return "UnicodeData.txt"; }
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetUpperCase ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1591 of file unicode.h.

1591 { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
template<typename TSrcVec , typename TDestCh >
void TUniChDb::GetUpperCase ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true,
const bool  turkic = false,
const bool  lithuanian = false 
) const
inline

Definition at line 1594 of file unicode.h.

1594 { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1591
int TUniChDb::GetWbFlags ( const int  cp) const
inline

Definition at line 1357 of file unicode.h.

1357 { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
static TStr TUniChDb::GetWordBreakPropertyFn ( )
inlinestatic

Definition at line 1306 of file unicode.h.

1306 { return "WordBreakProperty.txt"; }
static TStr TUniChDb::GetWordBreakTestFn ( )
inlinestatic

Definition at line 1305 of file unicode.h.

1305 { return "WordBreakTest.txt"; }
void TUniChDb::InitAfterLoad ( )
protected

Definition at line 1368 of file unicode.cpp.

1369 {
1371 }
#define IAssert(Cond)
Definition: bd.h:262
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317
int scriptUnknown
Definition: unicode.h:1272
void TUniChDb::InitDerivedCoreProperties ( const TStr basePath)
protected

Definition at line 1007 of file unicode.cpp.

1008 {
1009  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
1010  reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
1011  TSubcatHelper helper(*this);
1012  while (reader.GetNextLine(fields))
1013  {
1014  IAssert(fields.Len() == 2);
1015  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1016  TStr s = fields[1];
1018  if (s == "Math") flag = ucfDcpMath;
1019  else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
1020  else if (s == "Lowercase") flag = ucfDcpLowercase;
1021  else if (s == "Uppercase") flag = ucfDcpUppercase;
1022  else if (s == "ID_Start") flag = ucfDcpIdStart;
1023  else if (s == "ID_Continue") flag = ucfDcpIdContinue;
1024  else if (s == "XID_Start") flag = ucfDcpXidStart;
1025  else if (s == "XID_Continue") flag = ucfDcpXidContinue;
1026  else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
1027  else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
1028  else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
1029  else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
1030  else FailR(s.CStr());
1031  // If we add new codepoints to the hash table, we should also set their category.
1032  // This is supposed to be provided in the comment, e.g. "# Cf SOFT HYPHEN".
1033  helper.ProcessComment(reader);
1034  //
1035  for (int cp = from; cp <= to; cp++) {
1036  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1037  helper.TestCat(cp);
1038  TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
1039  ci.SetDcpFlag(flag); nCps++; }
1040  nLines++;
1041  }
1042  reader.Close();
1043  printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
1044 }
#define IAssert(Cond)
Definition: bd.h:262
enum TUniChFlags_ TUniChFlags
bool IsDcpFlag(const TUniChFlags flag) const
Definition: unicode.h:1068
void SetDcpFlag(const TUniChFlags flag)
Definition: unicode.h:1070
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
static TStr GetDerivedCorePropsFn()
Definition: unicode.h:1301
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
#define FailR(Reason)
Definition: bd.h:240
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int AddKey(const TKey &Key)
Definition: hash.h:331
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
Definition: dt.h:412
char * CStr()
Definition: dt.h:476
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::InitLineBreaks ( const TStr basePath)
protected

Definition at line 1046 of file unicode.cpp.

1047 {
1048  // Clear old linebreak values.
1050  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
1051  // Read LineBreak.txt.
1052  TUcdFileReader reader; TStrV fields;
1053  reader.Open(CombinePath(basePath, GetLineBreakFn()));
1054  int nLines = 0, nCps = 0;
1055  while (reader.GetNextLine(fields))
1056  {
1057  IAssert(fields.Len() == 2);
1058  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1059  TStr s = fields[1]; IAssert(s.Len() == 2);
1060  ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
1061  if (us == xx) continue;
1062  for (int cp = from; cp <= to; cp++) {
1063  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
1064  printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
1065  IAssert(h[i].lineBreak == xx);
1066  h[i].lineBreak = us; nCps++; }
1067  nLines++;
1068  }
1069  reader.Close();
1070  printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
1071 }
#define IAssert(Cond)
Definition: bd.h:262
int Len() const
Definition: dt.h:487
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
int FFirstKeyId() const
Definition: hash.h:236
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
static TStr GetLineBreakFn()
Definition: unicode.h:1302
unsigned short ushort
Definition: bd.h:13
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int AddKey(const TKey &Key)
Definition: hash.h:331
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
static const ushort LineBreak_Unknown
Definition: unicode.h:1032
Definition: dt.h:412
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
static ushort GetLineBreakCode(char c1, char c2)
Definition: unicode.h:1031
void TUniChDb::InitPropList ( const TStr basePath)
protected

Definition at line 950 of file unicode.cpp.

951 {
952  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
953  reader.Open(CombinePath(basePath, GetPropListFn()));
954  TSubcatHelper helper(*this);
955  while (reader.GetNextLine(fields))
956  {
957  IAssert(fields.Len() == 2);
958  int from, to; reader.ParseCodePointRange(fields[0], from, to);
959  TStr s = fields[1];
961  if (s == "White_Space") prop = ucfPrWhiteSpace;
962  else if (s == "Bidi_Control") prop = ucfPrBidiControl;
963  else if (s == "Join_Control") prop = ucfPrJoinControl;
964  else if (s == "Dash") prop = ucfPrDash;
965  else if (s == "Hyphen") prop = ucfPrHyphen;
966  else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
967  else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
968  else if (s == "Other_Math") propx = ucfPxOtherMath;
969  else if (s == "Hex_Digit") prop = ucfPrHexDigit;
970  else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
971  else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
972  else if (s == "Ideographic") prop = ucfPrIdeographic;
973  else if (s == "Diacritic") prop = ucfPrDiacritic;
974  else if (s == "Extender") prop = ucfPrExtender;
975  else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
976  else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
977  else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
978  else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
979  else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
980  else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
981  else if (s == "Radical") propx = ucfPxRadical;
982  else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
983  else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
984  else if (s == "Deprecated") prop = ucfPrDeprecated;
985  else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
986  else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
987  else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
988  else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
989  else if (s == "STerm") prop = ucfPrSTerm;
990  else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
991  else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
992  else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
993  else FailR(s.CStr());
994  helper.ProcessComment(reader);
995  for (int cp = from; cp <= to; cp++) {
996  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
997  TUniChInfo &ci = h[i]; helper.TestCat(cp);
998  if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
999  if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
1000  nCps++; }
1001  nLines++;
1002  }
1003  reader.Close();
1004  printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
1005 }
#define IAssert(Cond)
Definition: bd.h:262
enum TUniChProperties_ TUniChProperties
void SetPropertyX(const TUniChPropertiesX flag)
Definition: unicode.h:1108
void SetProperty(const TUniChProperties flag)
Definition: unicode.h:1085
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
enum TUniChPropertiesX_ TUniChPropertiesX
bool IsPropertyX(const TUniChPropertiesX flag) const
Definition: unicode.h:1107
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
#define FailR(Reason)
Definition: bd.h:240
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int AddKey(const TKey &Key)
Definition: hash.h:331
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
Definition: dt.h:412
static TStr GetPropListFn()
Definition: unicode.h:1303
bool IsProperty(const TUniChProperties flag) const
Definition: unicode.h:1084
char * CStr()
Definition: dt.h:476
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::InitScripts ( const TStr basePath)
protected

Definition at line 1073 of file unicode.cpp.

1074 {
1075  TUcdFileReader reader; TStrV fields;
1076  reader.Open(CombinePath(basePath, GetScriptsFn()));
1077  TSubcatHelper helper(*this);
1078  while (reader.GetNextLine(fields))
1079  {
1080  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1081  TStr scriptName = fields[1];
1082  int scriptNo = scripts.GetKeyId(scriptName);
1083  if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
1084  IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
1085  scripts[scriptNo] += 1;
1086  helper.ProcessComment(reader);
1087  for (int cp = from; cp <= to; cp++) {
1088  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1089  helper.TestCat(cp);
1090  TUniChInfo &ci = h[i]; ci.script = scriptNo; }
1091  }
1092  reader.Close();
1094  printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
1095  if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
1096  printf(" %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
1097  printf("\n");
1098 }
#define IAssert(Cond)
Definition: bd.h:262
static TStr GetScriptsFn()
Definition: unicode.h:1300
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
int FFirstKeyId() const
Definition: hash.h:236
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int AddKey(const TKey &Key)
Definition: hash.h:331
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
TStrIntH scripts
Definition: unicode.h:1265
Definition: dt.h:412
signed char script
Definition: unicode.h:1021
char * CStr()
Definition: dt.h:476
int Len() const
Definition: hash.h:186
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
bool AlwaysFalse()
Definition: unicode.h:3227
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::InitSpecialCasing ( const TStr basePath)
protected

Definition at line 1225 of file unicode.cpp.

1226 {
1227  TUcdFileReader reader; TStrV fields;
1228  reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
1229  while (reader.GetNextLine(fields))
1230  {
1231  IAssert(fields.Len() == 5 || fields.Len() == 6);
1232  IAssert(fields.Last().Empty());
1233  // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
1234  TStr conditions = "";
1235  if (fields.Len() == 6) conditions = fields[4];
1236  conditions.ToTrunc(); if (! conditions.Empty()) continue;
1237  // Keep the other mappings.
1238  const int cp = reader.ParseCodePoint(fields[0]);
1239  TIntV v; reader.ParseCodePointList(fields[1], v);
1240  specialCasingLower.AddDat(cp, v);
1241  reader.ParseCodePointList(fields[2], v);
1242  specialCasingTitle.AddDat(cp, v);
1243  reader.ParseCodePointList(fields[3], v);
1244  specialCasingUpper.AddDat(cp, v);
1245  }
1246  reader.Close();
1247 }
#define IAssert(Cond)
Definition: bd.h:262
static TStr GetSpecialCasingFn()
Definition: unicode.h:1297
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
TIntIntVH specialCasingLower
Definition: unicode.h:1271
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
Definition: dt.h:412
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::InitWordAndSentenceBoundaryFlags ( const TStr basePath)
protected

Definition at line 1100 of file unicode.cpp.

1101 {
1102  // UAX #29, sec. 4.1 and 5.1.
1103  // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
1104  int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
1105  int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
1106  // Clear any existing word-boundary flags and initialize them again.
1107  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1108  {
1109  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1110  ci.ClrWbAndSbFlags();
1111  // Word-boundary flags.
1112  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
1113  if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
1116  if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
1117  // Sentence-boundary flags. Some are identical to some word-boundary flags.
1118  if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
1119  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
1120  if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
1121  if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
1122  if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
1123  if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
1125  if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
1126  // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
1127  // the purposes of sentence-boundary detection. Now in PropList.txt there is no doubt that 002E has the STerm
1128  // property; thus, it should also belong to the STerm sentence-boundary class. However, in
1129  // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
1130  if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
1131  if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
1132  }
1133  // Some additional characters for Katakana and MidLetter.
1134  TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
1135  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
1136  v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
1137  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
1138  // WbALetter depends on Katakana, so it cannot be initialized earlier.
1139  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1140  {
1141  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1142  if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
1143  ci.SetWbFlag(ucfWbALetter);
1144  }
1145  // An alternative is to extract the flags from WordBreakProperty.txt.
1146  // The results should be the same.
1147  {TUcdFileReader reader; TStrV fields;
1148  reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
1149  THash<TInt, TInt> hh;
1150  while (reader.GetNextLine(fields))
1151  {
1152  IAssert(fields.Len() == 2);
1153  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1154  TStr s = fields[1];
1156  if (s == "Format") flag = ucfWbFormat;
1157  else if (s == "Katakana") flag = ucfWbKatakana;
1158  else if (s == "ALetter") flag = ucfWbALetter;
1159  else if (s == "MidLetter") flag = ucfWbMidLetter;
1160  else if (s == "MidNum") flag = ucfWbMidNum;
1161  else if (s == "Numeric") flag = ucfWbNumeric;
1162  else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
1163  else FailR(s.CStr());
1164  for (int c = from; c <= to; c++) {
1165  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1166  else hh[i].Val |= flag; }
1167  }
1168  reader.Close();
1169  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1170  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1171  cps.Sort(); cps.Merge();
1172  for (int i = 0; i < cps.Len(); i++)
1173  {
1174  int cp = cps[i];
1175  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
1176  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1177  flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
1178  if (flags1 != flags2) {
1179  printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
1180  Fail; }
1181  }}
1182  // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
1183  {TUcdFileReader reader; TStrV fields;
1184  reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
1185  THash<TInt, TInt> hh;
1186  while (reader.GetNextLine(fields))
1187  {
1188  IAssert(fields.Len() == 2);
1189  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1190  TStr s = fields[1];
1192  if (s == "Sep") flag = ucfSbSep;
1193  else if (s == "Format") flag = ucfSbFormat;
1194  else if (s == "Sp") flag = ucfSbSp;
1195  else if (s == "Lower") flag = ucfSbLower;
1196  else if (s == "Upper") flag = ucfSbUpper;
1197  else if (s == "OLetter") flag = ucfSbOLetter;
1198  else if (s == "Numeric") flag = ucfSbNumeric;
1199  else if (s == "ATerm") flag = ucfSbATerm;
1200  else if (s == "STerm") flag = ucfSbSTerm;
1201  else if (s == "Close") flag = ucfSbClose;
1202  else FailR(s.CStr());
1203  for (int c = from; c <= to; c++) {
1204  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1205  else hh[i].Val |= flag; }
1206  }
1207  reader.Close();
1208  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1209  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1210  cps.Sort(); cps.Merge();
1211  for (int i = 0; i < cps.Len(); i++)
1212  {
1213  int cp = cps[i];
1214  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
1215  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1216  if (flags1 != flags2) {
1217  printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
1218  flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
1219  flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
1220  flags1 ^ flags2);
1221  Fail; }
1222  }}
1223 }
#define IAssert(Cond)
Definition: bd.h:262
int GetWbFlags() const
Definition: unicode.h:1118
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
void Merge()
Sorts the vector and only keeps a single element of each value.
Definition: ds.h:1292
enum TUniChFlags_ TUniChFlags
static const ushort LineBreak_Quotation
Definition: unicode.h:1032
bool IsGraphemeExtend() const
Definition: unicode.h:1077
void SetSbFlag(const TUniChFlags flag)
Definition: unicode.h:1127
TUniChSubCategory subCat
Definition: unicode.h:1020
void SetWbFlag(const TUniChFlags flag)
Definition: unicode.h:1117
#define Fail
Definition: bd.h:238
static TStr GetScriptNameKatakana()
Definition: unicode.h:1318
static const ushort LineBreak_InfixNumeric
Definition: unicode.h:1032
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool IsAlphabetic() const
Definition: unicode.h:1071
static const ushort LineBreak_ComplexContext
Definition: unicode.h:1032
bool IsWhiteSpace() const
Definition: unicode.h:1104
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:220
bool IsUppercase() const
Definition: unicode.h:1072
bool IsLowercase() const
Definition: unicode.h:1073
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1254
ushort lineBreak
Definition: unicode.h:1028
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
TStr GetSbFlagsStr() const
Definition: unicode.h:1130
int FFirstKeyId() const
Definition: hash.h:236
static TStr GetWordBreakPropertyFn()
Definition: unicode.h:1306
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
#define FailR(Reason)
Definition: bd.h:240
bool IsSbFlag(const TUniChFlags flag) const
Definition: unicode.h:1126
void ClrWbAndSbFlags()
Definition: unicode.h:1116
static const ushort LineBreak_Numeric
Definition: unicode.h:1032
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
static TStr GetSentenceBreakPropertyFn()
Definition: unicode.h:1308
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
Definition: dt.h:412
int GetSbFlags() const
Definition: unicode.h:1128
bool IsSTerminal() const
Definition: unicode.h:1101
bool IsWbFlag(const TUniChFlags flag) const
Definition: unicode.h:1115
signed char script
Definition: unicode.h:1021
char * CStr()
Definition: dt.h:476
bool IsKey(const TKey &Key) const
Definition: hash.h:216
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
bool IsIdeographic() const
Definition: unicode.h:1095
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
static TStr GetAuxiliaryDir()
Definition: unicode.h:1304
static TStr GetScriptNameHiragana()
Definition: unicode.h:1319
bool TUniChDb::IsGetChInfo ( const int  cp,
TUniChInfo ChInfo 
)
inline

Definition at line 1350 of file unicode.h.

1350  {
1351  int i = h.GetKeyId(cp);
1352  if (i < 0) return false; else { ChInfo=h[i]; return true; }}
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
DECLARE_FORWARDED_PROPERTY_METHODS bool TUniChDb::IsPrivateUse ( const int  cp) const
inline

Definition at line 1383 of file unicode.h.

1383  {
1384  int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
1385  return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area
1386  // Planes 15 and 16 are entirely for private use.
1387  (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
bool TUniChDb::IsSbFlag ( const int  cp,
const TUniChFlags  flag 
) const
inline

Definition at line 1358 of file unicode.h.

1358 { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
bool TUniChDb::IsSurrogate ( const int  cp) const
inline

Definition at line 1392 of file unicode.h.

1392  {
1393  int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
1394  return 0xd800 <= cp && cp <= 0xdcff; }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
bool TUniChDb::IsWbFlag ( const int  cp,
const TUniChFlags  flag 
) const
inline

Definition at line 1356 of file unicode.h.

1356 { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
static bool TUniChDb::IsWbIgnored ( const TUniChInfo ci)
inlinestaticprotected

Definition at line 1419 of file unicode.h.

1419 { return ci.IsGbExtend() || ci.IsWbFormat(); }
bool IsGbExtend() const
Definition: unicode.h:1139
bool IsWbFormat() const
Definition: unicode.h:1119
bool TUniChDb::IsWbIgnored ( const int  cp) const
inlineprotected

Definition at line 1420 of file unicode.h.

1420 { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
void TUniChDb::Load ( TSIn SIn)
inline

Definition at line 1285 of file unicode.h.

1285  {
1286  h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
1287  decompositions.Load(SIn);
1288  inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
1290  SIn.LoadCs(); InitAfterLoad(); }
void InitAfterLoad()
Definition: unicode.cpp:1368
void Load(TSIn &SIn)
Definition: unicode.h:286
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
void Load(TSIn &SIn)
Definition: ds.h:895
void LoadCs()
Definition: fl.cpp:28
void Load(TSIn &SIn)
Definition: hash.h:137
TIntIntVH specialCasingLower
Definition: unicode.h:1271
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
Definition: dt.h:778
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
TIntV decompositions
Definition: unicode.h:1266
TStrIntH scripts
Definition: unicode.h:1265
~TStrPool()
Definition: dt.h:789
TStrPool charNames
Definition: unicode.h:1264
TUniCaseFolding caseFolding
Definition: unicode.h:1268
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
void TUniChDb::LoadBin ( const TStr fnBin)
inline

Definition at line 1291 of file unicode.h.

1291  {
1292  PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290
void Load(TSIn &SIn)
Definition: unicode.h:1285
void TUniChDb::LoadTxt ( const TStr basePath)

Definition at line 1249 of file unicode.cpp.

1250 {
1251  Clr();
1252  // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
1253  h = THash<TInt, TUniChInfo>(196613, true);
1254  //
1256  //
1257  TUcdFileReader reader; TStrV fields; TIntH seen;
1258  reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
1259  while (reader.GetNextLine(fields))
1260  {
1261  // Codepoint.
1262  int cp = reader.ParseCodePoint(fields[0]);
1263  IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
1264  TUniChInfo& ci = h.AddDat(cp);
1265  // Name.
1266  ci.nameOffset = charNames.AddStr(fields[1]);
1267  // Category.
1268  TStr& s = fields[2]; IAssert(s.Len() == 2);
1269  ci.chCat = s[0]; ci.chSubCat = s[1];
1270  // Canonical combining class.
1271  s = fields[3]; IAssert(s.Len() > 0);
1272  int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
1273  ci.combClass = (uchar) i;
1274  // Decomposition type and mapping.
1275  LoadTxt_ProcessDecomposition(ci, fields[5]);
1276  // Simple case mappings.
1277  s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1278  s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1279  s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1280  //
1281  ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
1282  }
1283  reader.Close();
1284  //
1285  InitScripts(basePath);
1286  //
1287  InitPropList(basePath);
1288  InitDerivedCoreProperties(basePath);
1289  InitLineBreaks(basePath);
1290  InitSpecialCasing(basePath);
1291  // Process the composition exclusions (UAX #15, sec. 6).
1292  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1293  {
1294  TUniChInfo& ci = h[i];
1295  int ofs = ci.decompOffset; if (ofs < 0) continue;
1296  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1297  IAssert(n > 0);
1298  // Singleton decompositions.
1299  if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
1300  // Non-starter decompositions.
1301  int cp1 = decompositions[ofs];
1302  IAssert(h.IsKey(cp1));
1303  uchar ccc = h.GetDat(cp1).combClass;
1304  if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
1305  }
1306  // Process the composition exclusion table.
1307  reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
1308  int nExclusionTable = 0;
1309  while (reader.GetNextLine(fields))
1310  {
1311  IAssert(fields.Len() == 1);
1312  int cp = reader.ParseCodePoint(fields[0]);
1313  int i = h.GetKeyId(cp); IAssert(i >= 0);
1314  h[i].flags |= ucfCompositionExclusion;
1315  nExclusionTable++;
1316  }
1317  reader.Close();
1318  // Prepare the inverted index for composition pairs.
1319  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1320  {
1321  int cp = h.GetKey(i);
1322  TUniChInfo& ci = h[i];
1323  int ofs = ci.decompOffset; if (ofs < 0) continue;
1324  if (ci.IsCompositionExclusion()) continue;
1325  if (ci.IsCompatibilityDecomposition()) continue;
1326  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1327  if (n != 2) continue;
1328  TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
1329  IAssert(! inverseDec.IsKey(pr));
1331  inverseDec.AddDat(pr, cp);
1332  }
1333  printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
1334  basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
1335  // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
1336  // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
1337  InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
1338  // Make sure that Hangul combined characters are treated as stareters.
1339  for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
1340  {
1341  int j = h.GetKeyId(cp); if (j < 0) continue;
1342  TUniChInfo& ci = h[j];
1345  }
1346  // There should be no more additions to 'h' beyond this point.
1347  const int oldHLen = h.Len();
1348  // Provide default (identity) case mappings if any were missing from UnicodeData.txt
1349  // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
1351  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1352  {
1353  int cp = h.GetKey(i); TUniChInfo &ci = h[i];
1354  if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
1355  if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
1356  if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
1357  if (ci.script < 0) ci.script = scriptUnknown;
1358  }
1359  IAssert(h.Len() == oldHLen);
1360 }
#define IAssert(Cond)
Definition: bd.h:262
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
void Clr()
Definition: unicode.h:1276
#define IAssertR(Cond, Reason)
Definition: bd.h:265
int Len() const
Definition: dt.h:487
bool IsInt(const bool &Check, const int &MnVal, const int &MxVal, int &Val) const
Definition: dt.cpp:1159
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
bool IsCompositionExclusion() const
Definition: unicode.h:1111
uchar combClass
Definition: unicode.h:1018
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void InitPropList(const TStr &basePath)
Definition: unicode.cpp:950
void InitDerivedCoreProperties(const TStr &basePath)
Definition: unicode.cpp:1007
void InitAfterLoad()
Definition: unicode.h:1035
void InitLineBreaks(const TStr &basePath)
Definition: unicode.cpp:1046
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:220
char chCat
Definition: unicode.h:1017
int simpleUpperCaseMapping
Definition: unicode.h:1022
static TStr GetUnicodeDataFn()
Definition: unicode.h:1298
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
int simpleTitleCaseMapping
Definition: unicode.h:1022
bool IsCompatibilityDecomposition() const
Definition: unicode.h:1112
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
void LoadTxt_ProcessDecomposition(TUniChInfo &ci, TStr s)
Definition: unicode.cpp:937
void InitSpecialCasing(const TStr &basePath)
Definition: unicode.cpp:1225
int FFirstKeyId() const
Definition: hash.h:236
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
unsigned char uchar
Definition: bd.h:10
void InitScripts(const TStr &basePath)
Definition: unicode.cpp:1073
TIntV decompositions
Definition: unicode.h:1266
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
void LoadTxt(const TStr &fileName)
Definition: unicode.cpp:505
Definition: ds.h:32
int AddKey(const TKey &Key)
Definition: hash.h:331
void InitWordAndSentenceBoundaryFlags(const TStr &basePath)
Definition: unicode.cpp:1100
char chSubCat
Definition: unicode.h:1017
int simpleLowerCaseMapping
Definition: unicode.h:1022
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
static TStr GetCompositionExclusionsFn()
Definition: unicode.h:1299
Definition: dt.h:412
bool Empty() const
Definition: dt.h:488
int decompOffset
Definition: unicode.h:1023
TStrPool charNames
Definition: unicode.h:1264
static const uchar Mx
Definition: dt.h:1005
signed char script
Definition: unicode.h:1021
int nameOffset
Definition: unicode.h:1024
int scriptUnknown
Definition: unicode.h:1272
char * CStr()
Definition: dt.h:476
bool IsKey(const TKey &Key) const
Definition: hash.h:216
TUniCaseFolding caseFolding
Definition: unicode.h:1268
uint AddStr(const char *Str, const uint &Len)
Definition: dt.cpp:1711
int Len() const
Definition: hash.h:186
int flags
Definition: unicode.h:1025
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
static const uchar Mn
Definition: dt.h:1004
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
static TStr GetCaseFoldingFn()
Definition: unicode.h:1296
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::LoadTxt_ProcessDecomposition ( TUniChInfo ci,
TStr  s 
)
protected

Definition at line 937 of file unicode.cpp.

938 {
939  if (s.Empty()) return;
940  if (s[0] == '<') {
941  int i = s.SearchCh('>'); IAssert(i > 0);
943  s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
945  IAssert(dec.Len() > 0);
948 }
#define IAssert(Cond)
Definition: bd.h:262
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:1043
int Len() const
Definition: dt.h:487
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
Definition: unicode.h:1697
TIntV decompositions
Definition: unicode.h:1266
bool Empty() const
Definition: dt.h:488
TStr & ToTrunc()
Definition: dt.cpp:770
int decompOffset
Definition: unicode.h:1023
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
int flags
Definition: unicode.h:1025
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
template<class TSrcVec >
void TUniChDb::PrintCharNames ( FILE *  f,
const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
const TStr prefix 
) const
inline

Definition at line 1336 of file unicode.h.

1336  {
1337  if (! f) f = stdout;
1338  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
1339  fprintf(f, "%s", prefix.CStr());
1340  int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
1341  fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
TStr GetCharNameS(const int cp) const
Definition: unicode.h:1332
char * CStr()
Definition: dt.h:476
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<class TSrcVec >
void TUniChDb::PrintCharNames ( FILE *  f,
const TSrcVec &  src,
const TStr prefix 
) const
inline

Definition at line 1342 of file unicode.h.

1342 { PrintCharNames(f, src, 0, src.Len(), prefix); }
void PrintCharNames(FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const
Definition: unicode.h:1336
void TUniChDb::Save ( TSOut SOut) const
inline

Definition at line 1280 of file unicode.h.

1280  {
1281  h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
1282  inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
1284  SOut.SaveCs(); }
void Save(TSOut &SOut) const
Definition: dt.cpp:1694
void Save(TSOut &SOut) const
Definition: hash.h:141
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
TIntIntVH specialCasingLower
Definition: unicode.h:1271
void Save(TSOut &SOut) const
Definition: ds.h:903
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
void SaveCs()
Definition: fl.h:171
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
TIntV decompositions
Definition: unicode.h:1266
TStrIntH scripts
Definition: unicode.h:1265
TStrPool charNames
Definition: unicode.h:1264
void Save(TSOut &SOut) const
Definition: unicode.h:287
TUniCaseFolding caseFolding
Definition: unicode.h:1268
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
void TUniChDb::SaveBin ( const TStr fnBinUcd)

Definition at line 1362 of file unicode.cpp.

1363 {
1364  PSOut SOut=TFOut::New(fnBinUcd);
1365  Save(*SOut);
1366 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
void Save(TSOut &SOut) const
Definition: unicode.h:1280
Definition: bd.h:196
template<class TSrcVec >
void TUniChDb::SbEx_Add ( const TSrcVec &  v)
inline

Definition at line 1490 of file unicode.h.

1490 { sbExTrie.Add(v); }
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461
void Add(const TSrcVec &src, const size_t srcIdx, const size_t srcCount)
Definition: unicode.h:1220
void TUniChDb::SbEx_Add ( const TStr s)
inline

Definition at line 1492 of file unicode.h.

1492  {
1493  TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
int Len() const
Definition: dt.h:487
unsigned char uchar
Definition: bd.h:10
void SbEx_Add(const TSrcVec &v)
Definition: unicode.h:1490
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
int TUniChDb::SbEx_AddMulti ( const TStr words,
const bool  wordsAreUtf8 = true 
)
inline

Definition at line 1495 of file unicode.h.

1495  { TStrV vec; words.SplitOnAllCh('|', vec);
1496  for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
1497  return vec.Len(); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void SbEx_AddUtf8(const TStr &s)
Definition: unicode.h:1494
void SbEx_Add(const TSrcVec &v)
Definition: unicode.h:1490
void SplitOnAllCh(const char &SplitCh, TStrV &StrV, const bool &SkipEmpty=true) const
Definition: dt.cpp:926
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::SbEx_AddUtf8 ( const TStr s)
inline

Definition at line 1494 of file unicode.h.

1494 { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
void SbEx_Add(const TSrcVec &v)
Definition: unicode.h:1490
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
void TUniChDb::SbEx_Clr ( )
inline

Definition at line 1489 of file unicode.h.

1489 { sbExTrie.Clr(); }
void Clr()
Definition: unicode.h:1200
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461
void TUniChDb::SbEx_Set ( const TUniTrie< TInt > &  newTrie)
inline

Definition at line 1498 of file unicode.h.

1498 { sbExTrie = newTrie; }
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461
int TUniChDb::SbEx_SetStdEnglish ( )
inline

Definition at line 1499 of file unicode.h.

1499  {
1500  static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
1501  SbEx_Clr(); return SbEx_AddMulti(data, false); }
void SbEx_Clr()
Definition: unicode.h:1489
int SbEx_AddMulti(const TStr &words, const bool wordsAreUtf8=true)
Definition: unicode.h:1495
Definition: dt.h:412
void TUniChDb::Test ( const TStr basePath)

Definition at line 1377 of file unicode.cpp.

1378 {
1379  TStr fnBin = CombinePath(basePath, GetBinFn());
1380  if (true || ! TFile::Exists(fnBin))
1381  {
1382  // Test LoadTxt.
1383  LoadTxt(basePath);
1384  // Test Save.
1385  {PSOut SOut = TFOut::New(fnBin);
1386  Save(*SOut);}
1387  }
1388  // Test Load.
1389  this->~TUniChDb();
1390  new(this) TUniChDb();
1391  {PSIn SIn = TFIn::New(fnBin);
1392  Load(*SIn);}
1393  // Test the case folding.
1394  caseFolding.Test();
1395  // Test the word breaking.
1397  // Test the sentence breaking.
1398  TestFindNextWordOrSentenceBoundary(basePath, true);
1399  TestFindNextWordOrSentenceBoundary(basePath, false);
1400  // Test composition and decomposition.
1401  TestComposition(basePath);
1402  // Test the case conversions.
1404 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
static TStr GetBinFn()
Definition: unicode.h:1310
void Test(const TIntV &src, const TIntV &expectedDest, const bool full, const bool turkic, FILE *f)
Definition: unicode.cpp:531
static bool Exists(const TStr &FNm)
Definition: fl.cpp:1100
void TestCaseConversions()
Definition: unicode.cpp:853
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290
TUniChDb()
Definition: unicode.h:1274
void Save(TSOut &SOut) const
Definition: unicode.h:1280
void Load(TSIn &SIn)
Definition: unicode.h:1285
void TestComposition(const TStr &basePath)
Definition: unicode.cpp:745
void TestFindNextWordOrSentenceBoundary(const TStr &basePath, bool sentence)
Definition: unicode.cpp:649
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
void LoadTxt(const TStr &basePath)
Definition: unicode.cpp:1249
Definition: dt.h:412
Definition: bd.h:196
TUniCaseFolding caseFolding
Definition: unicode.h:1268
void TestWbFindNonIgnored() const
Definition: unicode.cpp:619
void TUniChDb::TestCaseConversion ( const TStr source,
const TStr trueLc,
const TStr trueTc,
const TStr trueUc,
bool  turkic,
bool  lithuanian 
)
protected

Definition at line 825 of file unicode.cpp.

828 {
829  TIntV src;
831  FILE *f = stderr;
832  for (int i = 0; i < 3; i++)
833  {
834  TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
835  const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
836  TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
837  TIntV dest;
838  GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
839  bool ok = (dest.Len() == trueDest.Len());
840  if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
841  if (ok) continue;
842  fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
843  for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
844  fprintf(f, ")\nCorrect: (");
845  for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
846  fprintf(f, ")\nOur output:(");
847  for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
848  fprintf(f, ")\n");
849  IAssert(ok);
850  }
851 }
#define IAssert(Cond)
Definition: bd.h:262
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
enum TUniChDb::TCaseConversion_ TCaseConversion
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
Definition: unicode.h:1697
Definition: dt.h:412
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
void TUniChDb::TestCaseConversions ( )
protected

Definition at line 853 of file unicode.cpp.

854 {
855  // Because no thorough case-conversion test files have been provided as part
856  // of the Unicode standard, we'll have to test things on a few test cases of our own.
857  // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
858  const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
859  const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
860  const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
861  const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
862  const TStr space = "0020 ", Grave = "0300 ";
864  F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst, // source
865  f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst, // lowercase
866  F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst, // titlecase
867  F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
868  false, false);
869  // - Dotted I, dotless i, etc., but with turkic == false.
870  const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
872  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
873  s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
874  S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
875  S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
876  false, false);
877  // - Sigma (final vs. non-final forms).
878  const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
880  Sigma + s + space + s + Sigma + space + s + Sigma + s + space + Sigma + S + Sigma + space + Sigma, // source
881  sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
882  Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
883  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
884  false, false);
886  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + S + sigma + space + sigma, // source
887  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + s + sigma + space + sigma, // lowercase
888  Sigma + s + space + S + sigma + space + S + sigma + s + space + Sigma + s + sigma + space + Sigma, // titlecase
889  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
890  false, false);
892  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma + space + fsigma, // source
893  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma + space + fsigma, // lowercase
894  Sigma + s + space + S + fsigma + space + S + fsigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
895  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
896  false, false);
897  const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
898  // Special case mappings for Turkic languages:
899  // - After_I
901  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
902  s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
903  S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
904  S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
905  true, false); // turkic
906  // - Not_Before_Dot
908  I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
909  iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
910  I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
911  I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
912  true, false); // turkic
913  // Special case mappings for Lithuanian:
914  // - After_Soft_Dotted [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
916  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
917  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
918  I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
919  I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
920  false, true); // lithuanian
921  // - More_Above [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
923  J + Grave + space + J + nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J + nonSA + Grave + space + j + nonSA, // source
924  j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
925  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
926  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + J + nonSA + Grave + space + J + nonSA, // uppercase
927  false, true); // lithuanian
928  // SoftDotted [^ Starter Above]* 0307 --(uc,tc)--> brez 0307
929  // SoftDotted [^ Starter Above]* 0307 --(
930  //TestCaseConversion("", "", "", "", false, false);
931 }
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
Definition: unicode.cpp:825
Definition: dt.h:412
void TUniChDb::TestComposition ( const TStr basePath)
protected

Definition at line 745 of file unicode.cpp.

746 {
747  TUcdFileReader reader; TStrV fields; int nLines = 0;
748  reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
749  bool inPart1 = false; TIntH testedInPart1;
750  while (reader.GetNextLine(fields))
751  {
752  nLines += 1;
753  if (fields.Len() == 1) {
754  IAssert(fields[0].IsPrefix("@Part"));
755  inPart1 = (fields[0] == "@Part1"); continue; }
756  IAssert(fields.Len() == 6);
757  IAssert(fields[5].Len() == 0);
758  TIntV c1, c2, c3, c4, c5;
759  reader.ParseCodePointList(fields[0], c1);
760  reader.ParseCodePointList(fields[1], c2);
761  reader.ParseCodePointList(fields[2], c3);
762  reader.ParseCodePointList(fields[3], c4);
763  reader.ParseCodePointList(fields[4], c5);
764  TIntV v;
765 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
766 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
767 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
768 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
769 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
770  // NFD:
771  NFD_(c3, c1); // c3 == NFD(c1)
772  NFD_(c3, c2); // c3 == NFD(c2)
773  NFD_(c3, c3); // c3 == NFD(c3)
774  NFD_(c5, c4); // c5 == NFD(c4)
775  NFD_(c5, c5); // c5 == NFD(c5)
776  // NFC:
777  NFC_(c2, c1); // c2 == NFC(c1)
778  NFC_(c2, c2); // c2 == NFC(c2)
779  NFC_(c2, c3); // c2 == NFC(c3)
780  NFC_(c4, c4); // c4 == NFC(c4)
781  NFC_(c4, c5); // c4 == NFC(c5)
782  // NFKD:
783  NFKD_(c5, c1); // c5 == NFKD(c1)
784  NFKD_(c5, c2); // c5 == NFKD(c2)
785  NFKD_(c5, c3); // c5 == NFKD(c3)
786  NFKD_(c5, c4); // c5 == NFKD(c4)
787  NFKD_(c5, c5); // c5 == NFKD(c5)
788  // NFKC:
789  NFKC_(c4, c1); // c4 == NFKC(c1)
790  NFKC_(c4, c2); // c4 == NFKC(c2)
791  NFKC_(c4, c3); // c4 == NFKC(c3)
792  NFKC_(c4, c4); // c4 == NFKC(c4)
793  NFKC_(c4, c5); // c4 == NFKC(c5)
794  //
795  if (inPart1) {
796  IAssert(c1.Len() == 1);
797  testedInPart1.AddKey(c1[0]); }
798  }
799  reader.Close();
800  // Test other individual codepoints that were not mentioned in part 1.
801  int nOther = 0;
802  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
803  {
804  const int cp = h.GetKey(i), nLines = -1;
805  if (testedInPart1.IsKey(cp)) continue;
806  TIntV x, v; x.Add(cp);
807  NFC_(x, x); // x == NFC(x)
808  NFD_(x, x); // x == NFD(x)
809  NFKC_(x, x); // x == NFKC(x)
810  NFKD_(x, x); // x == NFKD(x)
811  nOther += 1;
812  }
813 #undef AssE_
814 #undef NFC_
815 #undef NFD_
816 #undef NFKC_
817 #undef NFKD_
818  printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
819 }
#define IAssert(Cond)
Definition: bd.h:262
#define NFC_(cmpWith, operand)
#define NFD_(cmpWith, operand)
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
static TStr GetNormalizationTestFn()
Definition: unicode.h:1309
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
int FFirstKeyId() const
Definition: hash.h:236
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int AddKey(const TKey &Key)
Definition: hash.h:331
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
#define NFKC_(cmpWith, operand)
#define NFKD_(cmpWith, operand)
bool IsKey(const TKey &Key) const
Definition: hash.h:216
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUniChDb::TestFindNextWordOrSentenceBoundary ( const TStr basePath,
bool  sentence 
)
protected

Definition at line 649 of file unicode.cpp.

650 {
651  TUcdFileReader reader; TStrV fields;
652  reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
653  int nLines = 0; TRnd rnd = TRnd(123);
654  while (reader.GetNextLine(fields))
655  {
656  nLines += 1;
657  IAssert(fields.Len() == 1);
658  TStrV parts; fields[0].SplitOnWs(parts);
659  const int n = parts.Len(); IAssert((n % 2) == 1);
660  TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
661  // Each line is a sequence of codepoints, with a \times or \div in between each
662  // pair of codepoints (as well as at the beginning and the end of the sequence) to
663  // indicate whether a boundary exists there or not.
664  for (int i = 0; i < n; i++)
665  {
666  const TStr& s = parts[i];
667  if ((i % 2) == 0) {
668  if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
669  isBreak.Add(false);
670  else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
671  isBreak.Add(true);
672  else FailR(s.CStr()); }
673  else chars.Add(reader.ParseCodePoint(s));
674  }
675  const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
676  IAssert(isBreak[0]); IAssert(isBreak[m]);
677  isPredicted.Gen(m + 1); isPredicted.PutAll(false);
678  if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
679  // We'll insert a few random characters at the beginning of the sequence
680  // so that srcPos doesn't always begin at 0.
681  for (int nBefore = 0; nBefore < 5; nBefore++)
682  {
683  TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
684  chars2.AddV(chars);
685  // Use FindNextBoundary to find all the word boundaries.
686  size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
687  while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
688  {
689  IAssert(prevPosition < position);
690  IAssert(position <= size_t(nBefore + m));
691  isPredicted[int(position) - nBefore] = true;
692  prevPosition = position;
693  }
694  IAssert(position == size_t(nBefore + m));
695  if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
696  else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
697  IAssert(isPredicted2.Len() == m + 1);
698  bool ok = true;
699  // If we start at 0, the word boundary at the beginning of the sequence was
700  // not found explicitly, so we'll add it now.
701  if (nBefore == 0) isPredicted[0] = true;
702  // Compare the predicted and the true boundaries.
703  for (int i = 0; i <= m; i++) {
704  if (isBreak[i] != isPredicted[i]) ok = false;
705  IAssert(isPredicted2[i] == isPredicted[i]); }
706  FILE *f = stderr;
707  if (! ok)
708  {
709  fprintf(f, "\nError in line %d:\n", nLines);
710  fprintf(f, "True: ");
711  for (int i = 0; i <= m; i++) {
712  fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
713  if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
714  fprintf(f, "\nPredicted: ");
715  for (int i = 0; i <= m; i++) {
716  fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
717  if (i < m) {
718  const int cp = chars[i + nBefore];
719  TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
720  if (IsWbIgnored(cp)) s = "*" + s;
721  fprintf(f, "%4s ", s.CStr()); }}
722  fprintf(f, "\n");
723  Fail;
724  }
725  // Test FindNextBoundary if we start in the middle of the sequence,
726  // i.e. not at an existing boundary.
727  for (int i = 0; i < m; i++) {
728  position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
729  IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
730  IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
731  position -= nBefore;
732  for (int j = i + 1; j < int(position); j++)
733  IAssert(! isBreak[j]);
734  IAssert(isBreak[int(position)]); }
735  }
736  }
737  reader.Close();
738  printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
739 }
#define IAssert(Cond)
Definition: bd.h:262
Definition: dt.h:11
int GetWbFlags(const int cp) const
Definition: unicode.h:1357
#define Fail
Definition: bd.h:238
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
static TStr GetSentenceBreakTestFn()
Definition: unicode.h:1307
TStr GetSbFlagsStr() const
Definition: unicode.h:1130
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
#define FailR(Reason)
Definition: bd.h:240
static TStr GetWordBreakTestFn()
Definition: unicode.h:1305
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
Definition: dt.h:412
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2793
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
char * CStr()
Definition: dt.h:476
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
bool AlwaysFalse()
Definition: unicode.h:3227
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
static TStr GetAuxiliaryDir()
Definition: unicode.h:1304
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
void TUniChDb::TestWbFindNonIgnored ( const TIntV src) const
protected

Definition at line 579 of file unicode.cpp.

580 {
581  int n = src.Len();
582  TBoolV isIgnored; isIgnored.Gen(n);
583  for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
584  TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
585  prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
586  FILE *f = 0; // stderr;
587  for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
588  {
589  int prev = -1;
590  for (int i = 0; i < srcLen; i++) {
591  prevNonIgnored[i] = prev;
592  if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
593  int next = srcIdx + srcLen;
594  for (int i = srcLen - 1; i >= 0; i--) {
595  nextNonIgnored[i] = next;
596  if (! isIgnored[srcIdx + i]) next = srcIdx + i;
597  curOrNextNonIgnored[i] = next; }
598  if (f) {
599  fprintf(f, "\nIndex: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
600  fprintf(f, "\nNonIgn: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
601  fprintf(f, "\nPrevNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
602  fprintf(f, "\nNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
603  fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
604  fprintf(f, "\n"); }
605  for (int i = 0; i < srcLen; i++)
606  {
607  size_t s;
608  s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
609  IAssert(s == size_t(nextNonIgnored[i]));
610  s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
611  IAssert(s == size_t(curOrNextNonIgnored[i]));
612  s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
613  if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
614  else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
615  }
616  }
617 }
#define IAssert(Cond)
Definition: bd.h:262
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1422
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
void TUniChDb::TestWbFindNonIgnored ( ) const
protected

Definition at line 619 of file unicode.cpp.

620 {
621  TIntV chIgnored, chNonIgnored;
622  FILE *f = 0; // stderr;
623  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
624  const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
625  if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
627  (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
628  }
629  chIgnored.Sort(); chNonIgnored.Sort();
630  printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
631  TRnd rnd = TRnd(123);
632  for (int iter = 0; iter <= 50; iter++)
633  {
634  int percIgnored = 2 * iter;
635  for (int n = 0; n <= 20; n++)
636  {
637  // Prepare a random sequence of 'n' codepoints.
638  TIntV v; v.Gen(n);
639  for (int i = 0; i < n; i++) {
640  TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
641  int j = rnd.GetUniDevInt(chars.Len());
642  v.Add(chars[j]); }
643  // Run the tests with this sequence.
645  }
646  }
647 }
Definition: dt.h:11
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
int propertiesX
Definition: unicode.h:1027
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1254
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
int FFirstKeyId() const
Definition: hash.h:236
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int properties
Definition: unicode.h:1026
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
signed char script
Definition: unicode.h:1021
char * CStr()
Definition: dt.h:476
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
int flags
Definition: unicode.h:1025
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
void TestWbFindNonIgnored() const
Definition: unicode.cpp:619
template<typename TSrcVec >
void TUniChDb::ToCaseFolded ( TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
const bool  turkic = false 
) const
inline

Definition at line 1636 of file unicode.h.

1636 { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
TUniCaseFolding caseFolding
Definition: unicode.h:1268
void FoldInPlace(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic) const
Definition: unicode.h:307
template<typename TSrcVec >
void TUniChDb::ToCaseFolded ( TSrcVec &  src,
const bool  turkic = false 
) const
inline

Definition at line 1637 of file unicode.h.

1637 { ToCaseFolded(src, 0, src.Len(), turkic); }
void ToCaseFolded(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
Definition: unicode.h:1636
template<typename TSrcVec >
void TUniChDb::ToSimpleCaseConverted ( TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
const TCaseConversion  how 
) const

Definition at line 3072 of file unicode.h.

3073 {
3074  bool seenCased = false; size_t nextWordBoundary = srcIdx;
3075  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
3076  {
3077  const int cp = src[TVecIdx(srcIdx)];
3078  int i = h.GetKeyId(cp); if (i < 0) continue;
3079  const TUniChInfo &ci = h[i];
3080  // With titlecasing, the first cased character of each word must be put into titlecase,
3081  // all others into lowercase. This is what the howHere variable is for.
3082  TUniChDb::TCaseConversion howHere;
3083  if (how != ccTitle) howHere = how;
3084  else {
3085  if (srcIdx == nextWordBoundary) { // A word starts/ends here.
3086  seenCased = false;
3087  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
3088  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3089  bool isCased = IsCased(cp);
3090  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
3091  else howHere = ccLower;
3092  }
3093  int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
3094  if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
3095  }
3096 }
#define IAssert(Cond)
Definition: bd.h:262
enum TUniChDb::TCaseConversion_ TCaseConversion
int simpleUpperCaseMapping
Definition: unicode.h:1022
int simpleTitleCaseMapping
Definition: unicode.h:1022
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
int simpleLowerCaseMapping
Definition: unicode.h:1022
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
void TUniChDb::ToSimpleLowerCase ( TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount 
) const
inline

Definition at line 1610 of file unicode.h.

1610 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
Definition: unicode.h:3072
template<typename TSrcVec >
void TUniChDb::ToSimpleLowerCase ( TSrcVec &  src) const
inline

Definition at line 1613 of file unicode.h.

1613 { ToSimpleLowerCase(src, 0, src.Len()); }
void ToSimpleLowerCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1610
template<typename TSrcVec >
void TUniChDb::ToSimpleTitleCase ( TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount 
) const
inline

Definition at line 1611 of file unicode.h.

1611 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
Definition: unicode.h:3072
template<typename TSrcVec >
void TUniChDb::ToSimpleTitleCase ( TSrcVec &  src) const
inline

Definition at line 1614 of file unicode.h.

1614 { ToSimpleTitleCase(src, 0, src.Len()); }
void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1611
template<typename TSrcVec >
void TUniChDb::ToSimpleUpperCase ( TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount 
) const
inline

Definition at line 1609 of file unicode.h.

1609 { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
Definition: unicode.h:3072
template<typename TSrcVec >
void TUniChDb::ToSimpleUpperCase ( TSrcVec &  src) const
inline

Definition at line 1612 of file unicode.h.

1612 { ToSimpleUpperCase(src, 0, src.Len()); }
void ToSimpleUpperCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1609
template<typename TSrcVec >
void TUniChDb::WbFindCurOrNextNonIgnored ( const TSrcVec &  src,
size_t &  position,
const size_t  srcEnd 
) const
inlineprotected

Definition at line 1422 of file unicode.h.

1422  {
1423  while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
void TUniChDb::WbFindNextNonIgnored ( const TSrcVec &  src,
size_t &  position,
const size_t  srcEnd 
) const
inlineprotected

Definition at line 1425 of file unicode.h.

1425  {
1426  if (position >= srcEnd) return;
1427  position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
void TUniChDb::WbFindNextNonIgnoredS ( const TSrcVec &  src,
size_t &  position,
const size_t  srcEnd 
) const
inlineprotected

Definition at line 1429 of file unicode.h.

1429  {
1430  if (position >= srcEnd) return;
1431  if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
1432  position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
TUniVecIdx TVecIdx
Definition: unicode.h:1260
template<typename TSrcVec >
bool TUniChDb::WbFindPrevNonIgnored ( const TSrcVec &  src,
const size_t  srcStart,
size_t &  position 
) const
inlineprotected

Definition at line 1434 of file unicode.h.

1434  {
1435  if (position <= srcStart) return false;
1436  while (position > srcStart) {
1437  position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
1438  return false; }
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
TUniVecIdx TVecIdx
Definition: unicode.h:1260

Friends And Related Function Documentation

friend class TUniCaseFolding
friend

Definition at line 1617 of file unicode.h.

Member Data Documentation

TUniCaseFolding TUniChDb::caseFolding

Definition at line 1268 of file unicode.h.

TStrPool TUniChDb::charNames

Definition at line 1264 of file unicode.h.

TIntV TUniChDb::decompositions

Definition at line 1266 of file unicode.h.

THash<TInt, TUniChInfo> TUniChDb::h

Definition at line 1263 of file unicode.h.

THash<TIntPr, TInt> TUniChDb::inverseDec

Definition at line 1267 of file unicode.h.

TUniTrie<TInt> TUniChDb::sbExTrie
protected

Definition at line 1461 of file unicode.h.

TStrIntH TUniChDb::scripts

Definition at line 1265 of file unicode.h.

int TUniChDb::scriptUnknown

Definition at line 1272 of file unicode.h.

TIntIntVH TUniChDb::specialCasingLower

Definition at line 1271 of file unicode.h.

TIntIntVH TUniChDb::specialCasingTitle

Definition at line 1271 of file unicode.h.

TIntIntVH TUniChDb::specialCasingUpper

Definition at line 1271 of file unicode.h.


The documentation for this class was generated from the following files: