SNAP Library 6.0, User Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
unicodestring.cpp
Go to the documentation of this file.
1 // Unicode-Definition
3 
5 
7  return TSysProc::GetExeFNm().GetFPath()+"UnicodeDef.Bin";
8 }
9 
11 // Unicode-String
12 TUStr::TUStr(const TStr& Str){
15  TIntV NfcUniChV; TUnicodeDef::GetDef()->Decompose(UniChV, NfcUniChV, true);
16  UniChV=NfcUniChV;
17 }
18 
21 }
22 
25 }
26 
28  TIntV StarterUniChV;
29  TUnicodeDef::GetDef()->ExtractStarters(UniChV, StarterUniChV);
30  TUnicodeDef::GetDef()->Decompose(StarterUniChV, UniChV, true);
31 }
32 
33 void TUStr::GetWordBoundPV(TBoolV& WordBoundPV){
35 }
36 
37 void TUStr::GetWordUStrV(TUStrV& WordUStrV){
38  // clear word vector
39  WordUStrV.Clr();
40  // create boundaries
41  TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV);
42  IAssert(Len()==WordBoundPV.Len()-1);
43  IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last()));
44  // traverse characters and bounds
45  int UniChs=Len(); TIntV WordUniChV;
46  for (int UniChN=0; UniChN<=UniChs; UniChN++){
47  if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary
48  if (UniChN<UniChs){ // if not finish
49  // if last-word-char or single-alphabetic-char
50  if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){
51  WordUniChV.Add(UniChV[UniChN]); // add char
52  }
53  }
54  if (!WordUniChV.Empty()){ // add current word to vector
55  TUStr WordUStr(WordUniChV); // construct word from char-vector
56  WordUStrV.Add(WordUStr); // add word to word-vector
57  WordUniChV.Clr(false); // clear char-vector
58  }
59  } else {
60  // add character to char-vector
61  WordUniChV.Add(UniChV[UniChN]);
62  }
63  }
64 }
65 
68  return Str;
69 }
70 
72  TIntV UniChV1; TIntV UniChV2;
74  TUnicodeDef::GetDef()->Decompose(UniChV1, UniChV2, true);
75  TStr Str=TUnicodeDef::GetDef()->EncodeUtf8Str(UniChV2);
76  return Str;
77 }
78 
80  TIntV UniChV1; TIntV UniChV2; TIntV UniChV3;
82  TUnicodeDef::GetDef()->ExtractStarters(UniChV1, UniChV2);
83  TUnicodeDef::GetDef()->Decompose(UniChV2, UniChV3, true);
84  TStr Str=TUnicodeDef::GetDef()->EncodeUtf8Str(UniChV3);
85  return Str;
86 }
87 
88 int TUStr::GetScriptId(const TStr& ScriptNm){
89  return TUnicodeDef::GetDef()->ucd.GetScriptByName(ScriptNm);
90 }
91 
92 TStr TUStr::GetScriptNm(const int& ScriptId){
93  return TUnicodeDef::GetDef()->ucd.GetScriptName(ScriptId);
94 }
95 
96 int TUStr::GetChScriptId(const int& UniCh){
97  return TUnicodeDef::GetDef()->ucd.GetScript(UniCh);
98 }
99 
100 TStr TUStr::GetChScriptNm(const int& UniCh){
101  return GetScriptNm(GetChScriptId(UniCh));
102 }
103 
104 TStr TUStr::GetChNm(const int& UniCh){
105  TStr UniChNm(TUnicodeDef::GetDef()->ucd.GetCharNameS(UniCh));
106  return UniChNm;
107 }
108 
109 TStr TUStr::GetChTypeStr(const int& UniCh){
110  TChA ChTypeChA;
111  ChTypeChA+='[';
112  if (IsCase(UniCh)){ChTypeChA+="Case,";}
113  if (IsUpperCase(UniCh)){ChTypeChA+="UpperCase,";}
114  if (IsLowerCase(UniCh)){ChTypeChA+="LowerCase,";}
115  if (IsAlphabetic(UniCh)){ChTypeChA+="Alphabetic,";}
116  if (IsMath(UniCh)){ChTypeChA+="Math,";}
117  if (ChTypeChA.LastCh()=='['){ChTypeChA+=']';}
118  else {ChTypeChA[ChTypeChA.Len()-1]=']';}
119  return ChTypeChA;
120 }
121 
122 bool TUStr::IsCase(const int& UniCh){
123  TUniChInfo ChInfo;
124  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
125  return ChInfo.IsCased();}
126  else {return false;}
127 }
128 
129 bool TUStr::IsUpperCase(const int& UniCh){
130  TUniChInfo ChInfo;
131  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
132  return ChInfo.IsUppercase();}
133  else {return false;}
134 }
135 
136 bool TUStr::IsLowerCase(const int& UniCh){
137  TUniChInfo ChInfo;
138  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
139  return ChInfo.IsLowercase();}
140  else {return false;}
141 }
142 
143 bool TUStr::IsAlphabetic(const int& UniCh){
144  TUniChInfo ChInfo;
145  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
146  return ChInfo.IsAlphabetic();}
147  else {return false;}
148 }
149 
150 bool TUStr::IsMath(const int& UniCh){
151  TUniChInfo ChInfo;
152  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
153  return ChInfo.IsMath();}
154  else {return false;}
155 }
156 
157 TStr TUStr::EncodeUtf8(const int& UniCh) {
160 }
161 
#define IAssert(Cond)
Definition: bd.h:262
static bool IsAlphabetic(const int &UniCh)
void ToSimpleUpperCase(TIntV &src) const
Definition: unicode.h:1977
void ToStarterCase()
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
TStr EncodeUtf8Str(const TIntV &src) const
Definition: unicode.h:1796
static TUnicodeDef UnicodeDef
Definition: unicodestring.h:5
TStr GetFPath() const
Definition: dt.cpp:1389
static TStr GetChNm(const int &UniCh)
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
bool IsAlphabetic() const
Definition: unicode.h:1071
void ToUpperCase()
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
int Len() const
Definition: dt.h:259
int GetScript(const TUniChInfo &ci) const
Definition: unicode.h:1323
TUniChDb ucd
Definition: unicode.h:1775
static TUnicode * GetDef()
Definition: unicodestring.h:23
void ToSimpleLowerCase(TIntV &src) const
Definition: unicode.h:1978
void GetSimpleLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1972
TStr GetStarterLowerCaseStr() const
static TStr GetChScriptNm(const int &UniCh)
static TStr GetDfFNm()
bool IsUppercase() const
Definition: unicode.h:1072
static TStr GetChTypeStr(const int &UniCh)
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:570
void FindWordBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1907
bool IsLowercase() const
Definition: unicode.h:1073
static bool IsLowerCase(const int &UniCh)
TStr GetCharNameS(const int cp) const
Definition: unicode.h:2025
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static bool IsMath(const int &UniCh)
char LastCh() const
Definition: dt.h:281
int ExtractStarters(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1951
static int GetChScriptId(const int &UniCh)
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
static bool IsUpperCase(const int &UniCh)
static int GetScriptId(const TStr &ScriptNm)
void GetWordUStrV(TUStrV &UStrV)
void GetWordBoundPV(TBoolV &WordBoundPV)
int DecodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1787
Definition: dt.h:201
Definition: dt.h:412
TIntV UniChV
Definition: unicodestring.h:34
static TStr GetScriptNm(const int &ScriptId)
bool IsCased() const
Definition: unicode.h:1142
void Decompose(const TIntV &src, TIntV &dest, bool compatibility) const
Definition: unicode.h:1934
void ToLowerCase()
static TVec< TInt, int > GetV(const TInt &Val1)
Returns a vector on element Val1.
Definition: ds.h:848
TStr GetStarterStr() const
static TStr EncodeUtf8(const int &UniCh)
static void AssertUnicodeDefOk()
Definition: unicodestring.h:35
bool IsMath() const
Definition: unicode.h:1074
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
int Len() const
Definition: unicodestring.h:57
TStr GetStr() const
static bool IsCase(const int &UniCh)