SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
unicode.cpp
Go to the documentation of this file.
1 // Unicode.cpp : Defines the entry point for the console application.
2 //
3 
5 // Includes
6 //#include "unicode.h"
7 
8 //-----------------------------------------------------------------------------
9 // Private declarations of this module
10 //-----------------------------------------------------------------------------
11 
12 namespace {
13 
15 {
16 public:
18  TVectorBuilder2(int i) { v.Add(i); }
19  operator TIntV() const { return v; }
20  TVectorBuilder2& operator ,(int i) { v.Add(i); return *this; }
21 };
22 
24 {
25 public:
26  operator TIntV() const { return TIntV(); }
27  TVectorBuilder2 operator ,(int i) { return TVectorBuilder2(i); }
28 };
29 
31 
32 TStr CombinePath(const TStr& s, const TStr& t)
33 {
34  int n = s.Len(); if (n <= 0) return t;
35  if (s[n - 1] == '\\' || s[n - 1] == '/' || s[n - 1] == ':') return s + t;
36  return s + "\\" + t;
37 }
38 
39 void AssertEq(const TIntV& v1, const TIntV& v2, const TStr& explanation, FILE *f)
40 {
41  const int n = v1.Len();
42  bool ok = (n == v2.Len());
43  if (ok) for (int i = 0; i < n && ok; i++) ok = ok && (v1[i] == v2[i]);
44  if (! ok)
45  {
46  if (! f) f = stderr;
47  fprintf(f, "%s: [", explanation.CStr());
48  for (int i = 0; i < v1.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v1[i]));
49  fprintf(f, "] != [");
50  for (int i = 0; i < v2.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v2[i]));
51  fprintf(f, "]\n");
52  Fail;
53  }
54 }
55 
56 };
57 
58 //-----------------------------------------------------------------------------
59 // TUniCodec -- miscellaneous declarations
60 //-----------------------------------------------------------------------------
61 
63 {
64  uint u = rnd.GetUniDevUInt(256) & 0xff;
65  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
66  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
67  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
68  return u;
69 }
70 
71 uint TUniCodec::GetRndUint(TRnd& rnd, uint minVal, uint maxVal)
72 {
73  if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
74  uint range = maxVal - minVal + 1;
75  if (range > (uint(1) << (8 * sizeof(uint) - 1)))
76  while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
77  uint mask = 1;
78  while (mask < range) mask <<= 1;
79  mask -= 1;
80  while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
81 }
82 
84 {
85  static bool isLE, initialized = false;
86  if (initialized) return isLE;
87  int i = 1;
88  if(*(char *)&i == 1) isLE = true;
89  else isLE = false;
90 
91  initialized = true;
92  return isLE;
93 }
94 
95 //-----------------------------------------------------------------------------
96 // TUniCodec -- UTF-8 test driver
97 //-----------------------------------------------------------------------------
98 
99 void TUniCodec::TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f)
100 {
101  TIntV dest;
102  if (f) {
103  fprintf(f, "Settings: %s %s %s replacementChar = %x\n",
104  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
105  (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
106  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
107  try
108  {
109  size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
110  if (f) {
111  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i]));
112  fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i]));
113  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
114  if (retVal != expectedRetVal)
115  printf("!!!");
116  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
117  if (dest.Len() != expectedDest.Len())
118  printf("!!!");
119  IAssert(dest.Len() == expectedDest.Len());
120  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
121  }
122  catch (TUnicodeException e)
123  {
124  if (f) {
125  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
126  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
127  IAssert(expectedThrow);
128  }
129 }
130 
131 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
132 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
133 void TUniCodec::TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc)
134 {
135  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
136  bool expectedAbort = false;
137  FILE *f = 0; // stderr
138  // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
139  // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
140  // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
141  // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
142  // (absent = 0, 'a' = 1, 'b' = 2 and so on).
143  for (int i = 0; i < testCaseDesc.Len(); )
144  {
145  IAssert(i + 2 <= testCaseDesc.Len());
146  const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
147  uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
148  IAssert('1' <= d && d <= '6'); nBytes = d - '0';
149  if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
150  else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
151  else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
152  else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
153  else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
154  else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
155  else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
156  else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
157  else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
158  else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
159  else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
160  else Fail;
161  IAssert(nBytes >= minBytes);
162  // Process 'e'.
163  int nToDel = 0;
164  if (i < testCaseDesc.Len()) {
165  const char e = testCaseDesc[i];
166  if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
167  IAssert(nToDel < nBytes);
168  // Will an error occur during the decoding of this codepoint?
169  bool errHere = false;
170  if (eighties) errHere = true;
171  else if (nToDel > 0) errHere = true;
172  else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
173  // Update 'expectedDest' and 'expetedRetVal'.
174  if (! expectedAbort) {
175  if (! errHere) {
176  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
177  else { expectedDest.Add(cp); expectedRetVal += 1; } }
178  else if (errorHandling == uehReplace) {
179  if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
180  else expectedDest.Add(replacementChar); }
181  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
182  // Update 'src'.
183  if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
184  else if (nBytes == 1) src.Add(cp);
185  else {
186  int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
187  src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
188  for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
189  }
190  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
191  TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
192 }
193 
195 {
196  TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
197  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
198  for (int strict_ = 0; strict_ < 2; strict_++)
199  for (int errMode_ = 0; errMode_ < 4; errMode_++)
200  {
201  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
202  TRnd rnd = TRnd(123);
203  // Test DecodeUtf8 on various random UTF-8-encoded sequences.
204  for (int i = 0; i < 10; i++)
205  {
206  TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
207  TestDecodeUtf8(rnd, "X3A5dA6d");
208  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
209  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
210  TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
211  TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
212  TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
213  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
214  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
215  TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
216  TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
217  TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
218  TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
219  TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
220  TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
221  TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
222  TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
223  TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
224  }
225  // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
226  // close to powers of 2.
227  TIntV src, expectedDest, src2;
228  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
229  for (int pow = 8; pow <= 32; pow++)
230  {
231  uint uFrom, uTo;
232  if (pow == 8) uFrom = 0, uTo = 1u << pow;
233  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
234  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
235  printf("%u..%u \r", uFrom, uTo);
236  for (uint u = uFrom; ; u++)
237  {
238  int nBytes = 0;
239  if (u < (1u << 7)) nBytes = 1;
240  else if (u < (1u << 11)) nBytes = 2;
241  else if (u < (1u << 16)) nBytes = 3;
242  else if (u < (1u << 21)) nBytes = 4;
243  else if (u < (1u << 26)) nBytes = 5;
244  else nBytes = 6;
245  src.Gen(6, nBytes);
246  if (nBytes == 1) src[0] = u;
247  else {
248  src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
249  for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
250  bool err = (strict && u > 0x10ffff);
251  expectedDest.Reserve(1, 0);
252  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
253  else if (! err) expectedDest.Add(u);
254  int erv = (err ? 0 : 1);
255  if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
256  TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
257  // We can also test the UTF-8 encoder.
258  src2[0] = u;
259  if (err) {
260  if (errorHandling == uehReplace) src = utf8ReplCh;
261  else src.Clr(false); }
262  TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
263  //
264  if (u == uTo) break;
265  }
266  }
267  }
268 }
269 
270 //-----------------------------------------------------------------------------
271 // TUniCodec -- UTF-16 test driver
272 //-----------------------------------------------------------------------------
273 
274 void TUniCodec::WordsToBytes(const TIntV& src, TIntV& dest)
275 {
276  dest.Clr();
277  bool isLE = IsMachineLittleEndian();
278  for (int i = 0; i < src.Len(); i++) {
279  int c = src[i] & 0xffff;
280  if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
281  else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
282 }
283 
284 void TUniCodec::TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
285  const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
286  FILE *f)
287 {
288  TIntV srcBytes, expectedDestBytes;
289  WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
290  TIntV dest;
291  if (f) {
292  fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n",
293  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
294  (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
295  (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
296  (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
298  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
299  for (int useBytes = 0; useBytes < 2; useBytes++)
300  {
301  const char *fmt = (useBytes ? " %02x" : " %04x");
302  try
303  {
304  dest.Clr();
305  size_t retVal;
306  if (! useBytes) {
307  if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
308  else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
309  else {
310  if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
311  else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
312  const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
313  if (f) {
314  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i]));
315  fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i]));
316  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
317  bool ok = true;
318  if (retVal != expectedRetVal) ok = false;
319  if (dest.Len() != ed.Len()) ok = false;
320  if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
321  if (! ok)
322  {
323  printf("!!!\n");
324  }
325  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
326  IAssert(dest.Len() == ed.Len());
327  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
328  }
329  catch (TUnicodeException e)
330  {
331  if (f) {
332  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
333  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
334  IAssert(expectedThrow);
335  }
336  }
337 }
338 
339 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
340 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
341 void TUniCodec::TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
342  const TUtf16BomHandling bomHandling,
343  const TUniByteOrder defaultByteOrder,
344  const bool insertBom)
345 {
346  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
347  bool expectedAbort = false;
348  FILE *f = 0;
349  bool isMachineLe = IsMachineLittleEndian();
350  bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
351  bool swap = (isMachineLe != isDefaultLe);
352  if (insertBom) {
353  src.Add(swap ? 0xfffe : 0xfeff);
354  if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
355  else if (bomHandling == bomRequired) {
356  expectedAbort = true; expectedRetVal = -1; }
357  // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
358  // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
359  // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
360  // (absent = 0, 'a' = 1).
361  for (int i = 0; i < testCaseDesc.Len(); )
362  {
363  const char c = testCaseDesc[i++];
364  uint cp = 0; int nWords = -1;
365  if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
366  if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
367  else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
368  else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
369  else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
370  else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
371  else if (c == 'X') { cp = 0xfffe; nWords = 1; }
372  else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
373  else Fail;
374  if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
375  // Process 'e'.
376  int nToDel = 0;
377  if (i < testCaseDesc.Len()) {
378  const char e = testCaseDesc[i];
379  if (e >= 'a') { i += 1; nToDel = 1; }}
380  IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
381  if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
382  // Will an error occur during the decoding of this codepoint?
383  bool errHere = false;
384  if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
385  else if (cp > 0x10ffff) { Fail; errHere = true; }
386  else if (nToDel > 0) errHere = true;
387  else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
388  // Update 'expectedDest' and 'expectedRetVal'.
389  if (! expectedAbort) {
390  if (! errHere) {
391  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
392  else { expectedDest.Add(cp); expectedRetVal += 1; } }
393  else if (errorHandling == uehReplace) {
394  expectedDest.Add(replacementChar); }
395  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
396  // Update 'src'.
397  if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
398  else {
399  int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
400  int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
401  src.Add(swap ? SwapBytes(c1) : c1);
402  if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
403  }
404  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
405  TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
406 }
407 
409 {
410  TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
411  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
412  for (int strict_ = 0; strict_ < 2; strict_++)
413  for (int errMode_ = 0; errMode_ < 4; errMode_++)
414  for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
415  for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
416  for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
417  {
418  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
419  bool insertBom = (insertBom_ == 1);
420  TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
421  TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
422  TRnd rnd = TRnd(123);
423  // Test DecodeUtf16 on various random UTF-16-encoded sequences.
424  for (int i = 0; i < 10; i++)
425  {
426  TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
427  TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
428  TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
429  TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
430  TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
431  TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
432  TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
433  TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
434  TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
435  }
436  //continue;
437  // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
438  // close to powers of 2.
439  TIntV src, expectedDest, src2;
440  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
441  for (int pow = 8; pow <= 32; pow++)
442  {
443  uint uFrom, uTo;
444  if (pow == 8) uFrom = 0, uTo = 1u << pow;
445  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
446  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
447  printf("%u..%u \r", uFrom, uTo);
448  for (uint u = uFrom; ; u++)
449  {
450  int nWords = 0;
451  if (u < 0x10000) nWords = 1;
452  else nWords = 2;
453  bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
454  bool swap = (isMachineLe != isDestLe);
455  bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
456  src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
457  if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
458  if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
459  {
460  // Try to encode 'u' and see if it gets decoded correctly.
461  if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
462  else {
463  int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
464  int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
465  src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
466  src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
467  if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
468  {
469  expectedDest.Reserve(2, 0);
470  if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
471  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
472  else if (! err) expectedDest.Add(u);
473  int erv = (err ? 0 : expectedDest.Len());
474  if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
475  bool errD = err;
476  if (bomHandling == bomRequired && ! insertBom) {
477  expectedDest.Clr(false);
478  if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
479  else { erv = -1; errD = true;
480  /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
481  TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
482  }
483  }
484  // We can also test the UTF-16 encoder.
485  src2[0] = u;
486  if (err) {
487  src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
488  if (errorHandling == uehReplace) {
490  /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
491  else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
492  }}
493  TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
494  //
495  if (u == uTo) break;
496  }
497  }
498  }
499 }
500 
501 //-----------------------------------------------------------------------------
502 // TUniCaseFolding
503 //-----------------------------------------------------------------------------
504 
505 void TUniCaseFolding::LoadTxt(const TStr& fileName)
506 {
507  Clr();
508  TUniChDb::TUcdFileReader reader; reader.Open(fileName);
509  TStrV fields;
510  while (reader.GetNextLine(fields))
511  {
512  int cp = reader.ParseCodePoint(fields[0]);
513  const TStr status = fields[1], mapsTo = fields[2];
514  if (status == "C" || status == "S" || status == "T") {
515  TIntH &dest = (status == "C" ? cfCommon : status == "S" ? cfSimple : cfTurkic);
516  IAssert(! dest.IsKey(cp));
517  int cp2 = reader.ParseCodePoint(mapsTo);
518  dest.AddDat(cp, cp2); }
519  else if (status == "F") {
520  TIntIntVH &dest = cfFull;
521  IAssert(! dest.IsKey(cp));
522  TIntV cps; reader.ParseCodePointList(mapsTo, cps); IAssert(cps.Len() > 0);
523  dest.AddDat(cp, cps); }
524  else
525  FailR(status.CStr());
526  }
527  printf("TUniCaseFolding(\"%s\"): %d common, %d simple, %d full, %d Turkic.\n",
528  fileName.CStr(), cfCommon.Len(), cfSimple.Len(), cfFull.Len(), cfTurkic.Len());
529 }
530 
531 void TUniCaseFolding::Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f)
532 {
533  fprintf(f, "TUniCaseFolding(%s%s): ", (full ? "full" : "simple"), (turkic ? ", turkic" : ""));
534  for (int i = 0; i < src.Len(); i++) fprintf(f, " %04x", int(src[i]));
535  TIntV dest; Fold(src, 0, src.Len(), dest, true, full, turkic);
536  fprintf(f, "\n -> ");
537  for (int i = 0; i < dest.Len(); i++) fprintf(f, " %04x", int(dest[i]));
538  fprintf(f, "\n");
539  IAssert(dest.Len() == expectedDest.Len());
540  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
541 }
542 
543 /*
544 void TUniCaseFolding::Test(const TIntV& src, FILE *f) {
545  Test(src, false, false, f); Test(src, false, true, f);
546  Test(src, true, false, f); Test(src, true, true, f); }
547 */
548 
550 {
551  FILE *f = stderr;
552  TVectorBuilder VB;
553  // simple
554  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0xdf), false, false, f);
555  // simple + turkic
556  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0xdf), false, true, f);
557  // full
558  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0x73, 0x73), true, false, f);
559  // full + turkic
560  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0x73, 0x73), true, true, f);
561 }
562 
563 //-----------------------------------------------------------------------------
564 // TUniChInfo
565 //-----------------------------------------------------------------------------
566 
567 // UAX #14
573 
574 //-----------------------------------------------------------------------------
575 // TUniChDb -- word breaking
576 //-----------------------------------------------------------------------------
577 
578 // Test driver for WbFind*NonIgnored.
579 void TUniChDb::TestWbFindNonIgnored(const TIntV& src) const
580 {
581  int n = src.Len();
582  TBoolV isIgnored; isIgnored.Gen(n);
583  for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
584  TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
585  prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
586  FILE *f = 0; // stderr;
587  for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
588  {
589  int prev = -1;
590  for (int i = 0; i < srcLen; i++) {
591  prevNonIgnored[i] = prev;
592  if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
593  int next = srcIdx + srcLen;
594  for (int i = srcLen - 1; i >= 0; i--) {
595  nextNonIgnored[i] = next;
596  if (! isIgnored[srcIdx + i]) next = srcIdx + i;
597  curOrNextNonIgnored[i] = next; }
598  if (f) {
599  fprintf(f, "\nIndex: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
600  fprintf(f, "\nNonIgn: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
601  fprintf(f, "\nPrevNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
602  fprintf(f, "\nNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
603  fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
604  fprintf(f, "\n"); }
605  for (int i = 0; i < srcLen; i++)
606  {
607  size_t s;
608  s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
609  IAssert(s == size_t(nextNonIgnored[i]));
610  s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
611  IAssert(s == size_t(curOrNextNonIgnored[i]));
612  s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
613  if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
614  else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
615  }
616  }
617 }
618 
620 {
621  TIntV chIgnored, chNonIgnored;
622  FILE *f = 0; // stderr;
623  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
624  const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
625  if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
627  (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
628  }
629  chIgnored.Sort(); chNonIgnored.Sort();
630  printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
631  TRnd rnd = TRnd(123);
632  for (int iter = 0; iter <= 50; iter++)
633  {
634  int percIgnored = 2 * iter;
635  for (int n = 0; n <= 20; n++)
636  {
637  // Prepare a random sequence of 'n' codepoints.
638  TIntV v; v.Gen(n);
639  for (int i = 0; i < n; i++) {
640  TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
641  int j = rnd.GetUniDevInt(chars.Len());
642  v.Add(chars[j]); }
643  // Run the tests with this sequence.
645  }
646  }
647 }
648 
649 void TUniChDb::TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence)
650 {
651  TUcdFileReader reader; TStrV fields;
652  reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
653  int nLines = 0; TRnd rnd = TRnd(123);
654  while (reader.GetNextLine(fields))
655  {
656  nLines += 1;
657  IAssert(fields.Len() == 1);
658  TStrV parts; fields[0].SplitOnWs(parts);
659  const int n = parts.Len(); IAssert((n % 2) == 1);
660  TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
661  // Each line is a sequence of codepoints, with a \times or \div in between each
662  // pair of codepoints (as well as at the beginning and the end of the sequence) to
663  // indicate whether a boundary exists there or not.
664  for (int i = 0; i < n; i++)
665  {
666  const TStr& s = parts[i];
667  if ((i % 2) == 0) {
668  if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
669  isBreak.Add(false);
670  else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
671  isBreak.Add(true);
672  else FailR(s.CStr()); }
673  else chars.Add(reader.ParseCodePoint(s));
674  }
675  const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
676  IAssert(isBreak[0]); IAssert(isBreak[m]);
677  isPredicted.Gen(m + 1); isPredicted.PutAll(false);
678  if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
679  // We'll insert a few random characters at the beginning of the sequence
680  // so that srcPos doesn't always begin at 0.
681  for (int nBefore = 0; nBefore < 5; nBefore++)
682  {
683  TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
684  chars2.AddV(chars);
685  // Use FindNextBoundary to find all the word boundaries.
686  size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
687  while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
688  {
689  IAssert(prevPosition < position);
690  IAssert(position <= size_t(nBefore + m));
691  isPredicted[int(position) - nBefore] = true;
692  prevPosition = position;
693  }
694  IAssert(position == size_t(nBefore + m));
695  if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
696  else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
697  IAssert(isPredicted2.Len() == m + 1);
698  bool ok = true;
699  // If we start at 0, the word boundary at the beginning of the sequence was
700  // not found explicitly, so we'll add it now.
701  if (nBefore == 0) isPredicted[0] = true;
702  // Compare the predicted and the true boundaries.
703  for (int i = 0; i <= m; i++) {
704  if (isBreak[i] != isPredicted[i]) ok = false;
705  IAssert(isPredicted2[i] == isPredicted[i]); }
706  FILE *f = stderr;
707  if (! ok)
708  {
709  fprintf(f, "\nError in line %d:\n", nLines);
710  fprintf(f, "True: ");
711  for (int i = 0; i <= m; i++) {
712  fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
713  if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
714  fprintf(f, "\nPredicted: ");
715  for (int i = 0; i <= m; i++) {
716  fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
717  if (i < m) {
718  const int cp = chars[i + nBefore];
720  if (IsWbIgnored(cp)) s = "*" + s;
721  fprintf(f, "%4s ", s.CStr()); }}
722  fprintf(f, "\n");
723  Fail;
724  }
725  // Test FindNextBoundary if we start in the middle of the sequence,
726  // i.e. not at an existing boundary.
727  for (int i = 0; i < m; i++) {
728  position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
729  IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
730  IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
731  position -= nBefore;
732  for (int j = i + 1; j < int(position); j++)
733  IAssert(! isBreak[j]);
734  IAssert(isBreak[int(position)]); }
735  }
736  }
737  reader.Close();
738  printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
739 }
740 
741 //-----------------------------------------------------------------------------
742 // TUniChDb -- composition and decomposition
743 //-----------------------------------------------------------------------------
744 
745 void TUniChDb::TestComposition(const TStr& basePath)
746 {
747  TUcdFileReader reader; TStrV fields; int nLines = 0;
748  reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
749  bool inPart1 = false; TIntH testedInPart1;
750  while (reader.GetNextLine(fields))
751  {
752  nLines += 1;
753  if (fields.Len() == 1) {
754  IAssert(fields[0].IsPrefix("@Part"));
755  inPart1 = (fields[0] == "@Part1"); continue; }
756  IAssert(fields.Len() == 6);
757  IAssert(fields[5].Len() == 0);
758  TIntV c1, c2, c3, c4, c5;
759  reader.ParseCodePointList(fields[0], c1);
760  reader.ParseCodePointList(fields[1], c2);
761  reader.ParseCodePointList(fields[2], c3);
762  reader.ParseCodePointList(fields[3], c4);
763  reader.ParseCodePointList(fields[4], c5);
764  TIntV v;
765 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
766 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
767 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
768 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
769 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
770  // NFD:
771  NFD_(c3, c1); // c3 == NFD(c1)
772  NFD_(c3, c2); // c3 == NFD(c2)
773  NFD_(c3, c3); // c3 == NFD(c3)
774  NFD_(c5, c4); // c5 == NFD(c4)
775  NFD_(c5, c5); // c5 == NFD(c5)
776  // NFC:
777  NFC_(c2, c1); // c2 == NFC(c1)
778  NFC_(c2, c2); // c2 == NFC(c2)
779  NFC_(c2, c3); // c2 == NFC(c3)
780  NFC_(c4, c4); // c4 == NFC(c4)
781  NFC_(c4, c5); // c4 == NFC(c5)
782  // NFKD:
783  NFKD_(c5, c1); // c5 == NFKD(c1)
784  NFKD_(c5, c2); // c5 == NFKD(c2)
785  NFKD_(c5, c3); // c5 == NFKD(c3)
786  NFKD_(c5, c4); // c5 == NFKD(c4)
787  NFKD_(c5, c5); // c5 == NFKD(c5)
788  // NFKC:
789  NFKC_(c4, c1); // c4 == NFKC(c1)
790  NFKC_(c4, c2); // c4 == NFKC(c2)
791  NFKC_(c4, c3); // c4 == NFKC(c3)
792  NFKC_(c4, c4); // c4 == NFKC(c4)
793  NFKC_(c4, c5); // c4 == NFKC(c5)
794  //
795  if (inPart1) {
796  IAssert(c1.Len() == 1);
797  testedInPart1.AddKey(c1[0]); }
798  }
799  reader.Close();
800  // Test other individual codepoints that were not mentioned in part 1.
801  int nOther = 0;
802  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
803  {
804  const int cp = h.GetKey(i), nLines = -1;
805  if (testedInPart1.IsKey(cp)) continue;
806  TIntV x, v; x.Add(cp);
807  NFC_(x, x); // x == NFC(x)
808  NFD_(x, x); // x == NFD(x)
809  NFKC_(x, x); // x == NFKC(x)
810  NFKD_(x, x); // x == NFKD(x)
811  nOther += 1;
812  }
813 #undef AssE_
814 #undef NFC_
815 #undef NFD_
816 #undef NFKC_
817 #undef NFKD_
818  printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
819 }
820 
821 //-----------------------------------------------------------------------------
822 // TUniChDb -- case conversion tests
823 //-----------------------------------------------------------------------------
824 
825 void TUniChDb::TestCaseConversion(const TStr& source, const TStr& trueLc,
826  const TStr& trueTc, const TStr& trueUc,
827  bool turkic, bool lithuanian)
828 {
829  TIntV src;
831  FILE *f = stderr;
832  for (int i = 0; i < 3; i++)
833  {
834  TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
835  const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
836  TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
837  TIntV dest;
838  GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
839  bool ok = (dest.Len() == trueDest.Len());
840  if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
841  if (ok) continue;
842  fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
843  for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
844  fprintf(f, ")\nCorrect: (");
845  for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
846  fprintf(f, ")\nOur output:(");
847  for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
848  fprintf(f, ")\n");
849  IAssert(ok);
850  }
851 }
852 
854 {
855  // Because no thorough case-conversion test files have been provided as part
856  // of the Unicode standard, we'll have to test things on a few test cases of our own.
857  // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
858  const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
859  const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
860  const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
861  const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
862  const TStr space = "0020 ", Grave = "0300 ";
864  F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst, // source
865  f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst, // lowercase
866  F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst, // titlecase
867  F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
868  false, false);
869  // - Dotted I, dotless i, etc., but with turkic == false.
870  const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
872  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
873  s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
874  S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
875  S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
876  false, false);
877  // - Sigma (final vs. non-final forms).
878  const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
880  Sigma + s + space + s + Sigma + space + s + Sigma + s + space + Sigma + S + Sigma + space + Sigma, // source
881  sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
882  Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
883  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
884  false, false);
886  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + S + sigma + space + sigma, // source
887  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + s + sigma + space + sigma, // lowercase
888  Sigma + s + space + S + sigma + space + S + sigma + s + space + Sigma + s + sigma + space + Sigma, // titlecase
889  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
890  false, false);
892  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma + space + fsigma, // source
893  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma + space + fsigma, // lowercase
894  Sigma + s + space + S + fsigma + space + S + fsigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
895  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
896  false, false);
897  const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
898  // Special case mappings for Turkic languages:
899  // - After_I
901  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
902  s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
903  S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
904  S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
905  true, false); // turkic
906  // - Not_Before_Dot
908  I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
909  iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
910  I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
911  I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
912  true, false); // turkic
913  // Special case mappings for Lithuanian:
914  // - After_Soft_Dotted [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
916  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
917  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
918  I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
919  I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
920  false, true); // lithuanian
921  // - More_Above [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
923  J + Grave + space + J + nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J + nonSA + Grave + space + j + nonSA, // source
924  j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
925  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
926  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + J + nonSA + Grave + space + J + nonSA, // uppercase
927  false, true); // lithuanian
928  // SoftDotted [^ Starter Above]* 0307 --(uc,tc)--> brez 0307
929  // SoftDotted [^ Starter Above]* 0307 --(
930  //TestCaseConversion("", "", "", "", false, false);
931 }
932 
933 //-----------------------------------------------------------------------------
934 // TUniChDb -- initialization from the text files
935 //-----------------------------------------------------------------------------
936 
938 {
939  if (s.Empty()) return;
940  if (s[0] == '<') {
941  int i = s.SearchCh('>'); IAssert(i > 0);
943  s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
945  IAssert(dec.Len() > 0);
948 }
949 
950 void TUniChDb::InitPropList(const TStr& basePath)
951 {
952  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
953  reader.Open(CombinePath(basePath, GetPropListFn()));
954  TSubcatHelper helper(*this);
955  while (reader.GetNextLine(fields))
956  {
957  IAssert(fields.Len() == 2);
958  int from, to; reader.ParseCodePointRange(fields[0], from, to);
959  TStr s = fields[1];
961  if (s == "White_Space") prop = ucfPrWhiteSpace;
962  else if (s == "Bidi_Control") prop = ucfPrBidiControl;
963  else if (s == "Join_Control") prop = ucfPrJoinControl;
964  else if (s == "Dash") prop = ucfPrDash;
965  else if (s == "Hyphen") prop = ucfPrHyphen;
966  else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
967  else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
968  else if (s == "Other_Math") propx = ucfPxOtherMath;
969  else if (s == "Hex_Digit") prop = ucfPrHexDigit;
970  else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
971  else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
972  else if (s == "Ideographic") prop = ucfPrIdeographic;
973  else if (s == "Diacritic") prop = ucfPrDiacritic;
974  else if (s == "Extender") prop = ucfPrExtender;
975  else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
976  else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
977  else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
978  else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
979  else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
980  else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
981  else if (s == "Radical") propx = ucfPxRadical;
982  else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
983  else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
984  else if (s == "Deprecated") prop = ucfPrDeprecated;
985  else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
986  else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
987  else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
988  else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
989  else if (s == "STerm") prop = ucfPrSTerm;
990  else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
991  else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
992  else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
993  else FailR(s.CStr());
994  helper.ProcessComment(reader);
995  for (int cp = from; cp <= to; cp++) {
996  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
997  TUniChInfo &ci = h[i]; helper.TestCat(cp);
998  if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
999  if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
1000  nCps++; }
1001  nLines++;
1002  }
1003  reader.Close();
1004  printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
1005 }
1006 
1008 {
1009  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
1010  reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
1011  TSubcatHelper helper(*this);
1012  while (reader.GetNextLine(fields))
1013  {
1014  IAssert(fields.Len() == 2);
1015  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1016  TStr s = fields[1];
1018  if (s == "Math") flag = ucfDcpMath;
1019  else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
1020  else if (s == "Lowercase") flag = ucfDcpLowercase;
1021  else if (s == "Uppercase") flag = ucfDcpUppercase;
1022  else if (s == "ID_Start") flag = ucfDcpIdStart;
1023  else if (s == "ID_Continue") flag = ucfDcpIdContinue;
1024  else if (s == "XID_Start") flag = ucfDcpXidStart;
1025  else if (s == "XID_Continue") flag = ucfDcpXidContinue;
1026  else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
1027  else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
1028  else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
1029  else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
1030  else FailR(s.CStr());
1031  // If we add new codepoints to the hash table, we should also set their category.
1032  // This is supposed to be provided in the comment, e.g. "# Cf SOFT HYPHEN".
1033  helper.ProcessComment(reader);
1034  //
1035  for (int cp = from; cp <= to; cp++) {
1036  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1037  helper.TestCat(cp);
1038  TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
1039  ci.SetDcpFlag(flag); nCps++; }
1040  nLines++;
1041  }
1042  reader.Close();
1043  printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
1044 }
1045 
1046 void TUniChDb::InitLineBreaks(const TStr& basePath)
1047 {
1048  // Clear old linebreak values.
1050  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
1051  // Read LineBreak.txt.
1052  TUcdFileReader reader; TStrV fields;
1053  reader.Open(CombinePath(basePath, GetLineBreakFn()));
1054  int nLines = 0, nCps = 0;
1055  while (reader.GetNextLine(fields))
1056  {
1057  IAssert(fields.Len() == 2);
1058  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1059  TStr s = fields[1]; IAssert(s.Len() == 2);
1060  ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
1061  if (us == xx) continue;
1062  for (int cp = from; cp <= to; cp++) {
1063  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
1064  printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
1065  IAssert(h[i].lineBreak == xx);
1066  h[i].lineBreak = us; nCps++; }
1067  nLines++;
1068  }
1069  reader.Close();
1070  printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
1071 }
1072 
1073 void TUniChDb::InitScripts(const TStr& basePath)
1074 {
1075  TUcdFileReader reader; TStrV fields;
1076  reader.Open(CombinePath(basePath, GetScriptsFn()));
1077  TSubcatHelper helper(*this);
1078  while (reader.GetNextLine(fields))
1079  {
1080  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1081  TStr scriptName = fields[1];
1082  int scriptNo = scripts.GetKeyId(scriptName);
1083  if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
1084  IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
1085  scripts[scriptNo] += 1;
1086  helper.ProcessComment(reader);
1087  for (int cp = from; cp <= to; cp++) {
1088  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1089  helper.TestCat(cp);
1090  TUniChInfo &ci = h[i]; ci.script = scriptNo; }
1091  }
1092  reader.Close();
1094  printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
1095  if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
1096  printf(" %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
1097  printf("\n");
1098 }
1099 
1101 {
1102  // UAX #29, sec. 4.1 and 5.1.
1103  // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
1104  int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
1105  int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
1106  // Clear any existing word-boundary flags and initialize them again.
1107  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1108  {
1109  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1110  ci.ClrWbAndSbFlags();
1111  // Word-boundary flags.
1112  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
1113  if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
1116  if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
1117  // Sentence-boundary flags. Some are identical to some word-boundary flags.
1118  if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
1119  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
1120  if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
1121  if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
1122  if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
1123  if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
1125  if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
1126  // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
1127  // the purposes of sentence-boundary detection. Now in PropList.txt there is no doubt that 002E has the STerm
1128  // property; thus, it should also belong to the STerm sentence-boundary class. However, in
1129  // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
1130  if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
1131  if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
1132  }
1133  // Some additional characters for Katakana and MidLetter.
1134  TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
1135  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
1136  v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
1137  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
1138  // WbALetter depends on Katakana, so it cannot be initialized earlier.
1139  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1140  {
1141  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1142  if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
1143  ci.SetWbFlag(ucfWbALetter);
1144  }
1145  // An alternative is to extract the flags from WordBreakProperty.txt.
1146  // The results should be the same.
1147  {TUcdFileReader reader; TStrV fields;
1149  THash<TInt, TInt> hh;
1150  while (reader.GetNextLine(fields))
1151  {
1152  IAssert(fields.Len() == 2);
1153  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1154  TStr s = fields[1];
1156  if (s == "Format") flag = ucfWbFormat;
1157  else if (s == "Katakana") flag = ucfWbKatakana;
1158  else if (s == "ALetter") flag = ucfWbALetter;
1159  else if (s == "MidLetter") flag = ucfWbMidLetter;
1160  else if (s == "MidNum") flag = ucfWbMidNum;
1161  else if (s == "Numeric") flag = ucfWbNumeric;
1162  else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
1163  else FailR(s.CStr());
1164  for (int c = from; c <= to; c++) {
1165  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1166  else hh[i].Val |= flag; }
1167  }
1168  reader.Close();
1169  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1170  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1171  cps.Sort(); cps.Merge();
1172  for (int i = 0; i < cps.Len(); i++)
1173  {
1174  int cp = cps[i];
1175  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
1176  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1177  flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
1178  if (flags1 != flags2) {
1179  printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
1180  Fail; }
1181  }}
1182  // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
1183  {TUcdFileReader reader; TStrV fields;
1185  THash<TInt, TInt> hh;
1186  while (reader.GetNextLine(fields))
1187  {
1188  IAssert(fields.Len() == 2);
1189  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1190  TStr s = fields[1];
1192  if (s == "Sep") flag = ucfSbSep;
1193  else if (s == "Format") flag = ucfSbFormat;
1194  else if (s == "Sp") flag = ucfSbSp;
1195  else if (s == "Lower") flag = ucfSbLower;
1196  else if (s == "Upper") flag = ucfSbUpper;
1197  else if (s == "OLetter") flag = ucfSbOLetter;
1198  else if (s == "Numeric") flag = ucfSbNumeric;
1199  else if (s == "ATerm") flag = ucfSbATerm;
1200  else if (s == "STerm") flag = ucfSbSTerm;
1201  else if (s == "Close") flag = ucfSbClose;
1202  else FailR(s.CStr());
1203  for (int c = from; c <= to; c++) {
1204  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1205  else hh[i].Val |= flag; }
1206  }
1207  reader.Close();
1208  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1209  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1210  cps.Sort(); cps.Merge();
1211  for (int i = 0; i < cps.Len(); i++)
1212  {
1213  int cp = cps[i];
1214  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
1215  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1216  if (flags1 != flags2) {
1217  printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
1218  flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
1219  flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
1220  flags1 ^ flags2);
1221  Fail; }
1222  }}
1223 }
1224 
1225 void TUniChDb::InitSpecialCasing(const TStr& basePath)
1226 {
1227  TUcdFileReader reader; TStrV fields;
1228  reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
1229  while (reader.GetNextLine(fields))
1230  {
1231  IAssert(fields.Len() == 5 || fields.Len() == 6);
1232  IAssert(fields.Last().Empty());
1233  // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
1234  TStr conditions = "";
1235  if (fields.Len() == 6) conditions = fields[4];
1236  conditions.ToTrunc(); if (! conditions.Empty()) continue;
1237  // Keep the other mappings.
1238  const int cp = reader.ParseCodePoint(fields[0]);
1239  TIntV v; reader.ParseCodePointList(fields[1], v);
1240  specialCasingLower.AddDat(cp, v);
1241  reader.ParseCodePointList(fields[2], v);
1242  specialCasingTitle.AddDat(cp, v);
1243  reader.ParseCodePointList(fields[3], v);
1244  specialCasingUpper.AddDat(cp, v);
1245  }
1246  reader.Close();
1247 }
1248 
1249 void TUniChDb::LoadTxt(const TStr& basePath)
1250 {
1251  Clr();
1252  // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
1253  h = THash<TInt, TUniChInfo>(196613, true);
1254  //
1256  //
1257  TUcdFileReader reader; TStrV fields; TIntH seen;
1258  reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
1259  while (reader.GetNextLine(fields))
1260  {
1261  // Codepoint.
1262  int cp = reader.ParseCodePoint(fields[0]);
1263  IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
1264  TUniChInfo& ci = h.AddDat(cp);
1265  // Name.
1266  ci.nameOffset = charNames.AddStr(fields[1]);
1267  // Category.
1268  TStr& s = fields[2]; IAssert(s.Len() == 2);
1269  ci.chCat = s[0]; ci.chSubCat = s[1];
1270  // Canonical combining class.
1271  s = fields[3]; IAssert(s.Len() > 0);
1272  int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
1273  ci.combClass = (uchar) i;
1274  // Decomposition type and mapping.
1275  LoadTxt_ProcessDecomposition(ci, fields[5]);
1276  // Simple case mappings.
1277  s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1278  s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1279  s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1280  //
1281  ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
1282  }
1283  reader.Close();
1284  //
1285  InitScripts(basePath);
1286  //
1287  InitPropList(basePath);
1288  InitDerivedCoreProperties(basePath);
1289  InitLineBreaks(basePath);
1290  InitSpecialCasing(basePath);
1291  // Process the composition exclusions (UAX #15, sec. 6).
1292  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1293  {
1294  TUniChInfo& ci = h[i];
1295  int ofs = ci.decompOffset; if (ofs < 0) continue;
1296  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1297  IAssert(n > 0);
1298  // Singleton decompositions.
1299  if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
1300  // Non-starter decompositions.
1301  int cp1 = decompositions[ofs];
1302  IAssert(h.IsKey(cp1));
1303  uchar ccc = h.GetDat(cp1).combClass;
1304  if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
1305  }
1306  // Process the composition exclusion table.
1307  reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
1308  int nExclusionTable = 0;
1309  while (reader.GetNextLine(fields))
1310  {
1311  IAssert(fields.Len() == 1);
1312  int cp = reader.ParseCodePoint(fields[0]);
1313  int i = h.GetKeyId(cp); IAssert(i >= 0);
1314  h[i].flags |= ucfCompositionExclusion;
1315  nExclusionTable++;
1316  }
1317  reader.Close();
1318  // Prepare the inverted index for composition pairs.
1319  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1320  {
1321  int cp = h.GetKey(i);
1322  TUniChInfo& ci = h[i];
1323  int ofs = ci.decompOffset; if (ofs < 0) continue;
1324  if (ci.IsCompositionExclusion()) continue;
1325  if (ci.IsCompatibilityDecomposition()) continue;
1326  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1327  if (n != 2) continue;
1328  TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
1329  IAssert(! inverseDec.IsKey(pr));
1331  inverseDec.AddDat(pr, cp);
1332  }
1333  printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
1334  basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
1335  // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
1336  // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
1337  InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
1338  // Make sure that Hangul combined characters are treated as stareters.
1339  for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
1340  {
1341  int j = h.GetKeyId(cp); if (j < 0) continue;
1342  TUniChInfo& ci = h[j];
1345  }
1346  // There should be no more additions to 'h' beyond this point.
1347  const int oldHLen = h.Len();
1348  // Provide default (identity) case mappings if any were missing from UnicodeData.txt
1349  // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
1351  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1352  {
1353  int cp = h.GetKey(i); TUniChInfo &ci = h[i];
1354  if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
1355  if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
1356  if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
1357  if (ci.script < 0) ci.script = scriptUnknown;
1358  }
1359  IAssert(h.Len() == oldHLen);
1360 }
1361 
1362 void TUniChDb::SaveBin(const TStr& fnBinUcd)
1363 {
1364  PSOut SOut=TFOut::New(fnBinUcd);
1365  Save(*SOut);
1366 }
1367 
1369 {
1371 }
1372 
1373 //-----------------------------------------------------------------------------
1374 // TUniChDb -- main test driver
1375 //-----------------------------------------------------------------------------
1376 
1377 void TUniChDb::Test(const TStr& basePath)
1378 {
1379  TStr fnBin = CombinePath(basePath, GetBinFn());
1380  if (true || ! TFile::Exists(fnBin))
1381  {
1382  // Test LoadTxt.
1383  LoadTxt(basePath);
1384  // Test Save.
1385  {PSOut SOut = TFOut::New(fnBin);
1386  Save(*SOut);}
1387  }
1388  // Test Load.
1389  this->~TUniChDb();
1390  new(this) TUniChDb();
1391  {PSIn SIn = TFIn::New(fnBin);
1392  Load(*SIn);}
1393  // Test the case folding.
1394  caseFolding.Test();
1395  // Test the word breaking.
1397  // Test the sentence breaking.
1398  TestFindNextWordOrSentenceBoundary(basePath, true);
1399  TestFindNextWordOrSentenceBoundary(basePath, false);
1400  // Test composition and decomposition.
1401  TestComposition(basePath);
1402  // Test the case conversions.
1404 }
1405 
1406 //-----------------------------------------------------------------------------
1407 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
1408 //-----------------------------------------------------------------------------
1409 
1410 //-----------------------------------------------------------------------------
1411 // ISO-8859-2
1412 //-----------------------------------------------------------------------------
1413 
1414 const int TEncoding_ISO8859_2::toUnicodeTable[6 * 16] =
1415 {
1416  /* 0xa0 */ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
1417  /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
1418  /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
1419  /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
1420  /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
1421  /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
1422 };
1423 
1424 const int TEncoding_ISO8859_2::fromUnicodeTable1[14 * 16] = {
1425  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1,
1426  /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1,
1427  /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1428  /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df,
1429  /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1430  /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1,
1431  /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a1, 0x00b1, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
1432  /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1,
1433  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1434  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00a5, 0x00b5, -1,
1435  /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1,
1436  /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x00a6, 0x00b6, -1, -1, 0x00aa, 0x00ba,
1437  /* U+0160 */ 0x00a9, 0x00b9, 0x00de, 0x00fe, 0x00ab, 0x00bb, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9,
1438  /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00af, 0x00bf, 0x00ae, 0x00be, -1
1439 };
1440 
1441 const int TEncoding_ISO8859_2::fromUnicodeTable2[2 * 16] = {
1442  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1,
1443  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1
1444 };
1445 
1446 //-----------------------------------------------------------------------------
1447 // ISO-8859-3
1448 //-----------------------------------------------------------------------------
1449 
1450 const int TEncoding_ISO8859_3::toUnicodeTable[6 * 16] = {
1451  /* 0xa0 */ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -1, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -1, 0x017b,
1452  /* 0xb0 */ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -1, 0x017c,
1453  /* 0xc0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
1454  /* 0xd0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
1455  /* 0xe0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
1456  /* 0xf0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9
1457 };
1458 
1459 const int TEncoding_ISO8859_3::fromUnicodeTable1[14 * 16] = {
1460  /* U+00a0 */ 0x00a0, -1, -1, 0x00a3, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1,
1461  /* U+00b0 */ 0x00b0, -1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, -1, 0x00b7, 0x00b8, -1, -1, -1, -1, 0x00bd, -1, -1,
1462  /* U+00c0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
1463  /* U+00d0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, 0x00d9, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df,
1464  /* U+00e0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
1465  /* U+00f0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, 0x00f9, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1,
1466  /* U+0100 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00c6, 0x00e6, 0x00c5, 0x00e5, -1, -1, -1, -1,
1467  /* U+0110 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d8, 0x00f8, 0x00ab, 0x00bb,
1468  /* U+0120 */ 0x00d5, 0x00f5, -1, -1, 0x00a6, 0x00b6, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, -1, -1,
1469  /* U+0130 */ 0x00a9, 0x00b9, -1, -1, 0x00ac, 0x00bc, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1470  /* U+0140 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1471  /* U+0150 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x00fe, 0x00aa, 0x00ba,
1472  /* U+0160 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00dd, 0x00fd, -1, -1,
1473  /* U+0170 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00af, 0x00bf, -1, -1, -1,
1474 };
1476  /* U+02d8 */ 0x00a2, 0x00ff
1477 };
1478 
1479 //-----------------------------------------------------------------------------
1480 // ISO-8859-4
1481 //-----------------------------------------------------------------------------
1482 
1483 const int TEncoding_ISO8859_4::toUnicodeTable[6 * 16] = {
1484  /* 0xa0 */ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
1485  /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
1486  /* 0xc0 */ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
1487  /* 0xd0 */ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
1488  /* 0xe0 */ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
1489  /* 0xf0 */ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9
1490 };
1491 
1492 const int TEncoding_ISO8859_4::fromUnicodeTable1[14 * 16] = {
1493  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, 0x00af,
1494  /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1,
1495  /* U+00c0 */ -1, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, -1, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1496  /* U+00d0 */ -1, -1, -1, -1, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, -1, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df,
1497  /* U+00e0 */ -1, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, -1, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1498  /* U+00f0 */ -1, -1, -1, -1, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, -1, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1,
1499  /* U+0100 */ 0x00c0, 0x00e0, -1, -1, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, 0x00c8, 0x00e8, -1, -1,
1500  /* U+0110 */ 0x00d0, 0x00f0, 0x00aa, 0x00ba, -1, -1, 0x00cc, 0x00ec, 0x00ca, 0x00ea, -1, -1, -1, -1, -1, -1,
1501  /* U+0120 */ -1, -1, 0x00ab, 0x00bb, -1, -1, -1, -1, 0x00a5, 0x00b5, 0x00cf, 0x00ef, -1, -1, 0x00c7, 0x00e7,
1502  /* U+0130 */ -1, -1, -1, -1, -1, -1, 0x00d3, 0x00f3, 0x00a2, -1, -1, 0x00a6, 0x00b6, -1, -1, -1,
1503  /* U+0140 */ -1, -1, -1, -1, -1, 0x00d1, 0x00f1, -1, -1, -1, 0x00bd, 0x00bf, 0x00d2, 0x00f2, -1, -1,
1504  /* U+0150 */ -1, -1, -1, -1, -1, -1, 0x00a3, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1,
1505  /* U+0160 */ 0x00a9, 0x00b9, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00dd, 0x00fd, 0x00de, 0x00fe, -1, -1, -1, -1,
1506  /* U+0170 */ -1, -1, 0x00d9, 0x00f9, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ae, 0x00be, -1,
1507 };
1508 
1509 const int TEncoding_ISO8859_4::fromUnicodeTable2[2 * 16] = {
1510  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1,
1511  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ff, -1, 0x00b2, -1, -1, -1, -1
1512 };
1513 
1514 //-----------------------------------------------------------------------------
1515 // CP 437
1516 //-----------------------------------------------------------------------------
1517 
1518 const int TEncoding_CP437::toUnicodeTable[8 * 16] = {
1519  /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
1520  /* 0x90 */ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
1521  /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
1522  /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
1523  /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
1524  /* 0xd0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
1525  /* 0xe0 */ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
1526  /* 0xf0 */ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
1527 };
1528 
1529 const int TEncoding_CP437::fromUnicodeTable1[6 * 16] = {
1530  /* U+00a0 */ 0x00ff, 0x00ad, 0x009b, 0x009c, -1, 0x009d, -1, -1, -1, -1, 0x00a6, 0x00ae, 0x00aa, -1, -1, -1,
1531  /* U+00b0 */ 0x00f8, 0x00f1, 0x00fd, -1, -1, 0x00e6, -1, 0x00fa, -1, -1, 0x00a7, 0x00af, 0x00ac, 0x00ab, -1, 0x00a8,
1532  /* U+00c0 */ -1, -1, -1, -1, 0x008e, 0x008f, 0x0092, 0x0080, -1, 0x0090, -1, -1, -1, -1, -1, -1,
1533  /* U+00d0 */ -1, 0x00a5, -1, -1, -1, -1, 0x0099, -1, -1, -1, -1, -1, 0x009a, -1, -1, 0x00e1,
1534  /* U+00e0 */ 0x0085, 0x00a0, 0x0083, -1, 0x0084, 0x0086, 0x0091, 0x0087, 0x008a, 0x0082, 0x0088, 0x0089, 0x008d, 0x00a1, 0x008c, 0x008b,
1535  /* U+00f0 */ -1, 0x00a4, 0x0095, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, 0x0097, 0x00a3, 0x0096, 0x0081, -1, -1, 0x0098,
1536 };
1537 
1538 const int TEncoding_CP437::fromUnicodeTable2[4 * 16] = {
1539  /* U+0390 */ -1, -1, -1, 0x00e2, -1, -1, -1, -1, 0x00e9, -1, -1, -1, -1, -1, -1, -1,
1540  /* U+03a0 */ -1, -1, -1, 0x00e4, -1, -1, 0x00e8, -1, -1, 0x00ea, -1, -1, -1, -1, -1, -1,
1541  /* U+03b0 */ -1, 0x00e0, -1, -1, 0x00eb, 0x00ee, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1542  /* U+03c0 */ 0x00e3, -1, -1, 0x00e5, 0x00e7, -1, 0x00ed, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1543 };
1544 
1545 const int TEncoding_CP437::fromUnicodeTable3[6 * 16] = {
1546  /* U+2210 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00f9, 0x00fb, -1, -1, -1, 0x00ec, -1,
1547  /* U+2220 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ef, -1, -1, -1, -1, -1, -1,
1548  /* U+2230 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1549  /* U+2240 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f7, -1, -1, -1, -1, -1, -1, -1,
1550  /* U+2250 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1551  /* U+2260 */ -1, 0x00f0, -1, -1, 0x00f3, 0x00f2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1552 };
1553 
1554 const int TEncoding_CP437::fromUnicodeTable4[11 * 16] = {
1555  /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1,
1556  /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1,
1557  /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1,
1558  /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1,
1559  /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1560  /* U+2550 */ 0x00cd, 0x00ba, 0x00d5, 0x00d6, 0x00c9, 0x00b8, 0x00b7, 0x00bb, 0x00d4, 0x00d3, 0x00c8, 0x00be, 0x00bd, 0x00bc, 0x00c6, 0x00c7,
1561  /* U+2560 */ 0x00cc, 0x00b5, 0x00b6, 0x00b9, 0x00d1, 0x00d2, 0x00cb, 0x00cf, 0x00d0, 0x00ca, 0x00d8, 0x00d7, 0x00ce, -1, -1, -1,
1562  /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1563  /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, 0x00dd, -1, -1, -1,
1564  /* U+2590 */ 0x00de, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1565  /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1566 };
1567 // /* U+0190 */ -1, -1, 0x009f, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1568 // /* U+2070 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00fc,
1569 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, 0x009e, -1, -1, -1, -1, -1, -1, -1, -1,
1570 // /* U+2310 */ 0x00a9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1571 // /* U+2320 */ 0x00f4, 0x00f5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1572 
1573 //-----------------------------------------------------------------------------
1574 // CP 852
1575 //-----------------------------------------------------------------------------
1576 
1577 const int TEncoding_CP852::toUnicodeTable[8 * 16] = {
1578  /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
1579  /* 0x90 */ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
1580  /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
1581  /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
1582  /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
1583  /* 0xd0 */ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
1584  /* 0xe0 */ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
1585  /* 0xf0 */ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0
1586 };
1587 
1588 const int TEncoding_CP852::fromUnicodeTable1[14 * 16] = {
1589  /* U+00a0 */ 0x00ff, -1, -1, -1, 0x00cf, -1, -1, 0x00f5, 0x00f9, -1, -1, 0x00ae, 0x00aa, 0x00f0, -1, -1,
1590  /* U+00b0 */ 0x00f8, -1, -1, -1, 0x00ef, -1, -1, -1, 0x00f7, -1, -1, 0x00af, -1, -1, -1, -1,
1591  /* U+00c0 */ -1, 0x00b5, 0x00b6, -1, 0x008e, -1, -1, 0x0080, -1, 0x0090, -1, 0x00d3, -1, 0x00d6, 0x00d7, -1,
1592  /* U+00d0 */ -1, -1, -1, 0x00e0, 0x00e2, -1, 0x0099, 0x009e, -1, -1, 0x00e9, -1, 0x009a, 0x00ed, -1, 0x00e1,
1593  /* U+00e0 */ -1, 0x00a0, 0x0083, -1, 0x0084, -1, -1, 0x0087, -1, 0x0082, -1, 0x0089, -1, 0x00a1, 0x008c, -1,
1594  /* U+00f0 */ -1, -1, -1, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, -1, 0x00a3, -1, 0x0081, 0x00ec, -1, -1,
1595  /* U+0100 */ -1, -1, 0x00c6, 0x00c7, 0x00a4, 0x00a5, 0x008f, 0x0086, -1, -1, -1, -1, 0x00ac, 0x009f, 0x00d2, 0x00d4,
1596  /* U+0110 */ 0x00d1, 0x00d0, -1, -1, -1, -1, -1, -1, 0x00a8, 0x00a9, 0x00b7, 0x00d8, -1, -1, -1, -1,
1597  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1598  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0091, 0x0092, -1, -1, 0x0095, 0x0096, -1,
1599  /* U+0140 */ -1, 0x009d, 0x0088, 0x00e3, 0x00e4, -1, -1, 0x00d5, 0x00e5, -1, -1, -1, -1, -1, -1, -1,
1600  /* U+0150 */ 0x008a, 0x008b, -1, -1, 0x00e8, 0x00ea, -1, -1, 0x00fc, 0x00fd, 0x0097, 0x0098, -1, -1, 0x00b8, 0x00ad,
1601  /* U+0160 */ 0x00e6, 0x00e7, 0x00dd, 0x00ee, 0x009b, 0x009c, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x0085,
1602  /* U+0170 */ 0x00eb, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008d, 0x00ab, 0x00bd, 0x00be, 0x00a6, 0x00a7, -1
1603 };
1604 
1605 const int TEncoding_CP852::fromUnicodeTable2[2* 16] = {
1606  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00f3, -1, -1, -1, -1, -1, -1, -1, -1,
1607  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f4, 0x00fa, -1, 0x00f2, -1, 0x00f1, -1, -1
1608 };
1609 
1610 const int TEncoding_CP852::fromUnicodeTable3[11 * 16] = {
1611  /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1,
1612  /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1,
1613  /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1,
1614  /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1,
1615  /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1616  /* U+2550 */ 0x00cd, 0x00ba, -1, -1, 0x00c9, -1, -1, 0x00bb, -1, -1, 0x00c8, -1, -1, 0x00bc, -1, -1,
1617  /* U+2560 */ 0x00cc, -1, -1, 0x00b9, -1, -1, 0x00cb, -1, -1, 0x00ca, -1, -1, 0x00ce, -1, -1, -1,
1618  /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1619  /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, -1, -1, -1, -1,
1620  /* U+2590 */ -1, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1621  /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1622 };
1623 
1624 //-----------------------------------------------------------------------------
1625 // Windows-1250
1626 //-----------------------------------------------------------------------------
1627 
1628 const int TEncoding_CP1250::toUnicodeTable[8 * 16] = {
1629  /* 0x80 */ 0x20ac, -1, 0x201a, -1, 0x201e, 0x2026, 0x2020, 0x2021, -1, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
1630  /* 0x90 */ -1, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, -1, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
1631  /* 0xa0 */ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
1632  /* 0xb0 */ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
1633  /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
1634  /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
1635  /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
1636  /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
1637 };
1638 
1639 const int TEncoding_CP1250::fromUnicodeTable1[14 * 16] = {
1640  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, 0x00a6, 0x00a7, 0x00a8, 0x00a9, -1, 0x00ab, 0x00ac, 0x00ad, 0x00ae, -1,
1641  /* U+00b0 */ 0x00b0, 0x00b1, -1, -1, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, -1, -1, 0x00bb, -1, -1, -1, -1,
1642  /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1643  /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df,
1644  /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1645  /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1,
1646  /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a5, 0x00b9, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
1647  /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1,
1648  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1649  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00bc, 0x00be, -1,
1650  /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1,
1651  /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x008c, 0x009c, -1, -1, 0x00aa, 0x00ba,
1652  /* U+0160 */ 0x008a, 0x009a, 0x00de, 0x00fe, 0x008d, 0x009d, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9,
1653  /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008f, 0x009f, 0x00af, 0x00bf, 0x008e, 0x009e, -1,
1654 };
1655 
1656 const int TEncoding_CP1250::fromUnicodeTable2[2 * 16] = {
1657  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00a1, -1, -1, -1, -1, -1, -1, -1, -1,
1658  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1,
1659 };
1660 
1661 const int TEncoding_CP1250::fromUnicodeTable3[3 * 16] = {
1662  /* U+2010 */ -1, -1, -1, 0x0096, 0x0097, -1, -1, -1, 0x0091, 0x0092, 0x0082, -1, 0x0093, 0x0094, 0x0084, -1,
1663  /* U+2020 */ 0x0086, 0x0087, 0x0095, -1, -1, -1, 0x0085, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1664  /* U+2030 */ 0x0089, -1, -1, -1, -1, -1, -1, -1, -1, 0x008b, 0x009b, -1, -1, -1, -1, -1,
1665 };
1666 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0080, -1, -1, -1,
1667 // /* U+2120 */ -1, -1, 0x0099, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1668 
1669 //-----------------------------------------------------------------------------
1670 // YU-ASCII
1671 //-----------------------------------------------------------------------------
1672 
1673 // C acute c acute C caron c caron S caron s caron Z caron z caron D stroke d stroke
1674 const int TEncoding_YuAscii::uniChars[10] = { 0x106, 0x107, 0x10c, 0x10d, 0x160, 0x161, 0x17d, 0x17e, 0x110, 0x111 };
1675 const int TEncoding_YuAscii::yuAsciiChars[10] = { 0x5d, 0x7d, 0x5e, 0x7e, 0x5b, 0x7b, 0x40, 0x60, 0x5c, 0x7c };
1676 // ']' '}' '^' '~' '[' '{' '@' '`' '\\' '|'
1677 
1678 
1679 //-----------------------------------------------------------------------------
1680 // TUnicode - codec registry
1681 //-----------------------------------------------------------------------------
1682 
1684 {
1685  ClrCodecs();
1686  RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
1687  RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
1688  RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
1689  RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
1690  RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
1691  RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
1692  RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
1693  RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
1694 }
1695 
1696 void TUnicode::EncodeUtf8(const uint& c, TChA& dest) {
1697  if (c > 0x10ffff) {
1698  throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
1699  if (c < 0x80u)
1700  dest.AddCh(char(c & 0xffu));
1701  else if (c < 0x800u) {
1702  dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
1703  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1704  else if (c < 0x10000u) {
1705  dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
1706  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1707  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1708  else if (c < 0x200000u) {
1709  dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
1710  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1711  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1712  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1713  else if (c < 0x4000000u) {
1714  dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
1715  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1716  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1717  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1718  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1719  else {
1720  dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
1721  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
1722  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1723  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1724  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1725  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1726 }
1727 
1729  TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
1730 }
void InitAfterLoad()
Definition: unicode.cpp:1368
#define IAssert(Cond)
Definition: bd.h:262
int GetWbFlags() const
Definition: unicode.h:1118
static int SwapBytes(int x)
Definition: unicode.h:250
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
void InitCodecs()
Definition: unicode.cpp:1683
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:1043
static PExcept New(const TStr &MsgStr, const TStr &LocStr=TStr())
Definition: ut.h:169
void Clr()
Definition: unicode.h:1276
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Definition: unicode.cpp:341
static const int fromUnicodeTable1[6 *16]
Definition: unicode.h:510
bool strict
Definition: unicode.h:83
enum TUniChProperties_ TUniChProperties
static const uint Mn
Definition: dt.h:1153
#define IAssertR(Cond, Reason)
Definition: bd.h:265
size_t srcIdx
Definition: unicode.h:32
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
int Len() const
Definition: dt.h:487
void SetPropertyX(const TUniChPropertiesX flag)
Definition: unicode.h:1108
bool IsInt(const bool &Check, const int &MnVal, const int &MxVal, int &Val) const
Definition: dt.cpp:1159
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
void Merge()
Sorts the vector and only keeps a single element of each value.
Definition: ds.h:1292
static TStr GetBinFn()
Definition: unicode.h:1310
void Test(const TIntV &src, const TIntV &expectedDest, const bool full, const bool turkic, FILE *f)
Definition: unicode.cpp:531
enum TUniChFlags_ TUniChFlags
bool IsCompositionExclusion() const
Definition: unicode.h:1111
#define NFC_(cmpWith, operand)
void SaveBin(const TStr &fnBinUcd)
Definition: unicode.cpp:1362
Definition: dt.h:11
bool IsDcpFlag(const TUniChFlags flag) const
Definition: unicode.h:1068
static const ushort LineBreak_Quotation
Definition: unicode.h:1032
void SetProperty(const TUniChProperties flag)
Definition: unicode.h:1085
bool IsGraphemeExtend() const
Definition: unicode.h:1077
void SetSbFlag(const TUniChFlags flag)
Definition: unicode.h:1127
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:480
static bool Exists(const TStr &FNm)
Definition: fl.cpp:1100
static TStr GetSpecialCasingFn()
Definition: unicode.h:1297
TUniChSubCategory subCat
Definition: unicode.h:1020
int GetWbFlags(const int cp) const
Definition: unicode.h:1357
static const uint Mx
Definition: dt.h:1154
void AssertEq(const TIntV &v1, const TIntV &v2, const TStr &explanation, FILE *f)
Definition: unicode.cpp:39
void SetDcpFlag(const TUniChFlags flag)
Definition: unicode.h:1070
void SetWbFlag(const TUniChFlags flag)
Definition: unicode.h:1117
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
unsigned int uint
Definition: bd.h:11
uchar combClass
Definition: unicode.h:1018
#define Fail
Definition: bd.h:238
static TStr GetScriptNameKatakana()
Definition: unicode.h:1318
static const ushort LineBreak_InfixNumeric
Definition: unicode.h:1032
#define NFD_(cmpWith, operand)
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void InitPropList(const TStr &basePath)
Definition: unicode.cpp:950
static const int toUnicodeTable[8 *16]
Definition: unicode.h:532
void ClrCodecs()
Definition: unicode.h:1881
enum TUniChDb::TCaseConversion_ TCaseConversion
bool IsAlphabetic() const
Definition: unicode.h:1071
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1422
static const ushort LineBreak_ComplexContext
Definition: unicode.h:1032
TIntIntVH cfFull
Definition: unicode.h:275
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
static const int yuAsciiChars[10]
Definition: unicode.h:493
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
Definition: unicode.h:1873
void InitDerivedCoreProperties(const TStr &basePath)
Definition: unicode.cpp:1007
void InitAfterLoad()
Definition: unicode.h:1035
bool IsWhiteSpace() const
Definition: unicode.h:1104
void InitLineBreaks(const TStr &basePath)
Definition: unicode.cpp:1046
static const int uniChars[10]
Definition: unicode.h:493
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:220
char chCat
Definition: unicode.h:1017
static TStr GetNormalizationTestFn()
Definition: unicode.h:1309
enum TUniChPropertiesX_ TUniChPropertiesX
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool IsUppercase() const
Definition: unicode.h:1072
void Test(const TStr &basePath)
Definition: unicode.cpp:1377
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:532
static const int fromUnicodeTable2[4 *16]
Definition: unicode.h:510
static void ParseCodePointRange(const TStr &s, int &from, int &to)
Definition: unicode.h:1703
TIntIntVH specialCasingLower
Definition: unicode.h:1271
int simpleUpperCaseMapping
Definition: unicode.h:1022
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
Definition: unicode.cpp:825
static TStr GetUnicodeDataFn()
Definition: unicode.h:1298
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
bool IsPropertyX(const TUniChPropertiesX flag) const
Definition: unicode.h:1107
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294
TStr GetWbFlagsStr() const
Definition: unicode.h:1120
static TStr GetScriptsFn()
Definition: unicode.h:1300
static const int fromUnicodeTable3[6 *16]
Definition: unicode.h:510
int propertiesX
Definition: unicode.h:1027
bool IsLowercase() const
Definition: unicode.h:1073
void Clr()
Definition: unicode.h:288
void TestCaseConversions()
Definition: unicode.cpp:853
int simpleTitleCaseMapping
Definition: unicode.h:1022
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:547
void TestDecodeUtf8(TRnd &rnd, const TStr &testCaseDesc)
Definition: unicode.cpp:133
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1254
bool IsCompatibilityDecomposition() const
Definition: unicode.h:1112
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
ushort lineBreak
Definition: unicode.h:1028
static TStr GetSentenceBreakTestFn()
Definition: unicode.h:1307
TUniChDb()
Definition: unicode.h:1274
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2428
void Save(TSOut &SOut) const
Definition: unicode.h:1280
void TestUtf8()
Definition: unicode.cpp:194
uint GetUniDevUInt(const uint &Range=0)
Definition: dt.cpp:45
enum TUniByteOrder_ TUniByteOrder
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436
static const int fromUnicodeTable3[3 *16]
Definition: unicode.h:547
TStr GetSbFlagsStr() const
Definition: unicode.h:1130
void LoadTxt_ProcessDecomposition(TUniChInfo &ci, TStr s)
Definition: unicode.cpp:937
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
static const int toUnicodeTable[6 *16]
Definition: unicode.h:452
void InitSpecialCasing(const TStr &basePath)
Definition: unicode.cpp:1225
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
int FFirstKeyId() const
Definition: hash.h:236
static TStr GetDerivedCorePropsFn()
Definition: unicode.h:1301
static TStr GetWordBreakPropertyFn()
Definition: unicode.h:1306
static const int fromUnicodeTable3[11 *16]
Definition: unicode.h:532
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
bool GetNextLine(TStrV &dest)
Definition: unicode.h:1686
int properties
Definition: unicode.h:1026
void Load(TSIn &SIn)
Definition: unicode.h:1285
void Open(const TStr &fileName)
Definition: unicode.h:1683
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:452
#define FailR(Reason)
Definition: bd.h:240
bool IsSbFlag(const TUniChFlags flag) const
Definition: unicode.h:1126
void ClrWbAndSbFlags()
Definition: unicode.h:1116
static const ushort LineBreak_Numeric
Definition: unicode.h:1032
unsigned char uchar
Definition: bd.h:10
void InitScripts(const TStr &basePath)
Definition: unicode.cpp:1073
void TestComposition(const TStr &basePath)
Definition: unicode.cpp:745
enum TUtf16BomHandling_ TUtf16BomHandling
TIntH cfTurkic
Definition: unicode.h:274
static TStr GetLineBreakFn()
Definition: unicode.h:1302
void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const
Definition: unicode.h:293
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:532
bool skipBom
Definition: unicode.h:89
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
Definition: unicode.h:1697
TIntH cfCommon
Definition: unicode.h:274
static TStr GetWordBreakTestFn()
Definition: unicode.h:1305
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2210
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
static const int toUnicodeTable[8 *16]
Definition: unicode.h:547
TIntV decompositions
Definition: unicode.h:1266
unsigned short ushort
Definition: bd.h:13
int GetKeyId(const TKey &Key) const
Definition: hash.h:424
Definition: dt.h:201
void SetCat(const int cp)
Definition: unicode.h:1744
int replacementChar
Definition: unicode.h:64
static const int toUnicodeTable[8 *16]
Definition: unicode.h:510
static TStr GetSentenceBreakPropertyFn()
Definition: unicode.h:1308
void LoadTxt(const TStr &fileName)
Definition: unicode.cpp:505
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
Definition: ds.h:32
int AddKey(const TKey &Key)
Definition: hash.h:331
void InitWordAndSentenceBoundaryFlags(const TStr &basePath)
Definition: unicode.cpp:1100
void TestFindNextWordOrSentenceBoundary(const TStr &basePath, bool sentence)
Definition: unicode.cpp:649
static const int toUnicodeTable[6 *16]
Definition: unicode.h:466
char chSubCat
Definition: unicode.h:1017
int simpleLowerCaseMapping
Definition: unicode.h:1022
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
static const ushort LineBreak_Unknown
Definition: unicode.h:1032
static TStr GetCompositionExclusionsFn()
Definition: unicode.h:1299
void LoadTxt(const TStr &basePath)
Definition: unicode.cpp:1249
TStrIntH scripts
Definition: unicode.h:1265
Definition: dt.h:412
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2376
void ProcessComment(TUniChDb::TUcdFileReader &reader)
Definition: unicode.h:1729
bool Empty() const
Definition: dt.h:488
TStr & ToTrunc()
Definition: dt.cpp:770
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561
int decompOffset
Definition: unicode.h:1023
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
static TStr GetPropListFn()
Definition: unicode.h:1303
void TestCat(const int cp)
Definition: unicode.h:1749
#define NFKC_(cmpWith, operand)
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
TIntH cfSimple
Definition: unicode.h:274
TVec< TInt > TIntV
Definition: ds.h:1529
TStrPool charNames
Definition: unicode.h:1264
int GetSbFlags() const
Definition: unicode.h:1128
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
bool IsSTerminal() const
Definition: unicode.h:1101
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2793
static const uchar Mx
Definition: dt.h:1005
#define NFKD_(cmpWith, operand)
int EncodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1792
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:515
static const int toUnicodeTable[6 *16]
Definition: unicode.h:480
bool IsProperty(const TUniChProperties flag) const
Definition: unicode.h:1084
void TestUtf16()
Definition: unicode.cpp:408
bool IsWbFlag(const TUniChFlags flag) const
Definition: unicode.h:1115
signed char script
Definition: unicode.h:1021
int nameOffset
Definition: unicode.h:1024
int scriptUnknown
Definition: unicode.h:1272
char * CStr()
Definition: dt.h:476
bool IsKey(const TKey &Key) const
Definition: hash.h:216
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:480
TUniCaseFolding caseFolding
Definition: unicode.h:1268
uint AddStr(const char *Str, const uint &Len)
Definition: dt.cpp:1711
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:452
bool IsIdeographic() const
Definition: unicode.h:1095
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
int Len() const
Definition: hash.h:186
int flags
Definition: unicode.h:1025
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:466
bool AlwaysFalse()
Definition: unicode.h:3227
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:547
static const int fromUnicodeTable4[11 *16]
Definition: unicode.h:510
static const uchar Mn
Definition: dt.h:1004
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
static const int fromUnicodeTable2[2]
Definition: unicode.h:466
void TestWbFindNonIgnored() const
Definition: unicode.cpp:619
static int ParseCodePoint(const TStr &s)
Definition: unicode.h:1695
static TStr GetCaseFoldingFn()
Definition: unicode.h:1296
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
static TStr GetAuxiliaryDir()
Definition: unicode.h:1304
static TStr GetScriptNameHiragana()
Definition: unicode.h:1319
static ushort GetLineBreakCode(char c1, char c2)
Definition: unicode.h:1031
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
void WordsToBytes(const TIntV &src, TIntV &dest)
Definition: unicode.cpp:274