SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
unicode.cpp
Go to the documentation of this file.
1 // Unicode.cpp : Defines the entry point for the console application.
2 //
3 
5 // Includes
6 //#include "unicode.h"
7 
8 //-----------------------------------------------------------------------------
9 // Private declarations of this module
10 //-----------------------------------------------------------------------------
11 
12 namespace {
13 
15 {
16 public:
18  TVectorBuilder2(int i) { v.Add(i); }
19  operator TIntV() const { return v; }
20  TVectorBuilder2& operator ,(int i) { v.Add(i); return *this; }
21 };
22 
24 {
25 public:
26  operator TIntV() const { return TIntV(); }
27  TVectorBuilder2 operator ,(int i) { return TVectorBuilder2(i); }
28 };
29 
31 
32 TStr CombinePath(const TStr& s, const TStr& t)
33 {
34  int n = s.Len(); if (n <= 0) return t;
35  if (s[n - 1] == '\\' || s[n - 1] == '/' || s[n - 1] == ':') return s + t;
36  return s + "\\" + t;
37 }
38 
39 void AssertEq(const TIntV& v1, const TIntV& v2, const TStr& explanation, FILE *f)
40 {
41  const int n = v1.Len();
42  bool ok = (n == v2.Len());
43  if (ok) for (int i = 0; i < n && ok; i++) ok = ok && (v1[i] == v2[i]);
44  if (! ok)
45  {
46  if (! f) f = stderr;
47  fprintf(f, "%s: [", explanation.CStr());
48  for (int i = 0; i < v1.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v1[i]));
49  fprintf(f, "] != [");
50  for (int i = 0; i < v2.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v2[i]));
51  fprintf(f, "]\n");
52  Fail;
53  }
54 }
55 
56 };
57 
58 //-----------------------------------------------------------------------------
59 // TUniCodec -- miscellaneous declarations
60 //-----------------------------------------------------------------------------
61 
63 {
64  uint u = rnd.GetUniDevUInt(256) & 0xff;
65  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
66  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
67  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
68  return u;
69 }
70 
71 uint TUniCodec::GetRndUint(TRnd& rnd, uint minVal, uint maxVal)
72 {
73  if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
74  uint range = maxVal - minVal + 1;
75  if (range > (uint(1) << (8 * sizeof(uint) - 1)))
76  while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
77  uint mask = 1;
78  while (mask < range) mask <<= 1;
79  mask -= 1;
80  while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
81 }
82 
84 {
85  static bool isLE, initialized = false;
86  if (initialized) return isLE;
87  int i = 1;
88  if(*(char *)&i == 1) isLE = true;
89  else isLE = false;
90 
91  initialized = true;
92  return isLE;
93 }
94 
95 //-----------------------------------------------------------------------------
96 // TUniCodec -- UTF-8 test driver
97 //-----------------------------------------------------------------------------
98 
99 void TUniCodec::TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f)
100 {
101  TIntV dest;
102  if (f) {
103  fprintf(f, "Settings: %s %s %s replacementChar = %x\n",
104  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
105  (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
106  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
107  try
108  {
109  size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
110  if (f) {
111  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i]));
112  fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i]));
113  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
114  if (retVal != expectedRetVal)
115  printf("!!!");
116  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
117  if (dest.Len() != expectedDest.Len())
118  printf("!!!");
119  IAssert(dest.Len() == expectedDest.Len());
120  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
121  }
122  catch (TUnicodeException e)
123  {
124  if (f) {
125  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
126  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
127  IAssert(expectedThrow);
128  }
129 }
130 
131 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
132 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
133 void TUniCodec::TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc)
134 {
135  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
136  bool expectedAbort = false;
137  FILE *f = 0; // stderr
138  // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
139  // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
140  // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
141  // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
142  // (absent = 0, 'a' = 1, 'b' = 2 and so on).
143  for (int i = 0; i < testCaseDesc.Len(); )
144  {
145  IAssert(i + 2 <= testCaseDesc.Len());
146  const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
147  uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
148  IAssert('1' <= d && d <= '6'); nBytes = d - '0';
149  if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
150  else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
151  else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
152  else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
153  else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
154  else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
155  else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
156  else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
157  else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
158  else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
159  else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
160  else Fail;
161  IAssert(nBytes >= minBytes);
162  // Process 'e'.
163  int nToDel = 0;
164  if (i < testCaseDesc.Len()) {
165  const char e = testCaseDesc[i];
166  if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
167  IAssert(nToDel < nBytes);
168  // Will an error occur during the decoding of this codepoint?
169  bool errHere = false;
170  if (eighties) errHere = true;
171  else if (nToDel > 0) errHere = true;
172  else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
173  // Update 'expectedDest' and 'expetedRetVal'.
174  if (! expectedAbort) {
175  if (! errHere) {
176  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
177  else { expectedDest.Add(cp); expectedRetVal += 1; } }
178  else if (errorHandling == uehReplace) {
179  if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
180  else expectedDest.Add(replacementChar); }
181  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
182  // Update 'src'.
183  if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
184  else if (nBytes == 1) src.Add(cp);
185  else {
186  int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
187  src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
188  for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
189  }
190  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
191  TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
192 }
193 
195 {
196  TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
197  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
198  for (int strict_ = 0; strict_ < 2; strict_++)
199  for (int errMode_ = 0; errMode_ < 4; errMode_++)
200  {
201  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
202  TRnd rnd = TRnd(123);
203  // Test DecodeUtf8 on various random UTF-8-encoded sequences.
204  for (int i = 0; i < 10; i++)
205  {
206  TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
207  TestDecodeUtf8(rnd, "X3A5dA6d");
208  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
209  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
210  TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
211  TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
212  TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
213  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
214  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
215  TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
216  TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
217  TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
218  TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
219  TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
220  TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
221  TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
222  TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
223  TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
224  }
225  // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
226  // close to powers of 2.
227  TIntV src, expectedDest, src2;
228  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
229  for (int pow = 8; pow <= 32; pow++)
230  {
231  uint uFrom, uTo;
232  if (pow == 8) uFrom = 0, uTo = 1u << pow;
233  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
234  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
235  printf("%u..%u \r", uFrom, uTo);
236  for (uint u = uFrom; ; u++)
237  {
238  int nBytes = 0;
239  if (u < (1u << 7)) nBytes = 1;
240  else if (u < (1u << 11)) nBytes = 2;
241  else if (u < (1u << 16)) nBytes = 3;
242  else if (u < (1u << 21)) nBytes = 4;
243  else if (u < (1u << 26)) nBytes = 5;
244  else nBytes = 6;
245  src.Gen(6, nBytes);
246  if (nBytes == 1) src[0] = u;
247  else {
248  src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
249  for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
250  bool err = (strict && u > 0x10ffff);
251  expectedDest.Reserve(1, 0);
252  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
253  else if (! err) expectedDest.Add(u);
254  int erv = (err ? 0 : 1);
255  if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
256  TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
257  // We can also test the UTF-8 encoder.
258  src2[0] = u;
259  if (err) {
260  if (errorHandling == uehReplace) src = utf8ReplCh;
261  else src.Clr(false); }
262  TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
263  //
264  if (u == uTo) break;
265  }
266  }
267  }
268 }
269 
270 //-----------------------------------------------------------------------------
271 // TUniCodec -- UTF-16 test driver
272 //-----------------------------------------------------------------------------
273 
274 void TUniCodec::WordsToBytes(const TIntV& src, TIntV& dest)
275 {
276  dest.Clr();
277  bool isLE = IsMachineLittleEndian();
278  for (int i = 0; i < src.Len(); i++) {
279  int c = src[i] & 0xffff;
280  if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
281  else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
282 }
283 
284 void TUniCodec::TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
285  const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
286  FILE *f)
287 {
288  TIntV srcBytes, expectedDestBytes;
289  WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
290  TIntV dest;
291  if (f) {
292  fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n",
293  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
294  (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
295  (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
296  (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
298  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
299  for (int useBytes = 0; useBytes < 2; useBytes++)
300  {
301  const char *fmt = (useBytes ? " %02x" : " %04x");
302  try
303  {
304  dest.Clr();
305  size_t retVal;
306  if (! useBytes) {
307  if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
308  else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
309  else {
310  if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
311  else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
312  const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
313  if (f) {
314  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i]));
315  fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i]));
316  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
317  bool ok = true;
318  if (retVal != expectedRetVal) ok = false;
319  if (dest.Len() != ed.Len()) ok = false;
320  if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
321  if (! ok)
322  {
323  printf("!!!\n");
324  }
325  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
326  IAssert(dest.Len() == ed.Len());
327  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
328  }
329  catch (TUnicodeException e)
330  {
331  if (f) {
332  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
333  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
334  IAssert(expectedThrow);
335  }
336  }
337 }
338 
339 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
340 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
341 void TUniCodec::TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
342  const TUtf16BomHandling bomHandling,
343  const TUniByteOrder defaultByteOrder,
344  const bool insertBom)
345 {
346  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
347  bool expectedAbort = false;
348  FILE *f = 0;
349  bool isMachineLe = IsMachineLittleEndian();
350  bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
351  bool swap = (isMachineLe != isDefaultLe);
352  if (insertBom) {
353  src.Add(swap ? 0xfffe : 0xfeff);
354  if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
355  else if (bomHandling == bomRequired) {
356  expectedAbort = true; expectedRetVal = -1; }
357  // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
358  // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
359  // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
360  // (absent = 0, 'a' = 1).
361  for (int i = 0; i < testCaseDesc.Len(); )
362  {
363  const char c = testCaseDesc[i++];
364  uint cp = 0; int nWords = -1;
365  if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
366  if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
367  else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
368  else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
369  else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
370  else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
371  else if (c == 'X') { cp = 0xfffe; nWords = 1; }
372  else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
373  else Fail;
374  if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
375  // Process 'e'.
376  int nToDel = 0;
377  if (i < testCaseDesc.Len()) {
378  const char e = testCaseDesc[i];
379  if (e >= 'a') { i += 1; nToDel = 1; }}
380  IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
381  if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
382  // Will an error occur during the decoding of this codepoint?
383  bool errHere = false;
384  if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
385  else if (cp > 0x10ffff) { Fail; errHere = true; }
386  else if (nToDel > 0) errHere = true;
387  else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
388  // Update 'expectedDest' and 'expectedRetVal'.
389  if (! expectedAbort) {
390  if (! errHere) {
391  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
392  else { expectedDest.Add(cp); expectedRetVal += 1; } }
393  else if (errorHandling == uehReplace) {
394  expectedDest.Add(replacementChar); }
395  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
396  // Update 'src'.
397  if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
398  else {
399  int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
400  int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
401  src.Add(swap ? SwapBytes(c1) : c1);
402  if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
403  }
404  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
405  TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
406 }
407 
409 {
410  TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
411  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
412  for (int strict_ = 0; strict_ < 2; strict_++)
413  for (int errMode_ = 0; errMode_ < 4; errMode_++)
414  for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
415  for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
416  for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
417  {
418  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
419  bool insertBom = (insertBom_ == 1);
420  TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
421  TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
422  TRnd rnd = TRnd(123);
423  // Test DecodeUtf16 on various random UTF-16-encoded sequences.
424  for (int i = 0; i < 10; i++)
425  {
426  TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
427  TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
428  TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
429  TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
430  TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
431  TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
432  TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
433  TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
434  TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
435  }
436  //continue;
437  // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
438  // close to powers of 2.
439  TIntV src, expectedDest, src2;
440  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
441  for (int pow = 8; pow <= 32; pow++)
442  {
443  uint uFrom, uTo;
444  if (pow == 8) uFrom = 0, uTo = 1u << pow;
445  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
446  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
447  printf("%u..%u \r", uFrom, uTo);
448  for (uint u = uFrom; ; u++)
449  {
450  int nWords = 0;
451  if (u < 0x10000) nWords = 1;
452  else nWords = 2;
453  bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
454  bool swap = (isMachineLe != isDestLe);
455  bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
456  src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
457  if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
458  if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
459  {
460  // Try to encode 'u' and see if it gets decoded correctly.
461  if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
462  else {
463  int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
464  int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
465  src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
466  src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
467  if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
468  {
469  expectedDest.Reserve(2, 0);
470  if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
471  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
472  else if (! err) expectedDest.Add(u);
473  int erv = (err ? 0 : expectedDest.Len());
474  if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
475  bool errD = err;
476  if (bomHandling == bomRequired && ! insertBom) {
477  expectedDest.Clr(false);
478  if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
479  else { erv = -1; errD = true;
480  /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
481  TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
482  }
483  }
484  // We can also test the UTF-16 encoder.
485  src2[0] = u;
486  if (err) {
487  src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
488  if (errorHandling == uehReplace) {
490  /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
491  else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
492  }}
493  TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
494  //
495  if (u == uTo) break;
496  }
497  }
498  }
499 }
500 
501 //-----------------------------------------------------------------------------
502 // TUniCaseFolding
503 //-----------------------------------------------------------------------------
504 
505 void TUniCaseFolding::LoadTxt(const TStr& fileName)
506 {
507  Clr();
508  TUniChDb::TUcdFileReader reader; reader.Open(fileName);
509  TStrV fields;
510  while (reader.GetNextLine(fields))
511  {
512  int cp = reader.ParseCodePoint(fields[0]);
513  const TStr status = fields[1], mapsTo = fields[2];
514  if (status == "C" || status == "S" || status == "T") {
515  TIntH &dest = (status == "C" ? cfCommon : status == "S" ? cfSimple : cfTurkic);
516  IAssert(! dest.IsKey(cp));
517  int cp2 = reader.ParseCodePoint(mapsTo);
518  dest.AddDat(cp, cp2); }
519  else if (status == "F") {
520  TIntIntVH &dest = cfFull;
521  IAssert(! dest.IsKey(cp));
522  TIntV cps; reader.ParseCodePointList(mapsTo, cps); IAssert(cps.Len() > 0);
523  dest.AddDat(cp, cps); }
524  else
525  FailR(status.CStr());
526  }
527  printf("TUniCaseFolding(\"%s\"): %d common, %d simple, %d full, %d Turkic.\n",
528  fileName.CStr(), cfCommon.Len(), cfSimple.Len(), cfFull.Len(), cfTurkic.Len());
529 }
530 
531 void TUniCaseFolding::Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f)
532 {
533  fprintf(f, "TUniCaseFolding(%s%s): ", (full ? "full" : "simple"), (turkic ? ", turkic" : ""));
534  for (int i = 0; i < src.Len(); i++) fprintf(f, " %04x", int(src[i]));
535  TIntV dest; Fold(src, 0, src.Len(), dest, true, full, turkic);
536  fprintf(f, "\n -> ");
537  for (int i = 0; i < dest.Len(); i++) fprintf(f, " %04x", int(dest[i]));
538  fprintf(f, "\n");
539  IAssert(dest.Len() == expectedDest.Len());
540  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
541 }
542 
543 /*
544 void TUniCaseFolding::Test(const TIntV& src, FILE *f) {
545  Test(src, false, false, f); Test(src, false, true, f);
546  Test(src, true, false, f); Test(src, true, true, f); }
547 */
548 
550 {
551  FILE *f = stderr;
552  TVectorBuilder VB;
553  // simple
554  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0xdf), false, false, f);
555  // simple + turkic
556  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0xdf), false, true, f);
557  // full
558  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0x73, 0x73), true, false, f);
559  // full + turkic
560  Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0x73, 0x73), true, true, f);
561 }
562 
563 //-----------------------------------------------------------------------------
564 // TUniChInfo
565 //-----------------------------------------------------------------------------
566 
567 // UAX #14
573 
574 //-----------------------------------------------------------------------------
575 // TUniChDb -- word breaking
576 //-----------------------------------------------------------------------------
577 
578 // Test driver for WbFind*NonIgnored.
579 void TUniChDb::TestWbFindNonIgnored(const TIntV& src) const
580 {
581  int n = src.Len();
582  TBoolV isIgnored; isIgnored.Gen(n);
583  for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
584  TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
585  prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
586  FILE *f = 0; // stderr;
587  for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
588  {
589  int prev = -1;
590  for (int i = 0; i < srcLen; i++) {
591  prevNonIgnored[i] = prev;
592  if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
593  int next = srcIdx + srcLen;
594  for (int i = srcLen - 1; i >= 0; i--) {
595  nextNonIgnored[i] = next;
596  if (! isIgnored[srcIdx + i]) next = srcIdx + i;
597  curOrNextNonIgnored[i] = next; }
598  if (f) {
599  fprintf(f, "\nIndex: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
600  fprintf(f, "\nNonIgn: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
601  fprintf(f, "\nPrevNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
602  fprintf(f, "\nNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
603  fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
604  fprintf(f, "\n"); }
605  for (int i = 0; i < srcLen; i++)
606  {
607  size_t s;
608  s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
609  IAssert(s == size_t(nextNonIgnored[i]));
610  s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
611  IAssert(s == size_t(curOrNextNonIgnored[i]));
612  s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
613  if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
614  else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
615  }
616  }
617 }
618 
620 {
621  TIntV chIgnored, chNonIgnored;
622  FILE *f = 0; // stderr;
623  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
624  const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
625  if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
627  (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
628  }
629  chIgnored.Sort(); chNonIgnored.Sort();
630  printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
631  TRnd rnd = TRnd(123);
632  for (int iter = 0; iter <= 50; iter++)
633  {
634  int percIgnored = 2 * iter;
635  for (int n = 0; n <= 20; n++)
636  {
637  // Prepare a random sequence of 'n' codepoints.
638  TIntV v; v.Gen(n);
639  for (int i = 0; i < n; i++) {
640  TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
641  int j = rnd.GetUniDevInt(chars.Len());
642  v.Add(chars[j]); }
643  // Run the tests with this sequence.
645  }
646  }
647 }
648 
649 void TUniChDb::TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence)
650 {
651  TUcdFileReader reader; TStrV fields;
652  reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
653  int nLines = 0; TRnd rnd = TRnd(123);
654  while (reader.GetNextLine(fields))
655  {
656  nLines += 1;
657  IAssert(fields.Len() == 1);
658  TStrV parts; fields[0].SplitOnWs(parts);
659  const int n = parts.Len(); IAssert((n % 2) == 1);
660  TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
661  // Each line is a sequence of codepoints, with a \times or \div in between each
662  // pair of codepoints (as well as at the beginning and the end of the sequence) to
663  // indicate whether a boundary exists there or not.
664  for (int i = 0; i < n; i++)
665  {
666  const TStr& s = parts[i];
667  if ((i % 2) == 0) {
668  if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
669  isBreak.Add(false);
670  else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
671  isBreak.Add(true);
672  else FailR(s.CStr()); }
673  else chars.Add(reader.ParseCodePoint(s));
674  }
675  const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
676  IAssert(isBreak[0]); IAssert(isBreak[m]);
677  isPredicted.Gen(m + 1); isPredicted.PutAll(false);
678  if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
679  // We'll insert a few random characters at the beginning of the sequence
680  // so that srcPos doesn't always begin at 0.
681  for (int nBefore = 0; nBefore < 5; nBefore++)
682  {
683  TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
684  chars2.AddV(chars);
685  // Use FindNextBoundary to find all the word boundaries.
686  size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
687  while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
688  {
689  IAssert(prevPosition < position);
690  IAssert(position <= size_t(nBefore + m));
691  isPredicted[int(position) - nBefore] = true;
692  prevPosition = position;
693  }
694  IAssert(position == size_t(nBefore + m));
695  if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
696  else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
697  IAssert(isPredicted2.Len() == m + 1);
698  bool ok = true;
699  // If we start at 0, the word boundary at the beginning of the sequence was
700  // not found explicitly, so we'll add it now.
701  if (nBefore == 0) isPredicted[0] = true;
702  // Compare the predicted and the true boundaries.
703  for (int i = 0; i <= m; i++) {
704  if (isBreak[i] != isPredicted[i]) ok = false;
705  IAssert(isPredicted2[i] == isPredicted[i]); }
706  FILE *f = stderr;
707  if (! ok)
708  {
709  fprintf(f, "\nError in line %d:\n", nLines);
710  fprintf(f, "True: ");
711  for (int i = 0; i <= m; i++) {
712  fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
713  if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
714  fprintf(f, "\nPredicted: ");
715  for (int i = 0; i <= m; i++) {
716  fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
717  if (i < m) {
718  const int cp = chars[i + nBefore];
720  if (IsWbIgnored(cp)) s = "*" + s;
721  fprintf(f, "%4s ", s.CStr()); }}
722  fprintf(f, "\n");
723  Fail;
724  }
725  // Test FindNextBoundary if we start in the middle of the sequence,
726  // i.e. not at an existing boundary.
727  for (int i = 0; i < m; i++) {
728  position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
729  IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
730  IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
731  position -= nBefore;
732  for (int j = i + 1; j < int(position); j++)
733  IAssert(! isBreak[j]);
734  IAssert(isBreak[int(position)]); }
735  }
736  }
737  reader.Close();
738  printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
739 }
740 
741 //-----------------------------------------------------------------------------
742 // TUniChDb -- composition and decomposition
743 //-----------------------------------------------------------------------------
744 
745 void TUniChDb::TestComposition(const TStr& basePath)
746 {
747  TUcdFileReader reader; TStrV fields; int nLines = 0;
748  reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
749  bool inPart1 = false; TIntH testedInPart1;
750  while (reader.GetNextLine(fields))
751  {
752  nLines += 1;
753  if (fields.Len() == 1) {
754  IAssert(fields[0].IsPrefix("@Part"));
755  inPart1 = (fields[0] == "@Part1"); continue; }
756  IAssert(fields.Len() == 6);
757  IAssert(fields[5].Len() == 0);
758  TIntV c1, c2, c3, c4, c5;
759  reader.ParseCodePointList(fields[0], c1);
760  reader.ParseCodePointList(fields[1], c2);
761  reader.ParseCodePointList(fields[2], c3);
762  reader.ParseCodePointList(fields[3], c4);
763  reader.ParseCodePointList(fields[4], c5);
764  TIntV v;
765 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
766 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
767 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
768 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
769 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
770  // NFD:
771  NFD_(c3, c1); // c3 == NFD(c1)
772  NFD_(c3, c2); // c3 == NFD(c2)
773  NFD_(c3, c3); // c3 == NFD(c3)
774  NFD_(c5, c4); // c5 == NFD(c4)
775  NFD_(c5, c5); // c5 == NFD(c5)
776  // NFC:
777  NFC_(c2, c1); // c2 == NFC(c1)
778  NFC_(c2, c2); // c2 == NFC(c2)
779  NFC_(c2, c3); // c2 == NFC(c3)
780  NFC_(c4, c4); // c4 == NFC(c4)
781  NFC_(c4, c5); // c4 == NFC(c5)
782  // NFKD:
783  NFKD_(c5, c1); // c5 == NFKD(c1)
784  NFKD_(c5, c2); // c5 == NFKD(c2)
785  NFKD_(c5, c3); // c5 == NFKD(c3)
786  NFKD_(c5, c4); // c5 == NFKD(c4)
787  NFKD_(c5, c5); // c5 == NFKD(c5)
788  // NFKC:
789  NFKC_(c4, c1); // c4 == NFKC(c1)
790  NFKC_(c4, c2); // c4 == NFKC(c2)
791  NFKC_(c4, c3); // c4 == NFKC(c3)
792  NFKC_(c4, c4); // c4 == NFKC(c4)
793  NFKC_(c4, c5); // c4 == NFKC(c5)
794  //
795  if (inPart1) {
796  IAssert(c1.Len() == 1);
797  testedInPart1.AddKey(c1[0]); }
798  }
799  reader.Close();
800  // Test other individual codepoints that were not mentioned in part 1.
801  int nOther = 0;
802  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
803  {
804  const int cp = h.GetKey(i), nLines = -1;
805  if (testedInPart1.IsKey(cp)) continue;
806  TIntV x, v; x.Add(cp);
807  NFC_(x, x); // x == NFC(x)
808  NFD_(x, x); // x == NFD(x)
809  NFKC_(x, x); // x == NFKC(x)
810  NFKD_(x, x); // x == NFKD(x)
811  nOther += 1;
812  }
813 #undef AssE_
814 #undef NFC_
815 #undef NFD_
816 #undef NFKC_
817 #undef NFKD_
818  printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
819 }
820 
821 //-----------------------------------------------------------------------------
822 // TUniChDb -- case conversion tests
823 //-----------------------------------------------------------------------------
824 
825 void TUniChDb::TestCaseConversion(const TStr& source, const TStr& trueLc,
826  const TStr& trueTc, const TStr& trueUc,
827  bool turkic, bool lithuanian)
828 {
829  TIntV src;
831  FILE *f = stderr;
832  for (int i = 0; i < 3; i++)
833  {
834  TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
835  const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
836  TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
837  TIntV dest;
838  GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
839  bool ok = (dest.Len() == trueDest.Len());
840  if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
841  if (ok) continue;
842  fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
843  for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
844  fprintf(f, ")\nCorrect: (");
845  for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
846  fprintf(f, ")\nOur output:(");
847  for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
848  fprintf(f, ")\n");
849  IAssert(ok);
850  }
851 }
852 
854 {
855  // Because no thorough case-conversion test files have been provided as part
856  // of the Unicode standard, we'll have to test things on a few test cases of our own.
857  // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
858  const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
859  const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
860  const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
861  const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
862  const TStr space = "0020 ", Grave = "0300 ";
864  F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst, // source
865  f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst, // lowercase
866  F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst, // titlecase
867  F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
868  false, false);
869  // - Dotted I, dotless i, etc., but with turkic == false.
870  const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
872  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
873  s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
874  S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
875  S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
876  false, false);
877  // - Sigma (final vs. non-final forms).
878  const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
880  Sigma + s + space + s + Sigma + space + s + Sigma + s + space + Sigma + S + Sigma + space + Sigma, // source
881  sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
882  Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
883  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
884  false, false);
886  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + S + sigma + space + sigma, // source
887  sigma + s + space + s + sigma + space + s + sigma + s + space + sigma + s + sigma + space + sigma, // lowercase
888  Sigma + s + space + S + sigma + space + S + sigma + s + space + Sigma + s + sigma + space + Sigma, // titlecase
889  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
890  false, false);
892  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma + space + fsigma, // source
893  fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma + space + fsigma, // lowercase
894  Sigma + s + space + S + fsigma + space + S + fsigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
895  Sigma + S + space + S + Sigma + space + S + Sigma + S + space + Sigma + S + Sigma + space + Sigma, // uppercase
896  false, false);
897  const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
898  // Special case mappings for Turkic languages:
899  // - After_I
901  s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
902  s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
903  S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
904  S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
905  true, false); // turkic
906  // - Not_Before_Dot
908  I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
909  iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
910  I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
911  I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
912  true, false); // turkic
913  // Special case mappings for Lithuanian:
914  // - After_Soft_Dotted [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
916  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
917  i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
918  I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
919  I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
920  false, true); // lithuanian
921  // - More_Above [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
923  J + Grave + space + J + nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J + nonSA + Grave + space + j + nonSA, // source
924  j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
925  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
926  J + Grave + space + J + nonSA + DotA + space + J + Grave + space + J + space + J + nonSA + J + nonSA + Grave + space + J + nonSA, // uppercase
927  false, true); // lithuanian
928  // SoftDotted [^ Starter Above]* 0307 --(uc,tc)--> brez 0307
929  // SoftDotted [^ Starter Above]* 0307 --(
930  //TestCaseConversion("", "", "", "", false, false);
931 }
932 
933 //-----------------------------------------------------------------------------
934 // TUniChDb -- initialization from the text files
935 //-----------------------------------------------------------------------------
936 
938 {
939  if (s.Empty()) return;
940  if (s[0] == '<') {
941  int i = s.SearchCh('>'); IAssert(i > 0);
943  s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
945  IAssert(dec.Len() > 0);
948 }
949 
950 void TUniChDb::InitPropList(const TStr& basePath)
951 {
952  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
953  reader.Open(CombinePath(basePath, GetPropListFn()));
954  TSubcatHelper helper(*this);
955  while (reader.GetNextLine(fields))
956  {
957  IAssert(fields.Len() == 2);
958  int from, to; reader.ParseCodePointRange(fields[0], from, to);
959  TStr s = fields[1];
961  if (s == "White_Space") prop = ucfPrWhiteSpace;
962  else if (s == "Bidi_Control") prop = ucfPrBidiControl;
963  else if (s == "Join_Control") prop = ucfPrJoinControl;
964  else if (s == "Dash") prop = ucfPrDash;
965  else if (s == "Hyphen") prop = ucfPrHyphen;
966  else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
967  else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
968  else if (s == "Other_Math") propx = ucfPxOtherMath;
969  else if (s == "Hex_Digit") prop = ucfPrHexDigit;
970  else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
971  else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
972  else if (s == "Ideographic") prop = ucfPrIdeographic;
973  else if (s == "Diacritic") prop = ucfPrDiacritic;
974  else if (s == "Extender") prop = ucfPrExtender;
975  else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
976  else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
977  else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
978  else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
979  else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
980  else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
981  else if (s == "Radical") propx = ucfPxRadical;
982  else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
983  else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
984  else if (s == "Deprecated") prop = ucfPrDeprecated;
985  else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
986  else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
987  else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
988  else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
989  else if (s == "STerm") prop = ucfPrSTerm;
990  else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
991  else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
992  else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
993  else FailR(s.CStr());
994  helper.ProcessComment(reader);
995  for (int cp = from; cp <= to; cp++) {
996  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
997  TUniChInfo &ci = h[i]; helper.TestCat(cp);
998  if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
999  if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
1000  nCps++; }
1001  nLines++;
1002  }
1003  reader.Close();
1004  printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
1005 }
1006 
1008 {
1009  TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
1010  reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
1011  TSubcatHelper helper(*this);
1012  while (reader.GetNextLine(fields))
1013  {
1014  IAssert(fields.Len() == 2);
1015  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1016  TStr s = fields[1];
1018  if (s == "Math") flag = ucfDcpMath;
1019  else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
1020  else if (s == "Lowercase") flag = ucfDcpLowercase;
1021  else if (s == "Uppercase") flag = ucfDcpUppercase;
1022  else if (s == "ID_Start") flag = ucfDcpIdStart;
1023  else if (s == "ID_Continue") flag = ucfDcpIdContinue;
1024  else if (s == "XID_Start") flag = ucfDcpXidStart;
1025  else if (s == "XID_Continue") flag = ucfDcpXidContinue;
1026  else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
1027  else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
1028  else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
1029  else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
1030  else FailR(s.CStr());
1031  // If we add new codepoints to the hash table, we should also set their category.
1032  // This is supposed to be provided in the comment, e.g. "# Cf SOFT HYPHEN".
1033  helper.ProcessComment(reader);
1034  //
1035  for (int cp = from; cp <= to; cp++) {
1036  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1037  helper.TestCat(cp);
1038  TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
1039  ci.SetDcpFlag(flag); nCps++; }
1040  nLines++;
1041  }
1042  reader.Close();
1043  printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
1044 }
1045 
1046 void TUniChDb::InitLineBreaks(const TStr& basePath)
1047 {
1048  // Clear old linebreak values.
1050  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
1051  // Read LineBreak.txt.
1052  TUcdFileReader reader; TStrV fields;
1053  reader.Open(CombinePath(basePath, GetLineBreakFn()));
1054  int nLines = 0, nCps = 0;
1055  while (reader.GetNextLine(fields))
1056  {
1057  IAssert(fields.Len() == 2);
1058  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1059  TStr s = fields[1]; IAssert(s.Len() == 2);
1060  ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
1061  if (us == xx) continue;
1062  for (int cp = from; cp <= to; cp++) {
1063  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
1064  printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
1065  IAssert(h[i].lineBreak == xx);
1066  h[i].lineBreak = us; nCps++; }
1067  nLines++;
1068  }
1069  reader.Close();
1070  printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
1071 }
1072 
1073 void TUniChDb::InitScripts(const TStr& basePath)
1074 {
1075  TUcdFileReader reader; TStrV fields;
1076  reader.Open(CombinePath(basePath, GetScriptsFn()));
1077  TSubcatHelper helper(*this);
1078  while (reader.GetNextLine(fields))
1079  {
1080  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1081  TStr scriptName = fields[1];
1082  int scriptNo = scripts.GetKeyId(scriptName);
1083  if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
1084  IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
1085  scripts[scriptNo] += 1;
1086  helper.ProcessComment(reader);
1087  for (int cp = from; cp <= to; cp++) {
1088  int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
1089  helper.TestCat(cp);
1090  TUniChInfo &ci = h[i]; ci.script = scriptNo; }
1091  }
1092  reader.Close();
1094  printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
1095  if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
1096  printf(" %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
1097  printf("\n");
1098 }
1099 
1101 {
1102  // UAX #29, sec. 4.1 and 5.1.
1103  // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
1104  int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
1105  int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
1106  // Clear any existing word-boundary flags and initialize them again.
1107  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1108  {
1109  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1110  ci.ClrWbAndSbFlags();
1111  // Word-boundary flags.
1112  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
1113  if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
1116  if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
1117  // Sentence-boundary flags. Some are identical to some word-boundary flags.
1118  if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
1119  if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
1120  if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
1121  if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
1122  if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
1123  if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
1125  if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
1126  // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
1127  // the purposes of sentence-boundary detection. Now in PropList.txt there is no doubt that 002E has the STerm
1128  // property; thus, it should also belong to the STerm sentence-boundary class. However, in
1129  // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
1130  if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
1131  if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
1132  }
1133  // Some additional characters for Katakana and MidLetter.
1134  TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
1135  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
1136  v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
1137  for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
1138  // WbALetter depends on Katakana, so it cannot be initialized earlier.
1139  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1140  {
1141  const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
1142  if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
1143  ci.SetWbFlag(ucfWbALetter);
1144  }
1145  // An alternative is to extract the flags from WordBreakProperty.txt.
1146  // The results should be the same.
1147  {TUcdFileReader reader; TStrV fields;
1149  THash<TInt, TInt> hh;
1150  while (reader.GetNextLine(fields))
1151  {
1152  IAssert(fields.Len() == 2);
1153  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1154  TStr s = fields[1];
1156  if (s == "Format") flag = ucfWbFormat;
1157  else if (s == "Katakana") flag = ucfWbKatakana;
1158  else if (s == "ALetter") flag = ucfWbALetter;
1159  else if (s == "MidLetter") flag = ucfWbMidLetter;
1160  else if (s == "MidNum") flag = ucfWbMidNum;
1161  else if (s == "Numeric") flag = ucfWbNumeric;
1162  else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
1163  else FailR(s.CStr());
1164  for (int c = from; c <= to; c++) {
1165  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1166  else hh[i].Val |= flag; }
1167  }
1168  reader.Close();
1169  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1170  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1171  cps.Sort(); cps.Merge();
1172  for (int i = 0; i < cps.Len(); i++)
1173  {
1174  int cp = cps[i];
1175  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
1176  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1177  flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
1178  if (flags1 != flags2) {
1179  printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
1180  Fail; }
1181  }}
1182  // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
1183  {TUcdFileReader reader; TStrV fields;
1185  THash<TInt, TInt> hh;
1186  while (reader.GetNextLine(fields))
1187  {
1188  IAssert(fields.Len() == 2);
1189  int from, to; reader.ParseCodePointRange(fields[0], from, to);
1190  TStr s = fields[1];
1192  if (s == "Sep") flag = ucfSbSep;
1193  else if (s == "Format") flag = ucfSbFormat;
1194  else if (s == "Sp") flag = ucfSbSp;
1195  else if (s == "Lower") flag = ucfSbLower;
1196  else if (s == "Upper") flag = ucfSbUpper;
1197  else if (s == "OLetter") flag = ucfSbOLetter;
1198  else if (s == "Numeric") flag = ucfSbNumeric;
1199  else if (s == "ATerm") flag = ucfSbATerm;
1200  else if (s == "STerm") flag = ucfSbSTerm;
1201  else if (s == "Close") flag = ucfSbClose;
1202  else FailR(s.CStr());
1203  for (int c = from; c <= to; c++) {
1204  int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
1205  else hh[i].Val |= flag; }
1206  }
1207  reader.Close();
1208  TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
1209  for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
1210  cps.Sort(); cps.Merge();
1211  for (int i = 0; i < cps.Len(); i++)
1212  {
1213  int cp = cps[i];
1214  int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
1215  int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
1216  if (flags1 != flags2) {
1217  printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
1218  flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
1219  flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
1220  flags1 ^ flags2);
1221  Fail; }
1222  }}
1223 }
1224 
1225 void TUniChDb::InitSpecialCasing(const TStr& basePath)
1226 {
1227  TUcdFileReader reader; TStrV fields;
1228  reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
1229  while (reader.GetNextLine(fields))
1230  {
1231  IAssert(fields.Len() == 5 || fields.Len() == 6);
1232  IAssert(fields.Last().Empty());
1233  // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
1234  TStr conditions = "";
1235  if (fields.Len() == 6) conditions = fields[4];
1236  conditions.ToTrunc(); if (! conditions.Empty()) continue;
1237  // Keep the other mappings.
1238  const int cp = reader.ParseCodePoint(fields[0]);
1239  TIntV v; reader.ParseCodePointList(fields[1], v);
1240  specialCasingLower.AddDat(cp, v);
1241  reader.ParseCodePointList(fields[2], v);
1242  specialCasingTitle.AddDat(cp, v);
1243  reader.ParseCodePointList(fields[3], v);
1244  specialCasingUpper.AddDat(cp, v);
1245  }
1246  reader.Close();
1247 }
1248 
1249 void TUniChDb::LoadTxt(const TStr& basePath)
1250 {
1251  Clr();
1252  // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
1253  h = THash<TInt, TUniChInfo>(196613, true);
1254  //
1256  //
1257  TUcdFileReader reader; TStrV fields; TIntH seen;
1258  reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
1259  while (reader.GetNextLine(fields))
1260  {
1261  // Codepoint.
1262  int cp = reader.ParseCodePoint(fields[0]);
1263  IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
1264  TUniChInfo& ci = h.AddDat(cp);
1265  // Name.
1266  ci.nameOffset = charNames.AddStr(fields[1]);
1267  // Category.
1268  TStr& s = fields[2]; IAssert(s.Len() == 2);
1269  ci.chCat = s[0]; ci.chSubCat = s[1];
1270  // Canonical combining class.
1271  s = fields[3]; IAssert(s.Len() > 0);
1272  int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
1273  ci.combClass = (uchar) i;
1274  // Decomposition type and mapping.
1275  LoadTxt_ProcessDecomposition(ci, fields[5]);
1276  // Simple case mappings.
1277  s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1278  s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1279  s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
1280  //
1281  ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
1282  }
1283  reader.Close();
1284  //
1285  InitScripts(basePath);
1286  //
1287  InitPropList(basePath);
1288  InitDerivedCoreProperties(basePath);
1289  InitLineBreaks(basePath);
1290  InitSpecialCasing(basePath);
1291  // Process the composition exclusions (UAX #15, sec. 6).
1292  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1293  {
1294  TUniChInfo& ci = h[i];
1295  int ofs = ci.decompOffset; if (ofs < 0) continue;
1296  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1297  IAssert(n > 0);
1298  // Singleton decompositions.
1299  if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
1300  // Non-starter decompositions.
1301  int cp1 = decompositions[ofs];
1302  IAssert(h.IsKey(cp1));
1303  uchar ccc = h.GetDat(cp1).combClass;
1304  if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
1305  }
1306  // Process the composition exclusion table.
1307  reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
1308  int nExclusionTable = 0;
1309  while (reader.GetNextLine(fields))
1310  {
1311  IAssert(fields.Len() == 1);
1312  int cp = reader.ParseCodePoint(fields[0]);
1313  int i = h.GetKeyId(cp); IAssert(i >= 0);
1314  h[i].flags |= ucfCompositionExclusion;
1315  nExclusionTable++;
1316  }
1317  reader.Close();
1318  // Prepare the inverted index for composition pairs.
1319  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1320  {
1321  int cp = h.GetKey(i);
1322  TUniChInfo& ci = h[i];
1323  int ofs = ci.decompOffset; if (ofs < 0) continue;
1324  if (ci.IsCompositionExclusion()) continue;
1325  if (ci.IsCompatibilityDecomposition()) continue;
1326  int n = 0; while (decompositions[ofs + n] >= 0) n++;
1327  if (n != 2) continue;
1328  TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
1329  IAssert(! inverseDec.IsKey(pr));
1331  inverseDec.AddDat(pr, cp);
1332  }
1333  printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
1334  basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
1335  // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
1336  // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
1337  InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
1338  // Make sure that Hangul combined characters are treated as stareters.
1339  for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
1340  {
1341  int j = h.GetKeyId(cp); if (j < 0) continue;
1342  TUniChInfo& ci = h[j];
1345  }
1346  // There should be no more additions to 'h' beyond this point.
1347  const int oldHLen = h.Len();
1348  // Provide default (identity) case mappings if any were missing from UnicodeData.txt
1349  // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
1351  for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
1352  {
1353  int cp = h.GetKey(i); TUniChInfo &ci = h[i];
1354  if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
1355  if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
1356  if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
1357  if (ci.script < 0) ci.script = scriptUnknown;
1358  }
1359  IAssert(h.Len() == oldHLen);
1360 }
1361 
1362 void TUniChDb::SaveBin(const TStr& fnBinUcd)
1363 {
1364  PSOut SOut=TFOut::New(fnBinUcd);
1365  Save(*SOut);
1366 }
1367 
1369 {
1371 }
1372 
1373 //-----------------------------------------------------------------------------
1374 // TUniChDb -- main test driver
1375 //-----------------------------------------------------------------------------
1376 
1377 void TUniChDb::Test(const TStr& basePath)
1378 {
1379  TStr fnBin = CombinePath(basePath, GetBinFn());
1380  if (true || ! TFile::Exists(fnBin))
1381  {
1382  // Test LoadTxt.
1383  LoadTxt(basePath);
1384  // Test Save.
1385  {PSOut SOut = TFOut::New(fnBin);
1386  Save(*SOut);}
1387  }
1388  // Test Load.
1389  this->~TUniChDb();
1390  new(this) TUniChDb();
1391  {PSIn SIn = TFIn::New(fnBin);
1392  Load(*SIn);}
1393  // Test the case folding.
1394  caseFolding.Test();
1395  // Test the word breaking.
1397  // Test the sentence breaking.
1398  TestFindNextWordOrSentenceBoundary(basePath, true);
1399  TestFindNextWordOrSentenceBoundary(basePath, false);
1400  // Test composition and decomposition.
1401  TestComposition(basePath);
1402  // Test the case conversions.
1404 }
1405 
1406 //-----------------------------------------------------------------------------
1407 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
1408 //-----------------------------------------------------------------------------
1409 
1410 //-----------------------------------------------------------------------------
1411 // ISO-8859-2
1412 //-----------------------------------------------------------------------------
1413 
1414 const int TEncoding_ISO8859_2::toUnicodeTable[6 * 16] =
1415 {
1416  /* 0xa0 */ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
1417  /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
1418  /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
1419  /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
1420  /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
1421  /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
1422 };
1423 
1424 const int TEncoding_ISO8859_2::fromUnicodeTable1[14 * 16] = {
1425  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1,
1426  /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1,
1427  /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1428  /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df,
1429  /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1430  /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1,
1431  /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a1, 0x00b1, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
1432  /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1,
1433  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1434  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00a5, 0x00b5, -1,
1435  /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1,
1436  /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x00a6, 0x00b6, -1, -1, 0x00aa, 0x00ba,
1437  /* U+0160 */ 0x00a9, 0x00b9, 0x00de, 0x00fe, 0x00ab, 0x00bb, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9,
1438  /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00af, 0x00bf, 0x00ae, 0x00be, -1
1439 };
1440 
1441 const int TEncoding_ISO8859_2::fromUnicodeTable2[2 * 16] = {
1442  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1,
1443  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1
1444 };
1445 
1446 //-----------------------------------------------------------------------------
1447 // ISO-8859-3
1448 //-----------------------------------------------------------------------------
1449 
1450 const int TEncoding_ISO8859_3::toUnicodeTable[6 * 16] = {
1451  /* 0xa0 */ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -1, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -1, 0x017b,
1452  /* 0xb0 */ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -1, 0x017c,
1453  /* 0xc0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
1454  /* 0xd0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
1455  /* 0xe0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
1456  /* 0xf0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9
1457 };
1458 
1459 const int TEncoding_ISO8859_3::fromUnicodeTable1[14 * 16] = {
1460  /* U+00a0 */ 0x00a0, -1, -1, 0x00a3, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, -1,
1461  /* U+00b0 */ 0x00b0, -1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, -1, 0x00b7, 0x00b8, -1, -1, -1, -1, 0x00bd, -1, -1,
1462  /* U+00c0 */ 0x00c0, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
1463  /* U+00d0 */ -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, 0x00d9, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df,
1464  /* U+00e0 */ 0x00e0, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
1465  /* U+00f0 */ -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, 0x00f9, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1,
1466  /* U+0100 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00c6, 0x00e6, 0x00c5, 0x00e5, -1, -1, -1, -1,
1467  /* U+0110 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d8, 0x00f8, 0x00ab, 0x00bb,
1468  /* U+0120 */ 0x00d5, 0x00f5, -1, -1, 0x00a6, 0x00b6, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, -1, -1,
1469  /* U+0130 */ 0x00a9, 0x00b9, -1, -1, 0x00ac, 0x00bc, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1470  /* U+0140 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1471  /* U+0150 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x00fe, 0x00aa, 0x00ba,
1472  /* U+0160 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00dd, 0x00fd, -1, -1,
1473  /* U+0170 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00af, 0x00bf, -1, -1, -1,
1474 };
1476  /* U+02d8 */ 0x00a2, 0x00ff
1477 };
1478 
1479 //-----------------------------------------------------------------------------
1480 // ISO-8859-4
1481 //-----------------------------------------------------------------------------
1482 
1483 const int TEncoding_ISO8859_4::toUnicodeTable[6 * 16] = {
1484  /* 0xa0 */ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
1485  /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
1486  /* 0xc0 */ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
1487  /* 0xd0 */ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
1488  /* 0xe0 */ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
1489  /* 0xf0 */ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9
1490 };
1491 
1492 const int TEncoding_ISO8859_4::fromUnicodeTable1[14 * 16] = {
1493  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, -1, 0x00a7, 0x00a8, -1, -1, -1, -1, 0x00ad, -1, 0x00af,
1494  /* U+00b0 */ 0x00b0, -1, -1, -1, 0x00b4, -1, -1, -1, 0x00b8, -1, -1, -1, -1, -1, -1, -1,
1495  /* U+00c0 */ -1, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, -1, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1496  /* U+00d0 */ -1, -1, -1, -1, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, -1, 0x00da, 0x00db, 0x00dc, -1, -1, 0x00df,
1497  /* U+00e0 */ -1, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, -1, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1498  /* U+00f0 */ -1, -1, -1, -1, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, -1, 0x00fa, 0x00fb, 0x00fc, -1, -1, -1,
1499  /* U+0100 */ 0x00c0, 0x00e0, -1, -1, 0x00a1, 0x00b1, -1, -1, -1, -1, -1, -1, 0x00c8, 0x00e8, -1, -1,
1500  /* U+0110 */ 0x00d0, 0x00f0, 0x00aa, 0x00ba, -1, -1, 0x00cc, 0x00ec, 0x00ca, 0x00ea, -1, -1, -1, -1, -1, -1,
1501  /* U+0120 */ -1, -1, 0x00ab, 0x00bb, -1, -1, -1, -1, 0x00a5, 0x00b5, 0x00cf, 0x00ef, -1, -1, 0x00c7, 0x00e7,
1502  /* U+0130 */ -1, -1, -1, -1, -1, -1, 0x00d3, 0x00f3, 0x00a2, -1, -1, 0x00a6, 0x00b6, -1, -1, -1,
1503  /* U+0140 */ -1, -1, -1, -1, -1, 0x00d1, 0x00f1, -1, -1, -1, 0x00bd, 0x00bf, 0x00d2, 0x00f2, -1, -1,
1504  /* U+0150 */ -1, -1, -1, -1, -1, -1, 0x00a3, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1,
1505  /* U+0160 */ 0x00a9, 0x00b9, -1, -1, -1, -1, 0x00ac, 0x00bc, 0x00dd, 0x00fd, 0x00de, 0x00fe, -1, -1, -1, -1,
1506  /* U+0170 */ -1, -1, 0x00d9, 0x00f9, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ae, 0x00be, -1,
1507 };
1508 
1509 const int TEncoding_ISO8859_4::fromUnicodeTable2[2 * 16] = {
1510  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00b7, -1, -1, -1, -1, -1, -1, -1, -1,
1511  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ff, -1, 0x00b2, -1, -1, -1, -1
1512 };
1513 
1514 //-----------------------------------------------------------------------------
1515 // CP 437
1516 //-----------------------------------------------------------------------------
1517 
1518 const int TEncoding_CP437::toUnicodeTable[8 * 16] = {
1519  /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
1520  /* 0x90 */ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
1521  /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
1522  /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
1523  /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
1524  /* 0xd0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
1525  /* 0xe0 */ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
1526  /* 0xf0 */ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
1527 };
1528 
1529 const int TEncoding_CP437::fromUnicodeTable1[6 * 16] = {
1530  /* U+00a0 */ 0x00ff, 0x00ad, 0x009b, 0x009c, -1, 0x009d, -1, -1, -1, -1, 0x00a6, 0x00ae, 0x00aa, -1, -1, -1,
1531  /* U+00b0 */ 0x00f8, 0x00f1, 0x00fd, -1, -1, 0x00e6, -1, 0x00fa, -1, -1, 0x00a7, 0x00af, 0x00ac, 0x00ab, -1, 0x00a8,
1532  /* U+00c0 */ -1, -1, -1, -1, 0x008e, 0x008f, 0x0092, 0x0080, -1, 0x0090, -1, -1, -1, -1, -1, -1,
1533  /* U+00d0 */ -1, 0x00a5, -1, -1, -1, -1, 0x0099, -1, -1, -1, -1, -1, 0x009a, -1, -1, 0x00e1,
1534  /* U+00e0 */ 0x0085, 0x00a0, 0x0083, -1, 0x0084, 0x0086, 0x0091, 0x0087, 0x008a, 0x0082, 0x0088, 0x0089, 0x008d, 0x00a1, 0x008c, 0x008b,
1535  /* U+00f0 */ -1, 0x00a4, 0x0095, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, 0x0097, 0x00a3, 0x0096, 0x0081, -1, -1, 0x0098,
1536 };
1537 
1538 const int TEncoding_CP437::fromUnicodeTable2[4 * 16] = {
1539  /* U+0390 */ -1, -1, -1, 0x00e2, -1, -1, -1, -1, 0x00e9, -1, -1, -1, -1, -1, -1, -1,
1540  /* U+03a0 */ -1, -1, -1, 0x00e4, -1, -1, 0x00e8, -1, -1, 0x00ea, -1, -1, -1, -1, -1, -1,
1541  /* U+03b0 */ -1, 0x00e0, -1, -1, 0x00eb, 0x00ee, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1542  /* U+03c0 */ 0x00e3, -1, -1, 0x00e5, 0x00e7, -1, 0x00ed, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1543 };
1544 
1545 const int TEncoding_CP437::fromUnicodeTable3[6 * 16] = {
1546  /* U+2210 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00f9, 0x00fb, -1, -1, -1, 0x00ec, -1,
1547  /* U+2220 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00ef, -1, -1, -1, -1, -1, -1,
1548  /* U+2230 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1549  /* U+2240 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f7, -1, -1, -1, -1, -1, -1, -1,
1550  /* U+2250 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1551  /* U+2260 */ -1, 0x00f0, -1, -1, 0x00f3, 0x00f2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1552 };
1553 
1554 const int TEncoding_CP437::fromUnicodeTable4[11 * 16] = {
1555  /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1,
1556  /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1,
1557  /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1,
1558  /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1,
1559  /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1560  /* U+2550 */ 0x00cd, 0x00ba, 0x00d5, 0x00d6, 0x00c9, 0x00b8, 0x00b7, 0x00bb, 0x00d4, 0x00d3, 0x00c8, 0x00be, 0x00bd, 0x00bc, 0x00c6, 0x00c7,
1561  /* U+2560 */ 0x00cc, 0x00b5, 0x00b6, 0x00b9, 0x00d1, 0x00d2, 0x00cb, 0x00cf, 0x00d0, 0x00ca, 0x00d8, 0x00d7, 0x00ce, -1, -1, -1,
1562  /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1563  /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, 0x00dd, -1, -1, -1,
1564  /* U+2590 */ 0x00de, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1565  /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1566 };
1567 // /* U+0190 */ -1, -1, 0x009f, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1568 // /* U+2070 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00fc,
1569 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, 0x009e, -1, -1, -1, -1, -1, -1, -1, -1,
1570 // /* U+2310 */ 0x00a9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1571 // /* U+2320 */ 0x00f4, 0x00f5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1572 
1573 //-----------------------------------------------------------------------------
1574 // CP 852
1575 //-----------------------------------------------------------------------------
1576 
1577 const int TEncoding_CP852::toUnicodeTable[8 * 16] = {
1578  /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
1579  /* 0x90 */ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
1580  /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
1581  /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
1582  /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
1583  /* 0xd0 */ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
1584  /* 0xe0 */ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
1585  /* 0xf0 */ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0
1586 };
1587 
1588 const int TEncoding_CP852::fromUnicodeTable1[14 * 16] = {
1589  /* U+00a0 */ 0x00ff, -1, -1, -1, 0x00cf, -1, -1, 0x00f5, 0x00f9, -1, -1, 0x00ae, 0x00aa, 0x00f0, -1, -1,
1590  /* U+00b0 */ 0x00f8, -1, -1, -1, 0x00ef, -1, -1, -1, 0x00f7, -1, -1, 0x00af, -1, -1, -1, -1,
1591  /* U+00c0 */ -1, 0x00b5, 0x00b6, -1, 0x008e, -1, -1, 0x0080, -1, 0x0090, -1, 0x00d3, -1, 0x00d6, 0x00d7, -1,
1592  /* U+00d0 */ -1, -1, -1, 0x00e0, 0x00e2, -1, 0x0099, 0x009e, -1, -1, 0x00e9, -1, 0x009a, 0x00ed, -1, 0x00e1,
1593  /* U+00e0 */ -1, 0x00a0, 0x0083, -1, 0x0084, -1, -1, 0x0087, -1, 0x0082, -1, 0x0089, -1, 0x00a1, 0x008c, -1,
1594  /* U+00f0 */ -1, -1, -1, 0x00a2, 0x0093, -1, 0x0094, 0x00f6, -1, -1, 0x00a3, -1, 0x0081, 0x00ec, -1, -1,
1595  /* U+0100 */ -1, -1, 0x00c6, 0x00c7, 0x00a4, 0x00a5, 0x008f, 0x0086, -1, -1, -1, -1, 0x00ac, 0x009f, 0x00d2, 0x00d4,
1596  /* U+0110 */ 0x00d1, 0x00d0, -1, -1, -1, -1, -1, -1, 0x00a8, 0x00a9, 0x00b7, 0x00d8, -1, -1, -1, -1,
1597  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1598  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0091, 0x0092, -1, -1, 0x0095, 0x0096, -1,
1599  /* U+0140 */ -1, 0x009d, 0x0088, 0x00e3, 0x00e4, -1, -1, 0x00d5, 0x00e5, -1, -1, -1, -1, -1, -1, -1,
1600  /* U+0150 */ 0x008a, 0x008b, -1, -1, 0x00e8, 0x00ea, -1, -1, 0x00fc, 0x00fd, 0x0097, 0x0098, -1, -1, 0x00b8, 0x00ad,
1601  /* U+0160 */ 0x00e6, 0x00e7, 0x00dd, 0x00ee, 0x009b, 0x009c, -1, -1, -1, -1, -1, -1, -1, -1, 0x00de, 0x0085,
1602  /* U+0170 */ 0x00eb, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008d, 0x00ab, 0x00bd, 0x00be, 0x00a6, 0x00a7, -1
1603 };
1604 
1605 const int TEncoding_CP852::fromUnicodeTable2[2* 16] = {
1606  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00f3, -1, -1, -1, -1, -1, -1, -1, -1,
1607  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00f4, 0x00fa, -1, 0x00f2, -1, 0x00f1, -1, -1
1608 };
1609 
1610 const int TEncoding_CP852::fromUnicodeTable3[11 * 16] = {
1611  /* U+2500 */ 0x00c4, -1, 0x00b3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00da, -1, -1, -1,
1612  /* U+2510 */ 0x00bf, -1, -1, -1, 0x00c0, -1, -1, -1, 0x00d9, -1, -1, -1, 0x00c3, -1, -1, -1,
1613  /* U+2520 */ -1, -1, -1, -1, 0x00b4, -1, -1, -1, -1, -1, -1, -1, 0x00c2, -1, -1, -1,
1614  /* U+2530 */ -1, -1, -1, -1, 0x00c1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, -1, -1, -1,
1615  /* U+2540 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1616  /* U+2550 */ 0x00cd, 0x00ba, -1, -1, 0x00c9, -1, -1, 0x00bb, -1, -1, 0x00c8, -1, -1, 0x00bc, -1, -1,
1617  /* U+2560 */ 0x00cc, -1, -1, 0x00b9, -1, -1, 0x00cb, -1, -1, 0x00ca, -1, -1, 0x00ce, -1, -1, -1,
1618  /* U+2570 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1619  /* U+2580 */ 0x00df, -1, -1, -1, 0x00dc, -1, -1, -1, 0x00db, -1, -1, -1, -1, -1, -1, -1,
1620  /* U+2590 */ -1, 0x00b0, 0x00b1, 0x00b2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1621  /* U+25a0 */ 0x00fe, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1622 };
1623 
1624 //-----------------------------------------------------------------------------
1625 // Windows-1250
1626 //-----------------------------------------------------------------------------
1627 
1628 const int TEncoding_CP1250::toUnicodeTable[8 * 16] = {
1629  /* 0x80 */ 0x20ac, -1, 0x201a, -1, 0x201e, 0x2026, 0x2020, 0x2021, -1, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
1630  /* 0x90 */ -1, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, -1, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
1631  /* 0xa0 */ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
1632  /* 0xb0 */ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
1633  /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
1634  /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
1635  /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
1636  /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
1637 };
1638 
1639 const int TEncoding_CP1250::fromUnicodeTable1[14 * 16] = {
1640  /* U+00a0 */ 0x00a0, -1, -1, -1, 0x00a4, -1, 0x00a6, 0x00a7, 0x00a8, 0x00a9, -1, 0x00ab, 0x00ac, 0x00ad, 0x00ae, -1,
1641  /* U+00b0 */ 0x00b0, 0x00b1, -1, -1, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, -1, -1, 0x00bb, -1, -1, -1, -1,
1642  /* U+00c0 */ -1, 0x00c1, 0x00c2, -1, 0x00c4, -1, -1, 0x00c7, -1, 0x00c9, -1, 0x00cb, -1, 0x00cd, 0x00ce, -1,
1643  /* U+00d0 */ -1, -1, -1, 0x00d3, 0x00d4, -1, 0x00d6, 0x00d7, -1, -1, 0x00da, -1, 0x00dc, 0x00dd, -1, 0x00df,
1644  /* U+00e0 */ -1, 0x00e1, 0x00e2, -1, 0x00e4, -1, -1, 0x00e7, -1, 0x00e9, -1, 0x00eb, -1, 0x00ed, 0x00ee, -1,
1645  /* U+00f0 */ -1, -1, -1, 0x00f3, 0x00f4, -1, 0x00f6, 0x00f7, -1, -1, 0x00fa, -1, 0x00fc, 0x00fd, -1, -1,
1646  /* U+0100 */ -1, -1, 0x00c3, 0x00e3, 0x00a5, 0x00b9, 0x00c6, 0x00e6, -1, -1, -1, -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
1647  /* U+0110 */ 0x00d0, 0x00f0, -1, -1, -1, -1, -1, -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec, -1, -1, -1, -1,
1648  /* U+0120 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 /* blank */,
1649  /* U+0130 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x00c5, 0x00e5, -1, -1, 0x00bc, 0x00be, -1,
1650  /* U+0140 */ -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1, -1, -1, 0x00d2, 0x00f2, -1, -1, -1, -1, -1, -1, -1,
1651  /* U+0150 */ 0x00d5, 0x00f5, -1, -1, 0x00c0, 0x00e0, -1, -1, 0x00d8, 0x00f8, 0x008c, 0x009c, -1, -1, 0x00aa, 0x00ba,
1652  /* U+0160 */ 0x008a, 0x009a, 0x00de, 0x00fe, 0x008d, 0x009d, -1, -1, -1, -1, -1, -1, -1, -1, 0x00d9, 0x00f9,
1653  /* U+0170 */ 0x00db, 0x00fb, -1, -1, -1, -1, -1, -1, -1, 0x008f, 0x009f, 0x00af, 0x00bf, 0x008e, 0x009e, -1,
1654 };
1655 
1656 const int TEncoding_CP1250::fromUnicodeTable2[2 * 16] = {
1657  /* U+02c0 */ -1, -1, -1, -1, -1, -1, -1, 0x00a1, -1, -1, -1, -1, -1, -1, -1, -1,
1658  /* U+02d0 */ -1, -1, -1, -1, -1, -1, -1, -1, 0x00a2, 0x00ff, -1, 0x00b2, -1, 0x00bd, -1, -1,
1659 };
1660 
1661 const int TEncoding_CP1250::fromUnicodeTable3[3 * 16] = {
1662  /* U+2010 */ -1, -1, -1, 0x0096, 0x0097, -1, -1, -1, 0x0091, 0x0092, 0x0082, -1, 0x0093, 0x0094, 0x0084, -1,
1663  /* U+2020 */ 0x0086, 0x0087, 0x0095, -1, -1, -1, 0x0085, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1664  /* U+2030 */ 0x0089, -1, -1, -1, -1, -1, -1, -1, -1, 0x008b, 0x009b, -1, -1, -1, -1, -1,
1665 };
1666 // /* U+20a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0080, -1, -1, -1,
1667 // /* U+2120 */ -1, -1, 0x0099, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
1668 
1669 //-----------------------------------------------------------------------------
1670 // YU-ASCII
1671 //-----------------------------------------------------------------------------
1672 
1673 // C acute c acute C caron c caron S caron s caron Z caron z caron D stroke d stroke
1674 const int TEncoding_YuAscii::uniChars[10] = { 0x106, 0x107, 0x10c, 0x10d, 0x160, 0x161, 0x17d, 0x17e, 0x110, 0x111 };
1675 const int TEncoding_YuAscii::yuAsciiChars[10] = { 0x5d, 0x7d, 0x5e, 0x7e, 0x5b, 0x7b, 0x40, 0x60, 0x5c, 0x7c };
1676 // ']' '}' '^' '~' '[' '{' '@' '`' '\\' '|'
1677 
1678 
1679 //-----------------------------------------------------------------------------
1680 // TUnicode - codec registry
1681 //-----------------------------------------------------------------------------
1682 
1684 {
1685  ClrCodecs();
1686  RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
1687  RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
1688  RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
1689  RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
1690  RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
1691  RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
1692  RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
1693  RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
1694 }
1695 
1696 void TUnicode::EncodeUtf8(const uint& c, TChA& dest) {
1697  if (c > 0x10ffff) {
1698  throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
1699  if (c < 0x80u)
1700  dest.AddCh(char(c & 0xffu));
1701  else if (c < 0x800u) {
1702  dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
1703  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1704  else if (c < 0x10000u) {
1705  dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
1706  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1707  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1708  else if (c < 0x200000u) {
1709  dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
1710  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1711  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1712  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1713  else if (c < 0x4000000u) {
1714  dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
1715  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1716  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1717  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1718  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1719  else {
1720  dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
1721  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
1722  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1723  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1724  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1725  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1726 }
1727 
1729  TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
1730 }
void InitAfterLoad()
Definition: unicode.cpp:1368
#define IAssert(Cond)
Definition: bd.h:262
int GetWbFlags() const
Definition: unicode.h:1118
static int SwapBytes(int x)
Definition: unicode.h:250
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
void InitCodecs()
Definition: unicode.cpp:1683
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:1043
static PExcept New(const TStr &MsgStr, const TStr &LocStr=TStr())
Definition: ut.h:169
void Clr()
Definition: unicode.h:1276
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Definition: unicode.cpp:341
static const int fromUnicodeTable1[6 *16]
Definition: unicode.h:510
bool strict
Definition: unicode.h:83
enum TUniChProperties_ TUniChProperties
static const uint Mn
Definition: dt.h:1246
#define IAssertR(Cond, Reason)
Definition: bd.h:265
size_t srcIdx
Definition: unicode.h:32
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
int Len() const
Definition: dt.h:490
void SetPropertyX(const TUniChPropertiesX flag)
Definition: unicode.h:1108
bool IsInt(const bool &Check, const int &MnVal, const int &MxVal, int &Val) const
Definition: dt.cpp:1159
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
void Merge()
Sorts the vector and only keeps a single element of each value.
Definition: ds.h:1356
static TStr GetBinFn()
Definition: unicode.h:1310
void Test(const TIntV &src, const TIntV &expectedDest, const bool full, const bool turkic, FILE *f)
Definition: unicode.cpp:531
enum TUniChFlags_ TUniChFlags
bool IsCompositionExclusion() const
Definition: unicode.h:1111
#define NFC_(cmpWith, operand)
void SaveBin(const TStr &fnBinUcd)
Definition: unicode.cpp:1362
Definition: dt.h:11
bool IsDcpFlag(const TUniChFlags flag) const
Definition: unicode.h:1068
static const ushort LineBreak_Quotation
Definition: unicode.h:1032
void SetProperty(const TUniChProperties flag)
Definition: unicode.h:1085
bool IsGraphemeExtend() const
Definition: unicode.h:1077
void SetSbFlag(const TUniChFlags flag)
Definition: unicode.h:1127
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:480
static bool Exists(const TStr &FNm)
Definition: fl.cpp:1156
static TStr GetSpecialCasingFn()
Definition: unicode.h:1297
TUniChSubCategory subCat
Definition: unicode.h:1020
int GetWbFlags(const int cp) const
Definition: unicode.h:1357
static const uint Mx
Definition: dt.h:1247
void AssertEq(const TIntV &v1, const TIntV &v2, const TStr &explanation, FILE *f)
Definition: unicode.cpp:39
void SetDcpFlag(const TUniChFlags flag)
Definition: unicode.h:1070
void SetWbFlag(const TUniChFlags flag)
Definition: unicode.h:1117
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
unsigned int uint
Definition: bd.h:11
uchar combClass
Definition: unicode.h:1018
#define Fail
Definition: bd.h:238
static TStr GetScriptNameKatakana()
Definition: unicode.h:1318
static const ushort LineBreak_InfixNumeric
Definition: unicode.h:1032
#define NFD_(cmpWith, operand)
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
void InitPropList(const TStr &basePath)
Definition: unicode.cpp:950
static const int toUnicodeTable[8 *16]
Definition: unicode.h:532
void ClrCodecs()
Definition: unicode.h:1881
enum TUniChDb::TCaseConversion_ TCaseConversion
bool IsAlphabetic() const
Definition: unicode.h:1071
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1422
static const ushort LineBreak_ComplexContext
Definition: unicode.h:1032
TIntIntVH cfFull
Definition: unicode.h:275
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
static const int yuAsciiChars[10]
Definition: unicode.h:493
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
Definition: unicode.h:1873
void InitDerivedCoreProperties(const TStr &basePath)
Definition: unicode.cpp:1007
void InitAfterLoad()
Definition: unicode.h:1035
bool IsWhiteSpace() const
Definition: unicode.h:1104
void InitLineBreaks(const TStr &basePath)
Definition: unicode.cpp:1046
static const int uniChars[10]
Definition: unicode.h:493
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
char chCat
Definition: unicode.h:1017
static TStr GetNormalizationTestFn()
Definition: unicode.h:1309
enum TUniChPropertiesX_ TUniChPropertiesX
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool IsUppercase() const
Definition: unicode.h:1072
void Test(const TStr &basePath)
Definition: unicode.cpp:1377
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:532
static const int fromUnicodeTable2[4 *16]
Definition: unicode.h:510
static void ParseCodePointRange(const TStr &s, int &from, int &to)
Definition: unicode.h:1703
TIntIntVH specialCasingLower
Definition: unicode.h:1271
int simpleUpperCaseMapping
Definition: unicode.h:1022
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
Definition: unicode.cpp:825
static TStr GetUnicodeDataFn()
Definition: unicode.h:1298
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
bool IsPropertyX(const TUniChPropertiesX flag) const
Definition: unicode.h:1107
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294
TStr GetWbFlagsStr() const
Definition: unicode.h:1120
static TStr GetScriptsFn()
Definition: unicode.h:1300
static const int fromUnicodeTable3[6 *16]
Definition: unicode.h:510
int propertiesX
Definition: unicode.h:1027
bool IsLowercase() const
Definition: unicode.h:1073
void Clr()
Definition: unicode.h:288
void TestCaseConversions()
Definition: unicode.cpp:853
int simpleTitleCaseMapping
Definition: unicode.h:1022
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:547
void TestDecodeUtf8(TRnd &rnd, const TStr &testCaseDesc)
Definition: unicode.cpp:133
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1318
bool IsCompatibilityDecomposition() const
Definition: unicode.h:1112
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1229
ushort lineBreak
Definition: unicode.h:1028
static TStr GetSentenceBreakTestFn()
Definition: unicode.h:1307
TUniChDb()
Definition: unicode.h:1274
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2428
void Save(TSOut &SOut) const
Definition: unicode.h:1280
void TestUtf8()
Definition: unicode.cpp:194
uint GetUniDevUInt(const uint &Range=0)
Definition: dt.cpp:45
enum TUniByteOrder_ TUniByteOrder
bool FNextKeyId(int &KeyId) const
Definition: hash.h:478
static const int fromUnicodeTable3[3 *16]
Definition: unicode.h:547
TStr GetSbFlagsStr() const
Definition: unicode.h:1130
void LoadTxt_ProcessDecomposition(TUniChInfo &ci, TStr s)
Definition: unicode.cpp:937
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
static const int toUnicodeTable[6 *16]
Definition: unicode.h:452
void InitSpecialCasing(const TStr &basePath)
Definition: unicode.cpp:1225
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
int FFirstKeyId() const
Definition: hash.h:278
static TStr GetDerivedCorePropsFn()
Definition: unicode.h:1301
static TStr GetWordBreakPropertyFn()
Definition: unicode.h:1306
static const int fromUnicodeTable3[11 *16]
Definition: unicode.h:532
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263
bool GetNextLine(TStrV &dest)
Definition: unicode.h:1686
int properties
Definition: unicode.h:1026
void Load(TSIn &SIn)
Definition: unicode.h:1285
void Open(const TStr &fileName)
Definition: unicode.h:1683
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:452
#define FailR(Reason)
Definition: bd.h:240
bool IsSbFlag(const TUniChFlags flag) const
Definition: unicode.h:1126
void ClrWbAndSbFlags()
Definition: unicode.h:1116
static const ushort LineBreak_Numeric
Definition: unicode.h:1032
unsigned char uchar
Definition: bd.h:10
void InitScripts(const TStr &basePath)
Definition: unicode.cpp:1073
void TestComposition(const TStr &basePath)
Definition: unicode.cpp:745
enum TUtf16BomHandling_ TUtf16BomHandling
TIntH cfTurkic
Definition: unicode.h:274
static TStr GetLineBreakFn()
Definition: unicode.h:1302
void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const
Definition: unicode.h:293
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:532
bool skipBom
Definition: unicode.h:89
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
Definition: unicode.h:1697
TIntH cfCommon
Definition: unicode.h:274
static TStr GetWordBreakTestFn()
Definition: unicode.h:1305
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2210
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419
static const int toUnicodeTable[8 *16]
Definition: unicode.h:547
TIntV decompositions
Definition: unicode.h:1266
unsigned short ushort
Definition: bd.h:13
int GetKeyId(const TKey &Key) const
Definition: hash.h:466
Definition: dt.h:201
void SetCat(const int cp)
Definition: unicode.h:1744
int replacementChar
Definition: unicode.h:64
static const int toUnicodeTable[8 *16]
Definition: unicode.h:510
static TStr GetSentenceBreakPropertyFn()
Definition: unicode.h:1308
void LoadTxt(const TStr &fileName)
Definition: unicode.cpp:505
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
Definition: ds.h:32
int AddKey(const TKey &Key)
Definition: hash.h:373
void InitWordAndSentenceBoundaryFlags(const TStr &basePath)
Definition: unicode.cpp:1100
void TestFindNextWordOrSentenceBoundary(const TStr &basePath, bool sentence)
Definition: unicode.cpp:649
static const int toUnicodeTable[6 *16]
Definition: unicode.h:466
char chSubCat
Definition: unicode.h:1017
int simpleLowerCaseMapping
Definition: unicode.h:1022
TStr CombinePath(const TStr &s, const TStr &t)
Definition: unicode.cpp:32
static const ushort LineBreak_Unknown
Definition: unicode.h:1032
static TStr GetCompositionExclusionsFn()
Definition: unicode.h:1299
void LoadTxt(const TStr &basePath)
Definition: unicode.cpp:1249
TStrIntH scripts
Definition: unicode.h:1265
Definition: dt.h:412
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2376
void ProcessComment(TUniChDb::TUcdFileReader &reader)
Definition: unicode.h:1729
bool Empty() const
Definition: dt.h:491
TStr & ToTrunc()
Definition: dt.cpp:770
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561
int decompOffset
Definition: unicode.h:1023
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
static TStr GetPropListFn()
Definition: unicode.h:1303
void TestCat(const int cp)
Definition: unicode.h:1749
#define NFKC_(cmpWith, operand)
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
TIntH cfSimple
Definition: unicode.h:274
TVec< TInt > TIntV
Definition: ds.h:1594
TStrPool charNames
Definition: unicode.h:1264
int GetSbFlags() const
Definition: unicode.h:1128
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:523
bool IsSTerminal() const
Definition: unicode.h:1101
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2793
static const uchar Mx
Definition: dt.h:1098
#define NFKD_(cmpWith, operand)
int EncodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1792
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:543
static const int toUnicodeTable[6 *16]
Definition: unicode.h:480
bool IsProperty(const TUniChProperties flag) const
Definition: unicode.h:1084
void TestUtf16()
Definition: unicode.cpp:408
bool IsWbFlag(const TUniChFlags flag) const
Definition: unicode.h:1115
signed char script
Definition: unicode.h:1021
int nameOffset
Definition: unicode.h:1024
int scriptUnknown
Definition: unicode.h:1272
char * CStr()
Definition: dt.h:479
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:480
TUniCaseFolding caseFolding
Definition: unicode.h:1268
uint AddStr(const char *Str, const uint &Len)
Definition: dt.cpp:1711
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:452
bool IsIdeographic() const
Definition: unicode.h:1095
TIntIntVH specialCasingTitle
Definition: unicode.h:1271
int Len() const
Definition: hash.h:228
int flags
Definition: unicode.h:1025
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:466
bool AlwaysFalse()
Definition: unicode.h:3227
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:547
static const int fromUnicodeTable4[11 *16]
Definition: unicode.h:510
static const uchar Mn
Definition: dt.h:1097
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:252
static const int fromUnicodeTable2[2]
Definition: unicode.h:466
void TestWbFindNonIgnored() const
Definition: unicode.cpp:619
static int ParseCodePoint(const TStr &s)
Definition: unicode.h:1695
static TStr GetCaseFoldingFn()
Definition: unicode.h:1296
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
static TStr GetAuxiliaryDir()
Definition: unicode.h:1304
static TStr GetScriptNameHiragana()
Definition: unicode.h:1319
static ushort GetLineBreakCode(char c1, char c2)
Definition: unicode.h:1031
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1110
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817
void WordsToBytes(const TIntV &src, TIntV &dest)
Definition: unicode.cpp:274