SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
TUniCodec Class Reference

#include <unicode.h>

Public Types

enum  { DefaultReplacementChar = 0xfffd }
 

Public Member Functions

 TUniCodec ()
 
 TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)
 
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
 
template<typename TSrcVec >
TStr EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
 
template<typename TSrcVec >
TStr EncodeUtf8Str (const TSrcVec &src) const
 
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
 
template<typename TSrcVec , typename TDestCh >
size_t DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
 
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
 
template<typename TSrcVec , typename TDestCh >
size_t EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
 
void TestUtf8 ()
 
void TestUtf16 ()
 

Public Attributes

int replacementChar
 
TUnicodeErrorHandling errorHandling
 
bool strict
 
bool skipBom
 

Protected Types

enum  {
  DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0),
  DefineByte =(1, 0, 0, 0, 0, 0, 0, 0)
}
 
enum  { Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 }
 
typedef TUniVecIdx TVecIdx
 

Protected Member Functions

void TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f)
 
void TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc)
 
void WordsToBytes (const TIntV &src, TIntV &dest)
 
void TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f)
 
void TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
 

Static Protected Member Functions

static bool IsMachineLittleEndian ()
 
static uint GetRndUint (TRnd &rnd)
 
static uint GetRndUint (TRnd &rnd, uint minVal, uint maxVal)
 
static int SwapBytes (int x)
 

Friends

class TUniCaseFolding
 
class TUnicode
 

Detailed Description

Definition at line 54 of file unicode.h.

Member Typedef Documentation

typedef TUniVecIdx TUniCodec::TVecIdx
protected

Definition at line 118 of file unicode.h.

Member Enumeration Documentation

anonymous enum
Enumerator
DefaultReplacementChar 

Definition at line 59 of file unicode.h.

anonymous enum
protected
Enumerator
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 
DefineByte 

Definition at line 101 of file unicode.h.

101  {
102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
103  DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
104  DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
105  DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
106  DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
107  DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
108  DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
109  DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
110  DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
111  DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
112  DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
113  DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
114  DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
115 #undef DefineByte
116  };
anonymous enum
protected
Enumerator
Utf16FirstSurrogate 
Utf16SecondSurrogate 

Definition at line 157 of file unicode.h.

Constructor & Destructor Documentation

TUniCodec::TUniCodec ( )
inline

Definition at line 91 of file unicode.h.

92  {
93  }
bool strict
Definition: unicode.h:83
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
TUniCodec::TUniCodec ( TUnicodeErrorHandling  errorHandling_,
bool  strict_,
int  replacementChar_,
bool  skipBom_ 
)
inline

Definition at line 95 of file unicode.h.

95  :
96  replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
97  {
98  }
bool strict
Definition: unicode.h:83
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64

Member Function Documentation

template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf16FromBytes ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const

Definition at line 2210 of file unicode.h.

2215 {
2216  IAssert(srcCount % 2 == 0);
2217  IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
2218  IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
2219  if (clrDest) dest.Clr();
2220  size_t nDecoded = 0;
2221  if (srcCount <= 0) return nDecoded;
2222  const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2223  bool littleEndian = false;
2224  bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
2225  if (bomHandling == bomIgnored) littleEndian = leDefault;
2226  else if (bomHandling == bomAllowed || bomHandling == bomRequired)
2227  {
2228  int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
2229  if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
2230  else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
2231  else if (bomHandling == bomAllowed) littleEndian = leDefault;
2232  else { // Report an error.
2233  switch (errorHandling) {
2234  case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
2235  case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
2236  default: Fail; } }
2237  }
2238  else Fail;
2239  while (srcIdx < srcEnd)
2240  {
2241  const size_t charSrcIdx = srcIdx;
2242  uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
2243  uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2244  if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
2245  {
2246  // c is the first character in a surrogate pair. Read the next character.
2247  if (! (srcIdx + 2 <= srcEnd)) {
2248  switch (errorHandling) {
2249  case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
2250  case uehAbort: return nDecoded;
2251  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2252  case uehIgnore: continue;
2253  default: Fail; } }
2254  uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
2255  uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2256  // c2 should be the second character of the surrogate pair.
2257  if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
2258  switch (errorHandling) {
2259  case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
2260  case uehAbort: return nDecoded;
2261  // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
2262  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
2263  case uehIgnore: srcIdx -= 2; continue;
2264  default: Fail; } }
2265  // c and c2 each contain 10 bits of information.
2266  uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
2267  cc += 0x10000;
2268  dest.Add(TDestCh(cc)); nDecoded++; continue;
2269  }
2270  else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
2271  switch (errorHandling) {
2272  case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
2273  case uehAbort: return nDecoded;
2274  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2275  case uehIgnore: continue;
2276  default: Fail; } }
2277  // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
2278  if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
2279  // Otherwise, store 'c' to the destination vector.
2280  dest.Add(TDestCh(c)); nDecoded++;
2281  }
2282  return nDecoded;
2283 }
#define IAssert(Cond)
Definition: bd.h:262
bool strict
Definition: unicode.h:83
TStr GetStr() const
Definition: dt.h:1107
TUniVecIdx TVecIdx
Definition: unicode.h:118
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf16FromWords ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
bool  clrDest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const

Definition at line 2294 of file unicode.h.

2299 {
2300  IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
2301  IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
2302  if (clrDest) dest.Clr();
2303  size_t nDecoded = 0;
2304  if (srcCount <= 0) return nDecoded;
2305  const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2306  bool swap = false;
2307  bool isMachineLe = IsMachineLittleEndian();
2308  bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
2309  if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
2310  else if (bomHandling == bomAllowed || bomHandling == bomRequired)
2311  {
2312  int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
2313  if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
2314  else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
2315  else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
2316  else { // Report an error.
2317  switch (errorHandling) {
2318  case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
2319  case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
2320  default: Fail; } }
2321  }
2322  else Fail;
2323  while (srcIdx < srcEnd)
2324  {
2325  const size_t charSrcIdx = srcIdx;
2326  uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
2327  if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2328  if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
2329  {
2330  // c is the first character in a surrogate pair. Read the next character.
2331  if (! (srcIdx < srcEnd)) {
2332  switch (errorHandling) {
2333  case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
2334  case uehAbort: return nDecoded;
2335  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2336  case uehIgnore: continue;
2337  default: Fail; } }
2338  uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
2339  if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
2340  // c2 should be the second character of the surrogate pair.
2341  if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
2342  switch (errorHandling) {
2343  case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
2344  case uehAbort: return nDecoded;
2345  // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
2346  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
2347  case uehIgnore: srcIdx -= 1; continue;
2348  default: Fail; } }
2349  // c and c2 each contain 10 bits of information.
2350  uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
2351  cc += 0x10000;
2352  dest.Add(TDestCh(cc)); nDecoded++; continue;
2353  }
2354  else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
2355  switch (errorHandling) {
2356  case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
2357  case uehAbort: return nDecoded;
2358  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2359  case uehIgnore: continue;
2360  default: Fail; } }
2361  // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
2362  if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
2363  // Otherwise, store 'c' to the destination vector.
2364  dest.Add(TDestCh(c)); nDecoded++;
2365  }
2366  return nDecoded;
2367 }
#define IAssert(Cond)
Definition: bd.h:262
bool strict
Definition: unicode.h:83
TStr GetStr() const
Definition: dt.h:1107
TUniVecIdx TVecIdx
Definition: unicode.h:118
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf8 ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const

Definition at line 2036 of file unicode.h.

2039 {
2040  size_t nDecoded = 0;
2041  if (clrDest) dest.Clr();
2042  const size_t origSrcIdx = srcIdx;
2043  const size_t srcEnd = srcIdx + srcCount;
2044  while (srcIdx < srcEnd)
2045  {
2046  const size_t charSrcIdx = srcIdx;
2047  uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
2048  if ((c & _1000_0000) == 0) {
2049  // c is one of the characters 0..0x7f, encoded as a single byte.
2050  dest.Add(TDestCh(c)); nDecoded++; continue; }
2051  else if ((c & _1100_0000) == _1000_0000) {
2052  // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
2053  // We must have been thrown into the middle of a multi-byte character.
2054  switch (errorHandling) {
2055  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
2056  case uehAbort: return nDecoded;
2057  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2058  case uehIgnore: continue;
2059  default: Fail; } }
2060  else
2061  {
2062  // c introduces a sequence of 2..6 bytes, depending on how many
2063  // of the most significant bits of c are set.
2064  uint nMoreBytes = 0, nBits = 0, minVal = 0;
2065  if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
2066  else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
2067  else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
2068  else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
2069  else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
2070  else {
2071  // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
2072  // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this
2073  // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
2074  // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
2075  if (strict) {
2076  switch (errorHandling) {
2077  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
2078  case uehAbort: return nDecoded;
2079  // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
2080  // and try to decode the character. Then, since 'strict' is true and
2081  // the codepoint is clearly >= 2^31, we'll notice this as an error later
2082  // and (in the case of uehReplace) insert a replacement character then.
2083  // This is probably better than inserting a replacement character right
2084  // away and then trying to read the next byte as if a new character
2085  // was beginning there -- if the current byte is really followed by five
2086  // 10xxxxxx bytes, we'll just get six replacement characters in a row.
2087  case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
2088  case uehIgnore: break; // continue;
2089  default: Fail; } }
2090  nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
2091  // Decode this multi-byte sequence.
2092  uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
2093  bool cancel = false;
2094  for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
2095  // See if there are enough bytes left in the source vector.
2096  if (! (srcIdx < srcEnd)) {
2097  switch (errorHandling) {
2098  case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
2099  case uehAbort: return nDecoded;
2100  case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
2101  case uehIgnore: cancel = true; continue;
2102  default: Fail; } }
2103  // Read the next byte.
2104  c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
2105  if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
2106  switch (errorHandling) {
2107  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
2108  case uehAbort: return nDecoded;
2109  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
2110  case uehIgnore: srcIdx--; cancel = true; continue;
2111  default: Fail; } }
2112  cOut <<= 6; cOut |= (c & _0011_1111); }
2113  if (cancel) continue;
2114  if (strict) {
2115  // err1: This codepoint has been represented by more bytes than it should have been.
2116  // For example, cOut in the range 0..127 should be represented by a single byte,
2117  // not by two or more bytes.
2118  // - For example, this may happen in the "modified UTF-8" sometimes used for Java
2119  // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
2120  // the appearance of null bytes in the encoded stream.
2121  bool err1 = (cOut < minVal);
2122  // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
2123  // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
2124  // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary.
2125  bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
2126  if (err1 || err2) switch (errorHandling) {
2127  case uehThrow:
2128  if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
2129  else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
2130  else { Fail; break; }
2131  case uehAbort: return nDecoded;
2132  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2133  case uehIgnore: continue;
2134  default: Fail; } }
2135  // Add the decoded codepoint to the destination vector.
2136  // If this is the first decoded character, and it's one of the byte-order marks
2137  // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
2138  if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
2139  dest.Add(cOut); nDecoded++; }
2140  } // else (multi-byte sequence)
2141  } // while
2142  return nDecoded;
2143 }
bool strict
Definition: unicode.h:83
TStr GetStr() const
Definition: dt.h:1107
TUniVecIdx TVecIdx
Definition: unicode.h:118
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::DecodeUtf8 ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 136 of file unicode.h.

136 { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf16ToBytes ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const

Definition at line 2428 of file unicode.h.

2432 {
2433  bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
2434  size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2435  if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
2436  while (srcIdx < srcEnd)
2437  {
2438  uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
2439  if (! (c <= 0x10ffffu)) {
2440  switch (errorHandling) {
2441  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
2442  case uehAbort: return nEncoded;
2443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
2444  case uehReplace: ___OutRepl; continue;
2445  case uehIgnore: continue;
2446  default: Fail; } }
2447  if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
2448  switch (errorHandling) {
2449  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
2450  case uehAbort: return nEncoded;
2451  case uehReplace: ___OutRepl; continue;
2452  case uehIgnore: continue;
2453  default: Fail; } }
2454  if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
2455  switch (errorHandling) {
2456  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
2457  case uehAbort: return nEncoded;
2458  case uehReplace: ___OutRepl; continue;
2459  case uehIgnore: continue;
2460  default: Fail; } }
2461 #undef ___OutRepl
2462  // If c is <= 0xffff, it can be stored directly.
2463  if (c <= 0xffffu) {
2464  if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
2465  else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
2466  nEncoded++; continue; }
2467  // Otherwise, represent c by a pair of surrogate characters.
2468  c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
2469  uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2471  if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
2472  else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
2473  nEncoded++; continue;
2474  }
2475  return nEncoded;
2476 }
#define IAssert(Cond)
Definition: bd.h:262
TUniVecIdx TVecIdx
Definition: unicode.h:118
#define ___OutRepl
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
TStr GetStr() const
Definition: dt.h:1189
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf16ToWords ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const

Definition at line 2376 of file unicode.h.

2380 {
2381  bool isMachineLe = IsMachineLittleEndian();
2382  bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
2383  size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2384  if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
2385  while (srcIdx < srcEnd)
2386  {
2387  uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
2388  if (! (c <= 0x10ffffu)) {
2389  switch (errorHandling) {
2390  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
2391  case uehAbort: return nEncoded;
2392  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2393  case uehIgnore: continue;
2394  default: Fail; } }
2395  if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
2396  switch (errorHandling) {
2397  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
2398  case uehAbort: return nEncoded;
2399  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2400  case uehIgnore: continue;
2401  default: Fail; } }
2402  if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
2403  switch (errorHandling) {
2404  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
2405  case uehAbort: return nEncoded;
2406  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2407  case uehIgnore: continue;
2408  default: Fail; } }
2409  // If c is <= 0xffff, it can be stored directly.
2410  if (c <= 0xffffu) {
2411  if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2412  dest.Add(TDestCh(c)); nEncoded++; continue; }
2413  // Otherwise, represent c by a pair of surrogate characters.
2414  c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
2415  uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2417  if (swap) {
2418  c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
2419  c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
2420  dest.Add(TDestCh(c1));
2421  dest.Add(TDestCh(c2));
2422  nEncoded++; continue;
2423  }
2424  return nEncoded;
2425 }
#define IAssert(Cond)
Definition: bd.h:262
static int SwapBytes(int x)
Definition: unicode.h:250
TUniVecIdx TVecIdx
Definition: unicode.h:118
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
TStr GetStr() const
Definition: dt.h:1189
int replacementChar
Definition: unicode.h:64
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf8 ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const

Definition at line 2152 of file unicode.h.

2155 {
2156  size_t nEncoded = 0;
2157  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
2158  {
2159  uint c = uint(src[TVecIdx(srcIdx)]);
2160  bool err = false;
2161  if (strict && c > 0x10ffff) {
2162  err = true;
2163  switch (errorHandling) {
2164  case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
2165  case uehAbort: return nEncoded;
2166  case uehReplace: c = replacementChar; break;
2167  case uehIgnore: continue;
2168  default: Fail; } }
2169  if (c < 0x80u)
2170  dest.Add(TDestCh(c & 0xffu));
2171  else if (c < 0x800u) {
2172  dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
2173  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2174  else if (c < 0x10000u) {
2175  dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
2176  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2177  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2178  else if (c < 0x200000u) {
2179  dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
2180  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2181  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2182  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2183  else if (c < 0x4000000u) {
2184  dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
2185  dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2186  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2187  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2188  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2189  else {
2190  dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
2191  dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
2192  dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2193  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2194  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2195  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2196  if (! err) nEncoded++;
2197  }
2198  return nEncoded;
2199 }
bool strict
Definition: unicode.h:83
TStr GetStr() const
Definition: dt.h:1107
TUniVecIdx TVecIdx
Definition: unicode.h:118
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
int replacementChar
Definition: unicode.h:64
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
template<typename TSrcVec , typename TDestCh >
size_t TUniCodec::EncodeUtf8 ( const TSrcVec &  src,
TVec< TDestCh > &  dest,
const bool  clrDest = true 
) const
inline

Definition at line 145 of file unicode.h.

145 { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
template<typename TSrcVec >
TStr TUniCodec::EncodeUtf8Str ( const TSrcVec &  src,
size_t  srcIdx,
const size_t  srcCount 
) const
inline

Definition at line 149 of file unicode.h.

149 { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
Definition: dt.h:412
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
template<typename TSrcVec >
TStr TUniCodec::EncodeUtf8Str ( const TSrcVec &  src) const
inline

Definition at line 150 of file unicode.h.

150 { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
Definition: dt.h:412
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
uint TUniCodec::GetRndUint ( TRnd rnd)
staticprotected

Definition at line 62 of file unicode.cpp.

63 {
64  uint u = rnd.GetUniDevUInt(256) & 0xff;
65  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
66  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
67  u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
68  return u;
69 }
unsigned int uint
Definition: bd.h:11
uint GetUniDevUInt(const uint &Range=0)
Definition: dt.cpp:45
uint TUniCodec::GetRndUint ( TRnd rnd,
uint  minVal,
uint  maxVal 
)
staticprotected

Definition at line 71 of file unicode.cpp.

72 {
73  if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
74  uint range = maxVal - minVal + 1;
75  if (range > (uint(1) << (8 * sizeof(uint) - 1)))
76  while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
77  uint mask = 1;
78  while (mask < range) mask <<= 1;
79  mask -= 1;
80  while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
81 }
static const uint Mn
Definition: dt.h:1153
static const uint Mx
Definition: dt.h:1154
unsigned int uint
Definition: bd.h:11
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
bool TUniCodec::IsMachineLittleEndian ( )
staticprotected

Definition at line 83 of file unicode.cpp.

84 {
85  static bool isLE, initialized = false;
86  if (initialized) return isLE;
87  int i = 1;
88  if(*(char *)&i == 1) isLE = true;
89  else isLE = false;
90 
91  initialized = true;
92  return isLE;
93 }
static int TUniCodec::SwapBytes ( int  x)
inlinestaticprotected

Definition at line 250 of file unicode.h.

250  {
251  return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
void TUniCodec::TestDecodeUtf16 ( TRnd rnd,
const TStr testCaseDesc,
const TUtf16BomHandling  bomHandling,
const TUniByteOrder  defaultByteOrder,
const bool  insertBom 
)
protected

Definition at line 341 of file unicode.cpp.

345 {
346  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
347  bool expectedAbort = false;
348  FILE *f = 0;
349  bool isMachineLe = IsMachineLittleEndian();
350  bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
351  bool swap = (isMachineLe != isDefaultLe);
352  if (insertBom) {
353  src.Add(swap ? 0xfffe : 0xfeff);
354  if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
355  else if (bomHandling == bomRequired) {
356  expectedAbort = true; expectedRetVal = -1; }
357  // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
358  // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
359  // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
360  // (absent = 0, 'a' = 1).
361  for (int i = 0; i < testCaseDesc.Len(); )
362  {
363  const char c = testCaseDesc[i++];
364  uint cp = 0; int nWords = -1;
365  if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
366  if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
367  else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
368  else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
369  else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
370  else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
371  else if (c == 'X') { cp = 0xfffe; nWords = 1; }
372  else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
373  else Fail;
374  if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
375  // Process 'e'.
376  int nToDel = 0;
377  if (i < testCaseDesc.Len()) {
378  const char e = testCaseDesc[i];
379  if (e >= 'a') { i += 1; nToDel = 1; }}
380  IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
381  if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
382  // Will an error occur during the decoding of this codepoint?
383  bool errHere = false;
384  if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
385  else if (cp > 0x10ffff) { Fail; errHere = true; }
386  else if (nToDel > 0) errHere = true;
387  else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
388  // Update 'expectedDest' and 'expectedRetVal'.
389  if (! expectedAbort) {
390  if (! errHere) {
391  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
392  else { expectedDest.Add(cp); expectedRetVal += 1; } }
393  else if (errorHandling == uehReplace) {
394  expectedDest.Add(replacementChar); }
395  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
396  // Update 'src'.
397  if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
398  else {
399  int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
400  int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
401  src.Add(swap ? SwapBytes(c1) : c1);
402  if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
403  }
404  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
405  TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
406 }
#define IAssert(Cond)
Definition: bd.h:262
static int SwapBytes(int x)
Definition: unicode.h:250
bool strict
Definition: unicode.h:83
int Len() const
Definition: dt.h:487
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
void TestUtf16()
Definition: unicode.cpp:408
char * CStr()
Definition: dt.h:476
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void TUniCodec::TestDecodeUtf8 ( TRnd rnd,
const TStr testCaseDesc 
)
protected

Definition at line 133 of file unicode.cpp.

134 {
135  TIntV src; TIntV expectedDest; int expectedRetVal = 0;
136  bool expectedAbort = false;
137  FILE *f = 0; // stderr
138  // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
139  // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
140  // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
141  // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
142  // (absent = 0, 'a' = 1, 'b' = 2 and so on).
143  for (int i = 0; i < testCaseDesc.Len(); )
144  {
145  IAssert(i + 2 <= testCaseDesc.Len());
146  const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
147  uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
148  IAssert('1' <= d && d <= '6'); nBytes = d - '0';
149  if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
150  else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
151  else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
152  else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
153  else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
154  else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
155  else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
156  else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
157  else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
158  else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
159  else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
160  else Fail;
161  IAssert(nBytes >= minBytes);
162  // Process 'e'.
163  int nToDel = 0;
164  if (i < testCaseDesc.Len()) {
165  const char e = testCaseDesc[i];
166  if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
167  IAssert(nToDel < nBytes);
168  // Will an error occur during the decoding of this codepoint?
169  bool errHere = false;
170  if (eighties) errHere = true;
171  else if (nToDel > 0) errHere = true;
172  else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
173  // Update 'expectedDest' and 'expetedRetVal'.
174  if (! expectedAbort) {
175  if (! errHere) {
176  if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
177  else { expectedDest.Add(cp); expectedRetVal += 1; } }
178  else if (errorHandling == uehReplace) {
179  if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
180  else expectedDest.Add(replacementChar); }
181  if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
182  // Update 'src'.
183  if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
184  else if (nBytes == 1) src.Add(cp);
185  else {
186  int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
187  src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
188  for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
189  }
190  if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
191  TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
192 }
#define IAssert(Cond)
Definition: bd.h:262
bool strict
Definition: unicode.h:83
int Len() const
Definition: dt.h:487
unsigned int uint
Definition: bd.h:11
#define Fail
Definition: bd.h:238
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void TestUtf8()
Definition: unicode.cpp:194
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
char * CStr()
Definition: dt.h:476
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void TUniCodec::TestUtf16 ( bool  decode,
size_t  expectedRetVal,
bool  expectedThrow,
const TIntV src,
const TIntV expectedDest,
const TUtf16BomHandling  bomHandling,
const TUniByteOrder  defaultByteOrder,
const bool  insertBom,
FILE *  f 
)
protected

Definition at line 284 of file unicode.cpp.

287 {
288  TIntV srcBytes, expectedDestBytes;
289  WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
290  TIntV dest;
291  if (f) {
292  fprintf(f, "Settings: %s %s %s %s %s replacementChar = %x \n",
293  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
294  (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
295  (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
296  (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
298  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
299  for (int useBytes = 0; useBytes < 2; useBytes++)
300  {
301  const char *fmt = (useBytes ? " %02x" : " %04x");
302  try
303  {
304  dest.Clr();
305  size_t retVal;
306  if (! useBytes) {
307  if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
308  else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
309  else {
310  if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
311  else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
312  const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
313  if (f) {
314  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(dest[i]));
315  fprintf(f, "\n expDest "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(ed[i]));
316  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
317  bool ok = true;
318  if (retVal != expectedRetVal) ok = false;
319  if (dest.Len() != ed.Len()) ok = false;
320  if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
321  if (! ok)
322  {
323  printf("!!!\n");
324  }
325  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
326  IAssert(dest.Len() == ed.Len());
327  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
328  }
329  catch (TUnicodeException e)
330  {
331  if (f) {
332  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
333  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
334  IAssert(expectedThrow);
335  }
336  }
337 }
#define IAssert(Cond)
Definition: bd.h:262
bool strict
Definition: unicode.h:83
size_t srcIdx
Definition: unicode.h:32
unsigned int uint
Definition: bd.h:11
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2428
bool skipBom
Definition: unicode.h:89
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2210
int replacementChar
Definition: unicode.h:64
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2376
char * CStr()
Definition: dt.h:476
void WordsToBytes(const TIntV &src, TIntV &dest)
Definition: unicode.cpp:274
void TUniCodec::TestUtf16 ( )

Definition at line 408 of file unicode.cpp.

409 {
410  TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
411  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
412  for (int strict_ = 0; strict_ < 2; strict_++)
413  for (int errMode_ = 0; errMode_ < 4; errMode_++)
414  for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
415  for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
416  for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
417  {
418  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
419  bool insertBom = (insertBom_ == 1);
420  TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
421  TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
422  TRnd rnd = TRnd(123);
423  // Test DecodeUtf16 on various random UTF-16-encoded sequences.
424  for (int i = 0; i < 10; i++)
425  {
426  TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
427  TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
428  TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
429  TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
430  TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
431  TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
432  TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
433  TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
434  TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
435  }
436  //continue;
437  // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
438  // close to powers of 2.
439  TIntV src, expectedDest, src2;
440  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
441  for (int pow = 8; pow <= 32; pow++)
442  {
443  uint uFrom, uTo;
444  if (pow == 8) uFrom = 0, uTo = 1u << pow;
445  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
446  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
447  printf("%u..%u \r", uFrom, uTo);
448  for (uint u = uFrom; ; u++)
449  {
450  int nWords = 0;
451  if (u < 0x10000) nWords = 1;
452  else nWords = 2;
453  bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
454  bool swap = (isMachineLe != isDestLe);
455  bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
456  src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
457  if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
458  if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
459  {
460  // Try to encode 'u' and see if it gets decoded correctly.
461  if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
462  else {
463  int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
464  int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
465  src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
466  src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
467  if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
468  {
469  expectedDest.Reserve(2, 0);
470  if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
471  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
472  else if (! err) expectedDest.Add(u);
473  int erv = (err ? 0 : expectedDest.Len());
474  if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
475  bool errD = err;
476  if (bomHandling == bomRequired && ! insertBom) {
477  expectedDest.Clr(false);
478  if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
479  else { erv = -1; errD = true;
480  /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
481  TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
482  }
483  }
484  // We can also test the UTF-16 encoder.
485  src2[0] = u;
486  if (err) {
487  src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
488  if (errorHandling == uehReplace) {
490  /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
491  else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
492  }}
493  TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
494  //
495  if (u == uTo) break;
496  }
497  }
498  }
499 }
static int SwapBytes(int x)
Definition: unicode.h:250
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Definition: unicode.cpp:341
bool strict
Definition: unicode.h:83
Definition: dt.h:11
static const uint Mx
Definition: dt.h:1154
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
unsigned int uint
Definition: bd.h:11
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
enum TUniByteOrder_ TUniByteOrder
enum TUtf16BomHandling_ TUtf16BomHandling
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:515
void TestUtf16()
Definition: unicode.cpp:408
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void TUniCodec::TestUtf8 ( bool  decode,
size_t  expectedRetVal,
bool  expectedThrow,
const TIntV src,
const TIntV expectedDest,
FILE *  f 
)
protected

Definition at line 99 of file unicode.cpp.

100 {
101  TIntV dest;
102  if (f) {
103  fprintf(f, "Settings: %s %s %s replacementChar = %x\n",
104  (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
105  (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
106  fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
107  try
108  {
109  size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
110  if (f) {
111  fprintf(f, "\n -> dest: "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(dest[i]));
112  fprintf(f, "\n expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : " %02x"), uint(expectedDest[i]));
113  fprintf(f, "\n retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
114  if (retVal != expectedRetVal)
115  printf("!!!");
116  IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
117  if (dest.Len() != expectedDest.Len())
118  printf("!!!");
119  IAssert(dest.Len() == expectedDest.Len());
120  for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
121  }
122  catch (TUnicodeException e)
123  {
124  if (f) {
125  fprintf(f, "\n -> expDest "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
126  fprintf(f, "\n exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
127  IAssert(expectedThrow);
128  }
129 }
#define IAssert(Cond)
Definition: bd.h:262
bool strict
Definition: unicode.h:83
size_t srcIdx
Definition: unicode.h:32
unsigned int uint
Definition: bd.h:11
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
char * CStr()
Definition: dt.h:476
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
void TUniCodec::TestUtf8 ( )

Definition at line 194 of file unicode.cpp.

195 {
196  TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
197  for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
198  for (int strict_ = 0; strict_ < 2; strict_++)
199  for (int errMode_ = 0; errMode_ < 4; errMode_++)
200  {
201  strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
202  TRnd rnd = TRnd(123);
203  // Test DecodeUtf8 on various random UTF-8-encoded sequences.
204  for (int i = 0; i < 10; i++)
205  {
206  TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
207  TestDecodeUtf8(rnd, "X3A5dA6d");
208  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
209  TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
210  TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
211  TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
212  TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
213  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
214  TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
215  TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
216  TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
217  TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
218  TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
219  TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
220  TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
221  TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
222  TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
223  TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
224  }
225  // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
226  // close to powers of 2.
227  TIntV src, expectedDest, src2;
228  expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
229  for (int pow = 8; pow <= 32; pow++)
230  {
231  uint uFrom, uTo;
232  if (pow == 8) uFrom = 0, uTo = 1u << pow;
233  else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
234  else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
235  printf("%u..%u \r", uFrom, uTo);
236  for (uint u = uFrom; ; u++)
237  {
238  int nBytes = 0;
239  if (u < (1u << 7)) nBytes = 1;
240  else if (u < (1u << 11)) nBytes = 2;
241  else if (u < (1u << 16)) nBytes = 3;
242  else if (u < (1u << 21)) nBytes = 4;
243  else if (u < (1u << 26)) nBytes = 5;
244  else nBytes = 6;
245  src.Gen(6, nBytes);
246  if (nBytes == 1) src[0] = u;
247  else {
248  src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
249  for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
250  bool err = (strict && u > 0x10ffff);
251  expectedDest.Reserve(1, 0);
252  if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
253  else if (! err) expectedDest.Add(u);
254  int erv = (err ? 0 : 1);
255  if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
256  TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
257  // We can also test the UTF-8 encoder.
258  src2[0] = u;
259  if (err) {
260  if (errorHandling == uehReplace) src = utf8ReplCh;
261  else src.Clr(false); }
262  TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
263  //
264  if (u == uTo) break;
265  }
266  }
267  }
268 }
bool strict
Definition: unicode.h:83
Definition: dt.h:11
static const uint Mx
Definition: dt.h:1154
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
unsigned int uint
Definition: bd.h:11
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
void TestDecodeUtf8(TRnd &rnd, const TStr &testCaseDesc)
Definition: unicode.cpp:133
void TestUtf8()
Definition: unicode.cpp:194
bool skipBom
Definition: unicode.h:89
int replacementChar
Definition: unicode.h:64
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:515
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
void TUniCodec::WordsToBytes ( const TIntV src,
TIntV dest 
)
protected

Definition at line 274 of file unicode.cpp.

275 {
276  dest.Clr();
277  bool isLE = IsMachineLittleEndian();
278  for (int i = 0; i < src.Len(); i++) {
279  int c = src[i] & 0xffff;
280  if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
281  else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
282 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574

Friends And Related Function Documentation

friend class TUniCaseFolding
friend

Definition at line 120 of file unicode.h.

friend class TUnicode
friend

Definition at line 121 of file unicode.h.

Member Data Documentation

TUnicodeErrorHandling TUniCodec::errorHandling

Definition at line 66 of file unicode.h.

int TUniCodec::replacementChar

Definition at line 64 of file unicode.h.

bool TUniCodec::skipBom

Definition at line 89 of file unicode.h.

bool TUniCodec::strict

Definition at line 83 of file unicode.h.


The documentation for this class was generated from the following files: