#include <unicode.h>

Public Types
enum	{ DefaultReplacementChar = 0xfffd }

Public Member Functions
	TUniCodec ()

	TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)

template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const

template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const

template<typename TSrcVec >
TStr	EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const

template<typename TSrcVec >
TStr	EncodeUtf8Str (const TSrcVec &src) const

template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const

template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const

template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const

template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const

void	TestUtf8 ()

void	TestUtf16 ()

Public Attributes
int	replacementChar

TUnicodeErrorHandling	errorHandling

bool	strict

bool	skipBom

Protected Types
enum	{ DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0), DefineByte =(1, 0, 0, 0, 0, 0, 0, 0) }

enum	{ Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 }

typedef TUniVecIdx	TVecIdx

Protected Member Functions
void	TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f)

void	TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc)

void	WordsToBytes (const TIntV &src, TIntV &dest)

void	TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f)

void	TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)

Static Protected Member Functions
static bool	IsMachineLittleEndian ()

static uint	GetRndUint (TRnd &rnd)

static uint	GetRndUint (TRnd &rnd, uint minVal, uint maxVal)

static int	SwapBytes (int x)

Friends
class	TUniCaseFolding

class	TUnicode

Detailed Description

Definition at line 54 of file unicode.h.

Member Typedef Documentation

typedef TUniVecIdx TUniCodec::TVecIdx

protected

Definition at line 118 of file unicode.h.

Member Enumeration Documentation

anonymous enum

Enumerator
DefaultReplacementChar

Definition at line 59 of file unicode.h.

59 { DefaultReplacementChar = 0xfffd };

TUniCodec::DefaultReplacementChar

Definition: unicode.h:59

anonymous enum

protected

Enumerator
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte

Definition at line 101 of file unicode.h.

              {
 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
 #undef DefineByte
         };

anonymous enum

protected

Enumerator
Utf16FirstSurrogate
Utf16SecondSurrogate

Definition at line 157 of file unicode.h.

              {
                 Utf16FirstSurrogate = 0xd800,
                 Utf16SecondSurrogate = 0xdc00
         };

Constructor & Destructor Documentation

TUniCodec::TUniCodec ( )

inline

Definition at line 91 of file unicode.h.

                     : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
         {
         }

TUniCodec::TUniCodec	(	TUnicodeErrorHandling	errorHandling_,
		bool	strict_,
		int	replacementChar_,
		bool	skipBom_
	)

inline

Definition at line 95 of file unicode.h.

                                                                                                            :
                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
         {
         }

Member Function Documentation

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf16FromBytes	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TUtf16BomHandling	bomHandling = `bomAllowed`,
		const TUniByteOrder	defaultByteOrder = `boMachineEndian`
	)		const

Definition at line 2210 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::DecodeUtf16FromBytes(), and TestUtf16().

 {
         IAssert(srcCount % 2 == 0);
         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
         if (clrDest) dest.Clr();
         size_t nDecoded = 0;
         if (srcCount <= 0) return nDecoded;
         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
         bool littleEndian = false;
         bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
         if (bomHandling == bomIgnored) littleEndian = leDefault;
         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
         {
                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
                 else { // Report an error.
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                         default: Fail; } }
         }
         else Fail;
         while (srcIdx < srcEnd)
         {
                 const size_t charSrcIdx = srcIdx;
                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                 {
                         // c is the first character in a surrogate pair.  Read the next character.
                         if (! (srcIdx + 2 <= srcEnd)) {
                                 switch (errorHandling) {
                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                 case uehAbort: return nDecoded;
                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                 case uehIgnore: continue;
                                 default: Fail; } }
                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                         // c2 should be the second character of the surrogate pair.
                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                 switch (errorHandling) {
                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                 case uehAbort: return nDecoded;
                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
                                 case uehIgnore: srcIdx -= 2; continue;
                                 default: Fail; } }
                         // c and c2 each contain 10 bits of information.
                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                         cc += 0x10000;
                         dest.Add(TDestCh(cc)); nDecoded++; continue;
                 }
                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                         case uehAbort: return nDecoded;
                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                 // Otherwise, store 'c' to the destination vector.
                 dest.Add(TDestCh(c)); nDecoded++;
         }
         return nDecoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf16FromWords	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest,
		const TUtf16BomHandling	bomHandling = `bomAllowed`,
		const TUniByteOrder	defaultByteOrder = `boMachineEndian`
	)		const

Definition at line 2294 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::DecodeUtf16FromWords(), and TestUtf16().

 {
         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
         if (clrDest) dest.Clr();
         size_t nDecoded = 0;
         if (srcCount <= 0) return nDecoded;
         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
         bool swap = false;
         bool isMachineLe = IsMachineLittleEndian();
         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
         {
                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
                 else { // Report an error.
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                         default: Fail; } }
         }
         else Fail;
         while (srcIdx < srcEnd)
         {
                 const size_t charSrcIdx = srcIdx;
                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                 {
                         // c is the first character in a surrogate pair.  Read the next character.
                         if (! (srcIdx < srcEnd)) {
                                 switch (errorHandling) {
                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                 case uehAbort: return nDecoded;
                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                 case uehIgnore: continue;
                                 default: Fail; } }
                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
                         // c2 should be the second character of the surrogate pair.
                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                 switch (errorHandling) {
                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                 case uehAbort: return nDecoded;
                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
                                 case uehIgnore: srcIdx -= 1; continue;
                                 default: Fail; } }
                         // c and c2 each contain 10 bits of information.
                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                         cc += 0x10000;
                         dest.Add(TDestCh(cc)); nDecoded++; continue;
                 }
                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                         case uehAbort: return nDecoded;
                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                 // Otherwise, store 'c' to the destination vector.
                 dest.Add(TDestCh(c)); nDecoded++;
         }
         return nDecoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf8	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

Definition at line 2036 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), Fail, TInt::GetStr(), uehAbort, uehIgnore, uehReplace, and uehThrow.

Referenced by TUnicode::DecodeUtf8(), TUniChDb::SbEx_AddUtf8(), and TestUtf8().

 {
         size_t nDecoded = 0;
         if (clrDest) dest.Clr();
         const size_t origSrcIdx = srcIdx;
         const size_t srcEnd = srcIdx + srcCount;
         while (srcIdx < srcEnd)
         {
                 const size_t charSrcIdx = srcIdx;
                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                 if ((c & _1000_0000) == 0) {
                         // c is one of the characters 0..0x7f, encoded as a single byte.
                         dest.Add(TDestCh(c)); nDecoded++; continue; }
                 else if ((c & _1100_0000) == _1000_0000) {
                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
                         // We must have been thrown into the middle of a multi-byte character.
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
                         case uehAbort: return nDecoded;
                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 else
                 {
                         // c introduces a sequence of 2..6 bytes, depending on how many
                         // of the most significant bits of c are set.
                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
                         else {
                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
                                 if (strict)  {
                                         switch (errorHandling) {
                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
                                         case uehAbort: return nDecoded;
                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
                                         // and try to decode the character.  Then, since 'strict' is true and
                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
                                         // and (in the case of uehReplace) insert a replacement character then.
                                         // This is probably better than inserting a replacement character right
                                         // away and then trying to read the next byte as if a new character
                                         // was beginning there -- if the current byte is really followed by five
                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
                                         case uehIgnore: break; // continue;
                                         default: Fail; } }
                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
                         // Decode this multi-byte sequence.
                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
                         bool cancel = false;
                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
                                 // See if there are enough bytes left in the source vector.
                                 if (! (srcIdx < srcEnd)) {
                                         switch (errorHandling) {
                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
                                         case uehAbort: return nDecoded;
                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
                                         case uehIgnore: cancel = true; continue;
                                         default: Fail; } }
                                 // Read the next byte.
                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
                                         switch (errorHandling) {
                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
                                         case uehAbort: return nDecoded;
                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
                                         case uehIgnore: srcIdx--; cancel = true; continue;
                                         default: Fail; } }
                                 cOut <<= 6; cOut |= (c & _0011_1111); }
                         if (cancel) continue;
                         if (strict) {
                                 // err1: This codepoint has been represented by more bytes than it should have been.
                                 // For example, cOut in the range 0..127 should be represented by a single byte,
                                 // not by two or more bytes.
                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
                                 // the appearance of null bytes in the encoded stream.
                                 bool err1 = (cOut < minVal);
                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
                                 if (err1 || err2) switch (errorHandling) {
                                         case uehThrow:
                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
                                                 else { Fail; break; }
                                         case uehAbort: return nDecoded;
                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                         case uehIgnore: continue;
                                         default: Fail; } }
                         // Add the decoded codepoint to the destination vector.
                         // If this is the first decoded character, and it's one of the byte-order marks
                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
                                 dest.Add(cOut); nDecoded++; }
                 } // else (multi-byte sequence)
         } // while
         return nDecoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf8	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 136 of file unicode.h.

References DecodeUtf8().

Referenced by DecodeUtf8().

136 { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }

TUniCodec::DecodeUtf8

size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:2036

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf16ToBytes	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	insertBom,
		const TUniByteOrder	destByteOrder = `boMachineEndian`
	)		const

Definition at line 2428 of file unicode.h.

References ___OutRepl, TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::EncodeUtf16ToBytes(), and TestUtf16().

 {
         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
         while (srcIdx < srcEnd)
         {
                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                 if (! (c <= 0x10ffffu)) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                         case uehAbort: return nEncoded;
 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
                         case uehReplace: ___OutRepl; continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                         case uehAbort: return nEncoded;
                         case uehReplace: ___OutRepl; continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                         case uehAbort: return nEncoded;
                         case uehReplace: ___OutRepl; continue;
                         case uehIgnore: continue;
                         default: Fail; } }
 #undef ___OutRepl
                 // If c is <= 0xffff, it can be stored directly.
                 if (c <= 0xffffu) {
                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
                         nEncoded++; continue; }
                 // Otherwise, represent c by a pair of surrogate characters.
                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
                 nEncoded++; continue;
         }
         return nEncoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf16ToWords	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	insertBom,
		const TUniByteOrder	destByteOrder = `boMachineEndian`
	)		const

Definition at line 2376 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), SwapBytes(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::EncodeUtf16ToWords(), and TestUtf16().

 {
         bool isMachineLe = IsMachineLittleEndian();
         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
         while (srcIdx < srcEnd)
         {
                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                 if (! (c <= 0x10ffffu)) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                         case uehAbort: return nEncoded;
                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                         case uehAbort: return nEncoded;
                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                         case uehAbort: return nEncoded;
                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                         case uehIgnore: continue;
                         default: Fail; } }
                 // If c is <= 0xffff, it can be stored directly.
                 if (c <= 0xffffu) {
                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                         dest.Add(TDestCh(c)); nEncoded++; continue; }
                 // Otherwise, represent c by a pair of surrogate characters.
                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                 if (swap) {
                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
                 dest.Add(TDestCh(c1));
                 dest.Add(TDestCh(c2));
                 nEncoded++; continue;
         }
         return nEncoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf8	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

Definition at line 2152 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), Fail, TInt::GetStr(), replacementChar, uehAbort, uehIgnore, uehReplace, and uehThrow.

Referenced by TUnicode::EncodeUtf8(), EncodeUtf8Str(), and TestUtf8().

 {
         size_t nEncoded = 0;
         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
         {
                 uint c = uint(src[TVecIdx(srcIdx)]);
                 bool err = false;
                 if (strict && c > 0x10ffff) {
                         err = true;
                         switch (errorHandling) {
                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
                         case uehAbort: return nEncoded;
                         case uehReplace: c = replacementChar; break;
                         case uehIgnore: continue;
                         default: Fail; } }
                 if (c < 0x80u)
                         dest.Add(TDestCh(c & 0xffu));
                 else if (c < 0x800u) {
                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                 else if (c < 0x10000u) {
                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                 else if (c < 0x200000u) {
                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                 else if (c < 0x4000000u) {
                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                 else {
                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                 if (! err) nEncoded++;
         }
         return nEncoded;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf8	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

inline

Definition at line 145 of file unicode.h.

References EncodeUtf8().

Referenced by EncodeUtf8().

145 { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }

TUniCodec::EncodeUtf8

size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:2152

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

TStr TUniCodec::EncodeUtf8Str	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const

inline

Definition at line 149 of file unicode.h.

References EncodeUtf8().

Referenced by TUnicode::EncodeUtf8Str().

149 { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }

TStr

Definition: dt.h:412

TUniCodec::EncodeUtf8

size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:2152

TVec

Vector is a sequence TVal objects representing an array that can change in size.

Definition: ds.h:430

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

TStr TUniCodec::EncodeUtf8Str ( const TSrcVec & src ) const

inline

Definition at line 150 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), and EncodeUtf8().

150 { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }

TStr

Definition: dt.h:412

TVec::Add

TSizeTy Add()

Adds a new element at the end of the vector, after its current last element.

Definition: ds.h:602

TUniCodec::EncodeUtf8

size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const

Definition: unicode.h:2152

TVec

Vector is a sequence TVal objects representing an array that can change in size.

Definition: ds.h:430

Here is the call graph for this function:

uint TUniCodec::GetRndUint ( TRnd & rnd )

staticprotected

Definition at line 62 of file unicode.cpp.

References TRnd::GetUniDevUInt().

Referenced by GetRndUint(), TestDecodeUtf16(), and TestDecodeUtf8().

 {
         uint u = rnd.GetUniDevUInt(256) & 0xff;
         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
         return u;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

uint TUniCodec::GetRndUint	(	TRnd &	rnd,
		uint	minVal,
		uint	maxVal
	)

staticprotected

Definition at line 71 of file unicode.cpp.

References GetRndUint(), TUInt::Mn, and TUInt::Mx.

 {
         if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
         uint range = maxVal - minVal + 1;
         if (range > (uint(1) << (8 * sizeof(uint) - 1)))
                 while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
         uint mask = 1;
         while (mask < range) mask <<= 1;
         mask -= 1;
         while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
 }

Here is the call graph for this function:

bool TUniCodec::IsMachineLittleEndian ( )

staticprotected

Definition at line 83 of file unicode.cpp.

Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), TestDecodeUtf16(), TestUtf16(), and WordsToBytes().

 {
         static bool isLE, initialized = false;
         if (initialized) return isLE;
         int i = 1;
         if(*(char *)&i == 1) isLE = true;
         else isLE = false;
 
         initialized = true;
         return isLE;
 }

Here is the caller graph for this function:

static int TUniCodec::SwapBytes ( int x )

inlinestaticprotected

Definition at line 250 of file unicode.h.

Referenced by EncodeUtf16ToWords(), TestDecodeUtf16(), and TestUtf16().

250 {

251 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }

Here is the caller graph for this function:

void TUniCodec::TestDecodeUtf16	(	TRnd &	rnd,
		const TStr &	testCaseDesc,
		const TUtf16BomHandling	bomHandling,
		const TUniByteOrder	defaultByteOrder,
		const bool	insertBom
	)

protected

Definition at line 341 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomRequired, TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, IsMachineLittleEndian(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, SwapBytes(), TestUtf16(), uehAbort, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TestUtf16().

 {
         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
         bool expectedAbort = false;
         FILE *f = 0;
         bool isMachineLe = IsMachineLittleEndian();
         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
         bool swap = (isMachineLe != isDefaultLe);
         if (insertBom) {
                 src.Add(swap ? 0xfffe : 0xfeff);
                 if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
         else if (bomHandling == bomRequired) {
                 expectedAbort = true; expectedRetVal = -1; }
         // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
         // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
         // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
         //   (absent = 0, 'a' = 1).
         for (int i = 0; i < testCaseDesc.Len(); )
         {
                 const char c = testCaseDesc[i++];
                 uint cp = 0; int nWords = -1;
                 if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
                 if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
                 else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
                 else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
                 else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
                 else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
                 else if (c == 'X') { cp = 0xfffe; nWords = 1; }
                 else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
                 else Fail;
                 if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                 // Process 'e'.
                 int nToDel = 0;
                 if (i < testCaseDesc.Len()) {
                         const char e = testCaseDesc[i];
                         if (e >= 'a') { i += 1; nToDel = 1; }}
                 IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
                 if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                 // Will an error occur during the decoding of this codepoint?
                 bool errHere = false;
                 if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
                 else if (cp > 0x10ffff) { Fail; errHere = true; }
                 else if (nToDel > 0) errHere = true;
                 else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
                 // Update 'expectedDest' and 'expectedRetVal'.
                 if (! expectedAbort) {
                         if (! errHere) {
                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
                         else if (errorHandling == uehReplace) {
                                 expectedDest.Add(replacementChar); }
                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                 // Update 'src'.
                 if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
                 else {
                         int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
                         int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
                         src.Add(swap ? SwapBytes(c1) : c1);
                         if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
         }
         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
         TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestDecodeUtf8	(	TRnd &	rnd,
		const TStr &	testCaseDesc
	)

protected

Definition at line 133 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, TestUtf8(), uehAbort, uehReplace, and uehThrow.

Referenced by TestUtf8().

 {
         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
         bool expectedAbort = false;
         FILE *f = 0; // stderr
         // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
         // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
         // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
         // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
         //   (absent = 0, 'a' = 1, 'b' = 2 and so on).
         for (int i = 0; i < testCaseDesc.Len(); )
         {
                 IAssert(i + 2 <= testCaseDesc.Len());
                 const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
                 uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
                 IAssert('1' <= d && d <= '6'); nBytes = d - '0';
                 if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
                 else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
                 else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
                 else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
                 else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
                 else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
                 else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
                 else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
                 else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
                 else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
                 else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
                 else Fail;
                 IAssert(nBytes >= minBytes);
                 // Process 'e'.
                 int nToDel = 0;
                 if (i < testCaseDesc.Len()) {
                         const char e = testCaseDesc[i];
                         if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
                 IAssert(nToDel < nBytes);
                 // Will an error occur during the decoding of this codepoint?
                 bool errHere = false;
                 if (eighties) errHere = true;
                 else if (nToDel > 0) errHere = true;
                 else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
                 // Update 'expectedDest' and 'expetedRetVal'.
                 if (! expectedAbort) {
                         if (! errHere) {
                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
                         else if (errorHandling == uehReplace) {
                                 if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
                                 else expectedDest.Add(replacementChar); }
                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                 // Update 'src'.
                 if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
                 else if (nBytes == 1) src.Add(cp);
                 else {
                         int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
                         src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
                         for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
         }
         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
         TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestUtf16	(	bool	decode,
		size_t	expectedRetVal,
		bool	expectedThrow,
		const TIntV &	src,
		const TIntV &	expectedDest,
		const TUtf16BomHandling	bomHandling,
		const TUniByteOrder	defaultByteOrder,
		const bool	insertBom,
		FILE *	f
	)

protected

Definition at line 284 of file unicode.cpp.

References boBigEndian, boLittleEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), TStr::CStr(), DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, uehThrow, and WordsToBytes().

 {
         TIntV srcBytes, expectedDestBytes;
         WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
         TIntV dest;
         if (f) {
                 fprintf(f, "Settings: %s  %s  %s  %s  %s replacementChar = %x  \n",
                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                         (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
                         (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
                         (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
                         uint(replacementChar));
                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
         for (int useBytes = 0; useBytes < 2; useBytes++)
         {
                 const char *fmt = (useBytes ? " %02x" : " %04x");
                 try
                 {
                         dest.Clr();
                         size_t retVal;
                         if (! useBytes) {
                                 if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
                                 else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                         else {
                                 if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
                                 else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                         const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
                         if (f) {
                                 fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(dest[i]));
                                 fprintf(f, "\n    expDest  "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(ed[i]));
                                 fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                         bool ok = true;
                         if (retVal != expectedRetVal) ok = false;
                         if (dest.Len() != ed.Len()) ok = false;
                         if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
                         if (! ok)
                         {
                                 printf("!!!\n");
                         }
                         IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                         IAssert(dest.Len() == ed.Len());
                         for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
                 }
                 catch (TUnicodeException e)
                 {
                         if (f) {
                                 fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
                                 fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                         IAssert(expectedThrow);
                 }
         }
 }

Here is the call graph for this function:

void TUniCodec::TestUtf16 ( )

Definition at line 408 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, TVec< TVal, TSizeTy >::Gen(), IsMachineLittleEndian(), TVec< TVal, TSizeTy >::Len(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, SwapBytes(), TestDecodeUtf16(), uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TestDecodeUtf16().

 {
         TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
         for (int strict_ = 0; strict_ < 2; strict_++)
         for (int errMode_ = 0; errMode_ < 4; errMode_++)
         for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
         for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
         for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
         {
                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                 bool insertBom = (insertBom_ == 1);
                 TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
                 TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
                 TRnd rnd = TRnd(123);
                 // Test DecodeUtf16 on various random UTF-16-encoded sequences.
                 for (int i = 0; i < 10; i++)
                 {
                         TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
                         TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
                 }
                 //continue;
                 // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
                 // close to powers of 2.
                 TIntV src, expectedDest, src2;
                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                 for (int pow = 8; pow <= 32; pow++)
                 {
                         uint uFrom, uTo;
                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                         printf("%u..%u          \r", uFrom, uTo);
                         for (uint u = uFrom; ; u++)
                         {
                                 int nWords = 0;
                                 if (u < 0x10000) nWords = 1;
                                 else nWords = 2;
                                 bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
                                 bool swap = (isMachineLe != isDestLe);
                                 bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
                                 src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
                                 if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
                                 if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
                                 {
                                         // Try to encode 'u' and see if it gets decoded correctly.
                                         if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
                                         else {
                                                 int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
                                                 int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
                                                 src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
                                                 src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
                                         if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
                                         {
                                                 expectedDest.Reserve(2, 0);
                                                 if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
                                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                                 else if (! err) expectedDest.Add(u);
                                                 int erv = (err ? 0 : expectedDest.Len());
                                                 if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
                                                 bool errD = err;
                                                 if (bomHandling == bomRequired && ! insertBom) {
                                                         expectedDest.Clr(false);
                                                         if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
                                                         else { erv = -1; errD = true;
                                                                 /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
                                                 TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
                                         }
                                 }
                                 // We can also test the UTF-16 encoder.
                                 src2[0] = u;
                                 if (err) {
                                         src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
                                         if (errorHandling == uehReplace) {
                                                 src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
                                                 /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
                                                 else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
                                         }}
                                 TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
                                 //
                                 if (u == uTo) break;
                         }
                 }
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestUtf8	(	bool	decode,
		size_t	expectedRetVal,
		bool	expectedThrow,
		const TIntV &	src,
		const TIntV &	expectedDest,
		FILE *	f
	)

protected

Definition at line 99 of file unicode.cpp.

References TStr::CStr(), DecodeUtf8(), EncodeUtf8(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.

 {
         TIntV dest;
         if (f) {
                 fprintf(f, "Settings: %s  %s  %s   replacementChar = %x\n",
                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                         (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
         try
         {
                 size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
                 if (f) {
                         fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(dest[i]));
                         fprintf(f, "\n    expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(expectedDest[i]));
                         fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                 if (retVal != expectedRetVal)
                         printf("!!!");
                 IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                 if (dest.Len() != expectedDest.Len())
                         printf("!!!");
                 IAssert(dest.Len() == expectedDest.Len());
                 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
         }
         catch (TUnicodeException e)
         {
                 if (f) {
                         fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
                         fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                 IAssert(expectedThrow);
         }
 }

Here is the call graph for this function:

void TUniCodec::TestUtf8 ( )

Definition at line 194 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), EncodeUtf8(), errorHandling, TVec< TVal, TSizeTy >::Gen(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, TestDecodeUtf8(), uehReplace, and uehThrow.

Referenced by TestDecodeUtf8().

 {
         TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
         for (int strict_ = 0; strict_ < 2; strict_++)
         for (int errMode_ = 0; errMode_ < 4; errMode_++)
         {
                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                 TRnd rnd = TRnd(123);
                 // Test DecodeUtf8 on various random UTF-8-encoded sequences.
                 for (int i = 0; i < 10; i++)
                 {
                         TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
                         TestDecodeUtf8(rnd, "X3A5dA6d");
                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
                         TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
                         TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
                         TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
                         TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
                         TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
                         TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
                 }
                 // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
                 // close to powers of 2.
                 TIntV src, expectedDest, src2;
                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                 for (int pow = 8; pow <= 32; pow++)
                 {
                         uint uFrom, uTo;
                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                         printf("%u..%u          \r", uFrom, uTo);
                         for (uint u = uFrom; ; u++)
                         {
                                 int nBytes = 0;
                                 if (u < (1u << 7)) nBytes = 1;
                                 else if (u < (1u << 11)) nBytes = 2;
                                 else if (u < (1u << 16)) nBytes = 3;
                                 else if (u < (1u << 21)) nBytes = 4;
                                 else if (u < (1u << 26)) nBytes = 5;
                                 else nBytes = 6;
                                 src.Gen(6, nBytes);
                                 if (nBytes == 1) src[0] = u;
                                 else {
                                         src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
                                         for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
                                 bool err = (strict && u > 0x10ffff);
                                 expectedDest.Reserve(1, 0);
                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                 else if (! err) expectedDest.Add(u);
                                 int erv = (err ? 0 : 1);
                                 if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
                                 TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
                                 // We can also test the UTF-8 encoder.
                                 src2[0] = u;
                                 if (err) {
                                         if (errorHandling == uehReplace) src = utf8ReplCh;
                                         else src.Clr(false); }
                                 TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
                                 //
                                 if (u == uTo) break;
                         }
                 }
         }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::WordsToBytes	(	const TIntV &	src,
		TIntV &	dest
	)

protected

Definition at line 274 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), IsMachineLittleEndian(), and TVec< TVal, TSizeTy >::Len().

Referenced by TestUtf16().

 {
         dest.Clr();
         bool isLE = IsMachineLittleEndian();
         for (int i = 0; i < src.Len(); i++) {
                 int c = src[i] & 0xffff;
                 if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
 }