SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
TUnicode Class Reference

#include <unicode.h>

Public Types

typedef TUniChDb::TCaseConversion TCaseConversion
 

Public Member Functions

 TUnicode ()
 
 TUnicode (const TStr &fnBinUcd)
 
void Init ()
 
int DecodeUtf8 (const TIntV &src, TIntV &dest) const
 
int DecodeUtf8 (const TStr &src, TIntV &dest) const
 
int EncodeUtf8 (const TIntV &src, TIntV &dest) const
 
TStr EncodeUtf8Str (const TIntV &src) const
 
int DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
 
int DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
 
int EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
 
int EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
 
void RegisterCodec (const TStr &nameList, const PCodecBase &codec)
 
void UnregisterCodec (const TStr &nameList)
 
void ClrCodecs ()
 
void InitCodecs ()
 
PCodecBase GetCodec (const TStr &name) const
 
void GetAllCodecs (TCodecBaseV &dest) const
 
bool FindNextWordBoundary (const TIntV &src, int &position) const
 
void FindWordBoundaries (const TIntV &src, TBoolV &dest) const
 
bool FindNextSentenceBoundary (const TIntV &src, int &position) const
 
void FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const
 
void ClrSentenceBoundaryExceptions ()
 
void UseEnglishSentenceBoundaryExceptions ()
 
void Decompose (const TIntV &src, TIntV &dest, bool compatibility) const
 
void Compose (const TIntV &src, TIntV &dest) const
 
void DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const
 
int ExtractStarters (const TIntV &src, TIntV &dest) const
 
int ExtractStarters (TIntV &src) const
 
void GetLowerCase (const TIntV &src, TIntV &dest) const
 
void GetUpperCase (const TIntV &src, TIntV &dest) const
 
void GetTitleCase (const TIntV &src, TIntV &dest) const
 
void GetSimpleLowerCase (const TIntV &src, TIntV &dest) const
 
void GetSimpleUpperCase (const TIntV &src, TIntV &dest) const
 
void GetSimpleTitleCase (const TIntV &src, TIntV &dest) const
 
void ToSimpleUpperCase (TIntV &src) const
 
void ToSimpleLowerCase (TIntV &src) const
 
void ToSimpleTitleCase (TIntV &src) const
 
void GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const
 
void ToCaseFolded (TIntV &src) const
 
TStr GetUtf8CaseFolded (const TStr &s) const
 
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
 
TUniChSubCategory GetSubCat (const int cp) const
 
const char * GetCharName (const int cp) const
 
TStr GetCharNameS (const int cp) const
 

Static Public Member Functions

static void EncodeUtf8 (const uint &Ch, TChA &Dest)
 
static TStr EncodeUtf8 (const uint &Ch)
 

Public Attributes

TUniCodec codec
 
TUniChDb ucd
 
T8BitCodec< TEncoding_ISO8859_1iso8859_1
 
T8BitCodec< TEncoding_ISO8859_2iso8859_2
 
T8BitCodec< TEncoding_ISO8859_3iso8859_3
 
T8BitCodec< TEncoding_ISO8859_4iso8859_4
 
T8BitCodec< TEncoding_YuAsciiyuAscii
 
T8BitCodec< TEncoding_CP1250cp1250
 
T8BitCodec< TEncoding_CP852cp852
 
T8BitCodec< TEncoding_CP437cp437
 

Static Protected Member Functions

static TStr NormalizeCodecName (const TStr &name)
 

Protected Attributes

THash< TStr, PCodecBasecodecs
 

Detailed Description

Definition at line 1771 of file unicode.h.

Member Typedef Documentation

Constructor & Destructor Documentation

TUnicode::TUnicode ( )
inline

Definition at line 1777 of file unicode.h.

1777 { Init(); }
void Init()
Definition: unicode.h:1779
TUnicode::TUnicode ( const TStr fnBinUcd)
inlineexplicit

Definition at line 1778 of file unicode.h.

1778 { ucd.LoadBin(fnBinUcd); Init(); }
void Init()
Definition: unicode.h:1779
TUniChDb ucd
Definition: unicode.h:1775
void LoadBin(const TStr &fnBin)
Definition: unicode.h:1291

Member Function Documentation

DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 ( IsPrivateUse  ,
IsSurrogate   
) const
inline

Definition at line 2018 of file unicode.h.

2020  { return ucd.GetCat(cp); }
TUniChCategory GetCat(const int cp) const
Definition: unicode.h:1353
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::ClrCodecs ( )
inline

Definition at line 1881 of file unicode.h.

1881 { codecs.Clr(); }
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869
void TUnicode::ClrSentenceBoundaryExceptions ( )
inline

Definition at line 1924 of file unicode.h.

1924 { ucd.SbEx_Clr(); }
TUniChDb ucd
Definition: unicode.h:1775
void SbEx_Clr()
Definition: unicode.h:1489
void TUnicode::Compose ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1941 of file unicode.h.

1941 { return ucd.Compose(src, dest, true); }
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3158
TUniChDb ucd
Definition: unicode.h:1775
int TUnicode::DecodeUtf16FromBytes ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const
inline

Definition at line 1810 of file unicode.h.

1812  {
1813  return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniCodec codec
Definition: unicode.h:1774
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2210
int TUnicode::DecodeUtf16FromWords ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const
inline

Definition at line 1823 of file unicode.h.

1825  {
1826  return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294
TUniCodec codec
Definition: unicode.h:1774
int TUnicode::DecodeUtf8 ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1787 of file unicode.h.

1787 { return (int) codec.DecodeUtf8(src, dest); }
TUniCodec codec
Definition: unicode.h:1774
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
int TUnicode::DecodeUtf8 ( const TStr src,
TIntV dest 
) const
inline

Definition at line 1788 of file unicode.h.

1788 { return (int) codec.DecodeUtf8(src, dest); }
TUniCodec codec
Definition: unicode.h:1774
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036
void TUnicode::Decompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const
inline

Definition at line 1934 of file unicode.h.

1934 { ucd.Decompose(src, dest, compatibility, true); }
TUniChDb ucd
Definition: unicode.h:1775
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3126
void TUnicode::DecomposeAndCompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const
inline

Definition at line 1946 of file unicode.h.

1946 { return ucd.DecomposeAndCompose(src, dest, compatibility); }
TUniChDb ucd
Definition: unicode.h:1775
void DecomposeAndCompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3148
int TUnicode::EncodeUtf16ToBytes ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const
inline

Definition at line 1838 of file unicode.h.

1839  {
1840  return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniCodec codec
Definition: unicode.h:1774
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2428
int TUnicode::EncodeUtf16ToWords ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const
inline

Definition at line 1834 of file unicode.h.

1835  {
1836  return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniCodec codec
Definition: unicode.h:1774
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2376
int TUnicode::EncodeUtf8 ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1792 of file unicode.h.

1792 { return (int) codec.EncodeUtf8(src, dest); }
TUniCodec codec
Definition: unicode.h:1774
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152
void TUnicode::EncodeUtf8 ( const uint Ch,
TChA Dest 
)
static

Definition at line 1696 of file unicode.cpp.

1696  {
1697  if (c > 0x10ffff) {
1698  throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
1699  if (c < 0x80u)
1700  dest.AddCh(char(c & 0xffu));
1701  else if (c < 0x800u) {
1702  dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
1703  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1704  else if (c < 0x10000u) {
1705  dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
1706  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1707  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1708  else if (c < 0x200000u) {
1709  dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
1710  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1711  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1712  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1713  else if (c < 0x4000000u) {
1714  dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
1715  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1716  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1717  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1718  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1719  else {
1720  dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
1721  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
1722  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
1723  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
1724  dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
1725  dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
1726 }
static PExcept New(const TStr &MsgStr, const TStr &LocStr=TStr())
Definition: ut.h:169
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
TStr TUnicode::EncodeUtf8 ( const uint Ch)
static

Definition at line 1728 of file unicode.cpp.

1728  {
1729  TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
1730 }
Definition: dt.h:201
int EncodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1792
TStr TUnicode::EncodeUtf8Str ( const TIntV src) const
inline

Definition at line 1796 of file unicode.h.

1796 { return codec.EncodeUtf8Str(src); }
TStr EncodeUtf8Str(const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:149
TUniCodec codec
Definition: unicode.h:1774
int TUnicode::ExtractStarters ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1951 of file unicode.h.

1951 { return (int) ucd.ExtractStarters(src, dest); }
TUniChDb ucd
Definition: unicode.h:1775
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3215
int TUnicode::ExtractStarters ( TIntV src) const
inline

Definition at line 1953 of file unicode.h.

1953 { return (int) ucd.ExtractStarters(src); }
TUniChDb ucd
Definition: unicode.h:1775
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3215
bool TUnicode::FindNextSentenceBoundary ( const TIntV src,
int &  position 
) const
inline

Definition at line 1916 of file unicode.h.

1916  {
1917  if (position < 0) { position = 0; return true; }
1918  size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniChDb ucd
Definition: unicode.h:1775
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
bool TUnicode::FindNextWordBoundary ( const TIntV src,
int &  position 
) const
inline

Definition at line 1901 of file unicode.h.

1901  {
1902  if (position < 0) { position = 0; return true; }
1903  size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniChDb ucd
Definition: unicode.h:1775
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483
void TUnicode::FindSentenceBoundaries ( const TIntV src,
TBoolV dest 
) const
inline

Definition at line 1922 of file unicode.h.

1922 { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniChDb ucd
Definition: unicode.h:1775
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2793
void TUnicode::FindWordBoundaries ( const TIntV src,
TBoolV dest 
) const
inline

Definition at line 1907 of file unicode.h.

1907 { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TUniChDb ucd
Definition: unicode.h:1775
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561
void TUnicode::GetAllCodecs ( TCodecBaseV dest) const
inline

Definition at line 1887 of file unicode.h.

1887  {
1888  dest.Clr();
1889  for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
1890  PCodecBase codec = codecs[i]; bool found = false;
1891  for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
1892  if (! found) dest.Add(codec); }}
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
TUniCodec codec
Definition: unicode.h:1774
Definition: bd.h:196
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869
void TUnicode::GetCaseFolded ( const TIntV src,
TIntV dest,
const bool  full = true 
) const
inline

Definition at line 1989 of file unicode.h.

1989 { return ucd.GetCaseFolded(src, dest, true, full, false); }
TUniChDb ucd
Definition: unicode.h:1775
void GetCaseFolded(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
Definition: unicode.h:1629
const char* TUnicode::GetCharName ( const int  cp) const
inline

Definition at line 2024 of file unicode.h.

2024 { return ucd.GetCharName(cp); }
const char * GetCharName(const int cp) const
Definition: unicode.h:1331
TUniChDb ucd
Definition: unicode.h:1775
TStr TUnicode::GetCharNameS ( const int  cp) const
inline

Definition at line 2025 of file unicode.h.

2025 { return ucd.GetCharNameS(cp); }
TUniChDb ucd
Definition: unicode.h:1775
TStr GetCharNameS(const int cp) const
Definition: unicode.h:1332
PCodecBase TUnicode::GetCodec ( const TStr name) const
inline

Definition at line 1883 of file unicode.h.

1883  {
1884  TStr s = NormalizeCodecName(name);
1885  PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
1886  return p; }
void Clr()
Definition: bd.h:502
static TStr NormalizeCodecName(const TStr &name)
Definition: unicode.h:1870
Definition: dt.h:412
Definition: bd.h:196
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869
void TUnicode::GetLowerCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1965 of file unicode.h.

1965 { ucd.GetLowerCase(src, dest, true, false, false); }
void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1590
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::GetSimpleLowerCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1972 of file unicode.h.

1972 { ucd.GetSimpleLowerCase(src, dest, true); }
TUniChDb ucd
Definition: unicode.h:1775
void GetSimpleLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1601
void TUnicode::GetSimpleTitleCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1974 of file unicode.h.

1974 { ucd.GetSimpleTitleCase(src, dest, true); }
void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1603
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::GetSimpleUpperCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1973 of file unicode.h.

1973 { ucd.GetSimpleUpperCase(src, dest, true); }
void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1602
TUniChDb ucd
Definition: unicode.h:1775
TUniChSubCategory TUnicode::GetSubCat ( const int  cp) const
inline

Definition at line 2021 of file unicode.h.

2021 { return ucd.GetSubCat(cp); }
TUniChSubCategory GetSubCat(const int cp) const
Definition: unicode.h:1354
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::GetTitleCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1967 of file unicode.h.

1967 { ucd.GetTitleCase(src, dest, true, false, false); }
TUniChDb ucd
Definition: unicode.h:1775
void GetTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1592
void TUnicode::GetUpperCase ( const TIntV src,
TIntV dest 
) const
inline

Definition at line 1966 of file unicode.h.

1966 { ucd.GetUpperCase(src, dest, true, false, false); }
void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1591
TUniChDb ucd
Definition: unicode.h:1775
TStr TUnicode::GetUtf8CaseFolded ( const TStr s) const
inline

Definition at line 1994 of file unicode.h.

1994  {
1995  bool isAscii = true;
1996  for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
1997  if (isAscii) return s.GetLc();
1998  TIntV src; DecodeUtf8(s, src);
1999  TIntV dest; GetCaseFolded(src, dest);
2000  return EncodeUtf8Str(dest); }
int Len() const
Definition: dt.h:487
TStr EncodeUtf8Str(const TIntV &src) const
Definition: unicode.h:1796
void GetCaseFolded(const TIntV &src, TIntV &dest, const bool full=true) const
Definition: unicode.h:1989
TStr GetLc() const
Definition: dt.h:499
int DecodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1787
void TUnicode::Init ( )
inline

Definition at line 1779 of file unicode.h.

1779 { InitCodecs(); }
void InitCodecs()
Definition: unicode.cpp:1683
void TUnicode::InitCodecs ( )

Definition at line 1683 of file unicode.cpp.

1684 {
1685  ClrCodecs();
1686  RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
1687  RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
1688  RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
1689  RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
1690  RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
1691  RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
1692  RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
1693  RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
1694 }
void ClrCodecs()
Definition: unicode.h:1881
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
Definition: unicode.h:1873
static TStr TUnicode::NormalizeCodecName ( const TStr name)
inlinestaticprotected

Definition at line 1870 of file unicode.h.

1870  {
1871  TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
int ChangeStrAll(const TStr &SrcStr, const TStr &DstStr, const bool &FromStartP=false)
Definition: dt.cpp:1141
TStr GetLc() const
Definition: dt.h:499
Definition: dt.h:412
void TUnicode::RegisterCodec ( const TStr nameList,
const PCodecBase codec 
)
inline

Definition at line 1873 of file unicode.h.

1873  {
1874  TStrV names; nameList.SplitOnWs(names);
1875  for (int i = 0; i < names.Len(); i++)
1876  codecs.AddDat(NormalizeCodecName(names[i]), codec); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
static TStr NormalizeCodecName(const TStr &name)
Definition: unicode.h:1870
void SplitOnWs(TStrV &StrV) const
Definition: dt.cpp:972
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUnicode::ToCaseFolded ( TIntV src) const
inline

Definition at line 1992 of file unicode.h.

1992 { return ucd.ToCaseFolded(src, false); }
TUniChDb ucd
Definition: unicode.h:1775
void ToCaseFolded(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
Definition: unicode.h:1636
void TUnicode::ToSimpleLowerCase ( TIntV src) const
inline

Definition at line 1978 of file unicode.h.

1978 { ucd.ToSimpleLowerCase(src); }
TUniChDb ucd
Definition: unicode.h:1775
void ToSimpleLowerCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1610
void TUnicode::ToSimpleTitleCase ( TIntV src) const
inline

Definition at line 1979 of file unicode.h.

1979 { ucd.ToSimpleTitleCase(src); }
void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1611
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::ToSimpleUpperCase ( TIntV src) const
inline

Definition at line 1977 of file unicode.h.

1977 { ucd.ToSimpleUpperCase(src); }
void ToSimpleUpperCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1609
TUniChDb ucd
Definition: unicode.h:1775
void TUnicode::UnregisterCodec ( const TStr nameList)
inline

Definition at line 1877 of file unicode.h.

1877  {
1878  TStrV names; nameList.SplitOnWs(names);
1879  for (int i = 0; i < names.Len(); i++)
1880  codecs.DelKey(NormalizeCodecName(names[i])); }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
static TStr NormalizeCodecName(const TStr &name)
Definition: unicode.h:1870
void SplitOnWs(TStrV &StrV) const
Definition: dt.cpp:972
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TUnicode::UseEnglishSentenceBoundaryExceptions ( )
inline

Definition at line 1925 of file unicode.h.

1925 { ucd.SbEx_SetStdEnglish(); }
TUniChDb ucd
Definition: unicode.h:1775
int SbEx_SetStdEnglish()
Definition: unicode.h:1499

Member Data Documentation

TUniCodec TUnicode::codec

Definition at line 1774 of file unicode.h.

THash<TStr, PCodecBase> TUnicode::codecs
protected

Definition at line 1869 of file unicode.h.

T8BitCodec<TEncoding_CP1250> TUnicode::cp1250

Definition at line 1851 of file unicode.h.

T8BitCodec<TEncoding_CP437> TUnicode::cp437

Definition at line 1853 of file unicode.h.

T8BitCodec<TEncoding_CP852> TUnicode::cp852

Definition at line 1852 of file unicode.h.

T8BitCodec<TEncoding_ISO8859_1> TUnicode::iso8859_1

Definition at line 1846 of file unicode.h.

T8BitCodec<TEncoding_ISO8859_2> TUnicode::iso8859_2

Definition at line 1847 of file unicode.h.

T8BitCodec<TEncoding_ISO8859_3> TUnicode::iso8859_3

Definition at line 1848 of file unicode.h.

T8BitCodec<TEncoding_ISO8859_4> TUnicode::iso8859_4

Definition at line 1849 of file unicode.h.

TUniChDb TUnicode::ucd

Definition at line 1775 of file unicode.h.

T8BitCodec<TEncoding_YuAscii> TUnicode::yuAscii

Definition at line 1850 of file unicode.h.


The documentation for this class was generated from the following files: