SNAP Library , User Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TUnicode Class Reference

#include <unicode.h>

List of all members.

Public Types

typedef TUniChDb::TCaseConversion TCaseConversion

Public Member Functions

 TUnicode ()
 TUnicode (const TStr &fnBinUcd)
void Init ()
int DecodeUtf8 (const TIntV &src, TIntV &dest) const
int DecodeUtf8 (const TStr &src, TIntV &dest) const
int EncodeUtf8 (const TIntV &src, TIntV &dest) const
TStr EncodeUtf8Str (const TIntV &src) const
int DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
int EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
void RegisterCodec (const TStr &nameList, const PCodecBase &codec)
void UnregisterCodec (const TStr &nameList)
void ClrCodecs ()
void InitCodecs ()
PCodecBase GetCodec (const TStr &name) const
void GetAllCodecs (TCodecBaseV &dest) const
bool FindNextWordBoundary (const TIntV &src, int &position) const
void FindWordBoundaries (const TIntV &src, TBoolV &dest) const
bool FindNextSentenceBoundary (const TIntV &src, int &position) const
void FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const
void ClrSentenceBoundaryExceptions ()
void UseEnglishSentenceBoundaryExceptions ()
void Decompose (const TIntV &src, TIntV &dest, bool compatibility) const
void Compose (const TIntV &src, TIntV &dest) const
void DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const
int ExtractStarters (const TIntV &src, TIntV &dest) const
int ExtractStarters (TIntV &src) const
void GetLowerCase (const TIntV &src, TIntV &dest) const
void GetUpperCase (const TIntV &src, TIntV &dest) const
void GetTitleCase (const TIntV &src, TIntV &dest) const
void GetSimpleLowerCase (const TIntV &src, TIntV &dest) const
void GetSimpleUpperCase (const TIntV &src, TIntV &dest) const
void GetSimpleTitleCase (const TIntV &src, TIntV &dest) const
void ToSimpleUpperCase (TIntV &src) const
void ToSimpleLowerCase (TIntV &src) const
void ToSimpleTitleCase (TIntV &src) const
void GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const
void ToCaseFolded (TIntV &src) const
TStr GetUtf8CaseFolded (const TStr &s) const
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
TUniChSubCategory GetSubCat (const int cp) const
const char * GetCharName (const int cp) const
TStr GetCharNameS (const int cp) const

Public Attributes

TUniCodec codec
TUniChDb ucd
T8BitCodec< TEncoding_ISO8859_1iso8859_1
T8BitCodec< TEncoding_ISO8859_2iso8859_2
T8BitCodec< TEncoding_ISO8859_3iso8859_3
T8BitCodec< TEncoding_ISO8859_4iso8859_4
T8BitCodec< TEncoding_YuAsciiyuAscii
T8BitCodec< TEncoding_CP1250cp1250
T8BitCodec< TEncoding_CP852cp852
T8BitCodec< TEncoding_CP437cp437

Static Protected Member Functions

static TStr NormalizeCodecName (const TStr &name)

Protected Attributes

THash< TStr, PCodecBasecodecs

Detailed Description

Definition at line 1770 of file unicode.h.


Member Typedef Documentation


Constructor & Destructor Documentation

TUnicode::TUnicode ( ) [inline]

Definition at line 1776 of file unicode.h.

{ Init(); }
TUnicode::TUnicode ( const TStr fnBinUcd) [inline, explicit]

Definition at line 1777 of file unicode.h.

{ ucd.LoadBin(fnBinUcd); Init(); }

Member Function Documentation

DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 ( IsPrivateUse  ,
IsSurrogate   
) const [inline]

Definition at line 2013 of file unicode.h.

                                                  { return ucd.GetCat(cp); }
void TUnicode::ClrCodecs ( ) [inline]

Definition at line 1876 of file unicode.h.

{ codecs.Clr(); }

Definition at line 1919 of file unicode.h.

{ ucd.SbEx_Clr(); }
void TUnicode::Compose ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1936 of file unicode.h.

{ return ucd.Compose(src, dest, true); }
int TUnicode::DecodeUtf16FromBytes ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1805 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf16FromWords ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1818 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1786 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
int TUnicode::DecodeUtf8 ( const TStr src,
TIntV dest 
) const [inline]

Definition at line 1787 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
void TUnicode::Decompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1929 of file unicode.h.

{ ucd.Decompose(src, dest, compatibility, true); }
void TUnicode::DecomposeAndCompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1941 of file unicode.h.

{ return ucd.DecomposeAndCompose(src, dest, compatibility); }
int TUnicode::EncodeUtf16ToBytes ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1833 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf16ToWords ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1829 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1791 of file unicode.h.

{ return (int) codec.EncodeUtf8(src, dest); }
TStr TUnicode::EncodeUtf8Str ( const TIntV src) const [inline]

Definition at line 1795 of file unicode.h.

{ return codec.EncodeUtf8Str(src); }
int TUnicode::ExtractStarters ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1946 of file unicode.h.

{ return (int) ucd.ExtractStarters(src, dest); }
int TUnicode::ExtractStarters ( TIntV src) const [inline]

Definition at line 1948 of file unicode.h.

{ return (int) ucd.ExtractStarters(src); }
bool TUnicode::FindNextSentenceBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1911 of file unicode.h.

                                                                             {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
bool TUnicode::FindNextWordBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1896 of file unicode.h.

                                                                         {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
void TUnicode::FindSentenceBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1917 of file unicode.h.

{ ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
void TUnicode::FindWordBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1902 of file unicode.h.

{ ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
void TUnicode::GetAllCodecs ( TCodecBaseV dest) const [inline]

Definition at line 1882 of file unicode.h.

                                                   {
                dest.Clr();
                for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
                        PCodecBase codec = codecs[i]; bool found = false;
                        for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
                        if (! found) dest.Add(codec); }}
void TUnicode::GetCaseFolded ( const TIntV src,
TIntV dest,
const bool  full = true 
) const [inline]

Definition at line 1984 of file unicode.h.

{ return ucd.GetCaseFolded(src, dest, true, full, false); }
const char* TUnicode::GetCharName ( const int  cp) const [inline]

Definition at line 2019 of file unicode.h.

{ return ucd.GetCharName(cp); }
TStr TUnicode::GetCharNameS ( const int  cp) const [inline]

Definition at line 2020 of file unicode.h.

{ return ucd.GetCharNameS(cp); }
PCodecBase TUnicode::GetCodec ( const TStr name) const [inline]

Definition at line 1878 of file unicode.h.

                                                    {
                TStr s = NormalizeCodecName(name);
                PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
                return p; }
void TUnicode::GetLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1960 of file unicode.h.

{ ucd.GetLowerCase(src, dest, true, false, false); }
void TUnicode::GetSimpleLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1967 of file unicode.h.

{ ucd.GetSimpleLowerCase(src, dest, true); }
void TUnicode::GetSimpleTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1969 of file unicode.h.

{ ucd.GetSimpleTitleCase(src, dest, true); }
void TUnicode::GetSimpleUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1968 of file unicode.h.

{ ucd.GetSimpleUpperCase(src, dest, true); }
TUniChSubCategory TUnicode::GetSubCat ( const int  cp) const [inline]

Definition at line 2016 of file unicode.h.

{ return ucd.GetSubCat(cp); }
void TUnicode::GetTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1962 of file unicode.h.

{ ucd.GetTitleCase(src, dest, true, false, false); }
void TUnicode::GetUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1961 of file unicode.h.

{ ucd.GetUpperCase(src, dest, true, false, false); }
TStr TUnicode::GetUtf8CaseFolded ( const TStr s) const [inline]

Definition at line 1989 of file unicode.h.

                                                    {
                bool isAscii = true;
                for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
                if (isAscii) return s.GetLc();
                TIntV src; DecodeUtf8(s, src);
                TIntV dest; GetCaseFolded(src, dest);
                return EncodeUtf8Str(dest); }
void TUnicode::Init ( ) [inline]

Definition at line 1778 of file unicode.h.

{ InitCodecs(); }

Definition at line 1687 of file unicode.cpp.

{
        ClrCodecs();
        RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
        RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
        RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
        RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
        RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
        RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
        RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
        RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
}
static TStr TUnicode::NormalizeCodecName ( const TStr name) [inline, static, protected]

Definition at line 1865 of file unicode.h.

                                                                {
                TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
void TUnicode::RegisterCodec ( const TStr nameList,
const PCodecBase codec 
) [inline]

Definition at line 1868 of file unicode.h.

                                                                          {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.AddDat(NormalizeCodecName(names[i]), codec); }
void TUnicode::ToCaseFolded ( TIntV src) const [inline]

Definition at line 1987 of file unicode.h.

{ return ucd.ToCaseFolded(src, false); }
void TUnicode::ToSimpleLowerCase ( TIntV src) const [inline]

Definition at line 1973 of file unicode.h.

void TUnicode::ToSimpleTitleCase ( TIntV src) const [inline]

Definition at line 1974 of file unicode.h.

void TUnicode::ToSimpleUpperCase ( TIntV src) const [inline]

Definition at line 1972 of file unicode.h.

void TUnicode::UnregisterCodec ( const TStr nameList) [inline]

Definition at line 1872 of file unicode.h.

                                                   {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.DelKey(NormalizeCodecName(names[i])); }

Definition at line 1920 of file unicode.h.


Member Data Documentation

Definition at line 1773 of file unicode.h.

Definition at line 1864 of file unicode.h.

Definition at line 1774 of file unicode.h.


The documentation for this class was generated from the following files: