SNAP Library, User Reference
2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
|
Go to the source code of this file.
Classes | |
class | THtmlLx |
Enumerations | |
enum | THtmlLxChTy { hlctSpace, hlctAlpha, hlctNum, hlctSym, hlctLTag, hlctRTag, hlctEof } |
enum | THtmlLxSym { hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl, hsyBTag, hsyETag, hsyMTag, hsyEof } |
enum | THtmlDocType { hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL } |
Functions | |
ClassHdTP (THtmlTok, PHtmlTok) ClassHdTP(THtmlDoc | |
void | SetUcCh (const char &UcCh, const char &LcCh) |
void | SetUcCh (const TStr &Str) |
void | SetChTy (const THtmlLxChTy &ChTy, const TStr &Str) |
void | SetEscStr (const TStr &SrcStr, const TStr &DstStr) |
THtmlLxChDef () | |
THtmlLxChDef (TSIn &SIn) | |
static PHtmlLxChDef | Load (TSIn &SIn) |
void | Save (TSOut &SOut) |
THtmlLxChDef & | operator= (const THtmlLxChDef &) |
int | GetChTy (const char &Ch) const |
bool | IsEoln (const char &Ch) const |
bool | IsWs (const char &Ch) const |
bool | IsSpace (const char &Ch) const |
bool | IsAlpha (const char &Ch) const |
bool | IsNum (const char &Ch) const |
bool | IsAlNum (const char &Ch) const |
bool | IsSym (const char &Ch) const |
bool | IsUrl (const char &Ch) const |
bool | IsUc (const char &Ch) const |
bool | IsLc (const char &Ch) const |
char | GetUc (const char &Ch) const |
char | GetLc (const char &Ch) const |
void | GetUcChA (TChA &ChA) const |
void | GetLcChA (TChA &ChA) const |
TStr | GetUcStr (const TStr &Str) const |
TStr | GetLcStr (const TStr &Str) const |
TStr | GetEscStr (const TStr &Str) const |
static PHtmlLxChDef | GetChDef () |
static THtmlLxChDef & | GetChDefRef () |
static TStr | GetCSZFromYuascii (const TChA &ChA) |
static TStr | GetCSZFromWin1250 (const TChA &ChA) |
static TStr | GetWin1250FromYuascii (const TChA &ChA) |
static TStr | GetIsoCeFromYuascii (const TChA &ChA) |
THtmlTok () | |
THtmlTok (const THtmlLxSym &_Sym) | |
THtmlTok (const THtmlLxSym &_Sym, const TStr &_Str) | |
THtmlTok (const THtmlLxSym &_Sym, const TStr &_Str, const THtmlLx::TArgNmValV &_ArgNmValV) | |
THtmlTok (TSIn &) | |
THtmlTok & | operator= (const THtmlTok &) |
THtmlLxSym | GetSym () const |
TStr | GetStr () const |
TStr | GetFullStr () const |
bool | IsArg (const TStr &ArgNm) const |
TStr | GetArg (const TStr &ArgNm) const |
TStr | GetArg (const TStr &ArgNm, const TStr &DfArgVal) const |
bool | IsUrlTok (TStr &RelUrlStr) const |
bool | IsRedirUrlTok () const |
void | SaveTxt (const PSOut &SOut, const bool &TxtMode=true) |
static bool | IsBreakTag (const TStr &TagNm) |
static bool | IsBreakTok (const PHtmlTok &Tok) |
static bool | IsHTag (const TStr &TagNm, int &HTagN) |
static PHtmlTok | GetHTok (const bool &IsBTag, const int &HTagN) |
THtmlDoc () | |
THtmlDoc (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) | |
static PHtmlDoc | New (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
THtmlDoc (TSIn &) | |
THtmlDoc & | operator= (const THtmlDoc &) |
int | GetToks () const |
PHtmlTok | GetTok (const int &TokN) const |
PHtmlTok | GetTok (const int &TokN, THtmlLxSym &Sym, TStr &Str) const |
void | AddTokV (const THtmlTokV &_TokV) |
static TStr | GetTxtLnDoc (const TStr &HtmlStr) |
static TStr | GetTxtLnDoc (const TStr &HtmlStr, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutTagsP) |
static PHtmlDoc | LoadTxt (const TStr &FNm, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
static void | SaveHtmlToTxt (const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
static void | SaveHtmlToTxt (const TStr &HtmlStr, const TStr &TxtFNm, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
static void | SaveHtmlToXml (const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
static void | SaveHtmlToXml (const TStr &HtmlStr, const TStr &XmlFNm, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
static TLxSym | GetLxSym (const THtmlLxSym &HtmlLxSym, const TChA &ChA) |
static bool | _IsTagRedir (const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
static TStr | GetRedirHtmlDocStr (const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
THtmlHldV (const PHtmlDoc &_RefHtmlDoc, const int &HldWnLen=10) | |
THtmlHldV (TSIn &) | |
THtmlHldV & | operator= (const THtmlHldV &) |
PHtmlDoc | GetRefHtmlDoc () |
int | GetHlds () |
PHtmlDoc | GetHld (const int &HldN) |
TWebPg () | |
TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp) | |
static PWebPg | New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp) |
static PWebPg | New (const TStrV &UrlStrV, const PHttpResp &HttpResp) |
static PWebPg | New (const TStr &UrlStr, const PHttpResp &HttpResp) |
~TWebPg () | |
TWebPg (TSIn &) | |
TWebPg & | operator= (const TWebPg &) |
int | GetUrls () const |
TStr | GetUrlStr (const int &UrlN=-1) const |
PUrl | GetUrl (const int &UrlN=-1) const |
int | GetIps () const |
TStr | GetIpNum (const int &IpN=-1) const |
PHttpResp | GetHttpResp () const |
TStr | GetHttpHdStr () const |
TStr | GetHttpBodyAsStr () const |
void | GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const |
void | GetOutUrlV (TUrlV &OutUrlV) const |
void | GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const |
void | PutFetchMSecs (const uint64 &_FetchMSecs) |
uint64 | GetFetchMSecs () const |
void | SaveAsHttpBody (const TStr &FNm) const |
void | SaveAsHttp (const TStr &FNm) const |
bool | IsTxt () const |
Variables | |
ClassTP(THtmlLxChDef, PHtmlLxChDef) private TChV | UcChV |
TChV | LcChV |
TStrStrH | EscStrH |
static PHtmlLxChDef | ChDef |
ClassTPV(THtmlTok, PHtmlTok, THtmlTokV) private TStr | Str |
THtmlLx::TArgNmValV | ArgNmValV |
static const TStr | ATagNm = "<A>" |
static const TStr | AreaTagNm = "<AREA>" |
static const TStr | BrTagNm = "<BR>" |
static const TStr | CardTagNm = "<CARD>" |
static const TStr | CenterTagNm = "<CENTER>" |
static const TStr | FrameTagNm = "<FRAME>" |
static const TStr | H1TagNm = "<H1>" |
static const TStr | H2TagNm = "<H2>" |
static const TStr | H3TagNm = "<H3>" |
static const TStr | H4TagNm = "<H4>" |
static const TStr | H5TagNm = "<H5>" |
static const TStr | H6TagNm = "<H6>" |
static const TStr | ImgTagNm = "<IMG>" |
static const TStr | LiTagNm = "<LI>" |
static const TStr | MetaTagNm = "<META>" |
static const TStr | PTagNm = "<P>" |
static const TStr | UlTagNm = "<UL>" |
static const TStr | TitleTagNm = "<TITLE>" |
static const TStr | TitleETagNm = "</TITLE>" |
static const TStr | AltArgNm = "ALT" |
static const TStr | HRefArgNm = "HREF" |
static const TStr | SrcArgNm = "SRC" |
static const TStr | TitleArgNm = "TITLE" |
static const TStr | HttpEquivArgNm = "HTTP-EQUIV" |
ClassTP(THtmlHldV, PHtmlHldV) private THtmlDocV | HldV |
ClassTPV(TWebPg, PWebPg, TWebPgV) private TStrV | IpNumV |
PHttpResp | HttpResp |
uint64 | FetchMSecs |
enum THtmlDocType |
enum THtmlLxChTy |
enum THtmlLxSym |
bool THtmlDoc::_IsTagRedir | ( | const TStr & | TagStr, |
const TStr & | ArgNm, | ||
THtmlLx & | Lx, | ||
const TStr & | BaseUrlStr, | ||
const TStr & | RedirUrlStr | ||
) | [static] |
Definition at line 1106 of file html.cpp.
{ IAssert(Lx.Sym==hsyBTag); if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){ TStr RelUrlStr=Lx.GetArg(ArgNm); PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr); if (Url->IsOk(usHttp)){ TStr UrlStr=Url->GetUrlStr(); PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr); Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr()); return true; } else { return false; } } else { return false; } }
static PHtmlLxChDef GetChDef | ( | ) | [static] |
static THtmlLxChDef& GetChDefRef | ( | ) | [static] |
int GetChTy | ( | const char & | Ch | ) | const |
TStr THtmlLxChDef::GetCSZFromWin1250 | ( | const TChA & | ChA | ) | [static] |
Definition at line 132 of file html.cpp.
{ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ const uchar Ch=ChA[ChN]; switch (Ch){ case 232: DstChA+='c'; break; case 200: DstChA+='C'; break; case 154: DstChA+='s'; break; case 138: DstChA+='S'; break; case 158: DstChA+='z'; break; case 142: DstChA+='Z'; break; default: DstChA+=Ch; } } return DstChA; }
TStr THtmlLxChDef::GetCSZFromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 111 of file html.cpp.
{ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+='c'; break; case '^': DstChA+='C'; break; case '}': DstChA+='c'; break; case ']': DstChA+='C'; break; case '|': DstChA+='d'; break; case '\\': DstChA+='D'; break; case '{': DstChA+='s'; break; case '[': DstChA+='S'; break; case '`': DstChA+='z'; break; case '@': DstChA+='Z'; break; default: DstChA+=Ch; } } return DstChA; }
TStr THtmlLxChDef::GetEscStr | ( | const TStr & | Str | ) | const |
uint64 GetFetchMSecs | ( | ) | const |
Definition at line 375 of file html.h.
{return FetchMSecs;}
TStr THtmlTok::GetFullStr | ( | ) | const |
Definition at line 628 of file html.cpp.
{ if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){ TChA FullChA; FullChA+=Str.GetSubStr(0, Str.Len()-2); for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){ FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='='; FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"'; } FullChA+='>'; return FullChA; } else if (Sym==hsyETag){ TChA FullChA; FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1); return FullChA; } else { return GetStr(); } }
PHtmlTok THtmlTok::GetHTok | ( | const bool & | IsBTag, |
const int & | HTagN | ||
) | [static] |
Definition at line 762 of file html.cpp.
{ THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag; TStr HTagNm; switch (HTagN){ case 1: HTagNm=H1TagNm; break; case 2: HTagNm=H2TagNm; break; case 3: HTagNm=H3TagNm; break; case 4: HTagNm=H4TagNm; break; case 5: HTagNm=H5TagNm; break; case 6: HTagNm=H6TagNm; break; default: Fail; } return PHtmlTok(new THtmlTok(HTagSym, HTagNm)); }
TStr GetHttpBodyAsStr | ( | ) | const |
Definition at line 366 of file html.h.
{return GetHttpResp()->GetBodyAsStr();}
TStr GetHttpHdStr | ( | ) | const |
Definition at line 365 of file html.h.
{return GetHttpResp()->GetHdStr();}
PHttpResp GetHttpResp | ( | ) | const |
TStr THtmlLxChDef::GetIsoCeFromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 170 of file html.cpp.
{ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+=uchar(232); break; case '^': DstChA+=uchar(200); break; case '}': DstChA+=uchar(230); break; case ']': DstChA+=uchar(198); break; case '|': DstChA+=uchar(240); break; case '\\': DstChA+=uchar(208); break; case '{': DstChA+=uchar(185); break; case '[': DstChA+=uchar(169); break; case '`': DstChA+=uchar(190); break; case '@': DstChA+=uchar(174); break; default: DstChA+=Ch; } } return DstChA; }
char GetLc | ( | const char & | Ch | ) | const |
TLxSym THtmlDoc::GetLxSym | ( | const THtmlLxSym & | HtmlLxSym, |
const TChA & | ChA | ||
) | [static] |
void TWebPg::GetOutDescUrlStrKdV | ( | TStrKdV & | OutDescUrlStrKdV | ) | const |
Definition at line 1258 of file html.cpp.
{ // create outgoing url vector OutDescUrlStrKdV.Clr(); // take interesting web-page components TStr UrlStr=GetUrlStr(); TStr HtmlStr=GetHttpBodyAsStr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); // traverse html documents PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; int TokN=0; int Toks=HtmlDoc->GetToks(); while (TokN<Toks){ Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, UrlStr); if (Url->IsOk()){ TChA DescChA; while (TokN<Toks){ Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++; if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){ break; } else { if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){ if (!DescChA.Empty()){DescChA+=' ';} DescChA+=TokStr; } } } OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr())); } } } } }
void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV, |
TUrlV & | OutRedirUrlV | ||
) | const |
Definition at line 1230 of file html.cpp.
{ // create outgoing url vector OutUrlV.Clr(); OutRedirUrlV.Clr(); // take interesting web-page components TStr UrlStr=GetUrlStr(); TStr HtmlStr=GetHttpBodyAsStr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn); PHtmlTok Tok; // traverse html for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ PHtmlTok Tok=HtmlDoc->GetTok(TokN); if (Tok->GetSym()==hsyBTag){ TStr RelUrlStr; if (Tok->IsUrlTok(RelUrlStr)){ PUrl Url=TUrl::New(RelUrlStr, UrlStr); if (Url->IsOk(usHttp)){ OutUrlV.Add(Url); if (Tok->IsRedirUrlTok()){ OutRedirUrlV.Add(Url); } } } } } }
void GetOutUrlV | ( | TUrlV & | OutUrlV | ) | const |
Definition at line 369 of file html.h.
{ TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
TStr THtmlDoc::GetRedirHtmlDocStr | ( | const TStr & | HtmlStr, |
const TStr & | BaseUrlStr, | ||
const TStr & | RedirUrlStr | ||
) | [static] |
Definition at line 1126 of file html.cpp.
{ PSIn SIn=TStrIn::New(HtmlStr); TMOut SOut; THtmlLx Lx(SIn); while (Lx.GetSym()!=hsyEof){ SOut.PutStr(Lx.PreSpaceChA); if ((Lx.Sym==hsyBTag)&&( (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))|| (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){ SOut.PutStr(Lx.GetFullBTagStr()); } else { SOut.PutStr(Lx.SymChA()); } } return SOut.GetAsStr(); }
PHtmlDoc GetRefHtmlDoc | ( | ) |
THtmlLxSym GetSym | ( | ) | const |
PHtmlTok GetTok | ( | const int & | TokN | ) | const |
PHtmlTok GetTok | ( | const int & | TokN, |
THtmlLxSym & | Sym, | ||
TStr & | Str | ||
) | const |
static TStr GetTxtLnDoc | ( | const TStr & | HtmlStr | ) | [static] |
static TStr GetTxtLnDoc | ( | const TStr & | HtmlStr, |
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutTagsP | ||
) | [static] |
char GetUc | ( | const char & | Ch | ) | const |
TStr TLxChDef::GetUcStr | ( | const TStr & | Str | ) | const |
PUrl GetUrl | ( | const int & | UrlN = -1 | ) | const |
TStr THtmlLxChDef::GetWin1250FromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 149 of file html.cpp.
{ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+=uchar(232); break; case '^': DstChA+=uchar(200); break; case '}': DstChA+='c'; break; case ']': DstChA+='C'; break; case '|': DstChA+='d'; break; case '\\': DstChA+='D'; break; case '{': DstChA+=uchar(154); break; case '[': DstChA+=uchar(138); break; case '`': DstChA+=uchar(158); break; case '@': DstChA+=uchar(142); break; default: DstChA+=Ch; } } return DstChA; }
bool IsAlNum | ( | const char & | Ch | ) | const |
bool IsAlpha | ( | const char & | Ch | ) | const |
Definition at line 203 of file html.h.
{ return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;}
bool THtmlTok::IsBreakTag | ( | const TStr & | TagNm | ) | [static] |
Definition at line 726 of file html.cpp.
{ static TStrH BreakTagNmH(50); if (BreakTagNmH.Len()==0){ BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>")); BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>")); BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>")); BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>")); BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>")); BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>")); BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>")); BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>")); BreakTagNmH.AddKey(TStr("<SCRIPT>")); BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>")); } return BreakTagNmH.IsKey(TagNm); }
bool THtmlTok::IsBreakTok | ( | const PHtmlTok & | Tok | ) | [static] |
bool IsEoln | ( | const char & | Ch | ) | const |
bool THtmlTok::IsHTag | ( | const TStr & | TagNm, |
int & | HTagN | ||
) | [static] |
bool IsLc | ( | const char & | Ch | ) | const |
bool IsNum | ( | const char & | Ch | ) | const |
bool THtmlTok::IsRedirUrlTok | ( | ) | const |
bool IsSpace | ( | const char & | Ch | ) | const |
bool IsSym | ( | const char & | Ch | ) | const |
bool TWebPg::IsTxt | ( | ) | const |
Definition at line 1310 of file html.cpp.
{ if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){ TStr Str=HttpResp->GetBodyAsStr(); int StrLen=Str.Len(); int ChN=0; int PrintChs=0; while ((ChN<100)&&(ChN<StrLen)){ char Ch=Str[ChN++]; if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){ PrintChs++;} } double PrintPrb=double(PrintChs)/double(ChN+1); return PrintPrb>0.9; } else { return false; } }
bool IsUc | ( | const char & | Ch | ) | const |
bool IsUrl | ( | const char & | Ch | ) | const |
bool THtmlTok::IsUrlTok | ( | TStr & | RelUrlStr | ) | const |
Definition at line 648 of file html.cpp.
{ if (GetSym()==hsyBTag){ TStr TagNm=GetStr(); if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){ RelUrlStr=GetArg(HRefArgNm); return true;} else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){ RelUrlStr=GetArg(HRefArgNm); return true;} else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){ RelUrlStr=GetArg(SrcArgNm); return true;} else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){ RelUrlStr=GetArg(SrcArgNm); return true;} else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){ TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc(); if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){ TStr ContentStr=GetArg("CONTENT"); TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL="; ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr); RelUrlStr=ContentStr.GetSubStr( LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len()); return !RelUrlStr.Empty(); } else { return false; } } } return false; }
bool IsWs | ( | const char & | Ch | ) | const |
Definition at line 34 of file html.h.
{ return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
Definition at line 25 of file html.h.
{return new THtmlLxChDef(SIn);}
static PHtmlDoc LoadTxt | ( | const TStr & | FNm, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) | [static] |
static PHtmlDoc New | ( | const PSIn & | SIn, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) | [static] |
THtmlLxChDef& operator= | ( | const THtmlLxChDef & | ) |
void PutFetchMSecs | ( | const uint64 & | _FetchMSecs | ) |
Definition at line 374 of file html.h.
{FetchMSecs=_FetchMSecs;}
void TWebPg::SaveAsHttp | ( | const TStr & | FNm | ) | const |
Definition at line 1303 of file html.cpp.
{ // create output file PSOut SOut=TFOut::New(FNm); // save http HttpResp->SaveTxt(SOut); }
void TWebPg::SaveAsHttpBody | ( | const TStr & | FNm | ) | const |
Definition at line 1296 of file html.cpp.
{ // create output file PSOut SOut=TFOut::New(FNm); // save http-body HttpResp->SaveBody(SOut); }
static void SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
const PSOut & | TxtSOut, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP | ||
) | [static] |
static void SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
const TStr & | TxtFNm, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP | ||
) | [static] |
static void SaveHtmlToXml | ( | const TStr & | HtmlStr, |
const PSOut & | XmlSOut, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutTextP, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP, | ||
const bool & | OutTagsP, | ||
const bool & | OutArgsP | ||
) | [static] |
static void SaveHtmlToXml | ( | const TStr & | HtmlStr, |
const TStr & | XmlFNm, | ||
const TStr & | BaseUrlStr, | ||
const bool & | OutTextP, | ||
const bool & | OutUrlP, | ||
const bool & | OutToksP, | ||
const bool & | OutTagsP, | ||
const bool & | OutArgsP | ||
) | [static] |
void SetChTy | ( | const THtmlLxChTy & | ChTy, |
const TStr & | Str | ||
) |
void THtmlLxChDef::SetEscStr | ( | const TStr & | SrcStr, |
const TStr & | DstStr | ||
) |
void THtmlLxChDef::SetUcCh | ( | const char & | UcCh, |
const char & | LcCh | ||
) |
Definition at line 3 of file html.cpp.
{ // update upper-case (more lower cases may have one upper case) IAssert( (UcChV[LcCh-TCh::Mn]==TCh(0))|| (UcChV[LcCh-TCh::Mn]==TCh(LcCh))); UcChV[LcCh-TCh::Mn]=TCh(UcCh); // update lower-case (one upper case may have only one lower case) if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){ LcChV[UcCh-TCh::Mn]=TCh(LcCh); } }
void TLxChDef::SetUcCh | ( | const TStr & | Str | ) |
THtmlDoc::THtmlDoc | ( | const PSIn & | SIn, |
const THtmlDocType & | Type = hdtAll , |
||
const bool & | DoUc = true |
||
) |
Definition at line 779 of file html.cpp.
: TokV(1000, 0){ THtmlLx Lx(SIn); bool MkTok=false; bool InUL=false; while (Lx.GetSym()!=hsyEof){ switch (Type){ case hdtAll: MkTok=true; break; case hdtStr: MkTok=(Lx.Sym==hsyStr); break; case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break; case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break; case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break; case hdtHRef: MkTok=(Lx.Sym==hsyBTag)&& ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)|| (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)|| (Lx.UcChA==THtmlTok::MetaTagNm)); break; case hdtUL: if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;} MkTok=InUL; if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;} break; default: Fail; } if (MkTok){TokV.Add(Lx.GetTok(DoUc));} } TokV.Add(PHtmlTok(new THtmlTok(hsyEof))); }
THtmlHldV::THtmlHldV | ( | const PHtmlDoc & | _RefHtmlDoc, |
const int & | HldWnLen = 10 |
||
) |
Definition at line 1148 of file html.cpp.
: RefHtmlDoc(_RefHtmlDoc), HldV(){ bool IsTitleAct=false; THtmlTokV TitleTokV; bool IsHAct=false; int ActHTagN=-1; TVec<THtmlTokV> HTokV(6); PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){ Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr); if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ // collect tokens before, inside and after <a> ... </a> tags int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr; // inside <A> tags THtmlTokV ATokV; ATokN=TokN; forever{ ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (ATokSym!=hsySSym){ATokV.Add(ATok);} if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;} ATokN++; if (ATokN>=RefHtmlDoc->GetToks()){break;} } int ETagATokN=ATokN+1; // before <A> tags THtmlTokV PrevATokV; ATokN=TokN; forever{ ATokN--; if (ATokN<0){break;} ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (THtmlTok::IsBreakTok(ATok)){break;} if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);} if (ATokV.Len()>=HldWnLen){break;} } // after <A> tags THtmlTokV NextATokV; ATokN=ETagATokN; forever{ ATokN++; if (ATokN>=RefHtmlDoc->GetToks()){break;} ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (THtmlTok::IsBreakTok(ATok)){break;} if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);} if (ATokV.Len()>=HldWnLen){break;} } // construct html-document with hyper-link context PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc()); HtmlDoc->AddTokV(TitleTokV); for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);} HtmlDoc->AddTokV(PrevATokV); HtmlDoc->AddTokV(ATokV); HtmlDoc->AddTokV(NextATokV); HldV.Add(HtmlDoc); HtmlDoc->SaveTxt(TSOut::StdOut); } else if (TokSym==hsyBTag){ int HTagN; if (TokStr==THtmlTok::TitleTagNm){ IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok); } else if (THtmlTok::IsHTag(TokStr, HTagN)){ if (IsHAct){// conclude previous <H?> tag if left open HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));} IsHAct=true; ActHTagN=HTagN; {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}} HTokV[ActHTagN-1].Add(Tok); } } else if (TokSym==hsyETag){ int HTagN; if (TokStr==THtmlTok::TitleTagNm){ if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;} } else if (THtmlTok::IsHTag(TokStr, HTagN)){ if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;} } } else if (TokSym!=hsySSym){ if (IsTitleAct){TitleTokV.Add(Tok);} if (IsHAct){HTokV[ActHTagN-1].Add(Tok);} } } }
Definition at line 48 of file html.cpp.
: ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){ // Character-Types ChTyV.PutAll(TInt(hlctSpace)); SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz"); SetChTy(hlctAlpha, "@_"); SetChTy(hlctNum, "0123456789"); SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?"); SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">"); SetChTy(hlctEof, TStr(TCh::EofCh)); for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}} //SetChTy(hlctSpace, TStr(TCh(char(160)))); // Upper-Case {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ SetUcCh(char(Ch), char(Ch));}} SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2"); SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6"); SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd"); SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9"); SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg"); SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed"); SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk"); SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1"); SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4"); SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp"); SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a"); SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa"); SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww"); SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz"); SetUcCh("\x8e\x9e"); // ISO-CE //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/ //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/ //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/ //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/ //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/ // Annoying Unicode-characters //SetChTy(hlctSpace, "\xc2\xef"); // Escape-Sequences SetEscStr(""", "\""); SetEscStr("&", "&"); SetEscStr("<", "<"); SetEscStr(">", ">"); SetEscStr(" ", " "); SetEscStr("ä", "\xe4"); SetEscStr("Ä", "\xc4"); SetEscStr("ö", "\xf6"); SetEscStr("Ö", "\xd6"); SetEscStr("ü", "\xfc"); SetEscStr("Ü", "\xdc"); SetEscStr("å", "\xe5"); SetEscStr("Å", "\xc5"); SetEscStr("ø", "\xf8"); SetEscStr("Ø", "\xd8"); SetEscStr("&Aelig", "\xc6"); SetEscStr("æ", "\xe6"); SetEscStr("é", "e"); SetEscStr("É", "E"); SetEscStr("è", "e"); SetEscStr("È", "E"); SetEscStr("à", "a"); SetEscStr("À", "A"); }
THtmlLxChDef | ( | TSIn & | SIn | ) |
THtmlTok | ( | const THtmlLxSym & | _Sym | ) |
THtmlTok | ( | const THtmlLxSym & | _Sym, |
const TStr & | _Str | ||
) |
THtmlTok | ( | const THtmlLxSym & | _Sym, |
const TStr & | _Str, | ||
const THtmlLx::TArgNmValV & | _ArgNmValV | ||
) |
const TStr THtmlTok::AltArgNm = "ALT" [static] |
const TStr THtmlTok::AreaTagNm = "<AREA>" [static] |
const TStr THtmlTok::ATagNm = "<A>" [static] |
const TStr THtmlTok::BrTagNm = "<BR>" [static] |
const TStr THtmlTok::CardTagNm = "<CARD>" [static] |
const TStr THtmlTok::CenterTagNm = "<CENTER>" [static] |
const TStr THtmlTok::FrameTagNm = "<FRAME>" [static] |
const TStr THtmlTok::H1TagNm = "<H1>" [static] |
const TStr THtmlTok::H2TagNm = "<H2>" [static] |
const TStr THtmlTok::H3TagNm = "<H3>" [static] |
const TStr THtmlTok::H4TagNm = "<H4>" [static] |
const TStr THtmlTok::H5TagNm = "<H5>" [static] |
const TStr THtmlTok::H6TagNm = "<H6>" [static] |
const TStr THtmlTok::HRefArgNm = "HREF" [static] |
const TStr THtmlTok::HttpEquivArgNm = "HTTP-EQUIV" [static] |
const TStr THtmlTok::ImgTagNm = "<IMG>" [static] |
const TStr THtmlTok::LiTagNm = "<LI>" [static] |
const TStr THtmlTok::MetaTagNm = "<META>" [static] |
const TStr THtmlTok::PTagNm = "<P>" [static] |
const TStr THtmlTok::SrcArgNm = "SRC" [static] |
const TStr THtmlTok::TitleArgNm = "TITLE" [static] |
const TStr THtmlTok::TitleETagNm = "</TITLE>" [static] |
const TStr THtmlTok::TitleTagNm = "<TITLE>" [static] |
const TStr THtmlTok::UlTagNm = "<UL>" [static] |