|
SNAP Library, User Reference
2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
|
Go to the source code of this file.
Classes | |
| class | THtmlLx |
Enumerations | |
| enum | THtmlLxChTy { hlctSpace, hlctAlpha, hlctNum, hlctSym, hlctLTag, hlctRTag, hlctEof } |
| enum | THtmlLxSym { hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl, hsyBTag, hsyETag, hsyMTag, hsyEof } |
| enum | THtmlDocType { hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL } |
Functions | |
| ClassHdTP (THtmlTok, PHtmlTok) ClassHdTP(THtmlDoc | |
| void | SetUcCh (const char &UcCh, const char &LcCh) |
| void | SetUcCh (const TStr &Str) |
| void | SetChTy (const THtmlLxChTy &ChTy, const TStr &Str) |
| void | SetEscStr (const TStr &SrcStr, const TStr &DstStr) |
| THtmlLxChDef () | |
| THtmlLxChDef (TSIn &SIn) | |
| static PHtmlLxChDef | Load (TSIn &SIn) |
| void | Save (TSOut &SOut) |
| THtmlLxChDef & | operator= (const THtmlLxChDef &) |
| int | GetChTy (const char &Ch) const |
| bool | IsEoln (const char &Ch) const |
| bool | IsWs (const char &Ch) const |
| bool | IsSpace (const char &Ch) const |
| bool | IsAlpha (const char &Ch) const |
| bool | IsNum (const char &Ch) const |
| bool | IsAlNum (const char &Ch) const |
| bool | IsSym (const char &Ch) const |
| bool | IsUrl (const char &Ch) const |
| bool | IsUc (const char &Ch) const |
| bool | IsLc (const char &Ch) const |
| char | GetUc (const char &Ch) const |
| char | GetLc (const char &Ch) const |
| void | GetUcChA (TChA &ChA) const |
| void | GetLcChA (TChA &ChA) const |
| TStr | GetUcStr (const TStr &Str) const |
| TStr | GetLcStr (const TStr &Str) const |
| TStr | GetEscStr (const TStr &Str) const |
| static PHtmlLxChDef | GetChDef () |
| static THtmlLxChDef & | GetChDefRef () |
| static TStr | GetCSZFromYuascii (const TChA &ChA) |
| static TStr | GetCSZFromWin1250 (const TChA &ChA) |
| static TStr | GetWin1250FromYuascii (const TChA &ChA) |
| static TStr | GetIsoCeFromYuascii (const TChA &ChA) |
| THtmlTok () | |
| THtmlTok (const THtmlLxSym &_Sym) | |
| THtmlTok (const THtmlLxSym &_Sym, const TStr &_Str) | |
| THtmlTok (const THtmlLxSym &_Sym, const TStr &_Str, const THtmlLx::TArgNmValV &_ArgNmValV) | |
| THtmlTok (TSIn &) | |
| THtmlTok & | operator= (const THtmlTok &) |
| THtmlLxSym | GetSym () const |
| TStr | GetStr () const |
| TStr | GetFullStr () const |
| bool | IsArg (const TStr &ArgNm) const |
| TStr | GetArg (const TStr &ArgNm) const |
| TStr | GetArg (const TStr &ArgNm, const TStr &DfArgVal) const |
| bool | IsUrlTok (TStr &RelUrlStr) const |
| bool | IsRedirUrlTok () const |
| void | SaveTxt (const PSOut &SOut, const bool &TxtMode=true) |
| static bool | IsBreakTag (const TStr &TagNm) |
| static bool | IsBreakTok (const PHtmlTok &Tok) |
| static bool | IsHTag (const TStr &TagNm, int &HTagN) |
| static PHtmlTok | GetHTok (const bool &IsBTag, const int &HTagN) |
| THtmlDoc () | |
| THtmlDoc (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) | |
| static PHtmlDoc | New (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
| THtmlDoc (TSIn &) | |
| THtmlDoc & | operator= (const THtmlDoc &) |
| int | GetToks () const |
| PHtmlTok | GetTok (const int &TokN) const |
| PHtmlTok | GetTok (const int &TokN, THtmlLxSym &Sym, TStr &Str) const |
| void | AddTokV (const THtmlTokV &_TokV) |
| static TStr | GetTxtLnDoc (const TStr &HtmlStr) |
| static TStr | GetTxtLnDoc (const TStr &HtmlStr, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutTagsP) |
| static PHtmlDoc | LoadTxt (const TStr &FNm, const THtmlDocType &Type=hdtAll, const bool &DoUc=true) |
| static void | SaveHtmlToTxt (const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
| static void | SaveHtmlToTxt (const TStr &HtmlStr, const TStr &TxtFNm, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP) |
| static void | SaveHtmlToXml (const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
| static void | SaveHtmlToXml (const TStr &HtmlStr, const TStr &XmlFNm, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP) |
| static TLxSym | GetLxSym (const THtmlLxSym &HtmlLxSym, const TChA &ChA) |
| static bool | _IsTagRedir (const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
| static TStr | GetRedirHtmlDocStr (const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr) |
| THtmlHldV (const PHtmlDoc &_RefHtmlDoc, const int &HldWnLen=10) | |
| THtmlHldV (TSIn &) | |
| THtmlHldV & | operator= (const THtmlHldV &) |
| PHtmlDoc | GetRefHtmlDoc () |
| int | GetHlds () |
| PHtmlDoc | GetHld (const int &HldN) |
| TWebPg () | |
| TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp) | |
| static PWebPg | New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp) |
| static PWebPg | New (const TStrV &UrlStrV, const PHttpResp &HttpResp) |
| static PWebPg | New (const TStr &UrlStr, const PHttpResp &HttpResp) |
| ~TWebPg () | |
| TWebPg (TSIn &) | |
| TWebPg & | operator= (const TWebPg &) |
| int | GetUrls () const |
| TStr | GetUrlStr (const int &UrlN=-1) const |
| PUrl | GetUrl (const int &UrlN=-1) const |
| int | GetIps () const |
| TStr | GetIpNum (const int &IpN=-1) const |
| PHttpResp | GetHttpResp () const |
| TStr | GetHttpHdStr () const |
| TStr | GetHttpBodyAsStr () const |
| void | GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const |
| void | GetOutUrlV (TUrlV &OutUrlV) const |
| void | GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const |
| void | PutFetchMSecs (const uint64 &_FetchMSecs) |
| uint64 | GetFetchMSecs () const |
| void | SaveAsHttpBody (const TStr &FNm) const |
| void | SaveAsHttp (const TStr &FNm) const |
| bool | IsTxt () const |
Variables | |
| ClassTP(THtmlLxChDef, PHtmlLxChDef) private TChV | UcChV |
| TChV | LcChV |
| TStrStrH | EscStrH |
| static PHtmlLxChDef | ChDef |
| ClassTPV(THtmlTok, PHtmlTok, THtmlTokV) private TStr | Str |
| THtmlLx::TArgNmValV | ArgNmValV |
| static const TStr | ATagNm = "<A>" |
| static const TStr | AreaTagNm = "<AREA>" |
| static const TStr | BrTagNm = "<BR>" |
| static const TStr | CardTagNm = "<CARD>" |
| static const TStr | CenterTagNm = "<CENTER>" |
| static const TStr | FrameTagNm = "<FRAME>" |
| static const TStr | H1TagNm = "<H1>" |
| static const TStr | H2TagNm = "<H2>" |
| static const TStr | H3TagNm = "<H3>" |
| static const TStr | H4TagNm = "<H4>" |
| static const TStr | H5TagNm = "<H5>" |
| static const TStr | H6TagNm = "<H6>" |
| static const TStr | ImgTagNm = "<IMG>" |
| static const TStr | LiTagNm = "<LI>" |
| static const TStr | MetaTagNm = "<META>" |
| static const TStr | PTagNm = "<P>" |
| static const TStr | UlTagNm = "<UL>" |
| static const TStr | TitleTagNm = "<TITLE>" |
| static const TStr | TitleETagNm = "</TITLE>" |
| static const TStr | AltArgNm = "ALT" |
| static const TStr | HRefArgNm = "HREF" |
| static const TStr | SrcArgNm = "SRC" |
| static const TStr | TitleArgNm = "TITLE" |
| static const TStr | HttpEquivArgNm = "HTTP-EQUIV" |
| ClassTP(THtmlHldV, PHtmlHldV) private THtmlDocV | HldV |
| ClassTPV(TWebPg, PWebPg, TWebPgV) private TStrV | IpNumV |
| PHttpResp | HttpResp |
| uint64 | FetchMSecs |
| enum THtmlDocType |
| enum THtmlLxChTy |
| enum THtmlLxSym |
| bool THtmlDoc::_IsTagRedir | ( | const TStr & | TagStr, |
| const TStr & | ArgNm, | ||
| THtmlLx & | Lx, | ||
| const TStr & | BaseUrlStr, | ||
| const TStr & | RedirUrlStr | ||
| ) | [static] |
Definition at line 1106 of file html.cpp.
{
IAssert(Lx.Sym==hsyBTag);
if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
TStr RelUrlStr=Lx.GetArg(ArgNm);
PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
if (Url->IsOk(usHttp)){
TStr UrlStr=Url->GetUrlStr();
PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
return true;
} else {
return false;
}
} else {
return false;
}
}
| static PHtmlLxChDef GetChDef | ( | ) | [static] |
| static THtmlLxChDef& GetChDefRef | ( | ) | [static] |
| int GetChTy | ( | const char & | Ch | ) | const |
| TStr THtmlLxChDef::GetCSZFromWin1250 | ( | const TChA & | ChA | ) | [static] |
Definition at line 132 of file html.cpp.
{
TChA DstChA;
for (int ChN=0; ChN<ChA.Len(); ChN++){
const uchar Ch=ChA[ChN];
switch (Ch){
case 232: DstChA+='c'; break;
case 200: DstChA+='C'; break;
case 154: DstChA+='s'; break;
case 138: DstChA+='S'; break;
case 158: DstChA+='z'; break;
case 142: DstChA+='Z'; break;
default: DstChA+=Ch;
}
}
return DstChA;
}
| TStr THtmlLxChDef::GetCSZFromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 111 of file html.cpp.
{
TChA DstChA;
for (int ChN=0; ChN<ChA.Len(); ChN++){
char Ch=ChA[ChN];
switch (Ch){
case '~': DstChA+='c'; break;
case '^': DstChA+='C'; break;
case '}': DstChA+='c'; break;
case ']': DstChA+='C'; break;
case '|': DstChA+='d'; break;
case '\\': DstChA+='D'; break;
case '{': DstChA+='s'; break;
case '[': DstChA+='S'; break;
case '`': DstChA+='z'; break;
case '@': DstChA+='Z'; break;
default: DstChA+=Ch;
}
}
return DstChA;
}
| TStr THtmlLxChDef::GetEscStr | ( | const TStr & | Str | ) | const |
| uint64 GetFetchMSecs | ( | ) | const |
Definition at line 375 of file html.h.
{return FetchMSecs;}
| TStr THtmlTok::GetFullStr | ( | ) | const |
Definition at line 628 of file html.cpp.
{
if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){
TChA FullChA;
FullChA+=Str.GetSubStr(0, Str.Len()-2);
for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){
FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='=';
FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"';
}
FullChA+='>';
return FullChA;
} else
if (Sym==hsyETag){
TChA FullChA;
FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1);
return FullChA;
} else {
return GetStr();
}
}
| PHtmlTok THtmlTok::GetHTok | ( | const bool & | IsBTag, |
| const int & | HTagN | ||
| ) | [static] |
Definition at line 762 of file html.cpp.
{
THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag;
TStr HTagNm;
switch (HTagN){
case 1: HTagNm=H1TagNm; break;
case 2: HTagNm=H2TagNm; break;
case 3: HTagNm=H3TagNm; break;
case 4: HTagNm=H4TagNm; break;
case 5: HTagNm=H5TagNm; break;
case 6: HTagNm=H6TagNm; break;
default: Fail;
}
return PHtmlTok(new THtmlTok(HTagSym, HTagNm));
}
| TStr GetHttpBodyAsStr | ( | ) | const |
Definition at line 366 of file html.h.
{return GetHttpResp()->GetBodyAsStr();}
| TStr GetHttpHdStr | ( | ) | const |
Definition at line 365 of file html.h.
{return GetHttpResp()->GetHdStr();}
| PHttpResp GetHttpResp | ( | ) | const |
| TStr THtmlLxChDef::GetIsoCeFromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 170 of file html.cpp.
{
TChA DstChA;
for (int ChN=0; ChN<ChA.Len(); ChN++){
char Ch=ChA[ChN];
switch (Ch){
case '~': DstChA+=uchar(232); break;
case '^': DstChA+=uchar(200); break;
case '}': DstChA+=uchar(230); break;
case ']': DstChA+=uchar(198); break;
case '|': DstChA+=uchar(240); break;
case '\\': DstChA+=uchar(208); break;
case '{': DstChA+=uchar(185); break;
case '[': DstChA+=uchar(169); break;
case '`': DstChA+=uchar(190); break;
case '@': DstChA+=uchar(174); break;
default: DstChA+=Ch;
}
}
return DstChA;
}
| char GetLc | ( | const char & | Ch | ) | const |
| TLxSym THtmlDoc::GetLxSym | ( | const THtmlLxSym & | HtmlLxSym, |
| const TChA & | ChA | ||
| ) | [static] |
| void TWebPg::GetOutDescUrlStrKdV | ( | TStrKdV & | OutDescUrlStrKdV | ) | const |
Definition at line 1258 of file html.cpp.
{
// create outgoing url vector
OutDescUrlStrKdV.Clr();
// take interesting web-page components
TStr UrlStr=GetUrlStr();
TStr HtmlStr=GetHttpBodyAsStr();
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
// traverse html documents
PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
int TokN=0; int Toks=HtmlDoc->GetToks();
while (TokN<Toks){
Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, UrlStr);
if (Url->IsOk()){
TChA DescChA;
while (TokN<Toks){
Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
break;
} else {
if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
if (!DescChA.Empty()){DescChA+=' ';}
DescChA+=TokStr;
}
}
}
OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
}
}
}
}
}
| void TWebPg::GetOutUrlV | ( | TUrlV & | OutUrlV, |
| TUrlV & | OutRedirUrlV | ||
| ) | const |
Definition at line 1230 of file html.cpp.
{
// create outgoing url vector
OutUrlV.Clr(); OutRedirUrlV.Clr();
// take interesting web-page components
TStr UrlStr=GetUrlStr();
TStr HtmlStr=GetHttpBodyAsStr();
// prepare html parsing
PSIn HtmlSIn=TStrIn::New(HtmlStr);
PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
PHtmlTok Tok;
// traverse html
for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
PHtmlTok Tok=HtmlDoc->GetTok(TokN);
if (Tok->GetSym()==hsyBTag){
TStr RelUrlStr;
if (Tok->IsUrlTok(RelUrlStr)){
PUrl Url=TUrl::New(RelUrlStr, UrlStr);
if (Url->IsOk(usHttp)){
OutUrlV.Add(Url);
if (Tok->IsRedirUrlTok()){
OutRedirUrlV.Add(Url);
}
}
}
}
}
}
| void GetOutUrlV | ( | TUrlV & | OutUrlV | ) | const |
Definition at line 369 of file html.h.
{
TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
| TStr THtmlDoc::GetRedirHtmlDocStr | ( | const TStr & | HtmlStr, |
| const TStr & | BaseUrlStr, | ||
| const TStr & | RedirUrlStr | ||
| ) | [static] |
Definition at line 1126 of file html.cpp.
{
PSIn SIn=TStrIn::New(HtmlStr);
TMOut SOut;
THtmlLx Lx(SIn);
while (Lx.GetSym()!=hsyEof){
SOut.PutStr(Lx.PreSpaceChA);
if ((Lx.Sym==hsyBTag)&&(
(_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
(_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
SOut.PutStr(Lx.GetFullBTagStr());
} else {
SOut.PutStr(Lx.SymChA());
}
}
return SOut.GetAsStr();
}
| PHtmlDoc GetRefHtmlDoc | ( | ) |
| THtmlLxSym GetSym | ( | ) | const |
| PHtmlTok GetTok | ( | const int & | TokN | ) | const |
| PHtmlTok GetTok | ( | const int & | TokN, |
| THtmlLxSym & | Sym, | ||
| TStr & | Str | ||
| ) | const |
| static TStr GetTxtLnDoc | ( | const TStr & | HtmlStr | ) | [static] |
| static TStr GetTxtLnDoc | ( | const TStr & | HtmlStr, |
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutTagsP | ||
| ) | [static] |
| char GetUc | ( | const char & | Ch | ) | const |
| TStr TLxChDef::GetUcStr | ( | const TStr & | Str | ) | const |
| PUrl GetUrl | ( | const int & | UrlN = -1 | ) | const |
| TStr THtmlLxChDef::GetWin1250FromYuascii | ( | const TChA & | ChA | ) | [static] |
Definition at line 149 of file html.cpp.
{
TChA DstChA;
for (int ChN=0; ChN<ChA.Len(); ChN++){
char Ch=ChA[ChN];
switch (Ch){
case '~': DstChA+=uchar(232); break;
case '^': DstChA+=uchar(200); break;
case '}': DstChA+='c'; break;
case ']': DstChA+='C'; break;
case '|': DstChA+='d'; break;
case '\\': DstChA+='D'; break;
case '{': DstChA+=uchar(154); break;
case '[': DstChA+=uchar(138); break;
case '`': DstChA+=uchar(158); break;
case '@': DstChA+=uchar(142); break;
default: DstChA+=Ch;
}
}
return DstChA;
}
| bool IsAlNum | ( | const char & | Ch | ) | const |
| bool IsAlpha | ( | const char & | Ch | ) | const |
Definition at line 203 of file html.h.
{
return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;}
| bool THtmlTok::IsBreakTag | ( | const TStr & | TagNm | ) | [static] |
Definition at line 726 of file html.cpp.
{
static TStrH BreakTagNmH(50);
if (BreakTagNmH.Len()==0){
BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>"));
BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>"));
BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>"));
BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>"));
BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>"));
BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>"));
BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>"));
BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>"));
BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>"));
BreakTagNmH.AddKey(TStr("<SCRIPT>"));
BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>"));
}
return BreakTagNmH.IsKey(TagNm);
}
| bool THtmlTok::IsBreakTok | ( | const PHtmlTok & | Tok | ) | [static] |
| bool IsEoln | ( | const char & | Ch | ) | const |
| bool THtmlTok::IsHTag | ( | const TStr & | TagNm, |
| int & | HTagN | ||
| ) | [static] |
| bool IsLc | ( | const char & | Ch | ) | const |
| bool IsNum | ( | const char & | Ch | ) | const |
| bool THtmlTok::IsRedirUrlTok | ( | ) | const |
| bool IsSpace | ( | const char & | Ch | ) | const |
| bool IsSym | ( | const char & | Ch | ) | const |
| bool TWebPg::IsTxt | ( | ) | const |
Definition at line 1310 of file html.cpp.
{
if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){
TStr Str=HttpResp->GetBodyAsStr();
int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
while ((ChN<100)&&(ChN<StrLen)){
char Ch=Str[ChN++];
if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
PrintChs++;}
}
double PrintPrb=double(PrintChs)/double(ChN+1);
return PrintPrb>0.9;
} else {
return false;
}
}
| bool IsUc | ( | const char & | Ch | ) | const |
| bool IsUrl | ( | const char & | Ch | ) | const |
| bool THtmlTok::IsUrlTok | ( | TStr & | RelUrlStr | ) | const |
Definition at line 648 of file html.cpp.
{
if (GetSym()==hsyBTag){
TStr TagNm=GetStr();
if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){
RelUrlStr=GetArg(HRefArgNm); return true;}
else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){
RelUrlStr=GetArg(HRefArgNm); return true;}
else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){
RelUrlStr=GetArg(SrcArgNm); return true;}
else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){
RelUrlStr=GetArg(SrcArgNm); return true;}
else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
TStr ContentStr=GetArg("CONTENT");
TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL=";
ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr);
RelUrlStr=ContentStr.GetSubStr(
LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len());
return !RelUrlStr.Empty();
} else {
return false;
}
}
}
return false;
}
| bool IsWs | ( | const char & | Ch | ) | const |
Definition at line 34 of file html.h.
{
return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
Definition at line 25 of file html.h.
{return new THtmlLxChDef(SIn);}
| static PHtmlDoc LoadTxt | ( | const TStr & | FNm, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) | [static] |
| static PHtmlDoc New | ( | const PSIn & | SIn, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) | [static] |
| THtmlLxChDef& operator= | ( | const THtmlLxChDef & | ) |
| void PutFetchMSecs | ( | const uint64 & | _FetchMSecs | ) |
Definition at line 374 of file html.h.
{FetchMSecs=_FetchMSecs;}
| void TWebPg::SaveAsHttp | ( | const TStr & | FNm | ) | const |
Definition at line 1303 of file html.cpp.
{
// create output file
PSOut SOut=TFOut::New(FNm);
// save http
HttpResp->SaveTxt(SOut);
}
| void TWebPg::SaveAsHttpBody | ( | const TStr & | FNm | ) | const |
Definition at line 1296 of file html.cpp.
{
// create output file
PSOut SOut=TFOut::New(FNm);
// save http-body
HttpResp->SaveBody(SOut);
}
| static void SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
| const PSOut & | TxtSOut, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP | ||
| ) | [static] |
| static void SaveHtmlToTxt | ( | const TStr & | HtmlStr, |
| const TStr & | TxtFNm, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP | ||
| ) | [static] |
| static void SaveHtmlToXml | ( | const TStr & | HtmlStr, |
| const PSOut & | XmlSOut, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutTextP, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP, | ||
| const bool & | OutTagsP, | ||
| const bool & | OutArgsP | ||
| ) | [static] |
| static void SaveHtmlToXml | ( | const TStr & | HtmlStr, |
| const TStr & | XmlFNm, | ||
| const TStr & | BaseUrlStr, | ||
| const bool & | OutTextP, | ||
| const bool & | OutUrlP, | ||
| const bool & | OutToksP, | ||
| const bool & | OutTagsP, | ||
| const bool & | OutArgsP | ||
| ) | [static] |
| void SetChTy | ( | const THtmlLxChTy & | ChTy, |
| const TStr & | Str | ||
| ) |
| void THtmlLxChDef::SetEscStr | ( | const TStr & | SrcStr, |
| const TStr & | DstStr | ||
| ) |
| void THtmlLxChDef::SetUcCh | ( | const char & | UcCh, |
| const char & | LcCh | ||
| ) |
Definition at line 3 of file html.cpp.
{
// update upper-case (more lower cases may have one upper case)
IAssert(
(UcChV[LcCh-TCh::Mn]==TCh(0))||
(UcChV[LcCh-TCh::Mn]==TCh(LcCh)));
UcChV[LcCh-TCh::Mn]=TCh(UcCh);
// update lower-case (one upper case may have only one lower case)
if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){
LcChV[UcCh-TCh::Mn]=TCh(LcCh);
}
}
| void TLxChDef::SetUcCh | ( | const TStr & | Str | ) |
| THtmlDoc::THtmlDoc | ( | const PSIn & | SIn, |
| const THtmlDocType & | Type = hdtAll, |
||
| const bool & | DoUc = true |
||
| ) |
Definition at line 779 of file html.cpp.
:
TokV(1000, 0){
THtmlLx Lx(SIn);
bool MkTok=false; bool InUL=false;
while (Lx.GetSym()!=hsyEof){
switch (Type){
case hdtAll: MkTok=true; break;
case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
case hdtHRef:
MkTok=(Lx.Sym==hsyBTag)&&
((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
(Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
(Lx.UcChA==THtmlTok::MetaTagNm));
break;
case hdtUL:
if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
MkTok=InUL;
if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
break;
default: Fail;
}
if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
}
TokV.Add(PHtmlTok(new THtmlTok(hsyEof)));
}
| THtmlHldV::THtmlHldV | ( | const PHtmlDoc & | _RefHtmlDoc, |
| const int & | HldWnLen = 10 |
||
| ) |
Definition at line 1148 of file html.cpp.
: RefHtmlDoc(_RefHtmlDoc), HldV(){ bool IsTitleAct=false; THtmlTokV TitleTokV; bool IsHAct=false; int ActHTagN=-1; TVec<THtmlTokV> HTokV(6); PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr; for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){ Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr); if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){ // collect tokens before, inside and after <a> ... </a> tags int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr; // inside <A> tags THtmlTokV ATokV; ATokN=TokN; forever{ ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (ATokSym!=hsySSym){ATokV.Add(ATok);} if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;} ATokN++; if (ATokN>=RefHtmlDoc->GetToks()){break;} } int ETagATokN=ATokN+1; // before <A> tags THtmlTokV PrevATokV; ATokN=TokN; forever{ ATokN--; if (ATokN<0){break;} ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (THtmlTok::IsBreakTok(ATok)){break;} if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);} if (ATokV.Len()>=HldWnLen){break;} } // after <A> tags THtmlTokV NextATokV; ATokN=ETagATokN; forever{ ATokN++; if (ATokN>=RefHtmlDoc->GetToks()){break;} ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr); if (THtmlTok::IsBreakTok(ATok)){break;} if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);} if (ATokV.Len()>=HldWnLen){break;} } // construct html-document with hyper-link context PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc()); HtmlDoc->AddTokV(TitleTokV); for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);} HtmlDoc->AddTokV(PrevATokV); HtmlDoc->AddTokV(ATokV); HtmlDoc->AddTokV(NextATokV); HldV.Add(HtmlDoc); HtmlDoc->SaveTxt(TSOut::StdOut); } else if (TokSym==hsyBTag){ int HTagN; if (TokStr==THtmlTok::TitleTagNm){ IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok); } else if (THtmlTok::IsHTag(TokStr, HTagN)){ if (IsHAct){// conclude previous <H?> tag if left open HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));} IsHAct=true; ActHTagN=HTagN; {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}} HTokV[ActHTagN-1].Add(Tok); } } else if (TokSym==hsyETag){ int HTagN; if (TokStr==THtmlTok::TitleTagNm){ if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;} } else if (THtmlTok::IsHTag(TokStr, HTagN)){ if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;} } } else if (TokSym!=hsySSym){ if (IsTitleAct){TitleTokV.Add(Tok);} if (IsHAct){HTokV[ActHTagN-1].Add(Tok);} } } }
Definition at line 48 of file html.cpp.
: ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){ // Character-Types ChTyV.PutAll(TInt(hlctSpace)); SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz"); SetChTy(hlctAlpha, "@_"); SetChTy(hlctNum, "0123456789"); SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?"); SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">"); SetChTy(hlctEof, TStr(TCh::EofCh)); for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}} //SetChTy(hlctSpace, TStr(TCh(char(160)))); // Upper-Case {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){ SetUcCh(char(Ch), char(Ch));}} SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2"); SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6"); SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd"); SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9"); SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg"); SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed"); SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk"); SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1"); SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4"); SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp"); SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a"); SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa"); SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww"); SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz"); SetUcCh("\x8e\x9e"); // ISO-CE //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/ //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/ //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/ //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/ //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/ // Annoying Unicode-characters //SetChTy(hlctSpace, "\xc2\xef"); // Escape-Sequences SetEscStr(""", "\""); SetEscStr("&", "&"); SetEscStr("<", "<"); SetEscStr(">", ">"); SetEscStr(" ", " "); SetEscStr("ä", "\xe4"); SetEscStr("Ä", "\xc4"); SetEscStr("ö", "\xf6"); SetEscStr("Ö", "\xd6"); SetEscStr("ü", "\xfc"); SetEscStr("Ü", "\xdc"); SetEscStr("å", "\xe5"); SetEscStr("Å", "\xc5"); SetEscStr("ø", "\xf8"); SetEscStr("Ø", "\xd8"); SetEscStr("&Aelig", "\xc6"); SetEscStr("æ", "\xe6"); SetEscStr("é", "e"); SetEscStr("É", "E"); SetEscStr("è", "e"); SetEscStr("È", "E"); SetEscStr("à", "a"); SetEscStr("À", "A"); }
| THtmlLxChDef | ( | TSIn & | SIn | ) |
| THtmlTok | ( | const THtmlLxSym & | _Sym | ) |
| THtmlTok | ( | const THtmlLxSym & | _Sym, |
| const TStr & | _Str | ||
| ) |
| THtmlTok | ( | const THtmlLxSym & | _Sym, |
| const TStr & | _Str, | ||
| const THtmlLx::TArgNmValV & | _ArgNmValV | ||
| ) |
const TStr THtmlTok::AltArgNm = "ALT" [static] |
const TStr THtmlTok::AreaTagNm = "<AREA>" [static] |
const TStr THtmlTok::ATagNm = "<A>" [static] |
const TStr THtmlTok::BrTagNm = "<BR>" [static] |
const TStr THtmlTok::CardTagNm = "<CARD>" [static] |
const TStr THtmlTok::CenterTagNm = "<CENTER>" [static] |
const TStr THtmlTok::FrameTagNm = "<FRAME>" [static] |
const TStr THtmlTok::H1TagNm = "<H1>" [static] |
const TStr THtmlTok::H2TagNm = "<H2>" [static] |
const TStr THtmlTok::H3TagNm = "<H3>" [static] |
const TStr THtmlTok::H4TagNm = "<H4>" [static] |
const TStr THtmlTok::H5TagNm = "<H5>" [static] |
const TStr THtmlTok::H6TagNm = "<H6>" [static] |
const TStr THtmlTok::HRefArgNm = "HREF" [static] |
const TStr THtmlTok::HttpEquivArgNm = "HTTP-EQUIV" [static] |
const TStr THtmlTok::ImgTagNm = "<IMG>" [static] |
const TStr THtmlTok::LiTagNm = "<LI>" [static] |
const TStr THtmlTok::MetaTagNm = "<META>" [static] |
const TStr THtmlTok::PTagNm = "<P>" [static] |
const TStr THtmlTok::SrcArgNm = "SRC" [static] |
const TStr THtmlTok::TitleArgNm = "TITLE" [static] |
const TStr THtmlTok::TitleETagNm = "</TITLE>" [static] |
const TStr THtmlTok::TitleTagNm = "<TITLE>" [static] |
const TStr THtmlTok::UlTagNm = "<UL>" [static] |